proxi.model.ArticleInflater.java Source code

Java tutorial

Introduction

Here is the source code for proxi.model.ArticleInflater.java

Source

/*
 *  ArticleInflater.java
 *  
 *  This file is part of Proxi project.
 *  
 *  Victor Purcallas Marchesi <vpurcallas@gmail.com>
 *  
 *  This class inflate the Articles and add they to an entry Diary
 *  Also inflate the Commentaries and add they to the Article
 *  
 *  To do the task this class requires the selenium-server-standalone jar
 *  The process begins opening a Browser thanks to the Selenium library and
 *  parsing the required URLs contained in the required Diary
 *  The Diary contains the necessary RegEx to parse the HTML
 *  
 *        
 *
 *  Proxi project is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  Proxi project is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with Proxi project.  If not, see <http://www.gnu.org/licenses/>. 
 */

package proxi.model;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import org.joda.time.DateTime;
import org.openqa.selenium.By;
import org.openqa.selenium.Dimension;
import org.openqa.selenium.Point;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.firefox.FirefoxDriver;

import proxi.model.objects.*;

/**
 * 
 * @author VictorPurMar <vpurcallas@gmail.com>
 * 
 */
public class ArticleInflater {
    // Web driver of Selenium
    // FirefoxDriver in that case
    private static WebDriver driver;

    private String analyzedUrl;
    private Diary diary;
    private Set<Commentary> analyzedComments;
    private int commentCounter;

    // Constructor

    public ArticleInflater() {
        analyzedComments = new HashSet<Commentary>();
        commentCounter = 0;
    }

    // Public Methods

    /**
     * Start the process to inflate the Articles to add to a recived Diary
     * 
     * @param url
     *            contains the url that will be parsed using selenium
     *            FirefoxDriver
     * @param diary
     *            is a Diary that will be stored as an attribute
     * @return Article with inflated with content
     */
    public Article run(String url, Diary diary) {

        this.diary = diary;
        this.analyzedUrl = url;

        // Open or reuse the browser
        if (driver == null) {
            driver = new FirefoxDriver();
            driver.manage().window().setPosition(new Point(-2000, -2000));
            driver.manage().window().setSize(new Dimension(100, 100));

        }

        // Make the article from the web
        Article article = articleController();

        // Order the article commentaries
        orderByTime(article);

        return article;
    }

    public void close() {
        driver.close();
    }

    public static void closeDriver() {
        driver.close();
    }

    // Private methods

    /**
     * Inflates the Article with his content First send start the article basic
     * Maker then inflates the comments to the Article
     * 
     * @return Article with all his content
     */
    private Article articleController() {
        Article article = null;

        try {
            if (!analyzedUrl.contains("http://"))
                analyzedUrl = "http://" + analyzedUrl;
            driver.get(analyzedUrl);

            // Article maker and basic data inflater
            article = articleBasicMaker();

            // Article commentary inflater
            article = commentInflater(article);

        } catch (Exception e) {
            e.printStackTrace();
        }
        return article;
    }

    /**
     * Makes an Article using parsing the web, using the necessary RegEx
     * contained in this.diary.
     * 
     * @return an Article without comments
     */
    private Article articleBasicMaker() {

        // Article common data inflater
        String title = driver.findElement(By.xpath(this.diary.getTitleRegEx())).getText();
        String subtitle = driver.findElement(By.xpath(this.diary.getSubtitleRegEx())).getText();
        String author = DataFixer.authorFixer(this.diary.getDiaryBasicUrl(),
                driver.findElement(By.xpath(this.diary.getAuthorRegEx())).getText());

        String date = "";
        try {
            date = driver.findElement(By.xpath(this.diary.getDateRegEx())).getText(); // NOMBRE
        } catch (Exception x) {
            try {
                date = driver.findElement(By.xpath(this.diary.getCommentTimeRegEx())).getAttribute("datetime")
                        .toString();
            } catch (Exception y) {
                date = "Fecha erronea";
            }
        }

        String url = analyzedUrl;

        // Uses the name of the first part of the url as diaryName
        String[] cleanDiary = analyzedUrl.split("/");
        String diaryName = cleanDiary[2];

        // Making the article
        Article article = new Article(title, subtitle, author, date, url, diaryName);

        try {
            DateTime dt = DataFixer.dataFixer(article);
            article.setDateTime(dt);
        } catch (Exception e) {
            System.err.print("Impossible to add DateTime to article");
        }
        return article;
    }

    /**
     * Inflates the given Article with comments Inflates it in the proper way
     * depending at the Diary specifications
     * 
     * @param article
     *            Article
     * @return Article inflated with comments
     */
    private Article commentInflater(Article article) {

        if (this.diary.getCommentsPage() != null) {
            WebElement element = driver.findElement(By.xpath(this.diary.getCommentsPage()));
            element.click();

            try {
                // wait to page load
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }

        if (this.diary.getNextButton() != null) {
            // Repeat the show more click until all the commentaries are showed
            loopNextClicker(article);

        } else {
            // Article Commentaries adder
            article = showedCommentsFiller(article);

        }
        return article;
    }

    /**
     * Receives an Article with a Diary that uses the next button in his diary.
     * That loops a process to take all the comments
     * 
     * @param article
     *            Article with all the comments
     */
    private void loopNextClicker(Article article) {
        Boolean exit = false;
        while (!exit) {
            // Article Commentaries adder
            article = showedCommentsFiller(article);
            try {
                WebElement element = driver.findElement(By.xpath(this.diary.getNextButton()));
                element.click();
                Thread.sleep(500); // Correct one unexpected behavior and
                // allow to process the URL to the
                // browser
            } catch (Exception e) {
                exit = true;
            }
        }
    }

    /**
     * Add showed comments to an Article For that propose makes each Comment
     * using the RegEx contained in the this.diary attributes
     * 
     * @param article
     *            Article
     * @return Article with the showed comments added
     */
    private Article showedCommentsFiller(Article article) {
        Iterator<WebElement> commentary = driver.findElements(By.xpath(this.diary.getCommentaryRegEx())).iterator();

        // Count the commentaries
        while (commentary.hasNext()) {
            commentary.next();
            this.commentCounter++;
        }

        commentary = driver.findElements(By.xpath(this.diary.getCommentaryRegEx())).iterator();

        while (commentary.hasNext()) {
            WebElement e = commentary.next();

            // Commentary
            String comment = e.findElement(By.xpath(this.diary.getCommentTextRegEx())).getText();
            comment = comment.replaceAll("\\n+", "");

            // number
            int n = 0;
            try {
                n = Integer.parseInt(e.findElement(By.xpath(this.diary.getCommentNumberRegEx())).getText());
            } catch (Exception x) {
                n = commentCounter;
                commentCounter--;
            }

            // Author
            String commentaryAuthor = e.findElement(By.xpath(this.diary.getCommentAuthorRegEx())).getText();
            // Time

            String time = "";
            try {
                time = e.findElement(By.xpath(this.diary.getCommentTimeRegEx())).getText();
            } catch (Exception x) {
                try {
                    time = e.findElement(By.xpath(this.diary.getCommentTimeRegEx())).getAttribute("datetime")
                            .toString();
                } catch (Exception y) {
                    time = "Fecha erronea";
                }
            }
            // This condition add only non repeated comments
            Commentary c = new Commentary(commentaryAuthor, time, n, comment);
            if (!this.analyzedComments.contains(c)) {
                this.analyzedComments.add(c);
                DateTime dt = DataFixer.dataFixer(article, c);
                c.setDateTime(dt);

                article.addCommentary(c);
            }
        }
        return article;
    }

    /**
     * Order the commentaries ArrayList from the article
     * 
     * @param ar
     *            Article
     */
    private void orderByTime(Article ar) {
        ArrayList<Commentary> commentaries = (ArrayList<Commentary>) ar.getCommentaries();
        Collections.sort(commentaries, new Comparator<Commentary>() {
            public int compare(Commentary s1, Commentary s2) {
                if (s1.getNumber() - s2.getNumber() > 0)
                    return 1;
                return -1;
            }
        });
        ar.setCommentaries(commentaries);
    }
}