de.tudarmstadt.ukp.argumentation.data.roomfordebate.NYTimesCommentsScraper.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.argumentation.data.roomfordebate.NYTimesCommentsScraper.java

Source

/*
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.tudarmstadt.ukp.argumentation.data.roomfordebate;

import de.tudarmstadt.ukp.argumentation.cleaning.TextCleaningUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.firefox.FirefoxDriver;

import java.io.IOException;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Scraping comments to Room for Debate articles
 *
 * @author Ivan Habernal
 */
public class NYTimesCommentsScraper {

    /**
     * Downloads the page and rolls out the entire discussion using {@link FirefoxDriver}.
     *
     * @param articleUrl url, e.g. {@code http://www.nytimes.com/roomfordebate/2015/02/04/regulate-internet-providers/the-internet-is-back-to-solid-regulatory-ground}
     * @return generated HTML code of the entire page
     * @throws InterruptedException
     */
    public String readHTML(String articleUrl) throws InterruptedException {
        // load the url
        WebDriver driver = new FirefoxDriver();
        driver.get(articleUrl);

        // roll-out the entire discussion
        // TODO fix that, is broken in actual versions of Selenium/Firefox
        /*
        List<WebElement> commentsExpandElements;
        do {
        commentsExpandElements = driver.findElements(By.cssSelector("div.comments-expand"));
            
        // click on each of them
        for (WebElement commentsExpandElement : commentsExpandElements) {
            // only if visible & enabled
            if (commentsExpandElement.isDisplayed() && commentsExpandElement.isEnabled()) {
                commentsExpandElement.click();
            
                // give it some time to load new comments
                Thread.sleep(3000);
            }
        }
        }
        // until there is one remaining that doesn't do anything...
        while (commentsExpandElements.size() > 1);
        */

        // get the html
        String result = driver.getPageSource();

        // close firefox
        driver.close();

        return result;
    }

    /**
     * Extracts comments from the input html stream
     *
     * @param html loaded html
     * @return list of comments (never null)
     * @throws IOException exception
     */
    public List<Comment> extractComments(String html) throws IOException {
        List<Comment> result = new ArrayList<Comment>();

        Document doc = Jsoup.parse(html, getBaseName());

        for (Element element : doc.select("#commentsContainer article")) {
            Comment comment = new Comment();

            // id
            comment.setId(element.attr("data-id"));
            // parent id
            comment.setParentId(!element.attr("data-parentid").equals("0") ? element.attr("data-parentid") : null);

            // previous comment id (if available)
            Comment previousComment = result.isEmpty() ? null : result.get(result.size() - 1);
            // if the previous comment has parent, the current comment is a reaction to the previous one
            if (previousComment != null && previousComment.getParentId() != null) {
                comment.setPreviousPostId(previousComment.getId());
            }

            // now metadata and content
            for (Node child : element.childNodes()) {
                if (child instanceof Element) {
                    Element childElement = (Element) child;

                    if ("header".equals(childElement.nodeName())) {
                        comment.setCommenterName(TextCleaningUtils.normalizeWithParagraphs(
                                childElement.select("h3.commenter").iterator().next().text()));
                        comment.setCommenterLocation(TextCleaningUtils.normalizeWithParagraphs(
                                childElement.select("span.commenter-location").iterator().next().text()));
                        comment.setCommenterTrusted(childElement.select("i.trusted-icon").size() == 1);
                        // time
                        DateFormat df = new SimpleDateFormat("dd MMM yyyy", Locale.ENGLISH);
                        String dateText = childElement.select("a.comment-time").text();
                        try {
                            Date date = df.parse(dateText);
                            comment.setTimestamp(date);
                        } catch (ParseException e) {
                            // maybe it's "x days ago"
                            Pattern p = Pattern.compile("(\\d+) days ago");
                            Matcher m = p.matcher(dateText);
                            while (m.find()) {
                                // get the value
                                int xDaysAgo = Integer.valueOf(m.group(1));

                                // translate to Java date
                                Calendar cal = Calendar.getInstance();
                                cal.add(Calendar.DAY_OF_YEAR, (-xDaysAgo));
                                Date date = cal.getTime();

                                comment.setTimestamp(date);
                            }
                        }
                    }
                    // recommendations
                    else if ("footer".equals(childElement.nodeName())) {
                        Elements select = childElement.select("span.recommend-count");
                        if (!select.text().isEmpty()) {
                            comment.setVoteUpCount(Integer.valueOf(select.text()));
                        }
                    }
                    // the text
                    else if ("p".equals(childElement.nodeName())) {
                        String text = paragraphElementToString(childElement);

                        // and do some cleaning and normalization
                        String normalized = TextCleaningUtils.normalizeWithParagraphs(text);

                        comment.setText(normalized);
                    }
                }
            }

            result.add(comment);
        }

        return result;
    }

    /**
     * Extracts elements from the html comments (paragraph breaks, links)
     *
     * @param pElement paragraph element
     * @return plain text
     */
    public String paragraphElementToString(Element pElement) {
        StringBuilder sb = new StringBuilder();
        for (Node child : pElement.childNodes()) {
            if (child instanceof TextNode) {
                TextNode textNode = (TextNode) child;

                sb.append(textNode.text());
            } else if (child instanceof Element) {
                Element element = (Element) child;

                // append new line for break
                if ("br".equals(element.tag().getName())) {
                    sb.append("\n");
                } else if ("a".equals(element.tag().getName())) {
                    // extract link from a.href
                    sb.append(" ").append(element.attr("href")).append(" ");
                } else {
                    // or just add the text
                    sb.append(" ").append(element.text()).append(" ");
                }
            }
        }

        return sb.toString();
    }

    public String getBaseName() {
        return "www.nytimes.com";
    }
}