de.tudarmstadt.ukp.argumentation.data.roomfordebate.DataFetcher.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.argumentation.data.roomfordebate.DataFetcher.java

Source

/*
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.tudarmstadt.ukp.argumentation.data.roomfordebate;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.lang.StringUtils;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.net.URLEncoder;
import java.util.*;

/**
 * Main class for scraping data from Room for Debate
 *
 * @author Ivan Habernal
 */
public class DataFetcher {

    private static final int FIRST_N_COMMENTS = 8;

    /**
     * Crawls all URLs from the given list and stores them in the output folder; files that
     * already exist in the output folder are skipped
     *
     * @param urls      list of urls for Room for debate
     * @param outputDir output
     * @throws IOException ex
     */
    public static void crawlPages(List<String> urls, File outputDir) throws IOException {
        for (String url : urls) {
            // file name
            File outFile = new File(outputDir, URLEncoder.encode(url, "utf-8") + ".html");

            if (!outFile.exists()) {
                NYTimesCommentsScraper nyTimesCommentsScraper = new NYTimesCommentsScraper();

                String html;
                try {
                    html = nyTimesCommentsScraper.readHTML(url);
                } catch (InterruptedException e) {
                    throw new IOException(e);
                }

                FileUtils.writeStringToFile(outFile, html);
            }
        }
    }

    public static void main(String[] args) throws Exception {
        File crawledPagesFolder = new File(args[0]);
        if (!crawledPagesFolder.exists()) {
            crawledPagesFolder.mkdirs();
        }

        File outputFolder = new File(args[1]);
        if (!outputFolder.exists()) {
            outputFolder.mkdirs();
        }

        // read links from text file
        final String urlsResourceName = "roomfordebate-urls.txt";

        InputStream urlsStream = DataFetcher.class.getClassLoader().getResourceAsStream(urlsResourceName);

        if (urlsStream == null) {
            throw new IOException("Cannot find resource " + urlsResourceName + " on the classpath");
        }

        // read list of urls
        List<String> urls = new ArrayList<>();
        LineIterator iterator = IOUtils.lineIterator(urlsStream, "utf-8");
        while (iterator.hasNext()) {
            // ignore commented url (line starts with #)
            String line = iterator.nextLine();
            if (!line.startsWith("#") && !line.trim().isEmpty()) {
                urls.add(line.trim());
            }
        }

        // download all
        crawlPages(urls, crawledPagesFolder);

        List<File> files = new ArrayList<>(FileUtils.listFiles(crawledPagesFolder, null, false));
        Collections.sort(files, new Comparator<File>() {
            @Override
            public int compare(File o1, File o2) {
                return o1.getName().compareTo(o2.getName());
            }
        });

        int idCounter = 0;

        for (File file : files) {
            NYTimesCommentsScraper commentsScraper = new NYTimesCommentsScraper();
            NYTimesArticleExtractor extractor = new NYTimesArticleExtractor();

            String html = FileUtils.readFileToString(file, "utf-8");

            idCounter++;
            File outputFileArticle = new File(outputFolder, String.format("Cx%03d.txt", idCounter));
            File outputFileComments = new File(outputFolder, String.format("Dx%03d.txt", idCounter));

            try {
                List<Comment> comments = commentsScraper.extractComments(html);
                Article article = extractor.extractArticle(html);

                saveArticleToText(article, outputFileArticle);
                System.out.println("Saved to " + outputFileArticle);

                saveCommentsToText(comments, outputFileComments, article);
                System.out.println("Saved to " + outputFileComments);
            } catch (IOException ex) {
                System.err.println(file.getName() + "\n" + ex.getMessage());
            }
        }
    }

    /**
     * Saves first N comments of a given article to a text file
     *
     * @param comments   comments
     * @param outputFile output file
     * @param article    corresponding article
     * @throws IOException exception
     */
    private static void saveCommentsToText(List<Comment> comments, File outputFile, Article article)
            throws IOException {
        PrintWriter pw = new PrintWriter(outputFile, "utf-8");

        // take first 10 comments
        List<Comment> firstTen = comments.subList(0,
                comments.size() > FIRST_N_COMMENTS ? FIRST_N_COMMENTS : comments.size());

        // collect IDs for mapping to 1-10
        List<String> ids = new ArrayList<>();
        for (Comment comment : firstTen) {
            ids.add(comment.getId());
        }

        // header
        pw.printf("Debate title: %s%n%nDebate description: %s%n%nArticle title: %s%n%n", article.getDebateTitle(),
                article.getDebateDescription(), article.getTitle());

        for (Comment comment : firstTen) {
            pw.printf("#%s %s%s%n%n%s%n%n%n", ids.indexOf(comment.getId()) + 1,
                    comment.getCommenterName().replaceAll("\\s+", "_"),
                    comment.getParentId() != null ? " ReactsTo #" + (ids.indexOf(comment.getParentId()) + 1) : "",
                    StringUtils.join(comment.getText().split("\n"), "\n\n"));
        }

        IOUtils.closeQuietly(pw);
    }

    /**
     * Saves article to the output file
     *
     * @param article    article
     * @param outputFile file
     * @throws IOException exception
     */
    public static void saveArticleToText(Article article, File outputFile) throws IOException {
        PrintWriter pw = new PrintWriter(outputFile, "utf-8");

        pw.printf("Debate title: %s%n%nDebate description: %s%n%nArticle title: %s%n%n%s", article.getDebateTitle(),
                article.getDebateDescription(), article.getTitle(),
                StringUtils.join(article.getText().split("\n"), "\n\n"));

        IOUtils.closeQuietly(pw);
    }
}