Java tutorial
/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.argumentation.data.roomfordebate; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.io.LineIterator; import org.apache.commons.lang.StringUtils; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import java.net.URLEncoder; import java.util.*; /** * Main class for scraping data from Room for Debate * * @author Ivan Habernal */ public class DataFetcher { private static final int FIRST_N_COMMENTS = 8; /** * Crawls all URLs from the given list and stores them in the output folder; files that * already exist in the output folder are skipped * * @param urls list of urls for Room for debate * @param outputDir output * @throws IOException ex */ public static void crawlPages(List<String> urls, File outputDir) throws IOException { for (String url : urls) { // file name File outFile = new File(outputDir, URLEncoder.encode(url, "utf-8") + ".html"); if (!outFile.exists()) { NYTimesCommentsScraper nyTimesCommentsScraper = new NYTimesCommentsScraper(); String html; try { html = nyTimesCommentsScraper.readHTML(url); } catch (InterruptedException e) { throw new IOException(e); } FileUtils.writeStringToFile(outFile, html); } } } public static void main(String[] args) throws Exception { File crawledPagesFolder = new File(args[0]); if (!crawledPagesFolder.exists()) { crawledPagesFolder.mkdirs(); } File outputFolder = new File(args[1]); if (!outputFolder.exists()) { outputFolder.mkdirs(); } // read links from text file final String urlsResourceName = "roomfordebate-urls.txt"; InputStream urlsStream = DataFetcher.class.getClassLoader().getResourceAsStream(urlsResourceName); if (urlsStream == null) { throw new IOException("Cannot find resource " + urlsResourceName + " on the classpath"); } // read list of urls List<String> urls = new ArrayList<>(); LineIterator iterator = IOUtils.lineIterator(urlsStream, "utf-8"); while (iterator.hasNext()) { // ignore commented url (line starts with #) String line = iterator.nextLine(); if (!line.startsWith("#") && !line.trim().isEmpty()) { urls.add(line.trim()); } } // download all crawlPages(urls, crawledPagesFolder); List<File> files = new ArrayList<>(FileUtils.listFiles(crawledPagesFolder, null, false)); Collections.sort(files, new Comparator<File>() { @Override public int compare(File o1, File o2) { return o1.getName().compareTo(o2.getName()); } }); int idCounter = 0; for (File file : files) { NYTimesCommentsScraper commentsScraper = new NYTimesCommentsScraper(); NYTimesArticleExtractor extractor = new NYTimesArticleExtractor(); String html = FileUtils.readFileToString(file, "utf-8"); idCounter++; File outputFileArticle = new File(outputFolder, String.format("Cx%03d.txt", idCounter)); File outputFileComments = new File(outputFolder, String.format("Dx%03d.txt", idCounter)); try { List<Comment> comments = commentsScraper.extractComments(html); Article article = extractor.extractArticle(html); saveArticleToText(article, outputFileArticle); System.out.println("Saved to " + outputFileArticle); saveCommentsToText(comments, outputFileComments, article); System.out.println("Saved to " + outputFileComments); } catch (IOException ex) { System.err.println(file.getName() + "\n" + ex.getMessage()); } } } /** * Saves first N comments of a given article to a text file * * @param comments comments * @param outputFile output file * @param article corresponding article * @throws IOException exception */ private static void saveCommentsToText(List<Comment> comments, File outputFile, Article article) throws IOException { PrintWriter pw = new PrintWriter(outputFile, "utf-8"); // take first 10 comments List<Comment> firstTen = comments.subList(0, comments.size() > FIRST_N_COMMENTS ? FIRST_N_COMMENTS : comments.size()); // collect IDs for mapping to 1-10 List<String> ids = new ArrayList<>(); for (Comment comment : firstTen) { ids.add(comment.getId()); } // header pw.printf("Debate title: %s%n%nDebate description: %s%n%nArticle title: %s%n%n", article.getDebateTitle(), article.getDebateDescription(), article.getTitle()); for (Comment comment : firstTen) { pw.printf("#%s %s%s%n%n%s%n%n%n", ids.indexOf(comment.getId()) + 1, comment.getCommenterName().replaceAll("\\s+", "_"), comment.getParentId() != null ? " ReactsTo #" + (ids.indexOf(comment.getParentId()) + 1) : "", StringUtils.join(comment.getText().split("\n"), "\n\n")); } IOUtils.closeQuietly(pw); } /** * Saves article to the output file * * @param article article * @param outputFile file * @throws IOException exception */ public static void saveArticleToText(Article article, File outputFile) throws IOException { PrintWriter pw = new PrintWriter(outputFile, "utf-8"); pw.printf("Debate title: %s%n%nDebate description: %s%n%nArticle title: %s%n%n%s", article.getDebateTitle(), article.getDebateDescription(), article.getTitle(), StringUtils.join(article.getText().split("\n"), "\n\n")); IOUtils.closeQuietly(pw); } }