Java tutorial
/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.argumentation.data.roomfordebate; import de.tudarmstadt.ukp.argumentation.cleaning.TextCleaningUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; import org.openqa.selenium.WebDriver; import org.openqa.selenium.firefox.FirefoxDriver; import java.io.IOException; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Scraping comments to Room for Debate articles * * @author Ivan Habernal */ public class NYTimesCommentsScraper { /** * Downloads the page and rolls out the entire discussion using {@link FirefoxDriver}. * * @param articleUrl url, e.g. {@code http://www.nytimes.com/roomfordebate/2015/02/04/regulate-internet-providers/the-internet-is-back-to-solid-regulatory-ground} * @return generated HTML code of the entire page * @throws InterruptedException */ public String readHTML(String articleUrl) throws InterruptedException { // load the url WebDriver driver = new FirefoxDriver(); driver.get(articleUrl); // roll-out the entire discussion // TODO fix that, is broken in actual versions of Selenium/Firefox /* List<WebElement> commentsExpandElements; do { commentsExpandElements = driver.findElements(By.cssSelector("div.comments-expand")); // click on each of them for (WebElement commentsExpandElement : commentsExpandElements) { // only if visible & enabled if (commentsExpandElement.isDisplayed() && commentsExpandElement.isEnabled()) { commentsExpandElement.click(); // give it some time to load new comments Thread.sleep(3000); } } } // until there is one remaining that doesn't do anything... while (commentsExpandElements.size() > 1); */ // get the html String result = driver.getPageSource(); // close firefox driver.close(); return result; } /** * Extracts comments from the input html stream * * @param html loaded html * @return list of comments (never null) * @throws IOException exception */ public List<Comment> extractComments(String html) throws IOException { List<Comment> result = new ArrayList<Comment>(); Document doc = Jsoup.parse(html, getBaseName()); for (Element element : doc.select("#commentsContainer article")) { Comment comment = new Comment(); // id comment.setId(element.attr("data-id")); // parent id comment.setParentId(!element.attr("data-parentid").equals("0") ? element.attr("data-parentid") : null); // previous comment id (if available) Comment previousComment = result.isEmpty() ? null : result.get(result.size() - 1); // if the previous comment has parent, the current comment is a reaction to the previous one if (previousComment != null && previousComment.getParentId() != null) { comment.setPreviousPostId(previousComment.getId()); } // now metadata and content for (Node child : element.childNodes()) { if (child instanceof Element) { Element childElement = (Element) child; if ("header".equals(childElement.nodeName())) { comment.setCommenterName(TextCleaningUtils.normalizeWithParagraphs( childElement.select("h3.commenter").iterator().next().text())); comment.setCommenterLocation(TextCleaningUtils.normalizeWithParagraphs( childElement.select("span.commenter-location").iterator().next().text())); comment.setCommenterTrusted(childElement.select("i.trusted-icon").size() == 1); // time DateFormat df = new SimpleDateFormat("dd MMM yyyy", Locale.ENGLISH); String dateText = childElement.select("a.comment-time").text(); try { Date date = df.parse(dateText); comment.setTimestamp(date); } catch (ParseException e) { // maybe it's "x days ago" Pattern p = Pattern.compile("(\\d+) days ago"); Matcher m = p.matcher(dateText); while (m.find()) { // get the value int xDaysAgo = Integer.valueOf(m.group(1)); // translate to Java date Calendar cal = Calendar.getInstance(); cal.add(Calendar.DAY_OF_YEAR, (-xDaysAgo)); Date date = cal.getTime(); comment.setTimestamp(date); } } } // recommendations else if ("footer".equals(childElement.nodeName())) { Elements select = childElement.select("span.recommend-count"); if (!select.text().isEmpty()) { comment.setVoteUpCount(Integer.valueOf(select.text())); } } // the text else if ("p".equals(childElement.nodeName())) { String text = paragraphElementToString(childElement); // and do some cleaning and normalization String normalized = TextCleaningUtils.normalizeWithParagraphs(text); comment.setText(normalized); } } } result.add(comment); } return result; } /** * Extracts elements from the html comments (paragraph breaks, links) * * @param pElement paragraph element * @return plain text */ public String paragraphElementToString(Element pElement) { StringBuilder sb = new StringBuilder(); for (Node child : pElement.childNodes()) { if (child instanceof TextNode) { TextNode textNode = (TextNode) child; sb.append(textNode.text()); } else if (child instanceof Element) { Element element = (Element) child; // append new line for break if ("br".equals(element.tag().getName())) { sb.append("\n"); } else if ("a".equals(element.tag().getName())) { // extract link from a.href sb.append(" ").append(element.attr("href")).append(" "); } else { // or just add the text sb.append(" ").append(element.text()).append(" "); } } } return sb.toString(); } public String getBaseName() { return "www.nytimes.com"; } }