List of usage examples for org.jsoup.nodes Element select
public Elements select(String cssQuery)
From source file:scrapper.BSEScrapper.java
public void parse(String url) throws Exception { Element doc = Jsoup.connect(url).get(); setSensexValue(doc.select("#ref_15173681_l").text()); setChange(doc.select("#ref_15173681_c").text()); setChangePercentage(doc.select("#ref_15173681_cp").text()); }
From source file:us.colloquy.sandbox.FileProcessor.java
@Test public void getURIForAllDiaries() { Set<DocumentPointer> uriList = new HashSet<>(); //String letterDirectory = System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries"; //// w w w . ja v a 2 s. c o m String letterDirectory = System.getProperty("user.home") + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49"; Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".ncx"); })) { stream.forEach(results::add); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); try { for (Path res : results) { Path parent = res.getParent(); // System.out.println("---------------------------------------------"); // System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); String title = ""; for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { title = child.text(); // System.out.println("Title: " + title); } } // System.out.println("========================== " + res.toString() + " =========================="); boolean startPrinting = false; boolean newFile = true; for (Element element : doc.getElementsByTag("navPoint")) { //get nav label and content Element navLabelElement = element.select("navLabel").first(); Element srsElement = element.select("content").first(); String navLabel = ""; String srs = ""; if (navLabelElement != null) { navLabel = navLabelElement.text().replaceAll("\\*", "").trim(); } if (srsElement != null) { srs = srsElement.attr("src"); } if ("??".matches(navLabel)) { startPrinting = false; // System.out.println("----------------- end of file pointer ---------------"); } if (StringUtils.isNotEmpty(navLabel) && navLabel.matches("??.*|?? ?.*") && newFile) { newFile = false; startPrinting = true; title = navLabel; } if (startPrinting) { // System.out.println("----------------- file pointer ---------------"); // System.out.println(navLabel + "\t" + srs); DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + srs.replaceAll("#.*", ""), title); uriList.add(documentPointer); } // for (Element child : element.children()) // { // String label = child.text(); // // if (StringUtils.isNotEmpty(label)) // { // if (label.matches("??\\s\\d{4}.*")) // { // System.out.println("------------------"); // } // // String url = child.getElementsByTag("content").attr("src"); // // if (label.matches(".*\\d{1,3}.*[?--?]+.*") && // StringUtils.isNotEmpty(url)) // { // DocumentPointer letterPointer = new DocumentPointer(parent.toString() // + File.separator + url.replaceAll("#.*", ""), title); // // uriList.add(letterPointer); //// System.out.println("nav point: " + label + " src " + parent.toString() //// + System.lineSeparator() + url.replaceAll("#.*","")); // // // } else if (label.matches(".*\\d{1,3}.*") && // StringUtils.isNotEmpty(url) && useOnlyNumber) // { // DocumentPointer letterPointer = new DocumentPointer(parent.toString() // + File.separator + url.replaceAll("#.*", ""), title); // // uriList.add(letterPointer); //// System.out.println("nav point: " + label + " src " + parent.toString() //// + System.lineSeparator() + url.replaceAll("#.*","")); // // // } else // { // // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); // } // // // } // } } // System.out.println("========================== END OF FILE =========================="); } } catch (Exception e) { e.printStackTrace(); } System.out.println("Size: " + uriList.size()); for (DocumentPointer pointer : uriList) { //parse and System.out.println(pointer.getSourse() + "\t" + pointer.getUri()); } }
From source file:us.colloquy.util.DiaryParser.java
@Test public void useJsoup() { //File input = new File(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries/dnevnik_1893(2)/OEBPS/Text/0001_1006_2001.xhtml"); // File input = new File(System.getProperty("user.home") + "/IdeaProjects/ElasticTest/temp/dnevnik_1862(1)/OEBPS/Text/0001_1006_2001.xhtml"); File input = new File(System.getProperty("user.home") + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49/OEBPS/Text/0001_1011_2005.xhtml"); String previousYear = ""; String sourse = "pointer"; List<DiaryEntry> diaryEntrys = new ArrayList<>(); try {//from w w w .ja v a 2 s . com Document doc = Jsoup.parse(input, "UTF-8"); for (Element element : doc.getElementsByClass("section")) { DiaryEntry diaryEntry = null; StringBuilder contentBuilder = new StringBuilder(); for (Element child : element.children()) { // for (Attribute att : child.attributes()) // { // // System.out.println(att.getKey() + " " + att.getValue()); // } //we need to assume that each element is a continuation unless the entry is a date that starts a new entry //the problem is to distinguish between an entry that contains date and place vs date within an entry //lets try to see if element is a date DiaryEntry diaryEntryToCollectDate = new DiaryEntry(); //we send it in two cases when text matches year or when text has em element Element em = child.select("em").first(); if (em == null && StringUtils.isNotEmpty(child.text())) { Matcher m = yearPattern.matcher(child.text()); if (m.find()) { child.text(m.group(1)); previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child); } } if (em != null) { previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child); } if (diaryEntryToCollectDate.getDate() != null) //this is the begginng of a new entry { System.out.println("Found date: " + diaryEntryToCollectDate.getDate()); //create new DiaryEntry if (diaryEntry != null) { diaryEntry.setEntry(contentBuilder.toString()); //add consecutive entries here diaryEntrys.add(diaryEntry); } diaryEntry = new DiaryEntry(); diaryEntry.setSource(sourse); diaryEntry.setDate(diaryEntryToCollectDate.getDate()); diaryEntry.setPlace(diaryEntryToCollectDate.getPlace()); contentBuilder = new StringBuilder(); } if (StringUtils.isNotEmpty(child.text()) && child.text().length() > 8) { contentBuilder.append(child.text() + "\n"); } // // System.out.println(child.tag() + "\n"); // System.out.println(child.outerHtml() + "\n" + child.text()); } //whatever we still have, add here: if (StringUtils.isNotEmpty(contentBuilder.toString()) && diaryEntry != null) { diaryEntry.setEntry(contentBuilder.toString()); diaryEntrys.add(diaryEntry); } } } catch (IOException e) { e.printStackTrace(); } for (DiaryEntry diaryEntry : diaryEntrys) { System.out.println(diaryEntry.toString()); } }
From source file:xxx.web.comments.debates.impl.ProConOrgCommentsParser.java
@Override public Debate parseDebate(InputStream inputStream) throws IOException { Debate result = new Debate(); Document doc = Jsoup.parse(inputStream, "UTF-8", "http://www.procon.org/"); // Set the Url of the doc // title/*ww w.jav a2 s . c o m*/ Element body = doc.body(); Map<String, Elements> proConElements = new HashMap<>(); proConElements.put("pro", body.select("div[class=column pro]")); proConElements.put("con", body.select("div[class=column con]")); // Elements pro = body.select("div[class=column pro]"); // System.out.println(pro); // title result.setTitle(Utils.normalize(body.select("h2").text())); for (Map.Entry<String, Elements> entry : proConElements.entrySet()) { // stance String stance = entry.getKey(); Elements comments = entry.getValue().select("ul.comments > li[class^=comment]"); for (Element element : comments) { Element divContent = element.select("div.contents").iterator().next(); // extract argument content Argument argument = extractArgumentFromDivContent(divContent); // extract ID String parentId = element.attr("id").replace(":", "_"); if (parentId == null) { throw new IllegalStateException("Parent id must be known"); } argument.setId(parentId); // set stance - we know it argument.setStance(stance); result.getArgumentList().add(argument); Elements divReplies = element.select("li[class^=reply]"); // System.out.println(divReplies.size()); for (Element divReply : divReplies) { Element replyDivContent = divReply.select("div.contents").iterator().next(); // extract reply argument Argument replyArgument = extractArgumentFromDivContent(replyDivContent); // set id and parentId String id = element.attr("id").replace(":", "_"); replyArgument.setId(id); if (id == null) { throw new IllegalStateException("Id must be known"); } replyArgument.setParentId(parentId); // add to debate result.getArgumentList().add(replyArgument); } } } return result; }
From source file:xxx.web.comments.debates.impl.ProConOrgCommentsParser.java
protected static Argument extractArgumentFromDivContent(Element divContent) { Argument argument = new Argument(); Element blockquote = divContent.select("blockquote").iterator().next(); // System.out.println("----------"); String text = ProConOrgParser.extractPlainTextFromTextElement(blockquote); argument.setText(text);//from w w w. ja v a 2s . c om String votesUpText = divContent.select("span.votes-up").text(); String votesDownText = divContent.select("span.votes-down").text(); int votesUp = Integer.valueOf(votesUpText); int votesDown = Integer.valueOf(votesDownText); argument.setVoteUpCount(votesUp); argument.setVoteDownCount(Math.abs(votesDown)); argument.setAuthor(divContent.select("span.name").text()); // time DateFormat df = new SimpleDateFormat("MMM. dd, yyyy", Locale.ENGLISH); String dateText = divContent.select("span.date").text(); try { Date date = df.parse(dateText); argument.setTimestamp(date); } catch (ParseException e) { // e.printStackTrace(); } // System.out.println(argument); return argument; }
From source file:xxx.web.comments.debates.impl.ProConOrgParser.java
@Override public Debate parseDebate(InputStream inputStream) throws IOException { Debate result = new Debate(); Document doc = Jsoup.parse(inputStream, "UTF-8", "http://www.procon.org/"); // Set the Url of the doc // title//from w ww . j a va 2 s.c om Element body = doc.body(); Elements debateTitleElements = body.select("h2"); // Elements debateTitleElements = body.select("p[class=title]").select("p[style]"); if (debateTitleElements.first() == null) { // not a debate return null; } String title = Utils.normalize(debateTitleElements.first().text()); result.setTitle(title); Elements proConTr = body.select("tr > td > b:contains(PRO \\(yes\\))"); if (proConTr == null || proConTr.parents() == null || proConTr.parents().first() == null || proConTr.parents().first().parents() == null || proConTr.parents().first().parents().first() == null || proConTr.parents().first().parents().first().nextElementSibling() == null) { // not a pro-con debate return null; } Element trAnswers = proConTr.parents().first().parents().first().nextElementSibling(); // the PRO side Element proTd = trAnswers.select("td").get(0); Element conTd = trAnswers.select("td").get(1); // System.out.println(proTd.select("blockquote").size()); // System.out.println(conTd.select("blockquote").size()); for (Element text : proTd.select("blockquote > div[class=editortext]")) { Argument argument = new Argument(); argument.setStance("pro"); argument.setText(extractPlainTextFromTextElement(text)); argument.setOriginalHTML(text.html()); // set ID idCounter++; argument.setId("pcq_" + idCounter); if (!argument.getText().isEmpty()) { result.getArgumentList().add(argument); } else { System.err.println("Failed to extract text from " + text.html()); } } for (Element text : conTd.select("blockquote > div[class=editortext]")) { Argument argument = new Argument(); argument.setStance("con"); argument.setText(extractPlainTextFromTextElement(text)); argument.setOriginalHTML(text.html()); idCounter++; argument.setId("pcq_" + idCounter); if (!argument.getText().isEmpty()) { result.getArgumentList().add(argument); } else { System.err.println("Failed to extract text from " + text.html()); } } // show some stats: Map<String, Integer> map = new HashMap<>(); map.put("pro", 0); map.put("con", 0); for (Argument argument : result.getArgumentList()) { map.put(argument.getStance(), map.get(argument.getStance()) + 1); } System.out.println(map); return result; }
From source file:xxx.web.comments.roomfordebate.NYTimesArticleExtractor.java
public Article extractArticle(String html) throws ParseException, IOException { Article result = new Article(); Document doc = Jsoup.parse(html, getBaseName()); Element element = doc.select("article.rfd").iterator().next(); // System.out.println(element); String dateText = element.select("p.pubdate").text().replaceAll("Updated[\\s]+", ""); // time// ww w.j a v a 2 s . co m try { DateFormat df = new SimpleDateFormat("MMM dd, yyyy, hh:mm aaa", Locale.ENGLISH); Date date = df.parse(dateText); result.setTimestamp(date); } catch (ParseException e) { // June 24, 2015 DateFormat df = new SimpleDateFormat("MMM dd, yyyy", Locale.ENGLISH); Date date = df.parse(dateText); result.setTimestamp(date); } // title result.setTitle(Utils.normalize(element.select("h1").text())); // text StringBuilder sb = new StringBuilder(); for (Element p : element.select("div.nytint-post > p")) { sb.append(p.text()); sb.append("\n"); } result.setText(Utils.normalize(sb.toString())); // debate title result.setDebateTitle(Utils.normalize(doc.select("div.nytint-discussion-overview > h2").text())); // debate url result.setDebateUrl(doc.select("div.nytint-discussion-overview > h2 > a").iterator().next().attr("href")); // document url result.setUrl(doc.select("meta[name=communityAssetURL]").attr("content")); // debate description result.setDebateDescription(Utils.normalize(((TextNode) doc.select("div.nytint-discussion-overview > p") .iterator().next().childNodes().iterator().next()).text())); // aurhor result.setAuthor(element.select("div.nytint-mugshots > img").iterator().next().attr("alt")); // topics for (Element a : element.select("p.nytint-tags > a")) { result.getTopics().add(a.attr("href")); } return result; }