List of usage examples for org.jsoup.nodes Document body
public Element body()
From source file:sachin.bws.site.Template.java
public String getTemplate(String pageSource) { Document doc = Jsoup.parse(pageSource); String bodyClass[] = doc.body().attr("class").split("\\u0020"); List<String> classes = Arrays.asList(bodyClass); Collections.reverse(classes); for (String t : classes) { if (templates.contains(t)) { return t; }// www . j av a 2 s .com } return "****"; }
From source file:utils.AutoLinkRenderer.java
private AutoLinkRenderer parse(Pattern pattern, ToLink toLink) { Document doc = Jsoup.parse(body); Document.OutputSettings settings = doc.outputSettings(); settings.prettyPrint(false);//from w ww . j av a 2 s . com Elements elements = doc.getElementsMatchingOwnText(pattern); for (Element el : elements) { if (isIgnoreElement(el)) { continue; } List<TextNode> textNodeList = el.textNodes(); for (TextNode node : textNodeList) { String result = convertLink(node.toString(), pattern, toLink); node.text(StringUtils.EMPTY); node.after(result); } } this.body = doc.body().html(); return this; }
From source file:xxx.web.comments.debates.impl.ProConOrgCommentsParser.java
@Override public Debate parseDebate(InputStream inputStream) throws IOException { Debate result = new Debate(); Document doc = Jsoup.parse(inputStream, "UTF-8", "http://www.procon.org/"); // Set the Url of the doc // title//from www.j a v a 2 s .c o m Element body = doc.body(); Map<String, Elements> proConElements = new HashMap<>(); proConElements.put("pro", body.select("div[class=column pro]")); proConElements.put("con", body.select("div[class=column con]")); // Elements pro = body.select("div[class=column pro]"); // System.out.println(pro); // title result.setTitle(Utils.normalize(body.select("h2").text())); for (Map.Entry<String, Elements> entry : proConElements.entrySet()) { // stance String stance = entry.getKey(); Elements comments = entry.getValue().select("ul.comments > li[class^=comment]"); for (Element element : comments) { Element divContent = element.select("div.contents").iterator().next(); // extract argument content Argument argument = extractArgumentFromDivContent(divContent); // extract ID String parentId = element.attr("id").replace(":", "_"); if (parentId == null) { throw new IllegalStateException("Parent id must be known"); } argument.setId(parentId); // set stance - we know it argument.setStance(stance); result.getArgumentList().add(argument); Elements divReplies = element.select("li[class^=reply]"); // System.out.println(divReplies.size()); for (Element divReply : divReplies) { Element replyDivContent = divReply.select("div.contents").iterator().next(); // extract reply argument Argument replyArgument = extractArgumentFromDivContent(replyDivContent); // set id and parentId String id = element.attr("id").replace(":", "_"); replyArgument.setId(id); if (id == null) { throw new IllegalStateException("Id must be known"); } replyArgument.setParentId(parentId); // add to debate result.getArgumentList().add(replyArgument); } } } return result; }
From source file:xxx.web.comments.debates.impl.ProConOrgParser.java
@Override public Debate parseDebate(InputStream inputStream) throws IOException { Debate result = new Debate(); Document doc = Jsoup.parse(inputStream, "UTF-8", "http://www.procon.org/"); // Set the Url of the doc // title/* www . j av a 2s . c o m*/ Element body = doc.body(); Elements debateTitleElements = body.select("h2"); // Elements debateTitleElements = body.select("p[class=title]").select("p[style]"); if (debateTitleElements.first() == null) { // not a debate return null; } String title = Utils.normalize(debateTitleElements.first().text()); result.setTitle(title); Elements proConTr = body.select("tr > td > b:contains(PRO \\(yes\\))"); if (proConTr == null || proConTr.parents() == null || proConTr.parents().first() == null || proConTr.parents().first().parents() == null || proConTr.parents().first().parents().first() == null || proConTr.parents().first().parents().first().nextElementSibling() == null) { // not a pro-con debate return null; } Element trAnswers = proConTr.parents().first().parents().first().nextElementSibling(); // the PRO side Element proTd = trAnswers.select("td").get(0); Element conTd = trAnswers.select("td").get(1); // System.out.println(proTd.select("blockquote").size()); // System.out.println(conTd.select("blockquote").size()); for (Element text : proTd.select("blockquote > div[class=editortext]")) { Argument argument = new Argument(); argument.setStance("pro"); argument.setText(extractPlainTextFromTextElement(text)); argument.setOriginalHTML(text.html()); // set ID idCounter++; argument.setId("pcq_" + idCounter); if (!argument.getText().isEmpty()) { result.getArgumentList().add(argument); } else { System.err.println("Failed to extract text from " + text.html()); } } for (Element text : conTd.select("blockquote > div[class=editortext]")) { Argument argument = new Argument(); argument.setStance("con"); argument.setText(extractPlainTextFromTextElement(text)); argument.setOriginalHTML(text.html()); idCounter++; argument.setId("pcq_" + idCounter); if (!argument.getText().isEmpty()) { result.getArgumentList().add(argument); } else { System.err.println("Failed to extract text from " + text.html()); } } // show some stats: Map<String, Integer> map = new HashMap<>(); map.put("pro", 0); map.put("con", 0); for (Argument argument : result.getArgumentList()) { map.put(argument.getStance(), map.get(argument.getStance()) + 1); } System.out.println(map); return result; }