List of usage examples for org.jsoup.select Elements traverse
public Elements traverse(NodeVisitor nodeVisitor)
From source file:hello.Scraper.java
@Splitter(inputChannel = "channel1", outputChannel = "channel2") public List<Element> scrape(ResponseEntity<String> payload) { String html = payload.getBody(); final Document htmlDoc = Jsoup.parse(html); final Elements anchorNodes = htmlDoc.select("body").select("ul").select("li"); final List<Element> anchorList = new ArrayList<Element>(); anchorNodes.traverse(new NodeVisitor() { @Override// w w w . j a v a2 s. c o m public void head(org.jsoup.nodes.Node node, int depth) { if (node instanceof org.jsoup.nodes.Element) { Element e = (Element) node; anchorList.add(e); } } @Override public void tail(Node node, int depth) { } }); return anchorList; }
From source file:crawler.AScraper.java
@Splitter(inputChannel = "channel1", outputChannel = "channel2") public List<Element> scrape(ResponseEntity<String> payload) { String html = payload.getBody(); final Document htmlDoc; try {//from w ww . ja v a 2 s . c o m htmlDoc = Jsoup.parse(new String(html.getBytes("ISO-8859-1"), "GBK")); } catch (UnsupportedEncodingException e) { LOG.error("Unsupported page encoding."); return null; } final Elements anchorNodes = htmlDoc.select("body").select("div[id^=read]").select("a"); final List<Element> anchorList = new ArrayList<>(); anchorNodes.traverse(new NodeVisitor() { @Override public void head(org.jsoup.nodes.Node node, int depth) { if (node instanceof org.jsoup.nodes.Element) { Element e = (Element) node; if (StringUtils.containsIgnoreCase(e.text(), ANCHOR_TEXT_PATTERN)) { anchorList.add(e); } } } @Override public void tail(Node node, int depth) { } }); return anchorList; }