List of usage examples for org.jsoup.nodes Element html
public String html()
From source file:org.sbs.goodcrawler.plugin.extract.ExtractorDytt8.java
@Override public ExtractedPage<?, ?> onExtract(Page page) { if (null != page) { try {/*from ww w . ja va 2 s . co m*/ Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getURL())); if (null != page.getWebURL().getURL() && page.getWebURL().getURL().contains("game/")) return null; // ???Url?Url Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); if (StringUtils.isNotBlank(linkHref) && filterUrls(linkHref)) { try { WebURL url = new WebURL(); url.setURL(linkHref); url.setJobName(conf.jobName); pendingUrls.addUrl(url); } catch (QueueException e) { log.error(e.getMessage()); } catch (Exception e) { log.error(e.getMessage()); } } } } // ?? // Map<String, String> selects = conf.getSelects(); Map<String, String> selects = null; ExtractedPage<String, Object> epage = pendingStore.new ExtractedPage<String, Object>(); epage.setUrl(page.getWebURL()); HashMap<String, Object> result = new HashMap<>(); Elements text = doc.select("#Zoom"); if (null == text || text.size() == 0) { return null; } String name = doc.select("h1").text(); name = name.replace("", "").replace("<<", "").replace("", "").replace(">>", ""); result.put("movie", name); // result.put("_id", name); String ts[] = doc.select("h2 a").text().split(" "); if (ts.length >= 2) { result.put("type", ts[1].trim()); } else { result.put("type", "unknow"); } result.put("url", page.getWebURL().getURL()); for (Entry<String, String> entry : selects.entrySet()) { Elements elements = doc.select(entry.getValue()); if (elements.isEmpty()) return null; else { if ("content".equals(entry.getKey())) { for (Element element : elements) { // Elements imgs = element.select("img[src]"); StringBuilder sb = new StringBuilder(); for (Element img : imgs) { sb.append(img.attr("src")).append(";"); } result.put("img", sb.toString()); // ? Elements movieInfos = element.select("p"); for (Element info : movieInfos) { String infotext = info.text(); try { String infotext_ = info.html(); int start, end = 0; start = infotext_.indexOf(""); if (start > 0) { end = infotext_.lastIndexOf(""); if (end > 0 && start < end) { result.put("jq", infotext_.substring(start, end)); } else { end = infotext_.lastIndexOf("."); if (end > 0 && start < end) { result.put("jq", infotext_.substring(start, end)); } } } infotext_ = null; } catch (Exception e) { e.printStackTrace(); } if (infotext.startsWith("")) { String ss[] = infotext.split(""); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.startsWith("?")) { String ss[] = infotext.split("?"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.contains("")) { infotext = info.html(); String[] ss = infotext.split("<br />"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.contains(":")) { infotext = info.html(); String[] ss = infotext.split("<br />"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } } // if(result.size()<5){ // result.put("content", value) // } // ? Elements elements2 = elements.select("td"); sb.setLength(0); for (Element download : elements2) { sb.append(download.text()).append(";"); } result.put("download", sb.toString()); } } } // result.put(entry.getKey(), elements.html()); } if (StringUtils.isNotBlank((String) result.get("nd"))) { result.put("nd", Integer.parseInt((String) result.get("nd"))); } epage.setMessages(result); try { pendingStore.addExtracedPage(epage); } catch (QueueException e) { log.error(e.getMessage()); } return epage; } catch (UnsupportedEncodingException e) { log.error(e.getMessage()); e.printStackTrace(); } } return null; }
From source file:org.structr.web.importer.Importer.java
public DOMNode createComponentChildNodes(final DOMNode parent, final Page page) throws FrameworkException { final Element head = parsedDocument.head(); final Element body = parsedDocument.body(); if (head != null && !head.html().isEmpty()) { // create Head element and append nodes to it final Head headElement = (Head) page.createElement("head"); createChildNodes(head, headElement, page); // head is a special case return headElement; }/*from w w w . ja va2s . com*/ if (body != null && !body.html().isEmpty()) { return createChildNodes(body, parent, page); } // fallback, no head no body => document is parent return createChildNodes(parsedDocument, parent, page); }
From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java
@Override public List<MediaSearchResult> search(MediaSearchOptions options) throws Exception { LOGGER.debug("search() " + options.toString()); List<MediaSearchResult> resultList = new ArrayList<MediaSearchResult>(); String searchUrl = ""; String searchTerm = ""; String imdb = ""; // only title search if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.QUERY))) { searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.QUERY)); searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8"); LOGGER.debug("search for everything: " + searchTerm); } else if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.TITLE))) { searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.TITLE)); searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8"); LOGGER.debug("search with title: " + searchTerm); } else {/*from w w w. j a va 2 s . c o m*/ LOGGER.debug("empty searchString"); return resultList; } searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm); Document doc = null; try { Url url = new CachedUrl(searchUrl); InputStream in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to search for " + searchTerm + ": " + e.getMessage()); // clear cache CachedUrl.removeCachedFileForUrl(searchUrl); } if (doc == null) { return resultList; } // only look for movie links Elements filme = doc.getElementsByAttributeValueStarting("href", "hit.php"); LOGGER.debug("found " + filme.size() + " search results"); if (filme.isEmpty()) { if (!doc.getElementsByTag("title").text().contains("Suche nach")) { // redirected to detail page MediaSearchResult msr = new MediaSearchResult(providerInfo.getId()); Elements el = doc.getElementsByAttributeValueStarting("href", "index.php3?id="); if (el.size() > 0) { msr.setId(StrgUtils.substr(el.get(0).attr("href"), "id=(\\d+)")); } msr.setTitle(StrgUtils.substr(doc.getElementsByTag("title").text(), "(.*?)\\|").trim()); el = doc.getElementsByAttributeValueContaining("href", "az.php3?j="); if (el.size() == 1) { msr.setYear(el.get(0).text()); } resultList.add(msr); } return resultList; } // <a // href="hit.php3?hit=d6900d7d9baf66ba77d8e59cc425da9e-movie-7614-17114331-1" // class="normLight">Avatar - Aufbruch nach Pandora</B> // <nobr>(2009)</nobr><br /><span class="smallLight" // style="color:#ccc;">Avatar</span></a> // map to merge 2 results :/ Map<String, MediaSearchResult> res = new HashMap<String, MediaSearchResult>(); for (Element a : filme) { try { String id = StrgUtils.substr(a.attr("href"), "-movie-(.*?)-"); MediaSearchResult sr = new MediaSearchResult(providerInfo.getId()); if (res.containsKey(id)) { LOGGER.debug("dupe found; merging with previous searchresult"); sr = res.get(id); } if (StringUtils.isNotEmpty(imdb)) { sr.setIMDBId(imdb); } if (StringUtils.isEmpty(sr.getId())) { sr.setId(id); } if (StringUtils.isEmpty(sr.getTitle())) { if (a.html().contains("nobr")) { sr.setTitle(a.ownText()); } else { sr.setTitle(a.text()); } } LOGGER.debug("found movie " + sr.getTitle()); if (StringUtils.isEmpty(sr.getOriginalTitle())) { sr.setOriginalTitle(a.getElementsByTag("span").text()); } if (StringUtils.isEmpty(sr.getYear())) { sr.setYear(StrgUtils.substr(a.getElementsByTag("nobr").text(), ".*(\\d{4}).*")); // any // 4 // digit } sr.setMediaType(MediaType.MOVIE); sr.setUrl(BASE_URL + "/filme/index.php3?id=" + id); // sr.setPosterUrl(BASE_URL + "/images" + StrgUtils.substr(a.toString(), // "images(.*?)\\"")); if (imdb.equals(sr.getIMDBId())) { // perfect match sr.setScore(1); } else { // compare score based on names sr.setScore(MetadataUtil.calculateScore(searchTerm, sr.getTitle())); } // populate extra args MetadataUtil.copySearchQueryToSearchResult(options, sr); res.put(id, sr); } catch (Exception e) { LOGGER.warn("error parsing movie result: " + e.getMessage()); } } for (String r : res.keySet()) { resultList.add(res.get(r)); } Collections.sort(resultList); Collections.reverse(resultList); return resultList; }
From source file:org.wandora.application.tools.extractors.bookmark.BookmarkExtractor.java
private void parseCategory(Element c, Topic parent, TopicMap t) throws TopicMapException, ParseException { Topic cTopic = parent;//from w w w . j av a2 s . c om Elements children = c.children(); for (Element child : children) { if (child.tagName().equals("h3")) { String cLocator = parent.getSubjectLocator().toString(); cLocator += "/" + urlEncode(child.html()); String cName = child.ownText(); cTopic = getOrCreateTopic(t, cLocator); cTopic.setSubjectLocator(new Locator(cLocator)); cTopic.setBaseName(cName + " (Bookmark)"); cTopic.setDisplayName(LANG, cName); makeSubclassOf(t, cTopic, parent); } } for (Element child : children) { if (!child.tagName().equals("dl")) continue; for (Element grandChild : child.children()) { if (!grandChild.tagName().equals("dt")) continue; for (Element ggChild : grandChild.children()) { if (ggChild.tagName().equals("a")) parseItem(ggChild, cTopic, t); else if (ggChild.tagName().equals("dl")) parseCategory(grandChild, cTopic, t); } } } }
From source file:webcralwerproject1.Webcrawler.java
public String contentprocessor() { File folder = new File(DirectoryName + "/" + crawlcount); FileWriter f_write = null;/*from ww w. j a v a 2 s .com*/ Elements p, c = null; String contentprocessfile = "./crawler" + crawlcount + "content.html"; if (!folder.exists()) { } else { try { File[] listOfFiles = folder.listFiles(); f_write = new FileWriter(contentprocessfile, true); //Open repo directory and loop through all files for (File file : listOfFiles) { if (file.isFile()) { File input = new File(file.getAbsolutePath()); Document doc = Jsoup.parse(input, "UTF-8"); String title = doc.select("title").toString(); Elements n = doc.select("nav").remove(); // String d =doc.select("div.id"); doc.select("head").remove(); doc.select("link").remove(); doc.select("style").remove(); doc.select("meta").remove(); doc.select("script").remove(); doc.select("figure").remove(); doc.select("img").remove(); doc.select("footer").remove(); doc.select("input[type = search]").remove(); doc.select("form").remove(); doc.select("button").remove(); doc.select("video").remove(); doc.select("div:empty").remove(); doc.select("div#footer").remove(); doc.select("div#id").remove(); doc.select("div#nav").remove(); doc.select("div#navigation").remove(); doc.select("div.footer").remove(); doc.select("div.header").remove(); doc.select("li > a[href]").remove(); Elements linksOnPage = doc.select("body a[href]"); for (Element link : linksOnPage) { if (link.html() == null) { link.remove();//<a></a> } else if (link.html().length() <= 4) {// does not contains title of the page link.remove(); } else { int child = link.parentNode().childNodeSize(); if (child == 1) {//only element remove link.remove(); } } } f_write.write(doc.text()); } f_write.write("<br>"); } f_write.close(); } catch (Exception e) { System.out.println("Inside Contentprocessor" + e); } return contentprocessfile; } return null; }
From source file:xxx.web.comments.debates.impl.ProConOrgParser.java
@Override public Debate parseDebate(InputStream inputStream) throws IOException { Debate result = new Debate(); Document doc = Jsoup.parse(inputStream, "UTF-8", "http://www.procon.org/"); // Set the Url of the doc // title//from w w w.ja v a2 s . c om Element body = doc.body(); Elements debateTitleElements = body.select("h2"); // Elements debateTitleElements = body.select("p[class=title]").select("p[style]"); if (debateTitleElements.first() == null) { // not a debate return null; } String title = Utils.normalize(debateTitleElements.first().text()); result.setTitle(title); Elements proConTr = body.select("tr > td > b:contains(PRO \\(yes\\))"); if (proConTr == null || proConTr.parents() == null || proConTr.parents().first() == null || proConTr.parents().first().parents() == null || proConTr.parents().first().parents().first() == null || proConTr.parents().first().parents().first().nextElementSibling() == null) { // not a pro-con debate return null; } Element trAnswers = proConTr.parents().first().parents().first().nextElementSibling(); // the PRO side Element proTd = trAnswers.select("td").get(0); Element conTd = trAnswers.select("td").get(1); // System.out.println(proTd.select("blockquote").size()); // System.out.println(conTd.select("blockquote").size()); for (Element text : proTd.select("blockquote > div[class=editortext]")) { Argument argument = new Argument(); argument.setStance("pro"); argument.setText(extractPlainTextFromTextElement(text)); argument.setOriginalHTML(text.html()); // set ID idCounter++; argument.setId("pcq_" + idCounter); if (!argument.getText().isEmpty()) { result.getArgumentList().add(argument); } else { System.err.println("Failed to extract text from " + text.html()); } } for (Element text : conTd.select("blockquote > div[class=editortext]")) { Argument argument = new Argument(); argument.setStance("con"); argument.setText(extractPlainTextFromTextElement(text)); argument.setOriginalHTML(text.html()); idCounter++; argument.setId("pcq_" + idCounter); if (!argument.getText().isEmpty()) { result.getArgumentList().add(argument); } else { System.err.println("Failed to extract text from " + text.html()); } } // show some stats: Map<String, Integer> map = new HashMap<>(); map.put("pro", 0); map.put("con", 0); for (Argument argument : result.getArgumentList()) { map.put(argument.getStance(), map.get(argument.getStance()) + 1); } System.out.println(map); return result; }