List of usage examples for org.jsoup.nodes Document getElementsByTag
public Elements getElementsByTag(String tagName)
From source file:org.loklak.api.search.WordpressCrawlerService.java
public static SusiThought crawlWordpress(String blogURL) { Document blogHTML = null; Elements articles = null;// w ww . j a v a 2 s . co m Elements articleList_title = null; Elements articleList_content = null; Elements articleList_dateTime = null; Elements articleList_author = null; String[][] blogPosts = new String[100][4]; // blogPosts[][0] = Blog Title // blogPosts[][1] = Posted On // blogPosts[][2] = Author // blogPosts[][3] = Blog Content Integer numberOfBlogs = 0; Integer iterator = 0; try { blogHTML = Jsoup.connect(blogURL).get(); } catch (IOException e) { e.printStackTrace(); } articles = blogHTML.getElementsByTag("article"); iterator = 0; for (Element article : articles) { articleList_title = article.getElementsByClass("entry-title"); for (Element blogs : articleList_title) { blogPosts[iterator][0] = blogs.text().toString(); } articleList_dateTime = article.getElementsByClass("posted-on"); for (Element blogs : articleList_dateTime) { blogPosts[iterator][1] = blogs.text().toString(); } articleList_author = article.getElementsByClass("byline"); for (Element blogs : articleList_author) { blogPosts[iterator][2] = blogs.text().toString(); } articleList_content = article.getElementsByClass("entry-content"); for (Element blogs : articleList_content) { blogPosts[iterator][3] = blogs.text().toString(); } iterator++; } numberOfBlogs = iterator; JSONArray blog = new JSONArray(); for (int k = 0; k < numberOfBlogs; k++) { JSONObject blogpost = new JSONObject(); blogpost.put("blog_url", blogURL); blogpost.put("title", blogPosts[k][0]); blogpost.put("posted_on", blogPosts[k][1]); blogpost.put("author", blogPosts[k][2]); blogpost.put("content", blogPosts[k][3]); blog.put(blogpost); } SusiThought json = new SusiThought(); json.setData(blog); return json; }
From source file:org.openhab.tools.analysis.checkstyle.AboutHtmlCheck.java
private void checkLicenseHeader(Document processedAboutHtmlFileDocument) throws CheckstyleException { Elements processedAboutHtmlFileHeaderTags = processedAboutHtmlFileDocument.getElementsByTag(HEADER_3_TAG); if (!isElementProvided(processedAboutHtmlFileHeaderTags, LICENSE_HEADER)) { log(0, "Invalid or missing license header in the about.html file. " + VALID_ABOUT_HTML_FILE_LINK_MSG + validAboutHtmlFileURL); }//www . j a v a 2s. co m }
From source file:org.openhab.tools.analysis.checkstyle.AboutHtmlCheck.java
private void checkLicenseParagraph(Document processedAboutHtmlFileDocument) { Document validAboutHtmlFileDocument = Jsoup.parse(validAboutHtmlFileContent); Elements validAboutHtmlFileParagraphTags = validAboutHtmlFileDocument.getElementsByTag(PARAGRAPH_TAG); // the paragraph with index 1 in the valid about.html file // is the license paragraph Element validAboutHtmlFileLicenseParagraph = validAboutHtmlFileParagraphTags.get(1); String validAboutHtmlFileLicenseParagraphContent = validAboutHtmlFileLicenseParagraph.html(); Elements processedFileParagraphTags = processedAboutHtmlFileDocument.getElementsByTag(PARAGRAPH_TAG); if (!isElementProvided(processedFileParagraphTags, validAboutHtmlFileLicenseParagraphContent)) { log(0, "Invalid or missing license paragraph in the about.html file. " + VALID_ABOUT_HTML_FILE_LINK_MSG + validAboutHtmlFileURL); }//from www . ja v a 2 s . c o m }
From source file:org.openmrs.module.radiology.report.template.DefaultMrrtReportTemplateFileParser.java
/** * @see MrrtReportTemplateFileParser#parse(String) *///from w w w.j av a 2s . c o m @Override public MrrtReportTemplate parse(String mrrtTemplate) throws IOException { validator.validate(mrrtTemplate); final Document doc = Jsoup.parse(mrrtTemplate, ""); final MrrtReportTemplate result = new MrrtReportTemplate(); initializeTemplate(result, doc); try { addTermsToTemplate(result, doc.getElementsByTag("script").get(0).toString()); } catch (ParserConfigurationException | SAXException e) { throw new APIException("radiology.report.template.parser.error", null, e); } return result; }
From source file:org.openmrs.module.radiology.report.template.DefaultMrrtReportTemplateFileParser.java
private final void initializeTemplate(MrrtReportTemplate template, Document doc) { final Elements metaTags = doc.getElementsByTag("meta"); template.setPath(doc.baseUri());// ww w. j ava2s . c o m template.setCharset(metaTags.attr("charset")); for (Element metaTag : metaTags) { final String name = metaTag.attr("name"); final String content = metaTag.attr("content"); switch (name) { case DCTERMS_TITLE: template.setDcTermsTitle(content); break; case DCTERMS_DESCRIPTION: template.setDcTermsDescription(content); break; case DCTERMS_IDENTIFIER: template.setDcTermsIdentifier(content); break; case DCTERMS_TYPE: template.setDcTermsType(content); break; case DCTERMS_LANGUAGE: template.setDcTermsLanguage(content); break; case DCTERMS_PUBLISHER: template.setDcTermsPublisher(content); break; case DCTERMS_RIGHTS: template.setDcTermsRights(content); break; case DCTERMS_LICENSE: template.setDcTermsLicense(content); break; case DCTERMS_DATE: template.setDcTermsDate(content); break; case DCTERMS_CREATOR: template.setDcTermsCreator(content); break; default: log.debug("Unhandled meta tag " + name); } } }
From source file:org.openmrs.module.radiology.report.template.XsdMrrtReportTemplateValidator.java
/** * @see MrrtReportTemplateValidator#validate(String) *//*from w ww .j a va2 s . c om*/ @Override public void validate(String mrrtTemplate) throws IOException { final Document document = Jsoup.parse(mrrtTemplate, ""); final Elements metatags = document.getElementsByTag("meta"); ValidationResult validationResult = metaTagsValidationEngine.run(metatags); final SchemaFactory factory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI); final Schema schema; final Validator validator; try (InputStream in = IOUtils.toInputStream(mrrtTemplate)) { schema = factory.newSchema(getSchemaFile()); validator = schema.newValidator(); validator.setErrorHandler(new ErrorHandler() { @Override public void warning(SAXParseException exception) throws SAXException { log.debug(exception.getMessage(), exception); validationResult.addError(exception.getMessage(), ""); } @Override public void error(SAXParseException exception) throws SAXException { log.debug(exception.getMessage(), exception); validationResult.addError(exception.getMessage(), ""); } @Override public void fatalError(SAXParseException exception) throws SAXException { log.debug(exception.getMessage(), exception); validationResult.addError(exception.getMessage(), ""); } }); validator.validate(new StreamSource(in)); validationResult.assertOk(); } catch (SAXException e) { log.error(e.getMessage(), e); throw new APIException("radiology.report.template.validation.error", null, e); } }
From source file:org.opens.tanaguru.processing.ProcessRemarkServiceImplTest.java
/** * Test of setDocument method, of class ProcessRemarkServiceImpl. *///from w ww .ja va 2s. c o m public void testGetSnippetFromElement() { ProcessRemarkServiceImpl instance = new ProcessRemarkServiceImpl(null, null, null, null); //--------------------------------------------------------------------// //-----------------------Test1----------------------------------------// //--------------------------------------------------------------------// String rawHtml = "<label> <span>Rechercher:</span> " + "<input type=\"text\" onkeyup=\"return CatchEnter(event);\" " + "class=\"text\" id=\"searchfield\" " + "name=\"search&qudsqqqssqdsqdsqdo\" /></label>"; Document document = Jsoup.parse(rawHtml); Element element = document.getElementsByTag("label").iterator().next(); String snippet = StringEscapeUtils.unescapeHtml4(instance.getSnippetFromElement(element)); String expectedSnippet = "<label> <span>Rechercher:</span> " + "<input type=\"text\" onkeyup=\"return CatchEnter(event);\" " + "class=\"text\" id=\"searchfield\" " + "name=\"search&qudsqqqssqdsqdsqdo\" />[...]</label>"; assertEquals(expectedSnippet, snippet); //--------------------------------------------------------------------// //-----------------------Test2----------------------------------------// //--------------------------------------------------------------------// rawHtml = "<label> <span>New Rechercher:</span> " + "<p title=\"some title here\" onkeyup=\"return CatchEnter(event);\" " + " id=\"searchfield\" class=\"myclass other-class1 other-class2\" > " + "anything</p></label>"; document = Jsoup.parse(rawHtml); element = document.getElementsByTag("label").iterator().next(); snippet = StringEscapeUtils.unescapeHtml4(instance.getSnippetFromElement(element)); expectedSnippet = "<label> <span>New Rechercher:</span> " + "<p title=\"some title here\" onkeyup=\"return CatchEnter(event);\"" + " id=\"searchfield\" class=\"myclass other-class1 other-class2\">" + "[...]</p>[...]</label>"; assertEquals(expectedSnippet, snippet); //--------------------------------------------------------------------// //-----------------------Test3----------------------------------------// //--------------------------------------------------------------------// rawHtml = "<iframe align=\"left\" width=\"315px\" " + "scrolling=\"no\" height=\"160px\" frameborder=\"0\" " + "id=\"link-meteo\" src=\"http://www.anyUrl.com/module/onelocationsearch?ShowSearch=true&StartDate=2012-06-01&Days=2&location=bruxelles&url=http://meteo1.lavenir.net&cssfile=http://lavenir.net/extra/weather/styles.css\">" + "</iframe> "; document = Jsoup.parse(rawHtml); element = document.getElementsByTag("iframe").iterator().next(); snippet = StringEscapeUtils.unescapeHtml4(instance.getSnippetFromElement(element)); expectedSnippet = rawHtml.trim(); assertEquals(expectedSnippet, snippet); //--------------------------------------------------------------------// //-----------------------Test4----------------------------------------// //--------------------------------------------------------------------// rawHtml = " <center> <script type=\"text/javascript\"> if (articledetail == false) initAdhese('IMU.SUPER.WIDE'); </script> " + "<script src=\"http://anyUrl.com/ad3/sl_ave_home_-IMU.SUPER.WIDE/lafr/rn92/pv1/brFirefox;Firefox17;Linux;screenundefined/in;prx;;gmbl;/?t=1381234838205\" type=\"text/javascript\"></script> " + " <div class=\"adhese_300x250\"> <script src=\"http://1.adhesecdn.be/pool/lib/68641.js?t=1371729603000\"></script> " + "<script src=\"http://anyUrl.com/pagead/show_ads.js\" type=\"text/javascript\"></script>" + "<ins style=\"display:inline-table;border:none;height:250px;margin:0;padding:0;position:relative;visibility:visible;width:300px\">" + "<ins style=\"display:block;border:none;height:250px;margin:0;padding:0;position:relative;visibility:visible;width:300px\" id=\"aswift_1_anchor\">" + "<iframe width=\"300\" scrolling=\"no\" height=\"250\" frameborder=\"0\" style=\"left:0;position:absolute;top:0;\" name=\"aswift_1\" id=\"aswift_1\" onload=\"var i=this.id,s=window.google_iframe_oncopy,H=s&&s.handlers,h=H&&H[i],w=this.contentWindow,d;try{d=w.document}catch(e){}if(h&&d&&(!d.body||!d.body.firstChild)){if(h.call){setTimeout(h,0)}else if(h.match){w.location.replace(h)}}\" allowtransparency=\"true\" hspace=\"0\" vspace=\"0\" marginheight=\"0\" marginwidth=\"0\"></iframe>" + "</ins>" + "</ins>" + "</div> " + "</center> "; document = Jsoup.parse(rawHtml); element = document.getElementsByTag("center").iterator().next(); snippet = StringEscapeUtils.unescapeHtml4(instance.getSnippetFromElement(element)); expectedSnippet = "<center> <script type=\"text/javascript\"> if (articledetail == false) initAdhese('IMU.SUPER.WIDE'); </script> " + "<script src=\"http://anyUrl.com/ad3/sl_ave_home_-IMU.SUPER.WIDE/lafr/rn92/pv1/brFirefox;Firefox17;Linux;screenundefined/in;prx;;gmbl;/?t=1381234838205\" type=\"text/javascript\">[...]</script>" + "[...]</center>"; assertEquals(expectedSnippet, snippet); }
From source file:org.sbs.goodcrawler.plugin.extract.ExtractorDytt8.java
@Override public ExtractedPage<?, ?> onExtract(Page page) { if (null != page) { try {/*from w ww. j a v a 2 s.c o m*/ Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getURL())); if (null != page.getWebURL().getURL() && page.getWebURL().getURL().contains("game/")) return null; // ???Url?Url Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); if (StringUtils.isNotBlank(linkHref) && filterUrls(linkHref)) { try { WebURL url = new WebURL(); url.setURL(linkHref); url.setJobName(conf.jobName); pendingUrls.addUrl(url); } catch (QueueException e) { log.error(e.getMessage()); } catch (Exception e) { log.error(e.getMessage()); } } } } // ?? // Map<String, String> selects = conf.getSelects(); Map<String, String> selects = null; ExtractedPage<String, Object> epage = pendingStore.new ExtractedPage<String, Object>(); epage.setUrl(page.getWebURL()); HashMap<String, Object> result = new HashMap<>(); Elements text = doc.select("#Zoom"); if (null == text || text.size() == 0) { return null; } String name = doc.select("h1").text(); name = name.replace("", "").replace("<<", "").replace("", "").replace(">>", ""); result.put("movie", name); // result.put("_id", name); String ts[] = doc.select("h2 a").text().split(" "); if (ts.length >= 2) { result.put("type", ts[1].trim()); } else { result.put("type", "unknow"); } result.put("url", page.getWebURL().getURL()); for (Entry<String, String> entry : selects.entrySet()) { Elements elements = doc.select(entry.getValue()); if (elements.isEmpty()) return null; else { if ("content".equals(entry.getKey())) { for (Element element : elements) { // Elements imgs = element.select("img[src]"); StringBuilder sb = new StringBuilder(); for (Element img : imgs) { sb.append(img.attr("src")).append(";"); } result.put("img", sb.toString()); // ? Elements movieInfos = element.select("p"); for (Element info : movieInfos) { String infotext = info.text(); try { String infotext_ = info.html(); int start, end = 0; start = infotext_.indexOf(""); if (start > 0) { end = infotext_.lastIndexOf(""); if (end > 0 && start < end) { result.put("jq", infotext_.substring(start, end)); } else { end = infotext_.lastIndexOf("."); if (end > 0 && start < end) { result.put("jq", infotext_.substring(start, end)); } } } infotext_ = null; } catch (Exception e) { e.printStackTrace(); } if (infotext.startsWith("")) { String ss[] = infotext.split(""); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.startsWith("?")) { String ss[] = infotext.split("?"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.contains("")) { infotext = info.html(); String[] ss = infotext.split("<br />"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.contains(":")) { infotext = info.html(); String[] ss = infotext.split("<br />"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } } // if(result.size()<5){ // result.put("content", value) // } // ? Elements elements2 = elements.select("td"); sb.setLength(0); for (Element download : elements2) { sb.append(download.text()).append(";"); } result.put("download", sb.toString()); } } } // result.put(entry.getKey(), elements.html()); } if (StringUtils.isNotBlank((String) result.get("nd"))) { result.put("nd", Integer.parseInt((String) result.get("nd"))); } epage.setMessages(result); try { pendingStore.addExtracedPage(epage); } catch (QueueException e) { log.error(e.getMessage()); } return epage; } catch (UnsupportedEncodingException e) { log.error(e.getMessage()); e.printStackTrace(); } } return null; }
From source file:org.sbs.goodcrawler.plugin.extract.ExtractYouku.java
/** * ??//from ww w . j ava2 s . c o m * @param url * @return */ public HashMap<String, Object> getInformation(Page page) { HashMap<String, Object> map = Maps.newHashMap(); String url = page.getWebURL().getURL(); try { ExtractedPage<String, Object> epage = pendingStore.new ExtractedPage<String, Object>(); epage.setUrl(page.getWebURL()); Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getURL())); // ???Url?Url Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); if (filterUrls(linkHref)) { WebURL weburl = new WebURL(); weburl.setURL(linkHref); weburl.setJobName(conf.jobName); Submitor.submitUrl(weburl); } } } if (url.contains("/show_page/")) { String title = doc.select(".title .name").text(); if (StringUtils.isBlank(title)) return null; map.put("title", title); String category = doc.select(".title .type a").text(); if (StringUtils.isBlank(category)) return null; map.put("category", category); String _year = CharMatcher.DIGIT.retainFrom(doc.select(".title .pub").text()); if (StringUtils.isNotBlank(_year)) { int year = Integer.parseInt(_year); map.put("year", year); } String score = CharMatcher.DIGIT.retainFrom(doc.select(".ratingstar .num").text()); map.put("score", score); String alias = doc.select(".alias").text(); if (alias.contains(":")) { map.put("translation", alias.split(":")[1]); } String img = doc.select(".thumb img").attr("src"); if (StringUtils.isBlank(img)) return null; map.put("thumbnail", img); String area = doc.select(".row2 .area a").text(); if (StringUtils.isBlank(area)) return null; map.put("area", area); String[] type = doc.select(".row2 .type a").text().split(" "); if (null == type || type.length == 0) return null; map.put("type", Sets.newHashSet(type)); String director = doc.select(".row2 .director a").text(); map.put("director", director); String _duration = CharMatcher.DIGIT.retainFrom(doc.select(".row2 .duration").text()); if (StringUtils.isNotBlank(_duration)) { int duration = Integer.parseInt(_duration); map.put("duration", duration); } String _hot = CharMatcher.anyOf(",").removeFrom(doc.select(".row2 .vr .num").text()); _hot = CharMatcher.DIGIT.retainFrom(_hot); if (StringUtils.isNotBlank(_hot)) { int hot = Integer.parseInt(_hot); map.put("hot", hot); } String sumary = doc.select(".detail .long").text(); map.put("summary", sumary); // Elements elements = doc.select(".baseaction a"); HashMap<String, String> playList = Maps.newHashMap(); for (Element element : elements) { String n = element.text(); String urlString = element.attr("href"); if (StringUtils.isBlank(urlString)) return null; Document d2 = Jsoup.parse(new URL(urlString), 10000); if (null != d2) { String x = d2.select("#link2").attr("value"); if (StringUtils.isBlank(x)) return null; playList.put(n, x); } } map.put("online", playList); } else if (url.contains("/v_show/")) { Document d3 = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getURL())); // ???Url?Url Elements links2 = d3.getElementsByTag("a"); if (!links2.isEmpty()) { for (Element link : links2) { String linkHref = link.absUrl("href"); if (filterUrls(linkHref)) { WebURL weburl = new WebURL(); weburl.setURL(linkHref); weburl.setJobName(conf.jobName); try { pendingUrls.addUrl(weburl); } catch (QueueException e) { log.error(e.getMessage()); } } } } String p = d3.select("h1.title a").attr("href"); if (StringUtils.isBlank(p)) return null; return getInformation(p); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } if (map != null && map.size() > 4) { if (null == map.get("year")) { map.put("year", 1800); } } return map; }
From source file:org.sbs.goodcrawler.plugin.extract.ExtractYouku.java
private HashMap<String, Object> getInformation(String p) { HashMap<String, Object> map = Maps.newHashMap(); try {/* ww w . ja va2 s . co m*/ if (p.contains("/show_page/")) { Document doc = Jsoup.parse(new URL(p), 15000); // ???Url?Url Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); if (filterUrls(linkHref)) { WebURL weburl = new WebURL(); weburl.setURL(linkHref); weburl.setJobName(conf.jobName); try { pendingUrls.addUrl(weburl); } catch (QueueException e) { log.error(e.getMessage()); } } } } String title = doc.select(".title .name").text(); if (StringUtils.isBlank(title)) return null; map.put("title", title); String category = doc.select(".title .type a").text(); if (StringUtils.isBlank(category)) return null; map.put("category", category); String _year = CharMatcher.DIGIT.retainFrom(doc.select(".title .pub").text()); if (StringUtils.isNotBlank(_year)) { int year = Integer.parseInt(_year); map.put("year", year); } String score = CharMatcher.DIGIT.retainFrom(doc.select(".ratingstar .num").text()); map.put("score", score); String alias = doc.select(".alias").text(); if (alias.contains(":")) { map.put("translation", alias.split(":")[1]); } String img = doc.select(".thumb img").attr("src"); if (StringUtils.isBlank(img)) return null; map.put("thumbnail", Lists.newArrayList(img)); String area = doc.select(".row2 .area a").text(); if (StringUtils.isBlank(area)) return null; map.put("area", area); String[] type = doc.select(".row2 .type a").text().split(" "); if (null == type || type.length == 0) return null; map.put("type", Lists.newArrayList(type)); String director = doc.select(".row2 .director a").text(); map.put("director", director); String _duration = CharMatcher.DIGIT.retainFrom(doc.select(".row2 .duration").text()); if (StringUtils.isNotBlank(_duration)) { int duration = Integer.parseInt(_duration); map.put("duration", duration); } String _hot = CharMatcher.anyOf(",").removeFrom(doc.select(".row2 .vr .num").text()); if (StringUtils.isNotBlank(_hot)) { int hot = Integer.parseInt(_hot); map.put("hot", hot); } String sumary = doc.select(".detail .long").text(); map.put("summary", sumary); // Elements elements = doc.select(".baseaction a"); HashMap<String, String> playList = Maps.newHashMap(); for (Element element : elements) { String n = element.text(); String urlString = element.attr("href"); if (StringUtils.isBlank(urlString)) return null; Document d2 = Jsoup.parse(new URL(urlString), 10000); if (null != d2) { String x = d2.select("#link2").attr("value"); if (StringUtils.isBlank(x)) return null; playList.put(n, x); } } map.put("online", playList); } else return null; } catch (Exception e) { return map; } return map; }