List of usage examples for org.jsoup.nodes Element html
public String html()
From source file:com.romeikat.datamessie.core.processing.service.cleaning.extract.TagExctractor.java
private String extractContent(final RawContent rawContent, final Document document, final String tagSelector) { if (tagSelector == null || tagSelector.isEmpty()) { return null; }//from w w w . ja va 2s .com // Parse tag selector String tagName = null; String idName = null; List<String> classNames = null; final String warningMessage = "Could not apply tag selecting rule on document " + document.getId() + " (" + document.getUrl() + ") due to malformed tag selector " + tagSelector + " of source " + document.getSourceId(); try { final String[] parts = tagSelector.split("#"); tagName = parts[0]; if (tagName.isEmpty()) { tagName = null; } if (parts.length >= 2) { idName = parts[1]; if (idName.isEmpty()) { idName = null; } } if (parts.length >= 3) { classNames = Arrays.asList(parts[2].split(" ")); } if (tagName == null || idName == null && classNames == null) { LOG.warn(warningMessage); return null; } } catch (final Exception e) { LOG.warn(warningMessage, e); return null; } // With tag selector, search for appropriate element final org.jsoup.nodes.Document jsoupDocument = Jsoup.parse(rawContent.getContent()); final List<Element> matchingElements = new ArrayList<Element>(); final Elements elementsWithTagName = jsoupDocument.getElementsByTag(tagName); for (final Element elementWithTagName : elementsWithTagName) { final boolean idNameMatches = idName == null || elementWithTagName.id().equals(idName); final boolean classNamesMatch = classNames == null || elementWithTagName.classNames().containsAll(classNames); if (idNameMatches && classNamesMatch) { matchingElements.add(elementWithTagName); } } // Unique match found if (matchingElements.size() == 1) { final Element matchingElement = matchingElements.get(0); return matchingElement.html(); } // No unique match found return null; }
From source file:net.slkdev.swagger.confluence.service.impl.XHtmlToConfluenceServiceImpl.java
private static List<ConfluencePage> handlePagination() { final List<ConfluencePage> confluencePages = new ArrayList<>(); final SwaggerConfluenceConfig swaggerConfluenceConfig = SWAGGER_CONFLUENCE_CONFIG.get(); final PaginationMode paginationMode = swaggerConfluenceConfig.getPaginationMode(); final Document originalDocument = SWAGGER_DOCUMENT.get(); final Document transformedDocument = originalDocument.clone(); final Elements categoryElements = transformedDocument.select(".sect1"); // Remove ToC form the transformed document final Elements toc = transformedDocument.select(".toc"); toc.html(""); toc.unwrap();/* w w w.j a v a2 s . c o m*/ // For Single Page Mode, the incoming XHTML can be used directly. if (paginationMode == SINGLE_PAGE) { final ConfluencePage confluencePage = ConfluencePageBuilder.aConfluencePage() .withPageType(PageType.ROOT).withOriginalTitle(swaggerConfluenceConfig.getTitle()) .withConfluenceTitle(buildConfluenceTitle(swaggerConfluenceConfig.getTitle(), null, null)) .build(); if (swaggerConfluenceConfig.isIncludeTableOfContentsOnSinglePage()) { confluencePage.setXhtml(originalDocument.html()); } else { confluencePage.setXhtml(transformedDocument.html()); } confluencePages.add(confluencePage); return confluencePages; } // Before beginning further processing, we need to know if we're in individual // page mode or not, as that will effect how we split the DOM. If we're in this // mode then the category pages will contain inner table of contents. final boolean individualPages = (paginationMode == INDIVIDUAL_PAGES); // From here on, if we're still proceeding then we know the meat of the document // will go in sub-pages. So for the master page, we will use the table of contents final Elements tocElements = originalDocument.select(".toc"); final List<String> innerTocXHtmlList = new ArrayList<>(); final Elements innerTocElements = originalDocument.select(".sectlevel2"); for (final Element innerTocElement : innerTocElements) { // If we're in individual page mode, then we collect the inner ToCs if (individualPages) { final StringBuilder tocHtml = new StringBuilder(); tocHtml.append("<div id=\"toc\" class=\"toc\">"); tocHtml.append("<h4 id=\"toctitle\">Table of Contents</h4>"); tocHtml.append("<div><ul class=\"sectlevel1\">"); tocHtml.append(innerTocElement.html()); tocHtml.append("</ul></div></div>"); innerTocXHtmlList.add(tocHtml.toString()); } // If we're in category page mode, then we strip out the inner table of contents. else { innerTocElement.html(""); innerTocElement.unwrap(); } } // Build the Root Page w/ the Appropriate Level of Table of Contents final ConfluencePage rootConfluencePage = ConfluencePageBuilder.aConfluencePage() .withPageType(PageType.ROOT).withOriginalTitle(swaggerConfluenceConfig.getTitle()) .withConfluenceTitle(buildConfluenceTitle(swaggerConfluenceConfig.getTitle(), null, null)) .withXhtml(tocElements.html()).build(); confluencePages.add(rootConfluencePage); int category = 1; // Now we process the category pages for (final Element categoryElement : categoryElements) { // Fetch the title from the first child, which is the header element final String categoryTitle = categoryElement.children().first().text(); // If we're in individual mode then we need these to be sub table of contents if (individualPages) { final ConfluencePage categoryConfluencePage = ConfluencePageBuilder.aConfluencePage() .withPageType(PageType.CATEGORY).withOriginalTitle(categoryTitle) .withConfluenceTitle(buildConfluenceTitle(categoryTitle, category, null)) .withXhtml(innerTocXHtmlList.get(category - 1)).build(); confluencePages.add(categoryConfluencePage); final Elements individualElements = categoryElement.getElementsByClass("sect2"); int individual = 1; for (final Element individualElement : individualElements) { final String individualTitle = individualElement.children().first().text(); final ConfluencePage individualConfluencePage = ConfluencePageBuilder.aConfluencePage() .withPageType(INDIVIDUAL).withOriginalTitle(individualTitle) .withConfluenceTitle(buildConfluenceTitle(individualTitle, category, individual)) .withXhtml(individualElement.html()).build(); confluencePages.add(individualConfluencePage); individual++; } category++; continue; } // If we're in category mode, we use the remaining page data final ConfluencePage categoryConfluencePage = ConfluencePageBuilder.aConfluencePage() .withPageType(PageType.CATEGORY).withOriginalTitle(categoryTitle) .withConfluenceTitle(buildConfluenceTitle(categoryTitle, category, null)) .withXhtml(categoryElement.html()).build(); confluencePages.add(categoryConfluencePage); category++; } return confluencePages; }
From source file:org.jlucrum.datafetcher.FetcherNasdaqOmxNordic.java
public Map<String, Double> getData(String name, DateTime fromDate, DateTime toDate, int type) { HttpPost httpPost = new HttpPost(this.url); HttpResponse response = null;/* w w w .ja va2 s. com*/ HashMap<String, Double> retMap = new HashMap<String, Double>(); httpclient = getClient(); List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>(); String fixedName = stockMap.get(name); if (fixedName == null) { fixedName = name; } nameValuePairs.add(new BasicNameValuePair("xmlquery", "<post> " + "<param name=\"SubSystem\" value=\"History\"/> " + "<param name=\"Action\" value=\"GetDataSeries\"/>" + "<param name=\"AppendIntraDay\" value=\"no\"/>" + "<param name=\"Instrument\" value=\"" + fixedName + "\"/>" + "<param name=\"FromDate\" value=\"" + dateFormatter.print(fromDate) + "\"/>" + "<param name=\"ToDate\" value=\"" + dateFormatter.print(toDate) + "\"/> " + "<param name=\"hi__a\" value=\"0,1,2,4,21,8,10,11,12,9\"/> " + "<param name=\"ext_xslt\" value=\"/nordicV3/hi_table_shares_adjusted.xsl\"/> " + "<param name=\"ext_xslt_options\" value=\",undefined,\"/> " + "<param name=\"ext_xslt_lang\" value=\"en\"/> " + "<param name=\"ext_xslt_hiddenattrs\" value=\",ip,iv,\"/> " + "<param name=\"ext_xslt_tableId\" value=\"historicalTable\"/> " + "<param name=\"app\" value=\"/osakkeet/Historialliset_kurssitiedot/\"/> " + "</post>")); try { Document doc = (Document) cache.getData(fixedName, fromDate.toString(), toDate.toString()); if (doc == null) { httpPost.setEntity(new UrlEncodedFormEntity(nameValuePairs, HTTP.UTF_8)); response = httpclient.execute(httpPost); HttpEntity entity = response.getEntity(); String resString = EntityUtils.toString(entity, "UTF-8"); if (debug) { System.out.printf("Respond:%s", resString); } doc = Jsoup.parse(resString); cache.putData(fixedName, fromDate.toString(), toDate.toString(), doc); System.out.printf("Fetched from network:%s\n", name); } Elements elems = doc.select("tr"); Iterator<Element> iter = elems.iterator(); iter.next(); //skip head while (iter.hasNext()) { Element elem = iter.next(); Elements dataElems = elem.getAllElements(); /* Output Example: <tr id="historicalTable-"> <td>2011-09-08</td> <td>25.29</td> <td>24.38</td> <td>24.93</td> <td>24.92</td> <td>895,389</td> <td>22,298,455</td> <td>5,524</td> </tr> */ Element dateElem = dataElems.get(1); Element dataElem = dataElems.get(dataMap[type]); if (dateElem.html() == null || dateElem.html().length() == 0 || dataElem.html() == null || dataElem.html().length() == 0) { continue; } retMap.put(dateElem.html(), Double.valueOf(dataElem.html().replaceAll(",", ""))); if (debug) { System.out.printf("Date:%s data:%s\n", dateElem.html(), dataElem.html()); } } System.out.printf("Fetched %s/%s from NasdaqOmxNordic:%d\n", name, fixedName, retMap.size()); } catch (IOException ex) { Logger.getLogger(FetcherNasdaqOmxNordic.class.getName()).log(Level.SEVERE, null, ex); } return retMap; }
From source file:web.analyzer.utils.Utils.java
public List<Heading> docHeadingsProcess(Document doc) { List<Heading> headingList = new ArrayList<Heading>(); int level = 0; Elements eles = doc.select("*"); for (Element ele : eles) { level++;/*w w w.j ava 2 s .co m*/ if (HEADING_TAG.contains(ele.tagName())) { headingList.add(new Heading(ele.tagName(), ele.html(), level)); } if (ele.children().size() == 0) { level = 0; continue; } else { eles = ele.children(); } } return headingList; }
From source file:com.liato.bankdroid.banking.banks.AppeakPoker.java
@Override public Urllib login() throws LoginException, BankException { try {/*from w w w .j a va 2 s. co m*/ LoginPackage lp = preLogin(); String response = urlopen.open(lp.getLoginTarget()); Document d = Jsoup.parse(response); Element e = d.select("#content > table tr:eq(2) td:eq(1)").first(); if (e == null) { throw new LoginException(res.getText(R.string.invalid_username).toString()); } else { mChips = e.html(); } } catch (ClientProtocolException e) { throw new BankException(e.getMessage()); } catch (IOException e) { throw new BankException(e.getMessage()); } return urlopen; }
From source file:accountgen.controller.Controller.java
private void setName(Document doc, Person p) { Elements e = doc.getElementsByClass("address"); Element name = e.select("h3").first(); p.setFirstname(StringEscapeUtils.unescapeHtml4(name.html().split(" ")[0]).trim()); p.setMiddlename(""); p.setLastname(StringEscapeUtils.unescapeHtml4(name.html().split(name.html().split(" ")[0])[1]).trim()); }
From source file:com.johan.vertretungsplan.parser.UntisCommonParser.java
/** * Parst eine "Nachrichten zum Tag"-Tabelle aus Untis-Vertretungsplnen * //from w w w.j ava 2 s.com * @param table * das <code>table</code>-Element des HTML-Dokuments, das geparst * werden soll * @param data * Daten von der Schule (aus <code>Schule.getData()</code>) * @param tag * der {@link VertretungsplanTag} in dem die Nachrichten * gespeichert werden sollen */ protected void parseNachrichten(Element table, JSONObject data, VertretungsplanTag tag) { Elements zeilen = table.select("tr:not(:contains(Nachrichten zum Tag))"); for (Element i : zeilen) { Elements spalten = i.select("td"); String info = ""; for (Element b : spalten) { info += "\n" + TextNode.createFromEncoded(b.html(), null).getWholeText(); } info = info.substring(1); // remove first \n tag.getNachrichten().add(info); } }
From source file:com.johan.vertretungsplan.parser.SVPlanParser.java
public Vertretungsplan getVertretungsplan() throws IOException, JSONException { new LoginHandler(schule).handleLogin(executor, cookieStore, username, password); // JSONArray urls = schule.getData().getJSONArray("urls"); String encoding = schule.getData().getString("encoding"); List<Document> docs = new ArrayList<Document>(); for (int i = 0; i < urls.length(); i++) { JSONObject url = urls.getJSONObject(i); loadUrl(url.getString("url"), encoding, docs); }//from ww w .ja v a2 s.co m LinkedHashMap<String, VertretungsplanTag> tage = new LinkedHashMap<String, VertretungsplanTag>(); for (Document doc : docs) { if (doc.select(".svp-tabelle").size() > 0) { VertretungsplanTag tag = new VertretungsplanTag(); String date = "Unbekanntes Datum"; if (doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").size() > 0) date = doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").text(); else if (doc.title().startsWith("Vertretungsplan fr ")) date = doc.title().substring("Vertretungsplan fr ".length()); tag.setDatum(date); if (doc.select(".svp-uploaddatum").size() > 0) tag.setStand(doc.select(".svp-uploaddatum").text().replace("Aktualisierung: ", "")); Elements rows = doc.select(".svp-tabelle tr"); String lastLesson = ""; for (Element row : rows) { if (row.hasClass("svp-header")) continue; Vertretung vertretung = new Vertretung(); List<String> affectedClasses = new ArrayList<String>(); for (Element column : row.select("td")) { if (!hasData(column.text())) { continue; } String type = column.className(); if (type.startsWith("svp-stunde")) { vertretung.setLesson(column.text()); lastLesson = column.text(); } else if (type.startsWith("svp-klasse")) affectedClasses = Arrays.asList(column.text().split(", ")); else if (type.startsWith("svp-esfehlt")) vertretung.setPreviousTeacher(column.text()); else if (type.startsWith("svp-esvertritt")) vertretung.setTeacher(column.text()); else if (type.startsWith("svp-fach")) vertretung.setSubject(column.text()); else if (type.startsWith("svp-bemerkung")) { vertretung.setDesc(column.text()); vertretung.setType(recognizeType(column.text())); } else if (type.startsWith("svp-raum")) vertretung.setRoom(column.text()); if (vertretung.getLesson() == null) vertretung.setLesson(lastLesson); } if (vertretung.getType() == null) { vertretung.setType("Vertretung"); } for (String klasse : affectedClasses) { KlassenVertretungsplan kv = tag.getKlassen().get(klasse); if (kv == null) kv = new KlassenVertretungsplan(klasse); kv.add(vertretung); tag.getKlassen().put(klasse, kv); } } List<String> nachrichten = new ArrayList<String>(); if (doc.select("h2:contains(Mitteilungen)").size() > 0) { Element h2 = doc.select("h2:contains(Mitteilungen)").first(); Element sibling = h2.nextElementSibling(); while (sibling != null && sibling.tagName().equals("p")) { for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText() .split("<br />\\s*<br />")) { if (hasData(nachricht)) nachrichten.add(nachricht); } sibling = sibling.nextElementSibling(); } } tag.setNachrichten(nachrichten); tage.put(date, tag); } else { throw new IOException("keine SVPlan-Tabelle gefunden"); } } Vertretungsplan v = new Vertretungsplan(); v.setTage(new ArrayList<VertretungsplanTag>(tage.values())); return v; }
From source file:com.github.hronom.scrape.dat.website.controllers.ScrapeButtonController.java
public void processByUi4j() { // Disable fields in view. scrapeView.setWebsiteUrlTextFieldEnabled(false); scrapeView.setSelectorTextFieldEnabled(false); scrapeView.setScrapeButtonEnabled(false); scrapeView.setWorkInProgress(true);/*w ww. j a va 2 s. c om*/ scrapeView.setOutput(""); scrapeView.setProgressBarTaskText("initializing"); logger.info("Start processing..."); long beginTime = System.currentTimeMillis(); // Output input parameters. if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) { logger.info("Input parameters: \"" + scrapeView.getWebsiteUrl() + "\", \"" + scrapeView.getSelector() + "\", \""); } // Navigate to blank page. scrapeView.setProgressBarTaskText("requesting page"); logger.info("Requesting page..."); Page page = browserEngine.navigate(scrapeView.getWebsiteUrl()); //page.show(); logger.info("Requesting of page completed."); scrapeView.setProgressBarTaskText("viewing page as HTML"); logger.info("View page as HTML"); String html = page.getDocument().getBody().getInnerHTML(); // Unescape html. scrapeView.setProgressBarTaskText("unescaping HTML"); logger.info("Unescape html"); html = StringEscapeUtils.unescapeHtml4(html); logger.info("Get selector"); String selector = scrapeView.getSelector(); if (!html.isEmpty() && !selector.isEmpty()) { scrapeView.setProgressBarTaskText("parsing HTML"); logger.info("Parse HTML"); Document doc = Jsoup.parse(html); scrapeView.setProgressBarTaskText("selecting elements in HTML"); logger.info("select elements in HTML"); Elements selectedElements = doc.select(selector); if (!selectedElements.isEmpty()) { scrapeView.setProgressBarTaskText("parsing selected elements"); logger.info("Parse extracted elements"); StringBuilder sb = new StringBuilder(); for (Element element : selectedElements) { String body = element.html(); sb.append(body); sb.append("\n"); sb.append("\n"); } scrapeView.setOutput(sb.toString()); } } browserEngine.clearCookies(); long endTime = System.currentTimeMillis(); logger.info("Process time: " + (endTime - beginTime) + " ms."); logger.info("Processing complete."); // Enable fields in view. scrapeView.setWorkInProgress(false); scrapeView.setScrapeButtonEnabled(true); scrapeView.setSelectorTextFieldEnabled(true); scrapeView.setWebsiteUrlTextFieldEnabled(true); }
From source file:com.github.hronom.scrape.dat.website.controllers.ScrapeButtonController.java
public void processByJxBrowser() { // Disable fields in view. scrapeView.setWebsiteUrlTextFieldEnabled(false); scrapeView.setSelectorTextFieldEnabled(false); scrapeView.setScrapeButtonEnabled(false); scrapeView.setWorkInProgress(true);//from w w w . j av a 2s . com scrapeView.setOutput(""); scrapeView.setProgressBarTaskText("initializing"); logger.info("Start processing..."); long beginTime = System.currentTimeMillis(); // Output input parameters. if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) { logger.info("Input parameters: \"" + scrapeView.getWebsiteUrl() + "\", \"" + scrapeView.getSelector() + "\", \""); } // Navigate to blank page. scrapeView.setProgressBarTaskText("requesting page"); logger.info("Requesting page..."); browser.loadURL(scrapeView.getWebsiteUrl()); // Wait for loading. while (browser.isLoading()) { try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } } logger.info("Requesting of page completed."); scrapeView.setProgressBarTaskText("viewing page as HTML"); logger.info("View page as HTML"); String html = browser.getHTML(); // Unescape html. scrapeView.setProgressBarTaskText("unescaping HTML"); logger.info("Unescape html"); html = StringEscapeUtils.unescapeHtml4(html); logger.info("Get selector"); String selector = scrapeView.getSelector(); if (!html.isEmpty() && !selector.isEmpty()) { scrapeView.setProgressBarTaskText("parsing HTML"); logger.info("Parse HTML"); Document doc = Jsoup.parse(html); scrapeView.setProgressBarTaskText("selecting elements in HTML"); logger.info("select elements in HTML"); Elements selectedElements = doc.select(selector); if (!selectedElements.isEmpty()) { scrapeView.setProgressBarTaskText("parsing selected elements"); logger.info("Parse extracted elements"); StringBuilder sb = new StringBuilder(); for (Element element : selectedElements) { String body = element.html(); sb.append(body); sb.append("\n"); sb.append("\n"); } scrapeView.setOutput(sb.toString()); } } browser.stop(); long endTime = System.currentTimeMillis(); logger.info("Process time: " + (endTime - beginTime) + " ms."); logger.info("Processing complete."); // Enable fields in view. scrapeView.setWorkInProgress(false); scrapeView.setScrapeButtonEnabled(true); scrapeView.setSelectorTextFieldEnabled(true); scrapeView.setWebsiteUrlTextFieldEnabled(true); }