Example usage for org.jsoup.nodes Element html

List of usage examples for org.jsoup.nodes Element html

Introduction

In this page you can find the example usage for org.jsoup.nodes Element html.

Prototype

public String html() 

Source Link

Document

Retrieves the element's inner HTML.

Usage

From source file:com.romeikat.datamessie.core.processing.service.cleaning.extract.TagExctractor.java

private String extractContent(final RawContent rawContent, final Document document, final String tagSelector) {
    if (tagSelector == null || tagSelector.isEmpty()) {
        return null;
    }//from   w  w w .  ja va  2s  .com
    // Parse tag selector
    String tagName = null;
    String idName = null;
    List<String> classNames = null;
    final String warningMessage = "Could not apply tag selecting rule on document " + document.getId() + " ("
            + document.getUrl() + ") due to malformed tag selector " + tagSelector + " of source "
            + document.getSourceId();
    try {
        final String[] parts = tagSelector.split("#");
        tagName = parts[0];
        if (tagName.isEmpty()) {
            tagName = null;
        }
        if (parts.length >= 2) {
            idName = parts[1];
            if (idName.isEmpty()) {
                idName = null;
            }
        }
        if (parts.length >= 3) {
            classNames = Arrays.asList(parts[2].split(" "));
        }
        if (tagName == null || idName == null && classNames == null) {
            LOG.warn(warningMessage);
            return null;
        }
    } catch (final Exception e) {
        LOG.warn(warningMessage, e);
        return null;
    }
    // With tag selector, search for appropriate element
    final org.jsoup.nodes.Document jsoupDocument = Jsoup.parse(rawContent.getContent());
    final List<Element> matchingElements = new ArrayList<Element>();
    final Elements elementsWithTagName = jsoupDocument.getElementsByTag(tagName);
    for (final Element elementWithTagName : elementsWithTagName) {
        final boolean idNameMatches = idName == null || elementWithTagName.id().equals(idName);
        final boolean classNamesMatch = classNames == null
                || elementWithTagName.classNames().containsAll(classNames);
        if (idNameMatches && classNamesMatch) {
            matchingElements.add(elementWithTagName);
        }
    }
    // Unique match found
    if (matchingElements.size() == 1) {
        final Element matchingElement = matchingElements.get(0);
        return matchingElement.html();
    }
    // No unique match found
    return null;
}

From source file:net.slkdev.swagger.confluence.service.impl.XHtmlToConfluenceServiceImpl.java

private static List<ConfluencePage> handlePagination() {
    final List<ConfluencePage> confluencePages = new ArrayList<>();
    final SwaggerConfluenceConfig swaggerConfluenceConfig = SWAGGER_CONFLUENCE_CONFIG.get();

    final PaginationMode paginationMode = swaggerConfluenceConfig.getPaginationMode();

    final Document originalDocument = SWAGGER_DOCUMENT.get();
    final Document transformedDocument = originalDocument.clone();

    final Elements categoryElements = transformedDocument.select(".sect1");

    // Remove ToC form the transformed document
    final Elements toc = transformedDocument.select(".toc");
    toc.html("");
    toc.unwrap();/* w w  w.j a v a2 s . c  o  m*/

    // For Single Page Mode, the incoming XHTML can be used directly.
    if (paginationMode == SINGLE_PAGE) {
        final ConfluencePage confluencePage = ConfluencePageBuilder.aConfluencePage()
                .withPageType(PageType.ROOT).withOriginalTitle(swaggerConfluenceConfig.getTitle())
                .withConfluenceTitle(buildConfluenceTitle(swaggerConfluenceConfig.getTitle(), null, null))
                .build();

        if (swaggerConfluenceConfig.isIncludeTableOfContentsOnSinglePage()) {
            confluencePage.setXhtml(originalDocument.html());
        } else {
            confluencePage.setXhtml(transformedDocument.html());
        }

        confluencePages.add(confluencePage);

        return confluencePages;
    }

    // Before beginning further processing, we need to know if we're in individual
    // page mode or not, as that will effect how we split the DOM. If we're in this
    // mode then the category pages will contain inner table of contents.
    final boolean individualPages = (paginationMode == INDIVIDUAL_PAGES);

    // From here on, if we're still proceeding then we know the meat of the document
    // will go in sub-pages. So for the master page, we will use the table of contents
    final Elements tocElements = originalDocument.select(".toc");

    final List<String> innerTocXHtmlList = new ArrayList<>();
    final Elements innerTocElements = originalDocument.select(".sectlevel2");

    for (final Element innerTocElement : innerTocElements) {
        // If we're in individual page mode, then we collect the inner ToCs
        if (individualPages) {
            final StringBuilder tocHtml = new StringBuilder();
            tocHtml.append("<div id=\"toc\" class=\"toc\">");
            tocHtml.append("<h4 id=\"toctitle\">Table of Contents</h4>");
            tocHtml.append("<div><ul class=\"sectlevel1\">");
            tocHtml.append(innerTocElement.html());
            tocHtml.append("</ul></div></div>");
            innerTocXHtmlList.add(tocHtml.toString());
        }
        // If we're in category page mode, then we strip out the inner table of contents.
        else {
            innerTocElement.html("");
            innerTocElement.unwrap();
        }
    }

    // Build the Root Page w/ the Appropriate Level of Table of Contents
    final ConfluencePage rootConfluencePage = ConfluencePageBuilder.aConfluencePage()
            .withPageType(PageType.ROOT).withOriginalTitle(swaggerConfluenceConfig.getTitle())
            .withConfluenceTitle(buildConfluenceTitle(swaggerConfluenceConfig.getTitle(), null, null))
            .withXhtml(tocElements.html()).build();
    confluencePages.add(rootConfluencePage);

    int category = 1;

    // Now we process the category pages
    for (final Element categoryElement : categoryElements) {
        // Fetch the title from the first child, which is the header element
        final String categoryTitle = categoryElement.children().first().text();

        // If we're in individual mode then we need these to be sub table of contents
        if (individualPages) {

            final ConfluencePage categoryConfluencePage = ConfluencePageBuilder.aConfluencePage()
                    .withPageType(PageType.CATEGORY).withOriginalTitle(categoryTitle)
                    .withConfluenceTitle(buildConfluenceTitle(categoryTitle, category, null))
                    .withXhtml(innerTocXHtmlList.get(category - 1)).build();
            confluencePages.add(categoryConfluencePage);

            final Elements individualElements = categoryElement.getElementsByClass("sect2");

            int individual = 1;

            for (final Element individualElement : individualElements) {
                final String individualTitle = individualElement.children().first().text();
                final ConfluencePage individualConfluencePage = ConfluencePageBuilder.aConfluencePage()
                        .withPageType(INDIVIDUAL).withOriginalTitle(individualTitle)
                        .withConfluenceTitle(buildConfluenceTitle(individualTitle, category, individual))
                        .withXhtml(individualElement.html()).build();
                confluencePages.add(individualConfluencePage);

                individual++;
            }

            category++;
            continue;
        }

        // If we're in category mode, we use the remaining page data
        final ConfluencePage categoryConfluencePage = ConfluencePageBuilder.aConfluencePage()
                .withPageType(PageType.CATEGORY).withOriginalTitle(categoryTitle)
                .withConfluenceTitle(buildConfluenceTitle(categoryTitle, category, null))
                .withXhtml(categoryElement.html()).build();
        confluencePages.add(categoryConfluencePage);

        category++;
    }

    return confluencePages;
}

From source file:org.jlucrum.datafetcher.FetcherNasdaqOmxNordic.java

public Map<String, Double> getData(String name, DateTime fromDate, DateTime toDate, int type) {
    HttpPost httpPost = new HttpPost(this.url);
    HttpResponse response = null;/* w  w w .ja va2 s. com*/
    HashMap<String, Double> retMap = new HashMap<String, Double>();

    httpclient = getClient();

    List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>();
    String fixedName = stockMap.get(name);
    if (fixedName == null) {
        fixedName = name;
    }

    nameValuePairs.add(new BasicNameValuePair("xmlquery",
            "<post> " + "<param name=\"SubSystem\" value=\"History\"/> "
                    + "<param name=\"Action\" value=\"GetDataSeries\"/>"
                    + "<param name=\"AppendIntraDay\" value=\"no\"/>" + "<param name=\"Instrument\" value=\""
                    + fixedName + "\"/>" + "<param name=\"FromDate\" value=\"" + dateFormatter.print(fromDate)
                    + "\"/>" + "<param name=\"ToDate\" value=\"" + dateFormatter.print(toDate) + "\"/> "
                    + "<param name=\"hi__a\" value=\"0,1,2,4,21,8,10,11,12,9\"/> "
                    + "<param name=\"ext_xslt\" value=\"/nordicV3/hi_table_shares_adjusted.xsl\"/> "
                    + "<param name=\"ext_xslt_options\" value=\",undefined,\"/> "
                    + "<param name=\"ext_xslt_lang\" value=\"en\"/> "
                    + "<param name=\"ext_xslt_hiddenattrs\" value=\",ip,iv,\"/> "
                    + "<param name=\"ext_xslt_tableId\" value=\"historicalTable\"/> "
                    + "<param name=\"app\" value=\"/osakkeet/Historialliset_kurssitiedot/\"/> " + "</post>"));

    try {

        Document doc = (Document) cache.getData(fixedName, fromDate.toString(), toDate.toString());
        if (doc == null) {
            httpPost.setEntity(new UrlEncodedFormEntity(nameValuePairs, HTTP.UTF_8));
            response = httpclient.execute(httpPost);
            HttpEntity entity = response.getEntity();
            String resString = EntityUtils.toString(entity, "UTF-8");
            if (debug) {
                System.out.printf("Respond:%s", resString);
            }

            doc = Jsoup.parse(resString);
            cache.putData(fixedName, fromDate.toString(), toDate.toString(), doc);
            System.out.printf("Fetched from network:%s\n", name);
        }

        Elements elems = doc.select("tr");

        Iterator<Element> iter = elems.iterator();
        iter.next(); //skip head
        while (iter.hasNext()) {
            Element elem = iter.next();
            Elements dataElems = elem.getAllElements();
            /* Output Example:
            <tr id="historicalTable-">
              <td>2011-09-08</td>
              <td>25.29</td>
              <td>24.38</td>
              <td>24.93</td>
              <td>24.92</td>
              <td>895,389</td>
              <td>22,298,455</td>
              <td>5,524</td>
            </tr>
             */
            Element dateElem = dataElems.get(1);
            Element dataElem = dataElems.get(dataMap[type]);
            if (dateElem.html() == null || dateElem.html().length() == 0 || dataElem.html() == null
                    || dataElem.html().length() == 0) {
                continue;
            }

            retMap.put(dateElem.html(), Double.valueOf(dataElem.html().replaceAll(",", "")));

            if (debug) {
                System.out.printf("Date:%s data:%s\n", dateElem.html(), dataElem.html());
            }
        }

        System.out.printf("Fetched %s/%s from NasdaqOmxNordic:%d\n", name, fixedName, retMap.size());
    } catch (IOException ex) {
        Logger.getLogger(FetcherNasdaqOmxNordic.class.getName()).log(Level.SEVERE, null, ex);
    }

    return retMap;
}

From source file:web.analyzer.utils.Utils.java

public List<Heading> docHeadingsProcess(Document doc) {
    List<Heading> headingList = new ArrayList<Heading>();
    int level = 0;
    Elements eles = doc.select("*");
    for (Element ele : eles) {
        level++;/*w  w w.j ava 2 s .co  m*/
        if (HEADING_TAG.contains(ele.tagName())) {
            headingList.add(new Heading(ele.tagName(), ele.html(), level));
        }

        if (ele.children().size() == 0) {
            level = 0;
            continue;
        } else {
            eles = ele.children();
        }
    }

    return headingList;
}

From source file:com.liato.bankdroid.banking.banks.AppeakPoker.java

@Override
public Urllib login() throws LoginException, BankException {
    try {/*from   w  w  w  .j  a va 2 s. co m*/
        LoginPackage lp = preLogin();
        String response = urlopen.open(lp.getLoginTarget());
        Document d = Jsoup.parse(response);
        Element e = d.select("#content > table tr:eq(2) td:eq(1)").first();
        if (e == null) {
            throw new LoginException(res.getText(R.string.invalid_username).toString());
        } else {
            mChips = e.html();
        }
    } catch (ClientProtocolException e) {
        throw new BankException(e.getMessage());
    } catch (IOException e) {
        throw new BankException(e.getMessage());
    }
    return urlopen;
}

From source file:accountgen.controller.Controller.java

private void setName(Document doc, Person p) {
    Elements e = doc.getElementsByClass("address");
    Element name = e.select("h3").first();
    p.setFirstname(StringEscapeUtils.unescapeHtml4(name.html().split(" ")[0]).trim());
    p.setMiddlename("");
    p.setLastname(StringEscapeUtils.unescapeHtml4(name.html().split(name.html().split(" ")[0])[1]).trim());
}

From source file:com.johan.vertretungsplan.parser.UntisCommonParser.java

/**
 * Parst eine "Nachrichten zum Tag"-Tabelle aus Untis-Vertretungsplnen
 * //from  w  w w.j  ava 2 s.com
 * @param table
 *            das <code>table</code>-Element des HTML-Dokuments, das geparst
 *            werden soll
 * @param data
 *            Daten von der Schule (aus <code>Schule.getData()</code>)
 * @param tag
 *            der {@link VertretungsplanTag} in dem die Nachrichten
 *            gespeichert werden sollen
 */
protected void parseNachrichten(Element table, JSONObject data, VertretungsplanTag tag) {
    Elements zeilen = table.select("tr:not(:contains(Nachrichten zum Tag))");
    for (Element i : zeilen) {
        Elements spalten = i.select("td");
        String info = "";
        for (Element b : spalten) {
            info += "\n" + TextNode.createFromEncoded(b.html(), null).getWholeText();
        }
        info = info.substring(1); // remove first \n
        tag.getNachrichten().add(info);
    }
}

From source file:com.johan.vertretungsplan.parser.SVPlanParser.java

public Vertretungsplan getVertretungsplan() throws IOException, JSONException {
    new LoginHandler(schule).handleLogin(executor, cookieStore, username, password); //

    JSONArray urls = schule.getData().getJSONArray("urls");
    String encoding = schule.getData().getString("encoding");
    List<Document> docs = new ArrayList<Document>();

    for (int i = 0; i < urls.length(); i++) {
        JSONObject url = urls.getJSONObject(i);
        loadUrl(url.getString("url"), encoding, docs);
    }//from   ww w .ja v a2 s.co  m

    LinkedHashMap<String, VertretungsplanTag> tage = new LinkedHashMap<String, VertretungsplanTag>();
    for (Document doc : docs) {
        if (doc.select(".svp-tabelle").size() > 0) {
            VertretungsplanTag tag = new VertretungsplanTag();
            String date = "Unbekanntes Datum";
            if (doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").size() > 0)
                date = doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").text();
            else if (doc.title().startsWith("Vertretungsplan fr "))
                date = doc.title().substring("Vertretungsplan fr ".length());
            tag.setDatum(date);
            if (doc.select(".svp-uploaddatum").size() > 0)
                tag.setStand(doc.select(".svp-uploaddatum").text().replace("Aktualisierung: ", ""));

            Elements rows = doc.select(".svp-tabelle tr");
            String lastLesson = "";
            for (Element row : rows) {
                if (row.hasClass("svp-header"))
                    continue;

                Vertretung vertretung = new Vertretung();
                List<String> affectedClasses = new ArrayList<String>();

                for (Element column : row.select("td")) {
                    if (!hasData(column.text())) {
                        continue;
                    }
                    String type = column.className();
                    if (type.startsWith("svp-stunde")) {
                        vertretung.setLesson(column.text());
                        lastLesson = column.text();
                    } else if (type.startsWith("svp-klasse"))
                        affectedClasses = Arrays.asList(column.text().split(", "));
                    else if (type.startsWith("svp-esfehlt"))
                        vertretung.setPreviousTeacher(column.text());
                    else if (type.startsWith("svp-esvertritt"))
                        vertretung.setTeacher(column.text());
                    else if (type.startsWith("svp-fach"))
                        vertretung.setSubject(column.text());
                    else if (type.startsWith("svp-bemerkung")) {
                        vertretung.setDesc(column.text());
                        vertretung.setType(recognizeType(column.text()));
                    } else if (type.startsWith("svp-raum"))
                        vertretung.setRoom(column.text());

                    if (vertretung.getLesson() == null)
                        vertretung.setLesson(lastLesson);
                }

                if (vertretung.getType() == null) {
                    vertretung.setType("Vertretung");
                }

                for (String klasse : affectedClasses) {
                    KlassenVertretungsplan kv = tag.getKlassen().get(klasse);
                    if (kv == null)
                        kv = new KlassenVertretungsplan(klasse);
                    kv.add(vertretung);
                    tag.getKlassen().put(klasse, kv);
                }
            }

            List<String> nachrichten = new ArrayList<String>();
            if (doc.select("h2:contains(Mitteilungen)").size() > 0) {
                Element h2 = doc.select("h2:contains(Mitteilungen)").first();
                Element sibling = h2.nextElementSibling();
                while (sibling != null && sibling.tagName().equals("p")) {
                    for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText()
                            .split("<br />\\s*<br />")) {
                        if (hasData(nachricht))
                            nachrichten.add(nachricht);
                    }
                    sibling = sibling.nextElementSibling();
                }
            }
            tag.setNachrichten(nachrichten);

            tage.put(date, tag);
        } else {
            throw new IOException("keine SVPlan-Tabelle gefunden");
        }
    }
    Vertretungsplan v = new Vertretungsplan();
    v.setTage(new ArrayList<VertretungsplanTag>(tage.values()));

    return v;
}

From source file:com.github.hronom.scrape.dat.website.controllers.ScrapeButtonController.java

public void processByUi4j() {
    // Disable fields in view.
    scrapeView.setWebsiteUrlTextFieldEnabled(false);
    scrapeView.setSelectorTextFieldEnabled(false);
    scrapeView.setScrapeButtonEnabled(false);
    scrapeView.setWorkInProgress(true);/*w ww. j  a va 2  s.  c om*/
    scrapeView.setOutput("");

    scrapeView.setProgressBarTaskText("initializing");
    logger.info("Start processing...");
    long beginTime = System.currentTimeMillis();

    // Output input parameters.
    if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) {
        logger.info("Input parameters: \"" + scrapeView.getWebsiteUrl() + "\", \"" + scrapeView.getSelector()
                + "\", \"");
    }

    // Navigate to blank page.
    scrapeView.setProgressBarTaskText("requesting page");
    logger.info("Requesting page...");
    Page page = browserEngine.navigate(scrapeView.getWebsiteUrl());
    //page.show();
    logger.info("Requesting of page completed.");

    scrapeView.setProgressBarTaskText("viewing page as HTML");
    logger.info("View page as HTML");
    String html = page.getDocument().getBody().getInnerHTML();

    // Unescape html.
    scrapeView.setProgressBarTaskText("unescaping HTML");
    logger.info("Unescape html");
    html = StringEscapeUtils.unescapeHtml4(html);

    logger.info("Get selector");
    String selector = scrapeView.getSelector();
    if (!html.isEmpty() && !selector.isEmpty()) {
        scrapeView.setProgressBarTaskText("parsing HTML");
        logger.info("Parse HTML");
        Document doc = Jsoup.parse(html);

        scrapeView.setProgressBarTaskText("selecting elements in HTML");
        logger.info("select elements in HTML");
        Elements selectedElements = doc.select(selector);

        if (!selectedElements.isEmpty()) {
            scrapeView.setProgressBarTaskText("parsing selected elements");
            logger.info("Parse extracted elements");
            StringBuilder sb = new StringBuilder();
            for (Element element : selectedElements) {
                String body = element.html();
                sb.append(body);
                sb.append("\n");
                sb.append("\n");
            }
            scrapeView.setOutput(sb.toString());
        }
    }

    browserEngine.clearCookies();

    long endTime = System.currentTimeMillis();
    logger.info("Process time: " + (endTime - beginTime) + " ms.");
    logger.info("Processing complete.");

    // Enable fields in view.
    scrapeView.setWorkInProgress(false);
    scrapeView.setScrapeButtonEnabled(true);
    scrapeView.setSelectorTextFieldEnabled(true);
    scrapeView.setWebsiteUrlTextFieldEnabled(true);
}

From source file:com.github.hronom.scrape.dat.website.controllers.ScrapeButtonController.java

public void processByJxBrowser() {
    // Disable fields in view.
    scrapeView.setWebsiteUrlTextFieldEnabled(false);
    scrapeView.setSelectorTextFieldEnabled(false);
    scrapeView.setScrapeButtonEnabled(false);
    scrapeView.setWorkInProgress(true);//from w  w  w . j av a 2s .  com
    scrapeView.setOutput("");

    scrapeView.setProgressBarTaskText("initializing");
    logger.info("Start processing...");
    long beginTime = System.currentTimeMillis();

    // Output input parameters.
    if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) {
        logger.info("Input parameters: \"" + scrapeView.getWebsiteUrl() + "\", \"" + scrapeView.getSelector()
                + "\", \"");
    }

    // Navigate to blank page.
    scrapeView.setProgressBarTaskText("requesting page");
    logger.info("Requesting page...");
    browser.loadURL(scrapeView.getWebsiteUrl());
    // Wait for loading.
    while (browser.isLoading()) {
        try {
            Thread.sleep(1000);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
    logger.info("Requesting of page completed.");

    scrapeView.setProgressBarTaskText("viewing page as HTML");
    logger.info("View page as HTML");
    String html = browser.getHTML();

    // Unescape html.
    scrapeView.setProgressBarTaskText("unescaping HTML");
    logger.info("Unescape html");
    html = StringEscapeUtils.unescapeHtml4(html);

    logger.info("Get selector");
    String selector = scrapeView.getSelector();
    if (!html.isEmpty() && !selector.isEmpty()) {
        scrapeView.setProgressBarTaskText("parsing HTML");
        logger.info("Parse HTML");
        Document doc = Jsoup.parse(html);

        scrapeView.setProgressBarTaskText("selecting elements in HTML");
        logger.info("select elements in HTML");
        Elements selectedElements = doc.select(selector);

        if (!selectedElements.isEmpty()) {
            scrapeView.setProgressBarTaskText("parsing selected elements");
            logger.info("Parse extracted elements");
            StringBuilder sb = new StringBuilder();
            for (Element element : selectedElements) {
                String body = element.html();
                sb.append(body);
                sb.append("\n");
                sb.append("\n");
            }
            scrapeView.setOutput(sb.toString());
        }
    }

    browser.stop();

    long endTime = System.currentTimeMillis();
    logger.info("Process time: " + (endTime - beginTime) + " ms.");
    logger.info("Processing complete.");

    // Enable fields in view.
    scrapeView.setWorkInProgress(false);
    scrapeView.setScrapeButtonEnabled(true);
    scrapeView.setSelectorTextFieldEnabled(true);
    scrapeView.setWebsiteUrlTextFieldEnabled(true);
}