Example usage for org.jsoup.nodes Element getElementsByTag

List of usage examples for org.jsoup.nodes Element getElementsByTag

Introduction

In this page you can find the example usage for org.jsoup.nodes Element getElementsByTag.

Prototype

public Elements getElementsByTag(String tagName) 

Source Link

Document

Finds elements, including and recursively under this element, with the specified tag name.

Usage

From source file:org.keycloak.testsuite.util.saml.RequiredConsentBuilder.java

/**
 * Prepares a GET/POST request for consent granting . The consent page is expected
 * to have at least input fields with id "kc-login" and "kc-cancel".
 *
 * @param consentPage// w  w  w . ja v a2s .c  o  m
 * @param consent
 * @return
 */
public HttpUriRequest handleConsentPage(String consentPage, URI currentURI) {
    org.jsoup.nodes.Document theLoginPage = Jsoup.parse(consentPage);

    List<NameValuePair> parameters = new LinkedList<>();
    for (Element form : theLoginPage.getElementsByTag("form")) {
        String method = form.attr("method");
        String action = form.attr("action");
        boolean isPost = method != null && "post".equalsIgnoreCase(method);

        for (Element input : form.getElementsByTag("input")) {
            if (Objects.equals(input.id(), "kc-login")) {
                if (approveConsent)
                    parameters.add(new BasicNameValuePair(input.attr("name"), input.attr("value")));
            } else if (Objects.equals(input.id(), "kc-cancel")) {
                if (!approveConsent)
                    parameters.add(new BasicNameValuePair(input.attr("name"), input.attr("value")));
            } else {
                parameters.add(new BasicNameValuePair(input.attr("name"), input.val()));
            }
        }

        if (isPost) {
            HttpPost res = new HttpPost(currentURI.resolve(action));

            UrlEncodedFormEntity formEntity;
            try {
                formEntity = new UrlEncodedFormEntity(parameters, "UTF-8");
            } catch (UnsupportedEncodingException e) {
                throw new RuntimeException(e);
            }
            res.setEntity(formEntity);

            return res;
        } else {
            UriBuilder b = UriBuilder.fromPath(action);
            for (NameValuePair parameter : parameters) {
                b.queryParam(parameter.getName(), parameter.getValue());
            }
            return new HttpGet(b.build());
        }
    }

    throw new IllegalArgumentException("Invalid consent page: " + consentPage);
}

From source file:GIST.IzbirkomExtractor.TableExtractor.java

/**
     * Tests the row if it looks like the 1st row of a parsable table
     * @param row//from   w  w w .j  a v  a2  s .  c o  m
     * @return
     */
    private boolean isParsableTable(Element row) {

        Elements cells = row.getElementsByTag("td");

        /* number of columns should be 4 */
        if (cells.size() != 4)
            return false;

        /* look for number signs in 1st cell*/
        if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()),
                " . -") < 3)
            return true;

        /* discard the table if any of the cells is empty */
        for (Element cell : cells) {
            if (cleanupUNICODE(cell.text()).isEmpty())
                return false;
        }

        /* 1st column should be a number */
        try {
            Integer.parseInt(cleanupUNICODE(cells.first().text()).trim());
            return true;
        } catch (NumberFormatException e) {
            return false;
        }
    }

From source file:com.github.jrrdev.mantisbtsync.core.common.auth.request.AuthHttpPost.java

/**
 * {@inheritDoc}/*w  w  w . j  ava  2  s  . c o m*/
 *
 */
@Override
public void configFromPreviousResponse(final HttpEntity entity) throws ParseException, IOException {
    if (formAction == null || entity == null) {
        return;
    }

    final String content = EntityUtils.toString(entity);
    final Elements forms = Jsoup.parse(content).getElementsByTag(HTML_FORM);

    for (final Element form : forms) {
        // Get the form
        if (form.hasAttr(HTML_ACTION) && formAction.equalsIgnoreCase(form.attr(HTML_ACTION))) {

            // Parsing of hidden inputs
            final Elements inputs = form.getElementsByTag(HTML_INPUT);
            for (final Element input : inputs) {
                if (input.hasAttr(HTML_TYPE) && HTML_HIDDEN.equalsIgnoreCase(input.attr(HTML_TYPE))) {
                    final String value = input.attr(HTML_VALUE);
                    final String name = input.attr(HTML_NAME);
                    builder = builder.addParameter(name, value);
                }
            }
            break;
        }
    }
}

From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java

/**
 * remove paragraphs that have less than x number of words, would indicate that it's some sort of link
 *///from w w  w  .ja v  a  2s  .  c  o m
private void removeParagraphsWithFewWords() {
    if (logger.isDebugEnabled()) {
        logger.debug("removeParagraphsWithFewWords starting...");
    }

    Elements allNodes = this.topNode.getAllElements();
    for (Element el : allNodes) {

        try {
            // get stop words that appear in each node

            WordStats stopWords = StopWords.getStopWordCount(el.text());

            if (stopWords.getStopWordCount() < 5 && el.getElementsByTag("object").size() == 0
                    && el.getElementsByTag("embed").size() == 0) {
                el.remove();
            }
        } catch (IllegalArgumentException e) {
            logger.error(e.getMessage());
        }
        //}
    }
}

From source file:com.dajodi.scandic.JSoupScraper.java

private List<ScandicStay> getStays(Element accountOverview) {
    Element tableNode = accountOverview
            .getElementById("ctl00_MainBodyRegion_AccountOverview1_tableTransactions");

    if (tableNode == null) {
        return Collections.emptyList();
    }//from   ww w.ja v a  2s.  co m

    Elements trs = tableNode.getElementsByTag("tr");

    List<ScandicStay> stays = new ArrayList<ScandicStay>();
    int order = 0;
    for (Element tr : trs) {
        if (tr.getElementsByTag("th").isEmpty()) {
            Elements tds = tr.getElementsByTag("td");
            if (tds.size() == 3) {
                String location = Util.trimIfNonNull(tds.get(0).text());
                String date = Util.trimIfNonNull(tds.get(1).text());
                String stayPoints = Util.trimIfNonNull(tds.get(2).text());
                ScandicStay stay = new ScandicStay();

                Date[] dates = Util.parseDates(date);
                int numNights = Util.daysBetween(dates[0], dates[1]);

                stay.setHotelName(location);
                stay.setNumPoints(Integer.parseInt(stayPoints));
                stay.setFromDate(dates[0]);
                stay.setToDate(dates[1]);
                stay.setNumNights(numNights);
                stay.setHtmlOrder(order);
                stays.add(stay);
                order++;
            } else {
                throw new ScandicHtmlException(
                        "unknown table node, html is funky.  could hide row if this is a serious problem.");
            }
        }
    }

    return stays;
}

From source file:nl.phanos.liteliveresultsclient.LoginHandler.java

public Object[] getEigenWedstrijden() throws Exception {
    ArrayList<Wedstrijd> wedstrijden = new ArrayList<Wedstrijd>();
    String content = GetPageContent(
            "https://www.atletiek.nu/feeder.php?page=search&do=events&search=&predefinedSearchTemplate=3");
    Element overview = Jsoup.parse(content).getElementById("overview").getElementsByTag("tbody").first();
    Elements rows = overview.getElementsByTag("tr");
    for (Element row : rows) {
        if (row.hasAttr("onclick")) {
            try {
                Wedstrijd w = new Wedstrijd();
                String[] split = row.attr("onclick").split("/");
                w.id = split[split.length - 2];
                w.date = row.getElementsByTag("td").first().text();
                w.club = row.getElementsByTag("td").get(3).text().replace(" ", "").replace(",", "");
                w.name = w.date + " - "
                        + row.getElementsByTag("td").get(1).getElementsByClass("hidden-xs").first().text();
                wedstrijden.add(w);//from www.j a  va 2  s  .  c  o m
            } catch (Exception e) {
                System.out.println(e);
            }
        }
    }
    return wedstrijden.toArray();
}

From source file:org.keycloak.testsuite.util.saml.UpdateProfileBuilder.java

public HttpUriRequest handleUpdateProfile(String loginPage, URI currentURI) {
    org.jsoup.nodes.Document theUpdateProfilePage = Jsoup.parse(loginPage);
    Set<String> unusedParams = new HashSet<>(this.parameters.keySet());

    List<NameValuePair> parameters = new LinkedList<>();
    for (Element form : theUpdateProfilePage.getElementsByTag("form")) {
        String method = form.attr("method");
        String action = form.attr("action");
        boolean isPost = method != null && "post".equalsIgnoreCase(method);

        for (Element input : form.getElementsByTag("input")) {
            if (this.parameters.containsKey(input.attr("name"))) {
                parameters.add(/*from  w  w  w .ja va 2 s.c  o  m*/
                        new BasicNameValuePair(input.attr("name"), this.parameters.get(input.attr("name"))));
                unusedParams.remove(input.attr("name"));
            }
        }

        if (!unusedParams.isEmpty()) {
            LOG.warnf("Unused parameter names at Update Profile page: %s", unusedParams);
        }

        if (isPost) {
            HttpPost res = new HttpPost(action);

            UrlEncodedFormEntity formEntity;
            try {
                formEntity = new UrlEncodedFormEntity(parameters, "UTF-8");
            } catch (UnsupportedEncodingException e) {
                throw new RuntimeException(e);
            }
            res.setEntity(formEntity);

            return res;
        } else {
            UriBuilder b = UriBuilder.fromPath(action);
            for (NameValuePair parameter : parameters) {
                b.queryParam(parameter.getName(), parameter.getValue());
            }
            return new HttpGet(b.build());
        }
    }

    throw new IllegalArgumentException("Invalid update profile form: " + loginPage);
}

From source file:net.parser.JobParser.java

private String getPropertyString(String helpStr, String cssQuery) {

    Elements elements = doc.select(cssQuery);
    String propertyString = null;

    for (Element element : elements) {
        if (element.getElementsByTag("strong").text().equals(helpStr)) {
            propertyString = element.nextElementSibling().text();
            break;
        }//from   w  w w .ja va  2s.c om
    }

    return propertyString;
}

From source file:com.screenslicer.core.util.BrowserUtil.java

private static WebElement toElement(Browser browser, HtmlNode htmlNode, Element body, boolean recurse)
        throws ActionFailed {
    if (body == null) {
        body = BrowserUtil.openElement(browser, true, null, null, null, null);
    }//from  w w  w.  j  a v a  2  s . c  om
    if (!CommonUtil.isEmpty(htmlNode.id)) {
        Elements elements = body.getElementsByAttributeValue("id", htmlNode.id);
        if (elements.size() == 1) {
            WebElement element = toElement(browser, elements.get(0), htmlNode, recurse);
            if (element != null) {
                return element;
            }
        }
    }
    List<Elements> selected = new ArrayList<Elements>();
    if (!CommonUtil.isEmpty(htmlNode.tagName)) {
        selected.add(body.getElementsByTag(htmlNode.tagName));
    } else if (!CommonUtil.isEmpty(htmlNode.href)) {
        selected.add(body.getElementsByTag("a"));
    }
    if (!CommonUtil.isEmpty(htmlNode.id)) {
        selected.add(body.getElementsByAttributeValue("id", htmlNode.id));
    }
    if (!CommonUtil.isEmpty(htmlNode.name)) {
        selected.add(body.getElementsByAttributeValue("name", htmlNode.name));
    }
    if (!CommonUtil.isEmpty(htmlNode.type)) {
        selected.add(body.getElementsByAttributeValue("type", htmlNode.type));
    }
    if (!CommonUtil.isEmpty(htmlNode.value)) {
        selected.add(body.getElementsByAttributeValue("value", htmlNode.value));
    }
    if (!CommonUtil.isEmpty(htmlNode.title)) {
        selected.add(body.getElementsByAttributeValue("title", htmlNode.title));
    }
    if (!CommonUtil.isEmpty(htmlNode.role)) {
        selected.add(body.getElementsByAttributeValue("role", htmlNode.role));
    }
    if (!CommonUtil.isEmpty(htmlNode.alt)) {
        selected.add(body.getElementsByAttributeValue("alt", htmlNode.alt));
    }
    if (htmlNode.classes != null && htmlNode.classes.length > 0) {
        Map<Element, Integer> found = new HashMap<Element, Integer>();
        for (int i = 0; i < htmlNode.classes.length; i++) {
            Elements elements = body.getElementsByClass(htmlNode.classes[i]);
            for (Element element : elements) {
                if (!found.containsKey(element)) {
                    found.put(element, 0);
                }
                found.put(element, found.get(element) + 1);
            }
        }
        Elements elements = new Elements();
        for (int i = htmlNode.classes.length; i > 0; i--) {
            for (Map.Entry<Element, Integer> entry : found.entrySet()) {
                if (entry.getValue() == i) {
                    elements.add(entry.getKey());
                }
            }
            if (!elements.isEmpty()) {
                break;
            }
        }
        selected.add(elements);
    }
    if (!CommonUtil.isEmpty(htmlNode.href)) {
        Elements hrefs = body.getElementsByAttribute("href");
        Elements toAdd = new Elements();
        String currentUrl = browser.getCurrentUrl();
        String hrefGiven = htmlNode.href;
        for (Element href : hrefs) {
            String hrefFound = href.attr("href");
            if (hrefGiven.equalsIgnoreCase(hrefFound)) {
                toAdd.add(href);
                toAdd.add(href);
                toAdd.add(href);
            } else if (htmlNode.fuzzy && hrefFound != null && hrefFound.endsWith(hrefGiven)) {
                toAdd.add(href);
                toAdd.add(href);
            } else if (htmlNode.fuzzy && hrefFound != null && hrefFound.contains(hrefGiven)) {
                toAdd.add(href);
            } else {
                String uriGiven = UrlUtil.toCanonicalUri(currentUrl, hrefGiven);
                String uriFound = UrlUtil.toCanonicalUri(currentUrl, hrefFound);
                if (uriGiven.equalsIgnoreCase(uriFound)) {
                    toAdd.add(href);
                }
            }
        }
        selected.add(toAdd);
    }
    if (!CommonUtil.isEmpty(htmlNode.innerText)) {
        selected.add(body.getElementsMatchingText(Pattern.quote(htmlNode.innerText)));
        selected.add(body.getElementsMatchingText("^\\s*" + Pattern.quote(htmlNode.innerText) + "\\s*$"));
    }
    if (htmlNode.multiple != null) {
        selected.add(body.getElementsByAttribute("multiple"));
    }
    Map<Element, Integer> votes = new HashMap<Element, Integer>();
    for (Elements elements : selected) {
        for (Element element : elements) {
            if (!votes.containsKey(element)) {
                votes.put(element, 0);
            }
            votes.put(element, votes.get(element) + 2);
            if (!NodeUtil.isHidden(element)) {
                votes.put(element, votes.get(element) + 1);
            }
        }
    }
    int maxVote = 0;
    Element maxElement = null;
    for (Map.Entry<Element, Integer> entry : votes.entrySet()) {
        if (entry.getValue() > maxVote) {
            maxVote = entry.getValue();
            maxElement = entry.getKey();
        }
    }
    return toElement(browser, maxElement, htmlNode, recurse);
}

From source file:GIST.IzbirkomExtractor.TableExtractor.java

public void processHTMLfile(File input_html) throws IOException, TableExtractorException,
            CloneNotSupportedException, SQLException, ResultSinkException {

        logger.info("Start processing " + input_html);

        Document doc = Jsoup.parse(input_html, "UTF-8");
        Elements tables = doc.getElementsByTag("table");

        /* count of parseable tables found */
        int tables_found = 0;

        /* determine raion name */
        String raion_name = extractRaionFromFileName(input_html.getName());
        //System.err.println(raion_name);

        // TODO: inflect raion name in  case

        /* searches for a table that has " . -" in its very 1st cell */
        for (Element table : tables) {
            Elements rows = table.getElementsByTag("tr");
            boolean firstRow = true;

            row_loop: for (Element row : rows) {
                Elements cells = row.getElementsByTag("td");

                if (firstRow) {
                    //System.err.println(row.text());
                    if (isParsableTable(row)) {
                        firstRow = false;
                        logger.info("Processing table #" + ++tables_found + " in " + input_html);
                    } else
                        break row_loop;
                }//from   w  w w . jav  a 2s  .  co  m

                if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()),
                        " . -") < 3)
                    continue row_loop; /* skip the row if it looks like a table header */

                /* skip rows with all cells empty */
                boolean emptyRow = true;
                for (Element cell : cells)
                    emptyRow = emptyRow && cleanupUNICODE(cell.text()).isEmpty();
                if (emptyRow)
                    continue;

                int i_cell = 0;
                Element station_id = null;
                Element address_field = null;
                Element org_address = null; /* address of the ??? */
                Element station_address = null;

                for (Element cell : cells) {
                    switch (i_cell) {
                    case 0:
                        station_id = cell;
                        break;
                    case 1:
                        address_field = cell;
                        break;
                    case 2:
                        org_address = cell;
                        break;
                    case 3:
                        station_address = cell;
                    default:
                        break;
                    }
                    i_cell++;
                }

                if (station_id == null)
                    throw new TableExtractorException("Polling station ID not found", row, input_html);
                if (address_field == null)
                    throw new TableExtractorException("Address list not found", row, input_html);

                /* extract int from poll station id */
                int psid;
                try {
                    psid = Integer.valueOf(cleanupUNICODE(station_id.text()).trim().replaceAll("[^\\d]", ""));
                } catch (NumberFormatException e) {
                    Exception te = new TableExtractorException("Failed to parse polling station ID >"
                            + cleanupUNICODE(station_id.text()).trim() + "<: ", station_id, input_html);
                    logger.severe(te.getMessage() + "; rest of " + input_html + " ignored.");
                    return;
                }

                /* extraction from HTML completely finished, now we work only with the addresses in the text form */
                extractAddressesFromText(raion_name.trim(), psid, cleanLeftoverHTML(address_field),
                        cleanLeftoverHTML(org_address), cleanLeftoverHTML(station_address));
            }
        }

        if (tables_found == 0)
            logger.severe("No parsable tables found in " + input_html);
        resultSink.commit();

        logger.info("" + tables_found + " table(s) processed in " + input_html);
    }