Example usage for org.jsoup.nodes Element getElementsByTag

List of usage examples for org.jsoup.nodes Element getElementsByTag

Introduction

In this page you can find the example usage for org.jsoup.nodes Element getElementsByTag.

Prototype

public Elements getElementsByTag(String tagName) 

Source Link

Document

Finds elements, including and recursively under this element, with the specified tag name.

Usage

From source file:eu.masconsult.bgbanking.banks.dskbank.DskClient.java

@Override
public List<RawBankAccount> getBankAccounts(String authToken)
        throws IOException, ParseException, AuthenticationException {
    String uri = BASE_URL + "?" + URLEncodedUtils.format(
            Arrays.asList(new BasicNameValuePair(XML_ID, LIST_ACCOUNTS_XML_ID)), ENCODING) + "&" + authToken;

    // Get the accounts list
    Log.i(TAG, "Getting from: " + uri);
    final HttpGet get = new HttpGet(uri);
    get.setHeader("Accept", "*/*");

    DefaultHttpClient httpClient = getHttpClient();

    Log.v(TAG, "sending " + get.toString());
    final HttpResponse resp = httpClient.execute(get);

    if (resp.getStatusLine().getStatusCode() != HttpStatus.SC_OK) {
        throw new ParseException("getBankAccounts: unhandled http status "
                + resp.getStatusLine().getStatusCode() + " " + resp.getStatusLine().getReasonPhrase());
    }/* w ww  . ja  va  2 s .  co  m*/

    HttpEntity entity = resp.getEntity();
    Document doc = Jsoup.parse(entity.getContent(), "utf-8", BASE_URL);

    if (!checkLoggedIn(doc)) {
        throw new AuthenticationException("session expired!");
    }

    Element content = doc.getElementById("PageContent");
    if (content == null) {
        throw new ParseException("getBankAccounts: can't find PageContent");
    }

    Elements tables = content.getElementsByTag("table");
    if (tables == null || tables.size() == 0) {
        throw new ParseException("getBankAccounts: can't find table in PageContent");
    }

    Elements rows = tables.first().getElementsByTag("tr");
    if (rows == null || rows.size() == 0) {
        throw new ParseException("getBankAccounts: first table is empty in PageContent");
    }

    ArrayList<RawBankAccount> bankAccounts = new ArrayList<RawBankAccount>(rows.size());

    String lastCurrency = null;
    for (Element row : rows) {
        RawBankAccount bankAccount = obtainBankAccountFromHtmlTableRow(row);
        if (bankAccount != null) {
            if (bankAccount.getCurrency() == null) {
                bankAccount.setCurrency(lastCurrency);
            } else {
                lastCurrency = bankAccount.getCurrency();
            }
            bankAccounts.add(bankAccount);
        }
    }

    return bankAccounts;
}

From source file:com.mythesis.userbehaviouranalysis.WebParser.java

/**
 * Parse the url and get all the content
 * @param link the url to parse/* ww w  .  jav  a  2 s  .c o  m*/
 * @return The content parsed
 */
private String cleanhtml(String link) {
    try {
        Document doc = Jsoup.connect(link).timeout(10 * 1000).get();
        String title = doc.title();
        String mainbody = doc.body().text();
        Elements links = doc.select("a[href]");
        Elements media = doc.select("[src]");
        //fix link html to remove https:// or http:// and simple /
        if (link.substring(link.length() - 1, link.length()).equalsIgnoreCase("/")) {
            link = link.substring(0, link.length() - 1);
        }
        if (link.substring(0, 5).equalsIgnoreCase("https")) {
            link = link.substring(8);
        } else if (link.substring(0, 4).equalsIgnoreCase("http")) {
            link = link.substring(7);
        }
        String anchortext = "";
        String alttext = "";
        //-----get the anchor text of internal links
        for (Element el : links) {
            String str_check = el.attr("abs:href");
            if (el.attr("abs:href").contains(link) && el.text().length() > 1) {
                anchortext = anchortext + el.text() + " ";
            }
        }
        //-------get alt text to internal images links
        for (Element medi : media) {
            if (medi.getElementsByTag("img").attr("src").contains(link)) {
                alttext = alttext + " " + medi.getElementsByTag("img").attr("alt");
            }
            if (medi.getElementsByTag("img").attr("src").startsWith("/")) {
                alttext = alttext + " " + medi.getElementsByTag("img").attr("alt");
            }
        }
        String content = mainbody + title + anchortext + alttext;

        return content;

    } catch (IOException ex) {
        Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    } catch (NullPointerException ex) {
        Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    } catch (Exception ex) {
        Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    }

}

From source file:eu.masconsult.bgbanking.banks.fibank.my.MyFIBankClient.java

@Override
public List<RawBankAccount> getBankAccounts(String authtoken)
        throws IOException, ParseException, AuthenticationException {
    Log.v(TAG, "getBankAccounts: " + authtoken);
    AuthToken authToken = AuthToken.fromJson(authtoken);
    DefaultHttpClient httpClient = getHttpClient();

    performLogin(httpClient, authToken);

    final ArrayList<NameValuePair> params = new ArrayList<NameValuePair>();
    params.add(new BasicNameValuePair("request_type", "open"));
    params.add(new BasicNameValuePair("open_tab", "home"));
    params.add(new BasicNameValuePair("LogSesID", authToken.sessionId));
    final HttpEntity entity;
    try {/*from  ww  w .jav  a 2s  .  c o  m*/
        entity = new UrlEncodedFormEntity(params);
    } catch (final UnsupportedEncodingException e) {
        // this should never happen.
        throw new IllegalStateException(e);
    }

    HttpPost post = new HttpPost(SUMMARY_URL);
    post.addHeader(entity.getContentType());
    post.setHeader("Accept", "*/*");
    post.setEntity(entity);
    /*
     * curl -b 'ASP.NET_SessionId=afmrm5b0eiesmhha14ml2xml' -d
     * request_type=open -d open_tab=home -d
     * LogSesID=80e46fac-e188-4055-93de-137bac9db9a3
     * https://my.fibank.bg/lAccSummary
     */

    HttpResponse resp = httpClient.execute(post);

    if (resp.getStatusLine().getStatusCode() != HttpStatus.SC_OK) {
        throw new ParseException("getBankAccounts: unhandled http status "
                + resp.getStatusLine().getStatusCode() + " " + resp.getStatusLine().getReasonPhrase());
    }

    String response = EntityUtils.toString(resp.getEntity());
    Log.v(TAG, "response = " + response);

    Document doc = Jsoup.parse(response, BASE_URL);

    Element table = doc.getElementById("AvailableAmt");
    if (table == null) {
        throw new ParseException("can't find @AvailableAmt");
    }

    List<RawBankAccount> bankAccounts = new LinkedList<RawBankAccount>();
    for (Element row : table.getElementsByTag("tr")) {
        RawBankAccount bankAccount = obtainBankAccountFromHtmlTableRow(row);
        if (bankAccount != null) {
            bankAccounts.add(bankAccount);
        }
    }

    return bankAccounts;
}

From source file:com.webbfontaine.valuewebb.timer.RatesUpdater.java

public HashMap<String, BigDecimal> ratesFromBank() {
    HashMap<String, BigDecimal> rates = new HashMap<String, BigDecimal>();

    Document doc = getPage();/*from ww  w  . j a  v a2 s.  c  om*/
    Elements tables = doc.getElementsByTag("table");
    Element tableOfRates = null;
    Elements trs;
    int pairsCodeIndex = 0;
    int sellingIndex = 0;

    for (Element table : tables) {
        if (table.text().contains("Dollar")) {
            tableOfRates = table;
            break;
        }
    }

    if (tableOfRates != null) {
        trs = tableOfRates.getElementsByTag("tr");
    } else {
        LOGGER.error("Error reading rates from URL");
        return rates;
    }

    Elements columns = trs.get(0).getElementsByTag("th");

    for (int i = 0; i < columns.size(); ++i) {
        if (columns.get(i).text().equalsIgnoreCase("Pairs Code")) {
            pairsCodeIndex = i;
        }

        if (columns.get(i).text().equalsIgnoreCase("Selling")) {
            sellingIndex = i;
        }
    }

    for (Element tr : trs) {
        Elements tds = tr.getElementsByTag("td");

        if (tds.size() != 0) {
            String currPair = tds.get(pairsCodeIndex).text().trim();
            String rateText = tds.get(sellingIndex).text().trim().replace(",", "");
            BigDecimal rate = new BigDecimal(rateText);
            String curr;

            if (currPair.startsWith("GHS")) {
                curr = currPair.substring(3);
                rate = new BigDecimal(1).divide(rate, Constants.FRACTION_DIGITS_NUMBER_4,
                        Utils.getRoundingMode());
            } else {
                curr = currPair.substring(0, currPair.lastIndexOf("GHS"));
            }

            rates.put(curr, rate);
        }
    }
    return rates;
}

From source file:GIST.IzbirkomExtractor.TableExtractor.java

/**
     * Cleaning up leftover of HTML code from the cell content.
     * /*from  w  w w  .ja  v  a 2  s  .  com*/
     * @param cell_content HTML code contains in the table cell 
     * @return an array list containing each line of the cell_content withh all HTML markup removed
     */
    private ArrayList<String> cleanLeftoverHTML(Element cell_content) {

        ArrayList<String> streets_and_numbers = new ArrayList<String>();

        /* <div>s designate separate lines inside the table cell */
        for (Element addr_line : cell_content.getElementsByTag("div")) {

            /* skip empty address lines */
            String addr_line_text = cleanupUNICODE(addr_line.text());
            if (StringUtils.isBlank(addr_line_text))
                continue;

            /* <strong> is not particularly useful, but can designate placement of simple separators like space */
            Elements streets = addr_line.getElementsByTag("strong");
            if (!streets.isEmpty()) {
                addr_line_text = addr_line_text.replaceFirst(Pattern.quote(streets.text()),
                        " " + streets.text() + " ");
            }

            streets_and_numbers.add(addr_line_text);
        }
        return streets_and_numbers;
    }

From source file:com.thesmartweb.swebrank.WebParser.java

/**
 * Parse the url and get all the content
 * @param link_html the url to parse/*w  w w . j  ava 2 s.  c  om*/
 * @return The content parsed
 */
public String cleanhtml(String link_html) {
    try {
        Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get();
        String title = doc.title();
        String mainbody = doc.body().text();
        Elements links = doc.select("a[href]");
        Elements media = doc.select("[src]");
        //fix link html to remove https:// or http:// and simple /
        if (link_html.substring(link_html.length() - 1, link_html.length()).equalsIgnoreCase("/")) {
            link_html = link_html.substring(0, link_html.length() - 1);
        }
        if (link_html.substring(0, 5).equalsIgnoreCase("https")) {
            link_html = link_html.substring(8);
        } else if (link_html.substring(0, 4).equalsIgnoreCase("http")) {
            link_html = link_html.substring(7);
        }
        String anchortext = "";
        String alttext = "";
        //-----get the anchor text of internal links
        for (Element link : links) {
            String str_check = link.attr("abs:href").toString();
            if (link.attr("abs:href").contains(link_html) && link.text().length() > 1) {
                anchortext = anchortext + link.text() + " ";
            }
        }
        //-------get alt text to internal images links
        for (Element medi : media) {
            if (medi.getElementsByTag("img").attr("src").toString().contains(link_html)) {
                alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString();
            }
            if (medi.getElementsByTag("img").attr("src").toString().startsWith("/")) {
                alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString();
            }
        }
        String content = mainbody + title + anchortext + alttext;

        return content;
    } catch (IOException ex) {
        Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    } catch (NullPointerException ex) {
        Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    } catch (Exception ex) {
        Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    }

}

From source file:com.serphacker.serposcope.scraper.google.scraper.GoogleScraper.java

protected Status parseSerp(List<String> urls) {
    String html = http.getContentAsString();
    if (html == null || html.isEmpty()) {
        return Status.ERROR_NETWORK;
    }//from   w  w w.  j a va2s  . com

    lastSerpHtml = Jsoup.parse(html);
    if (lastSerpHtml == null) {
        return Status.ERROR_NETWORK;
    }

    Elements h3Elts = lastSerpHtml.getElementsByTag("h3");
    for (Element h3Elt : h3Elts) {

        if (isSiteLinkElement(h3Elt)) {
            continue;
        }

        String link = extractLink(h3Elt.getElementsByTag("a").first());
        if (link != null) {
            urls.add(link);
        }
    }

    return Status.OK;
}

From source file:com.gote.downloader.kgs.KGSDownloader.java

/**
 * Try to found out if a game has been already played or by looking into archives page by page.
 * /*from w w w .j  a  v  a  2s  .co  m*/
 * @param pGame Game to found and update
 * @param pPlayerArchivePages List of archive pages
 */
private void retrieveAndUpdateGame(Game pGame, List<Document> pPlayerArchivePages) {
    stage = "Etape 3/3 - Rcupration de la partie";
    for (Document playerArchivePage : pPlayerArchivePages) {
        Elements tableRows = playerArchivePage.select("tr");

        for (Element row : tableRows) {
            if (Pattern.matches(regexGame, row.toString())) {
                // LOGGER.log(Level.INFO, "[TRACE] New row checked " + row.toString());

                // "Visible", "Blanc", "Noir", "Genre", "Debutee le", "Type", "Resultat"
                Elements tableCells = row.getElementsByTag("td");

                String gameUrl = isPublicGame(tableCells.get(GAMEURL));

                // May check with time if you can leave or continue
                if (gameUrl != null && !gameUrl.isEmpty()) {
                    if (gameUrl.toLowerCase().contains(pGame.getBlack().getPseudo().toLowerCase())
                            && gameUrl.toLowerCase().contains(pGame.getWhite().getPseudo().toLowerCase())) {
                        pGame.setGameUrl(gameUrl);
                        pGame.setResult(getStdResultFromKGSResult(tableCells.get(RESULT).text()));
                        File sgf = new File(AppUtil.PATH_TO_TOURNAMENTS + tournament.getTitle() + "/"
                                + AppUtil.PATH_TO_SGFS + tournament.getTitle().trim() + "_round"
                                + pGame.getBlack().getPseudo() + "_" + pGame.getWhite().getPseudo() + ".sgf");
                        try {
                            URL url = new URL(gameUrl);
                            FileUtils.copyURLToFile(url, sgf);
                        } catch (MalformedURLException e) {
                            log(Level.WARNING, "URL " + gameUrl + " malformee", e);
                        } catch (IOException e) {
                            log(Level.WARNING, "Erreur lors de l'ecriture du fichier", e);
                        }

                        // Leave the process
                        return;
                    }
                } else {
                    log(Level.INFO, "La partie " + tableCells
                            + " n'est pas visible ou un probleme a eu lieu lors de la recuperation de l'url");
                }
            }
        }
    }
}

From source file:edu.harvard.iq.safe.lockss.impl.LOCKSSPlatformStatusHtmlParser.java

/**
 *
 * @param is//from www  . j a  v  a 2 s  .  c  o m
 */
@Override
public void getPlatformStatusData(InputStream is) {

    try {

        Document doc = DataUtil.load(is, "UTF-8", "");
        Element body = doc.body();

        // most of the target items are sandwitched by <b> tag
        // this can be used to reach each target item.
        String tmpCurrentTime = null;
        String tmpUpTime = null;
        String currentTime = null;
        Elements tags = body.getElementsByTag("b");

        for (Element tag : tags) {

            // get the current-time string: for 1.52.3 or older daemons
            // this is the ony place to get it.
            String tagText = tag.text();
            logger.log(Level.FINE, "working on tagText={0}", tagText);

            if (tagText.equals("Daemon Status")) {
                // find current time and up running
                currentTime = tag.parent().parent().text();
                logger.log(Level.INFO, "currentTime text=[{0}]", currentTime);
                // "currentTime =Daemon Status lockss.statelib.lib.in.us (usdocspln group) 01:25:55 03/01/12, up 7d5h21m"
                tmstmpMatcher = currentTimeStampPattern.matcher(currentTime);

                if (tmstmpMatcher.find()) {
                    logger.log(Level.INFO, "group 0={0}", tmstmpMatcher.group(0));
                    tmpCurrentTime = tmstmpMatcher.group(1);
                    logger.log(Level.INFO, "Current Time:group 1={0}", tmpCurrentTime);
                    tmpUpTime = tmstmpMatcher.group(2);
                    logger.log(Level.INFO, "UpTime:group 2={0}", tmpUpTime);
                }
            }

            // get the remaining key-value sets
            if (fieldNameSet.contains(tagText)) {

                Element parent = tag.parent();
                String fieldValue = parent.nextElementSibling().text();
                logger.log(Level.FINE, "{0}={1}", new Object[] { tagText, fieldValue });
                summaryInfoMap.put(tagText, fieldValue);
            }
        }

        // extract the daemon version and platform info that are located
        // at the bottom
        // these data are sandwitched by a <center> tag
        Elements ctags = body.getElementsByTag("center");
        String version = null;
        String platform = null;
        for (Element ctag : ctags) {
            String cText = ctag.text();
            logger.log(Level.FINE, "center tag Text={0}", cText);
            // cText is like this:
            // Daemon 1.53.3 built 28-Jan-12 01:06:36 on build7.lockss.org, Linux RPM 1
            if (StringUtils.isNotBlank(cText) && ctag.child(0).nodeName().equals("font")) {
                String[] versionPlatform = cText.split(", ");
                if (versionPlatform.length == 2) {
                    logger.log(Level.INFO, "daemon version={0};platform={1}", versionPlatform);
                    version = DaemonStatusDataUtil.getDaemonVersion(versionPlatform[0]);
                    platform = versionPlatform[1];
                } else {
                    // the above regex failed
                    logger.log(Level.WARNING, "String-formatting differs; use pattern matching");
                    version = DaemonStatusDataUtil.getDaemonVersion(cText);
                    int platformOffset = cText.lastIndexOf(", ") + 2;
                    platform = cText.substring(platformOffset);
                    logger.log(Level.INFO, "platform={0}", platform);

                }
            }
        }

        if (summaryInfoMap.containsKey("V3 Identity")) {
            String ipAddress = DaemonStatusDataUtil.getPeerIpAddress(summaryInfoMap.get("V3 Identity"));
            logger.log(Level.INFO, "ipAddress={0}", ipAddress);

            if (StringUtils.isNotBlank(ipAddress)) {
                boxInfoMap.put("host", ipAddress);
                if (!ipAddress.equals(summaryInfoMap.get("IP Address"))) {
                    summaryInfoMap.put("IP Address", ipAddress);
                }
            } else {
                logger.log(Level.WARNING, "host token is blank or null: use IP Address instead");
                logger.log(Level.INFO, "IP Address={0}", summaryInfoMap.get("IP Address"));
                boxInfoMap.put("host", summaryInfoMap.get("IP Address"));
            }
        }

        // for pre-1.53.3 versions
        boxInfoMap.put("time", tmpCurrentTime);
        if (!summaryInfoMap.containsKey("Current Time")) {
            summaryInfoMap.put("Current Time", tmpCurrentTime);
        }

        boxInfoMap.put("up", tmpUpTime);
        if (!summaryInfoMap.containsKey("Uptime")) {
            summaryInfoMap.put("Uptime", tmpUpTime);
        }

        boxInfoMap.put("version", version);
        if (!summaryInfoMap.containsKey("Daemon Version")) {
            summaryInfoMap.put("Daemon Version", version);
        }

        boxInfoMap.put("platform", platform);
        if (!summaryInfoMap.containsKey("Platform")) {
            summaryInfoMap.put("Platform", platform);
        }

    } catch (IOException ex) {
        logger.log(Level.SEVERE, "IO error", ex);
    }

    logger.log(Level.INFO, "boxInfoMap={0}", boxInfoMap);
    logger.log(Level.INFO, "summaryInfo={0}", summaryInfoMap);
}

From source file:org.keycloak.testsuite.admin.concurrency.ConcurrentLoginTest.java

protected HttpUriRequest handleLogin(String html, String username, String password)
        throws UnsupportedEncodingException {
    log.debug("Extracting form's data...");

    // Keycloak form id
    Element loginform = Jsoup.parse(html).getElementById("kc-form-login");
    String method = loginform.attr("method");
    String action = loginform.attr("action");

    List<NameValuePair> paramList = new ArrayList<>();

    for (Element inputElement : loginform.getElementsByTag("input")) {
        String key = inputElement.attr("name");

        if (key.equals("username")) {
            paramList.add(new BasicNameValuePair(key, username));
        } else if (key.equals("password")) {
            paramList.add(new BasicNameValuePair(key, password));
        }/*from  w  w  w . j  a v a  2s  .  c  om*/
    }

    boolean isPost = method != null && "post".equalsIgnoreCase(method);

    if (isPost) {
        HttpPost req = new HttpPost(action);

        UrlEncodedFormEntity formEntity;
        try {
            formEntity = new UrlEncodedFormEntity(paramList, "UTF-8");
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(e);
        }
        req.setEntity(formEntity);

        return req;
    } else {
        throw new UnsupportedOperationException("not supported yet!");
    }
}