List of usage examples for org.jsoup.nodes Element getElementsByTag
public Elements getElementsByTag(String tagName)
From source file:eu.masconsult.bgbanking.banks.dskbank.DskClient.java
@Override public List<RawBankAccount> getBankAccounts(String authToken) throws IOException, ParseException, AuthenticationException { String uri = BASE_URL + "?" + URLEncodedUtils.format( Arrays.asList(new BasicNameValuePair(XML_ID, LIST_ACCOUNTS_XML_ID)), ENCODING) + "&" + authToken; // Get the accounts list Log.i(TAG, "Getting from: " + uri); final HttpGet get = new HttpGet(uri); get.setHeader("Accept", "*/*"); DefaultHttpClient httpClient = getHttpClient(); Log.v(TAG, "sending " + get.toString()); final HttpResponse resp = httpClient.execute(get); if (resp.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { throw new ParseException("getBankAccounts: unhandled http status " + resp.getStatusLine().getStatusCode() + " " + resp.getStatusLine().getReasonPhrase()); }/* w ww . ja va 2 s . co m*/ HttpEntity entity = resp.getEntity(); Document doc = Jsoup.parse(entity.getContent(), "utf-8", BASE_URL); if (!checkLoggedIn(doc)) { throw new AuthenticationException("session expired!"); } Element content = doc.getElementById("PageContent"); if (content == null) { throw new ParseException("getBankAccounts: can't find PageContent"); } Elements tables = content.getElementsByTag("table"); if (tables == null || tables.size() == 0) { throw new ParseException("getBankAccounts: can't find table in PageContent"); } Elements rows = tables.first().getElementsByTag("tr"); if (rows == null || rows.size() == 0) { throw new ParseException("getBankAccounts: first table is empty in PageContent"); } ArrayList<RawBankAccount> bankAccounts = new ArrayList<RawBankAccount>(rows.size()); String lastCurrency = null; for (Element row : rows) { RawBankAccount bankAccount = obtainBankAccountFromHtmlTableRow(row); if (bankAccount != null) { if (bankAccount.getCurrency() == null) { bankAccount.setCurrency(lastCurrency); } else { lastCurrency = bankAccount.getCurrency(); } bankAccounts.add(bankAccount); } } return bankAccounts; }
From source file:com.mythesis.userbehaviouranalysis.WebParser.java
/** * Parse the url and get all the content * @param link the url to parse/* ww w . jav a 2 s .c o m*/ * @return The content parsed */ private String cleanhtml(String link) { try { Document doc = Jsoup.connect(link).timeout(10 * 1000).get(); String title = doc.title(); String mainbody = doc.body().text(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); //fix link html to remove https:// or http:// and simple / if (link.substring(link.length() - 1, link.length()).equalsIgnoreCase("/")) { link = link.substring(0, link.length() - 1); } if (link.substring(0, 5).equalsIgnoreCase("https")) { link = link.substring(8); } else if (link.substring(0, 4).equalsIgnoreCase("http")) { link = link.substring(7); } String anchortext = ""; String alttext = ""; //-----get the anchor text of internal links for (Element el : links) { String str_check = el.attr("abs:href"); if (el.attr("abs:href").contains(link) && el.text().length() > 1) { anchortext = anchortext + el.text() + " "; } } //-------get alt text to internal images links for (Element medi : media) { if (medi.getElementsByTag("img").attr("src").contains(link)) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt"); } if (medi.getElementsByTag("img").attr("src").startsWith("/")) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt"); } } String content = mainbody + title + anchortext + alttext; return content; } catch (IOException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (NullPointerException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (Exception ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } }
From source file:eu.masconsult.bgbanking.banks.fibank.my.MyFIBankClient.java
@Override public List<RawBankAccount> getBankAccounts(String authtoken) throws IOException, ParseException, AuthenticationException { Log.v(TAG, "getBankAccounts: " + authtoken); AuthToken authToken = AuthToken.fromJson(authtoken); DefaultHttpClient httpClient = getHttpClient(); performLogin(httpClient, authToken); final ArrayList<NameValuePair> params = new ArrayList<NameValuePair>(); params.add(new BasicNameValuePair("request_type", "open")); params.add(new BasicNameValuePair("open_tab", "home")); params.add(new BasicNameValuePair("LogSesID", authToken.sessionId)); final HttpEntity entity; try {/*from ww w .jav a 2s . c o m*/ entity = new UrlEncodedFormEntity(params); } catch (final UnsupportedEncodingException e) { // this should never happen. throw new IllegalStateException(e); } HttpPost post = new HttpPost(SUMMARY_URL); post.addHeader(entity.getContentType()); post.setHeader("Accept", "*/*"); post.setEntity(entity); /* * curl -b 'ASP.NET_SessionId=afmrm5b0eiesmhha14ml2xml' -d * request_type=open -d open_tab=home -d * LogSesID=80e46fac-e188-4055-93de-137bac9db9a3 * https://my.fibank.bg/lAccSummary */ HttpResponse resp = httpClient.execute(post); if (resp.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { throw new ParseException("getBankAccounts: unhandled http status " + resp.getStatusLine().getStatusCode() + " " + resp.getStatusLine().getReasonPhrase()); } String response = EntityUtils.toString(resp.getEntity()); Log.v(TAG, "response = " + response); Document doc = Jsoup.parse(response, BASE_URL); Element table = doc.getElementById("AvailableAmt"); if (table == null) { throw new ParseException("can't find @AvailableAmt"); } List<RawBankAccount> bankAccounts = new LinkedList<RawBankAccount>(); for (Element row : table.getElementsByTag("tr")) { RawBankAccount bankAccount = obtainBankAccountFromHtmlTableRow(row); if (bankAccount != null) { bankAccounts.add(bankAccount); } } return bankAccounts; }
From source file:com.webbfontaine.valuewebb.timer.RatesUpdater.java
public HashMap<String, BigDecimal> ratesFromBank() { HashMap<String, BigDecimal> rates = new HashMap<String, BigDecimal>(); Document doc = getPage();/*from ww w . j a v a2 s. c om*/ Elements tables = doc.getElementsByTag("table"); Element tableOfRates = null; Elements trs; int pairsCodeIndex = 0; int sellingIndex = 0; for (Element table : tables) { if (table.text().contains("Dollar")) { tableOfRates = table; break; } } if (tableOfRates != null) { trs = tableOfRates.getElementsByTag("tr"); } else { LOGGER.error("Error reading rates from URL"); return rates; } Elements columns = trs.get(0).getElementsByTag("th"); for (int i = 0; i < columns.size(); ++i) { if (columns.get(i).text().equalsIgnoreCase("Pairs Code")) { pairsCodeIndex = i; } if (columns.get(i).text().equalsIgnoreCase("Selling")) { sellingIndex = i; } } for (Element tr : trs) { Elements tds = tr.getElementsByTag("td"); if (tds.size() != 0) { String currPair = tds.get(pairsCodeIndex).text().trim(); String rateText = tds.get(sellingIndex).text().trim().replace(",", ""); BigDecimal rate = new BigDecimal(rateText); String curr; if (currPair.startsWith("GHS")) { curr = currPair.substring(3); rate = new BigDecimal(1).divide(rate, Constants.FRACTION_DIGITS_NUMBER_4, Utils.getRoundingMode()); } else { curr = currPair.substring(0, currPair.lastIndexOf("GHS")); } rates.put(curr, rate); } } return rates; }
From source file:GIST.IzbirkomExtractor.TableExtractor.java
/** * Cleaning up leftover of HTML code from the cell content. * /*from w w w .ja v a 2 s . com*/ * @param cell_content HTML code contains in the table cell * @return an array list containing each line of the cell_content withh all HTML markup removed */ private ArrayList<String> cleanLeftoverHTML(Element cell_content) { ArrayList<String> streets_and_numbers = new ArrayList<String>(); /* <div>s designate separate lines inside the table cell */ for (Element addr_line : cell_content.getElementsByTag("div")) { /* skip empty address lines */ String addr_line_text = cleanupUNICODE(addr_line.text()); if (StringUtils.isBlank(addr_line_text)) continue; /* <strong> is not particularly useful, but can designate placement of simple separators like space */ Elements streets = addr_line.getElementsByTag("strong"); if (!streets.isEmpty()) { addr_line_text = addr_line_text.replaceFirst(Pattern.quote(streets.text()), " " + streets.text() + " "); } streets_and_numbers.add(addr_line_text); } return streets_and_numbers; }
From source file:com.thesmartweb.swebrank.WebParser.java
/** * Parse the url and get all the content * @param link_html the url to parse/*w w w . j ava 2 s. c om*/ * @return The content parsed */ public String cleanhtml(String link_html) { try { Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get(); String title = doc.title(); String mainbody = doc.body().text(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); //fix link html to remove https:// or http:// and simple / if (link_html.substring(link_html.length() - 1, link_html.length()).equalsIgnoreCase("/")) { link_html = link_html.substring(0, link_html.length() - 1); } if (link_html.substring(0, 5).equalsIgnoreCase("https")) { link_html = link_html.substring(8); } else if (link_html.substring(0, 4).equalsIgnoreCase("http")) { link_html = link_html.substring(7); } String anchortext = ""; String alttext = ""; //-----get the anchor text of internal links for (Element link : links) { String str_check = link.attr("abs:href").toString(); if (link.attr("abs:href").contains(link_html) && link.text().length() > 1) { anchortext = anchortext + link.text() + " "; } } //-------get alt text to internal images links for (Element medi : media) { if (medi.getElementsByTag("img").attr("src").toString().contains(link_html)) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString(); } if (medi.getElementsByTag("img").attr("src").toString().startsWith("/")) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString(); } } String content = mainbody + title + anchortext + alttext; return content; } catch (IOException ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (NullPointerException ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (Exception ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } }
From source file:com.serphacker.serposcope.scraper.google.scraper.GoogleScraper.java
protected Status parseSerp(List<String> urls) { String html = http.getContentAsString(); if (html == null || html.isEmpty()) { return Status.ERROR_NETWORK; }//from w w w. j a va2s . com lastSerpHtml = Jsoup.parse(html); if (lastSerpHtml == null) { return Status.ERROR_NETWORK; } Elements h3Elts = lastSerpHtml.getElementsByTag("h3"); for (Element h3Elt : h3Elts) { if (isSiteLinkElement(h3Elt)) { continue; } String link = extractLink(h3Elt.getElementsByTag("a").first()); if (link != null) { urls.add(link); } } return Status.OK; }
From source file:com.gote.downloader.kgs.KGSDownloader.java
/** * Try to found out if a game has been already played or by looking into archives page by page. * /*from w w w .j a v a 2s .co m*/ * @param pGame Game to found and update * @param pPlayerArchivePages List of archive pages */ private void retrieveAndUpdateGame(Game pGame, List<Document> pPlayerArchivePages) { stage = "Etape 3/3 - Rcupration de la partie"; for (Document playerArchivePage : pPlayerArchivePages) { Elements tableRows = playerArchivePage.select("tr"); for (Element row : tableRows) { if (Pattern.matches(regexGame, row.toString())) { // LOGGER.log(Level.INFO, "[TRACE] New row checked " + row.toString()); // "Visible", "Blanc", "Noir", "Genre", "Debutee le", "Type", "Resultat" Elements tableCells = row.getElementsByTag("td"); String gameUrl = isPublicGame(tableCells.get(GAMEURL)); // May check with time if you can leave or continue if (gameUrl != null && !gameUrl.isEmpty()) { if (gameUrl.toLowerCase().contains(pGame.getBlack().getPseudo().toLowerCase()) && gameUrl.toLowerCase().contains(pGame.getWhite().getPseudo().toLowerCase())) { pGame.setGameUrl(gameUrl); pGame.setResult(getStdResultFromKGSResult(tableCells.get(RESULT).text())); File sgf = new File(AppUtil.PATH_TO_TOURNAMENTS + tournament.getTitle() + "/" + AppUtil.PATH_TO_SGFS + tournament.getTitle().trim() + "_round" + pGame.getBlack().getPseudo() + "_" + pGame.getWhite().getPseudo() + ".sgf"); try { URL url = new URL(gameUrl); FileUtils.copyURLToFile(url, sgf); } catch (MalformedURLException e) { log(Level.WARNING, "URL " + gameUrl + " malformee", e); } catch (IOException e) { log(Level.WARNING, "Erreur lors de l'ecriture du fichier", e); } // Leave the process return; } } else { log(Level.INFO, "La partie " + tableCells + " n'est pas visible ou un probleme a eu lieu lors de la recuperation de l'url"); } } } } }
From source file:edu.harvard.iq.safe.lockss.impl.LOCKSSPlatformStatusHtmlParser.java
/** * * @param is//from www . j a v a 2 s . c o m */ @Override public void getPlatformStatusData(InputStream is) { try { Document doc = DataUtil.load(is, "UTF-8", ""); Element body = doc.body(); // most of the target items are sandwitched by <b> tag // this can be used to reach each target item. String tmpCurrentTime = null; String tmpUpTime = null; String currentTime = null; Elements tags = body.getElementsByTag("b"); for (Element tag : tags) { // get the current-time string: for 1.52.3 or older daemons // this is the ony place to get it. String tagText = tag.text(); logger.log(Level.FINE, "working on tagText={0}", tagText); if (tagText.equals("Daemon Status")) { // find current time and up running currentTime = tag.parent().parent().text(); logger.log(Level.INFO, "currentTime text=[{0}]", currentTime); // "currentTime =Daemon Status lockss.statelib.lib.in.us (usdocspln group) 01:25:55 03/01/12, up 7d5h21m" tmstmpMatcher = currentTimeStampPattern.matcher(currentTime); if (tmstmpMatcher.find()) { logger.log(Level.INFO, "group 0={0}", tmstmpMatcher.group(0)); tmpCurrentTime = tmstmpMatcher.group(1); logger.log(Level.INFO, "Current Time:group 1={0}", tmpCurrentTime); tmpUpTime = tmstmpMatcher.group(2); logger.log(Level.INFO, "UpTime:group 2={0}", tmpUpTime); } } // get the remaining key-value sets if (fieldNameSet.contains(tagText)) { Element parent = tag.parent(); String fieldValue = parent.nextElementSibling().text(); logger.log(Level.FINE, "{0}={1}", new Object[] { tagText, fieldValue }); summaryInfoMap.put(tagText, fieldValue); } } // extract the daemon version and platform info that are located // at the bottom // these data are sandwitched by a <center> tag Elements ctags = body.getElementsByTag("center"); String version = null; String platform = null; for (Element ctag : ctags) { String cText = ctag.text(); logger.log(Level.FINE, "center tag Text={0}", cText); // cText is like this: // Daemon 1.53.3 built 28-Jan-12 01:06:36 on build7.lockss.org, Linux RPM 1 if (StringUtils.isNotBlank(cText) && ctag.child(0).nodeName().equals("font")) { String[] versionPlatform = cText.split(", "); if (versionPlatform.length == 2) { logger.log(Level.INFO, "daemon version={0};platform={1}", versionPlatform); version = DaemonStatusDataUtil.getDaemonVersion(versionPlatform[0]); platform = versionPlatform[1]; } else { // the above regex failed logger.log(Level.WARNING, "String-formatting differs; use pattern matching"); version = DaemonStatusDataUtil.getDaemonVersion(cText); int platformOffset = cText.lastIndexOf(", ") + 2; platform = cText.substring(platformOffset); logger.log(Level.INFO, "platform={0}", platform); } } } if (summaryInfoMap.containsKey("V3 Identity")) { String ipAddress = DaemonStatusDataUtil.getPeerIpAddress(summaryInfoMap.get("V3 Identity")); logger.log(Level.INFO, "ipAddress={0}", ipAddress); if (StringUtils.isNotBlank(ipAddress)) { boxInfoMap.put("host", ipAddress); if (!ipAddress.equals(summaryInfoMap.get("IP Address"))) { summaryInfoMap.put("IP Address", ipAddress); } } else { logger.log(Level.WARNING, "host token is blank or null: use IP Address instead"); logger.log(Level.INFO, "IP Address={0}", summaryInfoMap.get("IP Address")); boxInfoMap.put("host", summaryInfoMap.get("IP Address")); } } // for pre-1.53.3 versions boxInfoMap.put("time", tmpCurrentTime); if (!summaryInfoMap.containsKey("Current Time")) { summaryInfoMap.put("Current Time", tmpCurrentTime); } boxInfoMap.put("up", tmpUpTime); if (!summaryInfoMap.containsKey("Uptime")) { summaryInfoMap.put("Uptime", tmpUpTime); } boxInfoMap.put("version", version); if (!summaryInfoMap.containsKey("Daemon Version")) { summaryInfoMap.put("Daemon Version", version); } boxInfoMap.put("platform", platform); if (!summaryInfoMap.containsKey("Platform")) { summaryInfoMap.put("Platform", platform); } } catch (IOException ex) { logger.log(Level.SEVERE, "IO error", ex); } logger.log(Level.INFO, "boxInfoMap={0}", boxInfoMap); logger.log(Level.INFO, "summaryInfo={0}", summaryInfoMap); }
From source file:org.keycloak.testsuite.admin.concurrency.ConcurrentLoginTest.java
protected HttpUriRequest handleLogin(String html, String username, String password) throws UnsupportedEncodingException { log.debug("Extracting form's data..."); // Keycloak form id Element loginform = Jsoup.parse(html).getElementById("kc-form-login"); String method = loginform.attr("method"); String action = loginform.attr("action"); List<NameValuePair> paramList = new ArrayList<>(); for (Element inputElement : loginform.getElementsByTag("input")) { String key = inputElement.attr("name"); if (key.equals("username")) { paramList.add(new BasicNameValuePair(key, username)); } else if (key.equals("password")) { paramList.add(new BasicNameValuePair(key, password)); }/*from w w w . j a v a 2s . c om*/ } boolean isPost = method != null && "post".equalsIgnoreCase(method); if (isPost) { HttpPost req = new HttpPost(action); UrlEncodedFormEntity formEntity; try { formEntity = new UrlEncodedFormEntity(paramList, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } req.setEntity(formEntity); return req; } else { throw new UnsupportedOperationException("not supported yet!"); } }