List of usage examples for org.jsoup.nodes Element getElementsByTag
public Elements getElementsByTag(String tagName)
From source file:org.keycloak.testsuite.util.saml.RequiredConsentBuilder.java
/** * Prepares a GET/POST request for consent granting . The consent page is expected * to have at least input fields with id "kc-login" and "kc-cancel". * * @param consentPage// w w w . ja v a2s .c o m * @param consent * @return */ public HttpUriRequest handleConsentPage(String consentPage, URI currentURI) { org.jsoup.nodes.Document theLoginPage = Jsoup.parse(consentPage); List<NameValuePair> parameters = new LinkedList<>(); for (Element form : theLoginPage.getElementsByTag("form")) { String method = form.attr("method"); String action = form.attr("action"); boolean isPost = method != null && "post".equalsIgnoreCase(method); for (Element input : form.getElementsByTag("input")) { if (Objects.equals(input.id(), "kc-login")) { if (approveConsent) parameters.add(new BasicNameValuePair(input.attr("name"), input.attr("value"))); } else if (Objects.equals(input.id(), "kc-cancel")) { if (!approveConsent) parameters.add(new BasicNameValuePair(input.attr("name"), input.attr("value"))); } else { parameters.add(new BasicNameValuePair(input.attr("name"), input.val())); } } if (isPost) { HttpPost res = new HttpPost(currentURI.resolve(action)); UrlEncodedFormEntity formEntity; try { formEntity = new UrlEncodedFormEntity(parameters, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } res.setEntity(formEntity); return res; } else { UriBuilder b = UriBuilder.fromPath(action); for (NameValuePair parameter : parameters) { b.queryParam(parameter.getName(), parameter.getValue()); } return new HttpGet(b.build()); } } throw new IllegalArgumentException("Invalid consent page: " + consentPage); }
From source file:GIST.IzbirkomExtractor.TableExtractor.java
/** * Tests the row if it looks like the 1st row of a parsable table * @param row//from w w w .j a v a2 s . c o m * @return */ private boolean isParsableTable(Element row) { Elements cells = row.getElementsByTag("td"); /* number of columns should be 4 */ if (cells.size() != 4) return false; /* look for number signs in 1st cell*/ if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()), " . -") < 3) return true; /* discard the table if any of the cells is empty */ for (Element cell : cells) { if (cleanupUNICODE(cell.text()).isEmpty()) return false; } /* 1st column should be a number */ try { Integer.parseInt(cleanupUNICODE(cells.first().text()).trim()); return true; } catch (NumberFormatException e) { return false; } }
From source file:com.github.jrrdev.mantisbtsync.core.common.auth.request.AuthHttpPost.java
/** * {@inheritDoc}/*w w w . j ava 2 s . c o m*/ * */ @Override public void configFromPreviousResponse(final HttpEntity entity) throws ParseException, IOException { if (formAction == null || entity == null) { return; } final String content = EntityUtils.toString(entity); final Elements forms = Jsoup.parse(content).getElementsByTag(HTML_FORM); for (final Element form : forms) { // Get the form if (form.hasAttr(HTML_ACTION) && formAction.equalsIgnoreCase(form.attr(HTML_ACTION))) { // Parsing of hidden inputs final Elements inputs = form.getElementsByTag(HTML_INPUT); for (final Element input : inputs) { if (input.hasAttr(HTML_TYPE) && HTML_HIDDEN.equalsIgnoreCase(input.attr(HTML_TYPE))) { final String value = input.attr(HTML_VALUE); final String name = input.attr(HTML_NAME); builder = builder.addParameter(name, value); } } break; } } }
From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java
/** * remove paragraphs that have less than x number of words, would indicate that it's some sort of link *///from w w w .ja v a 2s . c o m private void removeParagraphsWithFewWords() { if (logger.isDebugEnabled()) { logger.debug("removeParagraphsWithFewWords starting..."); } Elements allNodes = this.topNode.getAllElements(); for (Element el : allNodes) { try { // get stop words that appear in each node WordStats stopWords = StopWords.getStopWordCount(el.text()); if (stopWords.getStopWordCount() < 5 && el.getElementsByTag("object").size() == 0 && el.getElementsByTag("embed").size() == 0) { el.remove(); } } catch (IllegalArgumentException e) { logger.error(e.getMessage()); } //} } }
From source file:com.dajodi.scandic.JSoupScraper.java
private List<ScandicStay> getStays(Element accountOverview) { Element tableNode = accountOverview .getElementById("ctl00_MainBodyRegion_AccountOverview1_tableTransactions"); if (tableNode == null) { return Collections.emptyList(); }//from ww w.ja v a 2s. co m Elements trs = tableNode.getElementsByTag("tr"); List<ScandicStay> stays = new ArrayList<ScandicStay>(); int order = 0; for (Element tr : trs) { if (tr.getElementsByTag("th").isEmpty()) { Elements tds = tr.getElementsByTag("td"); if (tds.size() == 3) { String location = Util.trimIfNonNull(tds.get(0).text()); String date = Util.trimIfNonNull(tds.get(1).text()); String stayPoints = Util.trimIfNonNull(tds.get(2).text()); ScandicStay stay = new ScandicStay(); Date[] dates = Util.parseDates(date); int numNights = Util.daysBetween(dates[0], dates[1]); stay.setHotelName(location); stay.setNumPoints(Integer.parseInt(stayPoints)); stay.setFromDate(dates[0]); stay.setToDate(dates[1]); stay.setNumNights(numNights); stay.setHtmlOrder(order); stays.add(stay); order++; } else { throw new ScandicHtmlException( "unknown table node, html is funky. could hide row if this is a serious problem."); } } } return stays; }
From source file:nl.phanos.liteliveresultsclient.LoginHandler.java
public Object[] getEigenWedstrijden() throws Exception { ArrayList<Wedstrijd> wedstrijden = new ArrayList<Wedstrijd>(); String content = GetPageContent( "https://www.atletiek.nu/feeder.php?page=search&do=events&search=&predefinedSearchTemplate=3"); Element overview = Jsoup.parse(content).getElementById("overview").getElementsByTag("tbody").first(); Elements rows = overview.getElementsByTag("tr"); for (Element row : rows) { if (row.hasAttr("onclick")) { try { Wedstrijd w = new Wedstrijd(); String[] split = row.attr("onclick").split("/"); w.id = split[split.length - 2]; w.date = row.getElementsByTag("td").first().text(); w.club = row.getElementsByTag("td").get(3).text().replace(" ", "").replace(",", ""); w.name = w.date + " - " + row.getElementsByTag("td").get(1).getElementsByClass("hidden-xs").first().text(); wedstrijden.add(w);//from www.j a va 2 s . c o m } catch (Exception e) { System.out.println(e); } } } return wedstrijden.toArray(); }
From source file:org.keycloak.testsuite.util.saml.UpdateProfileBuilder.java
public HttpUriRequest handleUpdateProfile(String loginPage, URI currentURI) { org.jsoup.nodes.Document theUpdateProfilePage = Jsoup.parse(loginPage); Set<String> unusedParams = new HashSet<>(this.parameters.keySet()); List<NameValuePair> parameters = new LinkedList<>(); for (Element form : theUpdateProfilePage.getElementsByTag("form")) { String method = form.attr("method"); String action = form.attr("action"); boolean isPost = method != null && "post".equalsIgnoreCase(method); for (Element input : form.getElementsByTag("input")) { if (this.parameters.containsKey(input.attr("name"))) { parameters.add(/*from w w w .ja va 2 s.c o m*/ new BasicNameValuePair(input.attr("name"), this.parameters.get(input.attr("name")))); unusedParams.remove(input.attr("name")); } } if (!unusedParams.isEmpty()) { LOG.warnf("Unused parameter names at Update Profile page: %s", unusedParams); } if (isPost) { HttpPost res = new HttpPost(action); UrlEncodedFormEntity formEntity; try { formEntity = new UrlEncodedFormEntity(parameters, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } res.setEntity(formEntity); return res; } else { UriBuilder b = UriBuilder.fromPath(action); for (NameValuePair parameter : parameters) { b.queryParam(parameter.getName(), parameter.getValue()); } return new HttpGet(b.build()); } } throw new IllegalArgumentException("Invalid update profile form: " + loginPage); }
From source file:net.parser.JobParser.java
private String getPropertyString(String helpStr, String cssQuery) { Elements elements = doc.select(cssQuery); String propertyString = null; for (Element element : elements) { if (element.getElementsByTag("strong").text().equals(helpStr)) { propertyString = element.nextElementSibling().text(); break; }//from w w w .ja va 2s.c om } return propertyString; }
From source file:com.screenslicer.core.util.BrowserUtil.java
private static WebElement toElement(Browser browser, HtmlNode htmlNode, Element body, boolean recurse) throws ActionFailed { if (body == null) { body = BrowserUtil.openElement(browser, true, null, null, null, null); }//from w w w. j a v a 2 s . c om if (!CommonUtil.isEmpty(htmlNode.id)) { Elements elements = body.getElementsByAttributeValue("id", htmlNode.id); if (elements.size() == 1) { WebElement element = toElement(browser, elements.get(0), htmlNode, recurse); if (element != null) { return element; } } } List<Elements> selected = new ArrayList<Elements>(); if (!CommonUtil.isEmpty(htmlNode.tagName)) { selected.add(body.getElementsByTag(htmlNode.tagName)); } else if (!CommonUtil.isEmpty(htmlNode.href)) { selected.add(body.getElementsByTag("a")); } if (!CommonUtil.isEmpty(htmlNode.id)) { selected.add(body.getElementsByAttributeValue("id", htmlNode.id)); } if (!CommonUtil.isEmpty(htmlNode.name)) { selected.add(body.getElementsByAttributeValue("name", htmlNode.name)); } if (!CommonUtil.isEmpty(htmlNode.type)) { selected.add(body.getElementsByAttributeValue("type", htmlNode.type)); } if (!CommonUtil.isEmpty(htmlNode.value)) { selected.add(body.getElementsByAttributeValue("value", htmlNode.value)); } if (!CommonUtil.isEmpty(htmlNode.title)) { selected.add(body.getElementsByAttributeValue("title", htmlNode.title)); } if (!CommonUtil.isEmpty(htmlNode.role)) { selected.add(body.getElementsByAttributeValue("role", htmlNode.role)); } if (!CommonUtil.isEmpty(htmlNode.alt)) { selected.add(body.getElementsByAttributeValue("alt", htmlNode.alt)); } if (htmlNode.classes != null && htmlNode.classes.length > 0) { Map<Element, Integer> found = new HashMap<Element, Integer>(); for (int i = 0; i < htmlNode.classes.length; i++) { Elements elements = body.getElementsByClass(htmlNode.classes[i]); for (Element element : elements) { if (!found.containsKey(element)) { found.put(element, 0); } found.put(element, found.get(element) + 1); } } Elements elements = new Elements(); for (int i = htmlNode.classes.length; i > 0; i--) { for (Map.Entry<Element, Integer> entry : found.entrySet()) { if (entry.getValue() == i) { elements.add(entry.getKey()); } } if (!elements.isEmpty()) { break; } } selected.add(elements); } if (!CommonUtil.isEmpty(htmlNode.href)) { Elements hrefs = body.getElementsByAttribute("href"); Elements toAdd = new Elements(); String currentUrl = browser.getCurrentUrl(); String hrefGiven = htmlNode.href; for (Element href : hrefs) { String hrefFound = href.attr("href"); if (hrefGiven.equalsIgnoreCase(hrefFound)) { toAdd.add(href); toAdd.add(href); toAdd.add(href); } else if (htmlNode.fuzzy && hrefFound != null && hrefFound.endsWith(hrefGiven)) { toAdd.add(href); toAdd.add(href); } else if (htmlNode.fuzzy && hrefFound != null && hrefFound.contains(hrefGiven)) { toAdd.add(href); } else { String uriGiven = UrlUtil.toCanonicalUri(currentUrl, hrefGiven); String uriFound = UrlUtil.toCanonicalUri(currentUrl, hrefFound); if (uriGiven.equalsIgnoreCase(uriFound)) { toAdd.add(href); } } } selected.add(toAdd); } if (!CommonUtil.isEmpty(htmlNode.innerText)) { selected.add(body.getElementsMatchingText(Pattern.quote(htmlNode.innerText))); selected.add(body.getElementsMatchingText("^\\s*" + Pattern.quote(htmlNode.innerText) + "\\s*$")); } if (htmlNode.multiple != null) { selected.add(body.getElementsByAttribute("multiple")); } Map<Element, Integer> votes = new HashMap<Element, Integer>(); for (Elements elements : selected) { for (Element element : elements) { if (!votes.containsKey(element)) { votes.put(element, 0); } votes.put(element, votes.get(element) + 2); if (!NodeUtil.isHidden(element)) { votes.put(element, votes.get(element) + 1); } } } int maxVote = 0; Element maxElement = null; for (Map.Entry<Element, Integer> entry : votes.entrySet()) { if (entry.getValue() > maxVote) { maxVote = entry.getValue(); maxElement = entry.getKey(); } } return toElement(browser, maxElement, htmlNode, recurse); }
From source file:GIST.IzbirkomExtractor.TableExtractor.java
public void processHTMLfile(File input_html) throws IOException, TableExtractorException, CloneNotSupportedException, SQLException, ResultSinkException { logger.info("Start processing " + input_html); Document doc = Jsoup.parse(input_html, "UTF-8"); Elements tables = doc.getElementsByTag("table"); /* count of parseable tables found */ int tables_found = 0; /* determine raion name */ String raion_name = extractRaionFromFileName(input_html.getName()); //System.err.println(raion_name); // TODO: inflect raion name in case /* searches for a table that has " . -" in its very 1st cell */ for (Element table : tables) { Elements rows = table.getElementsByTag("tr"); boolean firstRow = true; row_loop: for (Element row : rows) { Elements cells = row.getElementsByTag("td"); if (firstRow) { //System.err.println(row.text()); if (isParsableTable(row)) { firstRow = false; logger.info("Processing table #" + ++tables_found + " in " + input_html); } else break row_loop; }//from w w w . jav a 2s . co m if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()), " . -") < 3) continue row_loop; /* skip the row if it looks like a table header */ /* skip rows with all cells empty */ boolean emptyRow = true; for (Element cell : cells) emptyRow = emptyRow && cleanupUNICODE(cell.text()).isEmpty(); if (emptyRow) continue; int i_cell = 0; Element station_id = null; Element address_field = null; Element org_address = null; /* address of the ??? */ Element station_address = null; for (Element cell : cells) { switch (i_cell) { case 0: station_id = cell; break; case 1: address_field = cell; break; case 2: org_address = cell; break; case 3: station_address = cell; default: break; } i_cell++; } if (station_id == null) throw new TableExtractorException("Polling station ID not found", row, input_html); if (address_field == null) throw new TableExtractorException("Address list not found", row, input_html); /* extract int from poll station id */ int psid; try { psid = Integer.valueOf(cleanupUNICODE(station_id.text()).trim().replaceAll("[^\\d]", "")); } catch (NumberFormatException e) { Exception te = new TableExtractorException("Failed to parse polling station ID >" + cleanupUNICODE(station_id.text()).trim() + "<: ", station_id, input_html); logger.severe(te.getMessage() + "; rest of " + input_html + " ignored."); return; } /* extraction from HTML completely finished, now we work only with the addresses in the text form */ extractAddressesFromText(raion_name.trim(), psid, cleanLeftoverHTML(address_field), cleanLeftoverHTML(org_address), cleanLeftoverHTML(station_address)); } } if (tables_found == 0) logger.severe("No parsable tables found in " + input_html); resultSink.commit(); logger.info("" + tables_found + " table(s) processed in " + input_html); }