List of usage examples for org.jsoup.select Elements first
public Element first()
From source file:GIST.IzbirkomExtractor.TableExtractor.java
/** * Tests the row if it looks like the 1st row of a parsable table * @param row/*from w ww. j a v a 2s .c om*/ * @return */ private boolean isParsableTable(Element row) { Elements cells = row.getElementsByTag("td"); /* number of columns should be 4 */ if (cells.size() != 4) return false; /* look for number signs in 1st cell*/ if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()), " . -") < 3) return true; /* discard the table if any of the cells is empty */ for (Element cell : cells) { if (cleanupUNICODE(cell.text()).isEmpty()) return false; } /* 1st column should be a number */ try { Integer.parseInt(cleanupUNICODE(cells.first().text()).trim()); return true; } catch (NumberFormatException e) { return false; } }
From source file:net.acesinc.convergentui.ConvergentUIResponseFilter.java
@Override public Object run() { String origBody = contentManager.getDownstreamResponse(); if (origBody == null || origBody.isEmpty()) { return null; }/*from w w w . j av a 2s .c o m*/ String composedBody = null; log.trace("Response from downstream server: " + origBody); Document doc = Jsoup.parse(origBody); if (hasReplaceableElements(doc)) { log.debug("We have replaceable elements. Let's get em!"); Elements elementsToUpdate = doc.select("div[data-loc]"); for (Element e : elementsToUpdate) { StringBuilder content = new StringBuilder(); String location = e.dataset().get("loc"); String fragmentName = e.dataset().get("fragment-name"); String cacheName = e.dataset().get("cache-name"); boolean useCaching = !Boolean.valueOf(e.dataset().get("disable-caching")); boolean failQuietly = Boolean.valueOf(e.dataset().get("fail-quietly")); URL url = null; try { url = new URL(location); String protocol = url.getProtocol(); String service = url.getHost(); log.debug("Fetching content at location [ " + location + " ] with cacheName = [ " + cacheName + " ]"); try { RequestContext context = RequestContext.getCurrentContext(); ContentResponse response = contentManager.getContentFromService(location, cacheName, useCaching, context); log.trace(response.toString()); if (!response.isError()) { Object resp = response.getContent(); if (String.class.isAssignableFrom(resp.getClass())) { String subContentResponse = (String) resp; //TODO You better trust the source of your downstream HTML! // String cleanedContent = Jsoup.clean(subContentResponse, Whitelist.basic()); //this totally stripped the html out... Document subDocument = Jsoup.parse(subContentResponse); if (fragmentName != null) { Elements fragments = subDocument .select("div[data-fragment-name=\"" + fragmentName + "\"]"); if (fragments != null && fragments.size() > 0) { if (fragments.size() == 1) { Element frag = fragments.first(); //need to see if there are images that we need to replace the urls on Elements images = frag.select("img"); for (Element i : images) { String src = i.attr("src"); if (src.startsWith("/") && !src.startsWith("//")) { i.attr("src", "/cui-req://" + protocol + "://" + service + src); } //else what do we do about relative urls? } content.append(frag.toString()); } else { for (Element frag : fragments) { content.append(frag.toString()).append("\n\n"); } } } else { log.debug("Found no matching fragments for [ " + fragmentName + " ]"); if (failQuietly) { content.append("<div class='cui-error'></div>"); } else { content.append( "<span class='cui-error'>Failed getting content from remote service. Possible reason in reponse below</span>"); content.append(subDocument.toString()); } } } else { //take the whole thing and cram it in there! content.append(subDocument.toString()); } } else { //not text... if (!failQuietly) { content.append( "<span class='cui-error'>Failed getting content from remote service. Reason: content was not text</span>"); } else { content.append("<div class='cui-error'></div>"); } } } else { if (!failQuietly) { content.append( "<span class='cui-error'>Failed getting content from remote service. Reason: " + response.getMessage() + "</span>"); } else { content.append("<div class='cui-error'></div>"); } } //now append it to the page if (!content.toString().isEmpty()) { e.html(content.toString()); } } catch (Throwable t) { if (!failQuietly) { e.html("<span class='cui-error'>Failed getting content from remote service. Reason: " + t.getMessage() + "</span>"); } log.warn("Failed replacing content", t); } } catch (MalformedURLException ex) { log.warn("location was invalid: [ " + location + " ]", ex); if (!failQuietly) { content.append( "<span class='cui-error'>Failed getting content from remote service. Reason: data-loc was an invalid location.</span>"); } else { content.append("<div class='cui-error'></div>"); } } } composedBody = doc.toString(); } else { log.debug("Document has no replaeable elements. Skipping"); } try { addResponseHeaders(); if (composedBody != null && !composedBody.isEmpty()) { writeResponse(composedBody, getMimeType(RequestContext.getCurrentContext())); } else { writeResponse(origBody, getMimeType(RequestContext.getCurrentContext())); } } catch (Exception ex) { log.error("Error sending response", ex); } return null; }
From source file:com.quarterfull.newsAndroid.NewsDetailFragment.java
public void onCreateContextMenu(ContextMenu menu, View v, ContextMenu.ContextMenuInfo menuInfo) { if (v instanceof WebView) { WebView.HitTestResult result = ((WebView) v).getHitTestResult(); if (result != null) { int type = result.getType(); Document htmldoc = Jsoup.parse(html); FragmentTransaction ft = getFragmentManager().beginTransaction(); if (type == WebView.HitTestResult.IMAGE_TYPE || type == WebView.HitTestResult.SRC_IMAGE_ANCHOR_TYPE) { String imageUrl = result.getExtra(); if (imageUrl.startsWith("http") || imageUrl.startsWith("file")) { URL mImageUrl; String imgtitle; String imgaltval; String imgsrcval; imgsrcval = imageUrl.substring(imageUrl.lastIndexOf('/') + 1, imageUrl.length()); Elements imgtag = htmldoc.getElementsByAttributeValueContaining("src", imageUrl); try { imgtitle = imgtag.first().attr("title"); } catch (NullPointerException e) { imgtitle = ""; }// w w w . j a v a2s.co m try { imgaltval = imgtag.first().attr("alt"); } catch (NullPointerException e) { imgaltval = ""; } try { mImageUrl = new URL(imageUrl); } catch (MalformedURLException e) { return; } String title = imgsrcval; int titleIcon = android.R.drawable.ic_menu_gallery; String text = (imgtitle.isEmpty()) ? imgaltval : imgtitle; // Create and show the dialog. DialogFragment newFragment = NewsDetailImageDialogFragment.newInstanceImage(title, titleIcon, text, mImageUrl); newFragment.show(ft, "menu_fragment_dialog"); } } else if (type == WebView.HitTestResult.SRC_ANCHOR_TYPE) { String url = result.getExtra(); URL mUrl; String text; try { Elements urltag = htmldoc.getElementsByAttributeValueContaining("href", url); text = urltag.text(); mUrl = new URL(url); } catch (MalformedURLException e) { return; } // Create and show the dialog. DialogFragment newFragment = NewsDetailImageDialogFragment.newInstanceUrl(text, mUrl.toString()); newFragment.show(ft, "menu_fragment_dialog"); } //else if (type == WebView.HitTestResult.EMAIL_TYPE) { } //else if (type == WebView.HitTestResult.GEO_TYPE) { } //else if (type == WebView.HitTestResult.PHONE_TYPE) { } //else if (type == WebView.HitTestResult.EDIT_TEXT_TYPE) { } } } }
From source file:com.gumtreescraper.scraper.GumtreeScraper.java
public void scrapeWithJSoup(List<Gumtree> gumtrees, String url) throws IOException { // openSite(url); // waitForPageToLoad(); String nextPageUrl = url;//from w w w .java2 s . c o m boolean needContinue = true; do { try { Document doc = Jsoup.connect(nextPageUrl).timeout(getTimeout() * 1000).userAgent("Mozilla") // .userAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36") .get(); Elements adElements = doc.select("#srchrslt-adtable > li"); int size = adElements.size(); for (int i = 0; i < size; i++) { Element ad = adElements.get(i); if (!isOwner(ad)) { continue; } Element linkElement = ad.select("h6.rs-ad-title > a").first(); if (linkElement == null) { System.out.print(ad); continue; } String adUrl = linkElement.attr("href"); Gumtree gumtree = new Gumtree(); gumtree.setUrl(BASE_URL + adUrl); gumtrees.add(gumtree); if (i == size - 1) { // last element Elements adDateElements = ad.select("div.rs-ad-date"); if (adDateElements.isEmpty()) { continue; } if (!needToScrapeNextPage(adDateElements.first().text().trim())) { needContinue = false; } } } Elements nextElements = doc.select("a.rs-paginator-btn.next"); if (nextElements.isEmpty()) { break; } nextPageUrl = BASE_URL + nextElements.first().attr("href"); System.out.println("next page: " + nextPageUrl); } catch (Exception oex) { System.out.println(oex); } } while (true && needContinue); }
From source file:com.liato.bankdroid.banking.banks.coop.Coop.java
@Override public void update() throws BankException, LoginException, BankChoiceException { super.update(); if (username == null || password == null || username.length() == 0 || password.length() == 0) { throw new LoginException(res.getText(R.string.invalid_username_password).toString()); }/*from www . j ava 2 s . c o m*/ login(); try { for (AccountType at : AccountType.values()) { response = urlopen.open(at.getUrl()); Document d = Jsoup.parse(response); Elements historik = d.select("#historik section"); TransactionParams params = new TransactionParams(); mTransactionParams.put(at, params); if (historik != null && !historik.isEmpty()) { String data = historik.first().attr("data-controller"); Matcher m = rePageGuid.matcher(data); if (m.find()) { params.setPageGuid(m.group(1)); } } Element date = d.getElementById("dateFrom"); if (date != null) { params.setMinDate(date.hasAttr("min") ? date.attr("min") : null); params.setMaxDate(date.hasAttr("max") ? date.attr("max") : null); } Elements es = d.select(".List:contains(Saldo)"); if (es != null && !es.isEmpty()) { List<String> names = new ArrayList<String>(); List<String> values = new ArrayList<String>(); for (Element e : es.first().select("dt")) { names.add(e.text().replaceAll(":", "").trim()); } for (Element e : es.first().select("dd")) { values.add(e.text().trim()); } for (int i = 0; i < Math.min(names.size(), values.size()); i++) { Account a = new Account(names.get(i), Helpers.parseBalance(values.get(i)), String.format("%s%d", at.getPrefix(), i)); a.setCurrency(Helpers.parseCurrency(values.get(i), "SEK")); if (a.getName().toLowerCase().contains("disponibelt")) { a.setType(Account.REGULAR); balance = a.getBalance(); setCurrency(a.getCurrency()); } else { a.setType(Account.OTHER); } if (i > 0) { a.setAliasfor(String.format("%s%d", at.getPrefix(), 0)); } accounts.add(a); } } } } catch (ClientProtocolException e) { e.printStackTrace(); throw new BankException(e.getMessage()); } catch (IOException e) { e.printStackTrace(); throw new BankException(e.getMessage()); } try { RefundSummaryRequest refsumReq = new RefundSummaryRequest(mUserId, mToken, APPLICATION_ID); HttpEntity e = new StringEntity(getObjectmapper().writeValueAsString(refsumReq)); InputStream is = urlopen .openStream("https://www.coop.se/ExternalServices/RefundService.svc/RefundSummary", e, true); RefundSummaryResponse refsumResp = readJsonValue(is, RefundSummaryResponse.class); if (refsumResp != null && refsumResp.getRefundSummaryResult() != null) { Account a = new Account("terbring p ditt kort", BigDecimal.valueOf(refsumResp.getRefundSummaryResult().getAccountBalance()), "refsummary"); a.setCurrency("SEK"); if (accounts.isEmpty()) { balance = a.getBalance(); setCurrency(a.getCurrency()); } accounts.add(a); a = new Account( String.format("terbring fr %s", refsumResp.getRefundSummaryResult().getMonthName()), BigDecimal.valueOf(refsumResp.getRefundSummaryResult().getTotalRefund()), "refsummary_month"); accounts.add(a); } } catch (JsonParseException e) { e.printStackTrace(); throw new BankException(e.getMessage()); } catch (ClientProtocolException e) { e.printStackTrace(); throw new BankException(e.getMessage()); } catch (IOException e) { e.printStackTrace(); throw new BankException(e.getMessage()); } if (accounts.isEmpty()) { throw new BankException(res.getText(R.string.no_accounts_found).toString()); } super.updateComplete(); }
From source file:org.keycloak.testsuite.util.saml.ModifySamlResponseStepBuilder.java
private HttpUriRequest handlePostBinding(CloseableHttpResponse currentResponse) throws Exception { assertThat(currentResponse, statusCodeIsHC(Status.OK)); final String htmlBody = EntityUtils.toString(currentResponse.getEntity()); assertThat(htmlBody, Matchers.containsString("SAML")); org.jsoup.nodes.Document theResponsePage = Jsoup.parse(htmlBody); Elements samlResponses = theResponsePage.select("input[name=SAMLResponse]"); Elements samlRequests = theResponsePage.select("input[name=SAMLRequest]"); Elements forms = theResponsePage.select("form"); Elements relayStates = theResponsePage.select("input[name=RelayState]"); int size = samlResponses.size() + samlRequests.size(); assertThat("Checking uniqueness of SAMLResponse/SAMLRequest input field in the page", size, is(1)); assertThat("Checking uniqueness of forms in the page", forms, hasSize(1)); Element respElement = samlResponses.isEmpty() ? samlRequests.first() : samlResponses.first(); Element form = forms.first(); String base64EncodedSamlDoc = respElement.val(); InputStream decoded = PostBindingUtil.base64DecodeAsStream(base64EncodedSamlDoc); String samlDoc = IOUtils.toString(decoded, GeneralConstants.SAML_CHARSET); IOUtils.closeQuietly(decoded);/*from w w w .j av a 2 s .c o m*/ String transformed = getTransformer().transform(samlDoc); if (transformed == null) { return null; } final String attributeName = this.targetAttribute != null ? this.targetAttribute : respElement.attr("name"); List<NameValuePair> parameters = new LinkedList<>(); if (!relayStates.isEmpty()) { parameters.add(new BasicNameValuePair(GeneralConstants.RELAY_STATE, relayStates.first().val())); } URI locationUri = this.targetUri != null ? this.targetUri : URI.create(form.attr("action")); return createRequest(locationUri, attributeName, transformed, parameters); }
From source file:fr.eolya.extraction.tika.TikaWrapper.java
private String getMetaContent(Document doc, String metaName) { Elements e = doc.select("meta[name=" + metaName + "]"); if (e == null || e.first() == null) return null; return e.first().attr("content"); }
From source file:net.kevxu.purdueassist.course.CatalogDetail.java
private CatalogDetailEntry parseDocument(Document document) throws HtmlParseException, CourseNotFoundException, IOException { CatalogDetailEntry entry = new CatalogDetailEntry(subject, cnbr); Elements tableElements = document.getElementsByAttributeValue("summary", "This table lists the course detail for the selected term."); if (tableElements.isEmpty() != true) { // get name try {//from ww w. ja v a 2 s .c o m Element body = tableElements.first().select("tbody").first(); String nameBlock = body.select("tr td.nttitle").first().text(); String[] temp = nameBlock.split(subject.name() + " " + String.valueOf(cnbr)); String name = temp[temp.length - 1].substring(3); entry.setName(name); // get description body = body.select(".ntdefault").first(); String text = body.text(); int split = text.indexOf("Levels:"); String description = text.substring(0, split); description = description.substring(20); entry.setDescription(description); // get levels int begin = split; int end = text.indexOf("Schedule Types:"); String levels = text.substring(begin + 8, end); temp = levels.split("[ ,]"); List<String> lvs = new ArrayList<String>(); for (String s : temp) if (!s.equals("")) { lvs.add(s); } entry.setLevels(lvs); // get type and prerequisites List<Type> types = new ArrayList<Type>(); List<String> preq = new ArrayList<String>(); Elements parsing_A = body.select("a"); for (Element e : parsing_A) { if (e.attr("href").contains("schd_in") && !(e.attr("href").contains("%"))) { try { types.add(Type.valueOf(e.text().replace(" ", ""))); } catch (Exception exception) { throw new HtmlParseException(); } } else if (e.attr("href").contains("sel_attr=")) { preq.add(e.text()); } } if (types.size() > 0) entry.setType(types); if (preq.size() > 0) entry.setPrerequisites(preq); // get offered by begin = text.indexOf("Offered By:"); end = text.indexOf("Department:"); if (end < 0) end = text.indexOf("Course Attributes:"); if (end > 0) { entry.setOfferedBy(text.substring(begin + 12, end - 1)); } // get department begin = text.indexOf("Department:"); if (begin > 0) { end = text.indexOf("Course Attributes:"); entry.setDepartment((text.substring(begin + 12, end - 1))); } // get campus begin = text.indexOf("May be offered at any of the following campuses:"); String campuses; end = text.indexOf("Repeatable for Additional Credit:"); if (end < 0) end = text.indexOf("Learning Objectives:"); if (end < 0) end = text.indexOf("Restrictions:"); if (end < 0) end = text.indexOf("Corequisites:"); if (end < 0) end = text.indexOf("Prerequisites:"); if (end < 0) { campuses = text .substring(begin + "May be offered at any of the following campuses:".length() + 5); } else { campuses = text.substring( begin + "May be offered at any of the following campuses:".length() + 5, end - 1); } temp = campuses.replace(" ", "#").split("#"); List<String> camps = new ArrayList<String>(); for (String s : temp) { if (s.length() > 1) { camps.add(s); } } entry.setCampuses(camps); // get restrictions begin = text.indexOf("Restrictions:"); end = text.indexOf("Corequisites:"); if (end < 0) end = text.indexOf("Prerequisites:"); if (begin > 0 && end < 0) { entry.setRestrictions( text.substring(begin + "Restrictions:".length()).replace(" ", "\n")); } else if (begin > 0) { entry.setRestrictions( text.substring(begin + "Restrictions:".length(), end).replace(" ", "\n")); } } catch (StringIndexOutOfBoundsException e) { // no type, not available // System.out.println("-----------"); // System.out.println("Error for cnbr = " + cnbr); // System.out.println("-----------"); } } else { throw new CourseNotFoundException(); } return entry; }
From source file:com.adarshahd.indianrailinfo.donate.PNRStat.java
private void createTableLayoutTrnDtls() { if (mPageResult.contains("FLUSHED PNR / ") || mPageResult.contains("Invalid PNR")) { mTextViewPNRSts.setText("The PNR entered is either invalid or expired! Please check."); mFrameLayout.removeAllViews();/*from ww w . ja v a 2 s .c om*/ mFrameLayout.addView(mTextViewPNRSts); return; } if (mPageResult.contains("Connectivity Failure") || mPageResult.contains("try again")) { mTextViewPNRSts.setText("Looks like server is busy or currently unavailable. Please try again later!"); mFrameLayout.removeAllViews(); mFrameLayout.addView(mTextViewPNRSts); return; } List<String> trainList; if (mTrainDetails == null || mTrainDetails.getPNR() != mPNRNumber) { Elements eleTrain = Jsoup.parse(mPageResult).select("table tr tr td:containsOwn(Train Number)"); Iterator iteTrain = null; try { iteTrain = eleTrain.first().parent().parent().parent().getElementsByTag("tr").iterator(); } catch (Exception e) { Log.i("PNRStat", mPageResult); return; } trainList = new ArrayList<String>(); Element tmp; //Get the third row for train details iteTrain.next(); iteTrain.next(); if (iteTrain.hasNext()) { tmp = (Element) iteTrain.next(); trainList.add(tmp.select("td").get(0).text()); trainList.add(tmp.select("td").get(1).text()); trainList.add(tmp.select("td").get(2).text()); trainList.add(tmp.select("td").get(5).text()); trainList.add(tmp.select("td").get(6).text()); trainList.add(tmp.select("td").get(7).text()); } mTrainDetails = new TrainDetails(trainList, mPNRNumber); } else { trainList = mTrainDetails.getTrainDetails(); } mTableLayoutTrn = new TableLayout(mActivity); mTableLayoutTrn.setLayoutParams(new FrameLayout.LayoutParams(ViewGroup.LayoutParams.MATCH_PARENT, ViewGroup.LayoutParams.WRAP_CONTENT)); TableRow row = new TableRow(mActivity); mStrTrainDetails = new String(); row.setLayoutParams(new FrameLayout.LayoutParams(ViewGroup.LayoutParams.MATCH_PARENT, ViewGroup.LayoutParams.WRAP_CONTENT)); for (String list : trainList) { TextView tv = new TextView(mActivity); tv.setText(list); tv.setPadding(10, 10, 10, 10); tv.setTextAppearance(mActivity, android.R.style.TextAppearance_DeviceDefault_Small); row.addView(tv); mStrTrainDetails += list + " "; } row.setBackgroundResource(R.drawable.card_background); row.setGravity(Gravity.CENTER_HORIZONTAL | Gravity.CENTER_VERTICAL); mTableLayoutTrn.addView(row); }
From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java
/** * metaTitle?metaTitle,metaTitle??????title * * @param contentElement/*from w w w . j a v a 2 s. co m*/ * @return * @throws Exception */ protected String getTitle(final Element contentElement) throws Exception { final ArrayList<Element> titleList = new ArrayList<Element>(); final ArrayList<Double> titleSim = new ArrayList<Double>(); final String metaTitle = getText(doc.title().trim()); if (!metaTitle.isEmpty()) { doc.body().traverse(new NodeVisitor() { @Override public void head(Node node, int i) { if (node instanceof Element) { Element tag = (Element) node; String tagName = tag.tagName(); if (Pattern.matches("h[1-6]", tagName)) { String title = tag.text().trim(); double sim = strSim(title, metaTitle); titleSim.add(sim); titleList.add(tag); } } } @Override public void tail(Node node, int i) { } }); int index = titleSim.size(); if (index >= 0) { double maxScore = 0; int maxIndex = -1; for (int i = 0; i < index; i++) { double score = (i + 1) * titleSim.get(i); if (score > maxScore) { maxScore = score; maxIndex = i; } } if (maxIndex == -1 || titleSim.get(maxIndex) < 0.3) { String title = getText(metaTitle); if (!title.endsWith("") && title.length() > 7) { return title; } Collections.sort(titleList, new Comparator<Element>() { @Override public int compare(Element o1, Element o2) { int len1 = 1; int len2 = 1; if (o1.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() > 26 || o1.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() < 7) { len1 = 0; } if (o2.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() > 26 || o2.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() < 7) { len2 = 0; } if (len1 == len2) { return o1.tagName().charAt(1) - o2.tagName().charAt(1); } return len2 - len1; } }); return getText(titleList.get(0).text()); } return titleList.get(maxIndex).text(); } } /** * ? */ Elements titles = doc.body().select("*[id^=title],*[id$=title],*[class^=title],*[class$=title]"); if (titles.size() > 0) { String title = titles.first().text(); if (title.length() > 5 && title.length() < 40) { return titles.first().text(); } } try { return getTitleByEditDistance(contentElement); } catch (Exception ex) { throw new Exception("title not found"); } }