List of usage examples for org.jsoup.select Elements first
public Element first()
From source file:com.gumtreescraper.scraper.GumtreeScraper.java
private boolean isOwner(Element adElement) { Elements forSaleByElements = adElement.select("span.rs-ad-attributes-forsaleby_s"); Elements forRentByElements = adElement.select("span.rs-ad-attributes-forrentby_s"); // sometime if ads is owner then it does not display if (forSaleByElements.isEmpty() && forRentByElements.isEmpty()) { return true; }//from w ww. ja va2 s. c o m if (!forSaleByElements.isEmpty() && ("agency".equalsIgnoreCase(forSaleByElements.first().text().trim()) || "agent".equalsIgnoreCase(forSaleByElements.first().text().trim()))) { return false; } if (!forRentByElements.isEmpty() && ("agency".equalsIgnoreCase(forRentByElements.first().text().trim()) || "agent".equalsIgnoreCase(forRentByElements.first().text().trim()))) { return false; } return true; }
From source file:com.adarshahd.indianrailinfo.donate.PNRStat.java
private void createTableLayoutPsnDtls() { if (mPageResult.contains("FLUSHED PNR / ") || mPageResult.contains("Invalid PNR")) { mTextViewPNRSts.setText("The PNR entered is either invalid or expired! Please check."); mFrameLayout.removeAllViews();// w w w . j av a 2s .c o m mFrameLayout.addView(mTextViewPNRSts); mStrPassengerDetails = null; return; } if (mPageResult.contains("Connectivity Failure") || mPageResult.contains("try again")) { mTextViewPNRSts.setText("Looks like server is busy or currently unavailable. Please try again later!"); mFrameLayout.removeAllViews(); mFrameLayout.addView(mTextViewPNRSts); mStrPassengerDetails = null; return; } List<List<String>> passengersList; if (mPassengerDetails == null || mPassengerDetails.getPNR() != mPNRNumber) { Elements elements = Jsoup.parse(mPageResult).select("table tr td:containsOwn(S. No.)"); Iterator iterator = null; try { iterator = elements.first().parent().parent().getElementsByTag("tr").iterator(); } catch (Exception e) { Log.i("PNRStat", mPageResult); return; } passengersList = new ArrayList<List<String>>(); List<String> list; Element tmp; while (iterator.hasNext()) { tmp = (Element) iterator.next(); if (tmp.toString().contains("Passenger")) { list = new ArrayList<String>(); list.add(tmp.select("td").get(0).text()); list.add(tmp.select("td").get(1).text()); list.add(tmp.select("td").get(2).text()); if (!tmp.select("td").get(2).text().toUpperCase().contains("CNF") && !tmp.select("td").get(2).text().toUpperCase().contains("CAN")) { isWaitingList = true; } passengersList.add(list); } } mPassengerDetails = new PassengerDetails(passengersList, mPNRNumber); } else { passengersList = mPassengerDetails.getPassengerList(); } mTableLayoutPsn = new TableLayout(mActivity); TableRow row; TextView tv1, tv2, tv3, tv4; mStrPassengerDetails = new ArrayList<String>(); int current; mTableLayoutPsn.setLayoutParams(new FrameLayout.LayoutParams(ViewGroup.LayoutParams.MATCH_PARENT, ViewGroup.LayoutParams.WRAP_CONTENT)); for (int i = 0; i < passengersList.size(); ++i) { current = i + 1; row = new TableRow(mActivity); row.setLayoutParams(new FrameLayout.LayoutParams(ViewGroup.LayoutParams.MATCH_PARENT, ViewGroup.LayoutParams.WRAP_CONTENT)); tv1 = new TextView(mActivity); tv2 = new TextView(mActivity); tv3 = new TextView(mActivity); tv4 = new TextView(mActivity); tv1.setText("" + (i + 1) + "."); tv2.setText(" " + passengersList.get(i).get(0)); tv3.setText(" " + passengersList.get(i).get(1)); tv4.setText(" " + passengersList.get(i).get(2)); tv1.setTextAppearance(mActivity, android.R.style.TextAppearance_DeviceDefault_Medium); tv2.setTextAppearance(mActivity, android.R.style.TextAppearance_DeviceDefault_Medium); tv3.setTextAppearance(mActivity, android.R.style.TextAppearance_DeviceDefault_Medium); tv4.setTextAppearance(mActivity, android.R.style.TextAppearance_DeviceDefault_Medium); tv1.setPadding(10, 10, 10, 10); tv2.setPadding(10, 10, 10, 10); tv3.setPadding(10, 10, 10, 10); tv4.setPadding(10, 10, 10, 10); row.addView(tv1); row.addView(tv2); row.addView(tv3); row.addView(tv4); row.setBackgroundResource(R.drawable.card_background); row.setGravity(Gravity.CENTER_HORIZONTAL | Gravity.CENTER_VERTICAL); mTableLayoutPsn.addView(row); String strPsn = "" + current + ". " + passengersList.get(i).get(0) + " " + passengersList.get(i).get(1) + " " + passengersList.get(i).get(2); mStrPassengerDetails.add(strPsn); } }
From source file:de.geeksfactory.opacclient.apis.Open.java
protected DetailledItem parse_result(Document doc) { DetailledItem item = new DetailledItem(); // Title and Subtitle item.setTitle(doc.select("span[id$=LblShortDescriptionValue]").text()); String subtitle = doc.select("span[id$=LblSubTitleValue]").text(); if (!subtitle.equals("")) { item.addDetail(new Detail(stringProvider.getString(StringProvider.SUBTITLE), subtitle)); }/*from w ww. j a v a2s.com*/ // Cover if (doc.select("input[id$=mediumImage]").size() > 0) { item.setCover(doc.select("input[id$=mediumImage]").attr("src")); } else if (doc.select("img[id$=CoverView_Image]").size() > 0) { item.setCover(getCoverUrl(doc.select("img[id$=CoverView_Image]").first())); } // ID item.setId(doc.select("input[id$=regionmednr]").val()); // Description if (doc.select("span[id$=ucCatalogueContent_LblAnnotation]").size() > 0) { String name = doc.select("span[id$=lblCatalogueContent]").text(); String value = doc.select("span[id$=ucCatalogueContent_LblAnnotation]").text(); item.addDetail(new Detail(name, value)); } // Details for (Element detail : doc.select("div[id$=CatalogueDetailView] .spacingBottomSmall:has(span+span)")) { String name = detail.select("span").get(0).text().replace(": ", ""); String value = detail.select("span").get(1).text(); item.addDetail(new Detail(name, value)); } // Copies Element table = doc.select("table[id$=grdViewMediumCopies]").first(); Elements trs = table.select("tr"); List<String> columnmap = new ArrayList<>(); for (Element th : trs.first().select("th")) { columnmap.add(getCopyColumnKey(th.text())); } DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); for (int i = 1; i < trs.size(); i++) { Elements tds = trs.get(i).select("td"); Copy copy = new Copy(); for (int j = 0; j < tds.size(); j++) { if (columnmap.get(j) == null) continue; String text = tds.get(j).text().replace("\u00a0", ""); if (text.equals("")) continue; copy.set(columnmap.get(j), text, fmt); } item.addCopy(copy); } return item; }
From source file:GIST.IzbirkomExtractor.TableExtractor.java
public void processHTMLfile(File input_html) throws IOException, TableExtractorException, CloneNotSupportedException, SQLException, ResultSinkException { logger.info("Start processing " + input_html); Document doc = Jsoup.parse(input_html, "UTF-8"); Elements tables = doc.getElementsByTag("table"); /* count of parseable tables found */ int tables_found = 0; /* determine raion name */ String raion_name = extractRaionFromFileName(input_html.getName()); //System.err.println(raion_name); // TODO: inflect raion name in case /* searches for a table that has " . -" in its very 1st cell */ for (Element table : tables) { Elements rows = table.getElementsByTag("tr"); boolean firstRow = true; row_loop: for (Element row : rows) { Elements cells = row.getElementsByTag("td"); if (firstRow) { //System.err.println(row.text()); if (isParsableTable(row)) { firstRow = false; logger.info("Processing table #" + ++tables_found + " in " + input_html); } else break row_loop; }//from w w w .j a va 2s . c om if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()), " . -") < 3) continue row_loop; /* skip the row if it looks like a table header */ /* skip rows with all cells empty */ boolean emptyRow = true; for (Element cell : cells) emptyRow = emptyRow && cleanupUNICODE(cell.text()).isEmpty(); if (emptyRow) continue; int i_cell = 0; Element station_id = null; Element address_field = null; Element org_address = null; /* address of the ??? */ Element station_address = null; for (Element cell : cells) { switch (i_cell) { case 0: station_id = cell; break; case 1: address_field = cell; break; case 2: org_address = cell; break; case 3: station_address = cell; default: break; } i_cell++; } if (station_id == null) throw new TableExtractorException("Polling station ID not found", row, input_html); if (address_field == null) throw new TableExtractorException("Address list not found", row, input_html); /* extract int from poll station id */ int psid; try { psid = Integer.valueOf(cleanupUNICODE(station_id.text()).trim().replaceAll("[^\\d]", "")); } catch (NumberFormatException e) { Exception te = new TableExtractorException("Failed to parse polling station ID >" + cleanupUNICODE(station_id.text()).trim() + "<: ", station_id, input_html); logger.severe(te.getMessage() + "; rest of " + input_html + " ignored."); return; } /* extraction from HTML completely finished, now we work only with the addresses in the text form */ extractAddressesFromText(raion_name.trim(), psid, cleanLeftoverHTML(address_field), cleanLeftoverHTML(org_address), cleanLeftoverHTML(station_address)); } } if (tables_found == 0) logger.severe("No parsable tables found in " + input_html); resultSink.commit(); logger.info("" + tables_found + " table(s) processed in " + input_html); }
From source file:de.geeksfactory.opacclient.apis.Open.java
@Override public SearchRequestResult searchGetPage(int page) throws IOException, OpacErrorException, JSONException { /*// w w w.j a v a 2 s .co m When there are many pages of results, there will only be links to the next 4 and previous 4 pages, so we will click links until it gets to the correct page. */ if (searchResultDoc == null) throw new NotReachableException(); Document doc = searchResultDoc; Elements pageLinks = doc.select("span[id$=DataPager1]").first().select("a[id*=LinkButtonPageN"); int from = Integer.valueOf(pageLinks.first().text()); int to = Integer.valueOf(pageLinks.last().text()); Element linkToClick; boolean willBeCorrectPage; if (page < from) { linkToClick = pageLinks.first(); willBeCorrectPage = false; } else if (page > to) { linkToClick = pageLinks.last(); willBeCorrectPage = false; } else { linkToClick = pageLinks.get(page - from); willBeCorrectPage = true; } Pattern pattern = Pattern.compile("javascript:__doPostBack\\('([^,]*)','([^\\)]*)'\\)"); Matcher matcher = pattern.matcher(linkToClick.attr("href")); if (!matcher.find()) throw new OpacErrorException(StringProvider.INTERNAL_ERROR); FormElement form = (FormElement) doc.select("form").first(); HttpEntity data = formData(form, null).addTextBody("__EVENTTARGET", matcher.group(1)) .addTextBody("__EVENTARGUMENT", matcher.group(2)).build(); ByteArrayOutputStream stream = new ByteArrayOutputStream(); data.writeTo(stream); String postUrl = form.attr("abs:action"); String html = httpPost(postUrl, data, "UTF-8"); if (willBeCorrectPage) { // We clicked on the correct link Document doc2 = Jsoup.parse(html); doc2.setBaseUri(postUrl); return parse_search(doc2, page); } else { // There was no correct link, so try to find one again searchResultDoc = Jsoup.parse(html); searchResultDoc.setBaseUri(postUrl); return searchGetPage(page); } }
From source file:net.kevxu.purdueassist.course.ScheduleDetail.java
private ScheduleDetailEntry parseDocument(Document document) throws HtmlParseException, CourseNotFoundException, ResultNotMatchException { ScheduleDetailEntry entry = new ScheduleDetailEntry(term, crn); Elements tableElements = document.getElementsByAttributeValue("summary", "This table is used to present the detailed class information."); if (!tableElements.isEmpty()) { for (Element tableElement : tableElements) { // get basic info for selected course Element tableBasicInfoElement = tableElement.getElementsByClass("ddlabel").first(); if (tableBasicInfoElement != null) { setBasicInfo(entry, tableBasicInfoElement.text()); } else { throw new HtmlParseException("Basic info element empty."); }/* www.j a v a2 s . c o m*/ // get detailed course info Element tableDetailedInfoElement = tableElement.getElementsByClass("dddefault").first(); if (tableDetailedInfoElement != null) { // process seat info Elements tableSeatDetailElements = tableDetailedInfoElement.getElementsByAttributeValue( "summary", "This layout table is used to present the seating numbers."); if (tableSeatDetailElements.size() == 1) { Element tableSeatDetailElement = tableSeatDetailElements.first(); Elements tableSeatDetailEntryElements = tableSeatDetailElement.getElementsByTag("tbody") .first().children(); if (tableSeatDetailEntryElements.size() == 3 || tableSeatDetailEntryElements.size() == 4) { setSeats(entry, tableSeatDetailEntryElements.get(1).text()); setWaitlistSeats(entry, tableSeatDetailEntryElements.get(2).text()); if (tableSeatDetailEntryElements.size() == 4) { setCrosslistSeats(entry, tableSeatDetailEntryElements.get(3).text()); } } else { throw new HtmlParseException("Seat detail entry elements size not 3. We have " + tableSeatDetailEntryElements.size() + "."); } } else { throw new HtmlParseException( "Seat detail elements size not 1. We have " + tableSeatDetailElements.size() + "."); } // remove the seat info from detailed info tableSeatDetailElements.remove(); // remaining information setRemainingInfo(entry, tableDetailedInfoElement.html()); } else { throw new HtmlParseException("Detailed info element empty."); } } } else { // test empty Elements informationElements = document.getElementsByAttributeValue("summary", "This layout table holds message information"); if (!informationElements.isEmpty() && informationElements.text().contains("No detailed class information found")) { throw new CourseNotFoundException(informationElements.text()); } else { throw new HtmlParseException( "Course table not found, but page does not contain message stating no course found."); } } return entry; }
From source file:de.geeksfactory.opacclient.apis.Pica.java
@Override public List<SearchField> getSearchFields() throws IOException, JSONException { if (!initialised) { start();/*from w ww. j a v a 2s . c o m*/ } String html = httpGet(opac_url + "/LNG=" + getLang() + "/DB=" + db + "/ADVANCED_SEARCHFILTER", getDefaultEncoding()); Document doc = Jsoup.parse(html); List<SearchField> fields = new ArrayList<>(); Elements options = doc.select("select[name=IKT0] option"); for (Element option : options) { TextSearchField field = new TextSearchField(); field.setDisplayName(option.text()); field.setId(option.attr("value")); field.setHint(""); field.setData(new JSONObject("{\"ADI\": false}")); Pattern pattern = Pattern.compile("\\[X?[A-Za-z]{2,3}:?\\]|\\(X?[A-Za-z]{2,3}:?\\)"); Matcher matcher = pattern.matcher(field.getDisplayName()); if (matcher.find()) { field.getData().put("meaning", matcher.group().replace(":", "").toUpperCase()); field.setDisplayName(matcher.replaceFirst("").trim()); } fields.add(field); } Elements sort = doc.select("select[name=SRT]"); if (sort.size() > 0) { DropdownSearchField field = new DropdownSearchField(); field.setDisplayName(sort.first().parent().parent().select(".longval").first().text()); field.setId("SRT"); for (Element option : sort.select("option")) { field.addDropdownValue(option.attr("value"), option.text()); } fields.add(field); } for (Element input : doc.select("input[type=text][name^=ADI]")) { TextSearchField field = new TextSearchField(); field.setDisplayName(input.parent().parent().select(".longkey").text()); field.setId(input.attr("name")); field.setHint(input.parent().select("span").text()); field.setData(new JSONObject("{\"ADI\": true}")); fields.add(field); } for (Element dropdown : doc.select("select[name^=ADI]")) { DropdownSearchField field = new DropdownSearchField(); field.setDisplayName(dropdown.parent().parent().select(".longkey").text()); field.setId(dropdown.attr("name")); for (Element option : dropdown.select("option")) { field.addDropdownValue(option.attr("value"), option.text()); } fields.add(field); } Elements fuzzy = doc.select("input[name=FUZZY]"); if (fuzzy.size() > 0) { CheckboxSearchField field = new CheckboxSearchField(); field.setDisplayName(fuzzy.first().parent().parent().select(".longkey").first().text()); field.setId("FUZZY"); fields.add(field); } Elements mediatypes = doc.select("input[name=ADI_MAT]"); if (mediatypes.size() > 0) { DropdownSearchField field = new DropdownSearchField(); field.setDisplayName("Materialart"); field.setId("ADI_MAT"); field.addDropdownValue("", "Alle"); for (Element mt : mediatypes) { field.addDropdownValue(mt.attr("value"), mt.parent().nextElementSibling().text().replace("\u00a0", "")); } fields.add(field); } return fields; }
From source file:hu.tbognar76.apking.ApKing.java
private boolean isLocalVersionGooglePlayImage(String packageName) { String fname = this.init.catalogHtml + "/" + this.init.catalogPic + "/" + packageName + ".png"; if (!this.init.isCatalogPicForced) { File t = new File(fname); if (t.exists()) { // System.out.println("------"+fname); if (t.length() > 1) { return true; }//from w ww . ja v a 2 s .co m return false; } else { // WORK TO DO BELOW } } Document doc = null; try { doc = Jsoup .connect("https://play.google.com/store/apps/details?id=" + URI.create(packageName) + "&hl=en") .get(); // Joni j, de nha nem // Elements img = doc.getElementsByClass("cover-image"); Elements img = doc.select("div.cover-container img"); String uu = "http:" + img.first().attr("src"); uu = uu.replace("http:https:", "https:"); uu = uu.replace("=w300", "=w120"); URL url = new URL(uu); FileUtils.copyURLToFile(url, new File(fname)); } catch (Exception e) { try { FileUtils.write(new File(fname), "-"); return false; } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } return true; }
From source file:com.jimplush.goose.ContentExtractor.java
private String getMetaContent(Document doc, String metaName) { Elements meta = doc.select(metaName); if (meta.size() > 0) { String content = meta.first().attr("content"); return string.isNullOrEmpty(content) ? string.empty : content.trim(); }//from w w w . j a va2s. c o m return string.empty; }
From source file:com.jimplush.goose.ContentExtractor.java
/** * adds any siblings that may have a decent score to this node * * @param node/*from www . jav a 2 s . co m*/ * @return */ private Element addSiblings(Element node) { if (logger.isDebugEnabled()) { logger.debug("Starting to add siblings"); } int baselineScoreForSiblingParagraphs = getBaselineScoreForSiblings(node); Element currentSibling = node.previousElementSibling(); while (currentSibling != null) { if (logger.isDebugEnabled()) { logger.debug("SIBLINGCHECK: " + debugNode(currentSibling)); } if (currentSibling.tagName().equals("p")) { node.child(0).before(currentSibling.outerHtml()); currentSibling = currentSibling.previousElementSibling(); continue; } // check for a paraph embedded in a containing element int insertedSiblings = 0; Elements potentialParagraphs = currentSibling.getElementsByTag("p"); if (potentialParagraphs.first() == null) { currentSibling = currentSibling.previousElementSibling(); continue; } for (Element firstParagraph : potentialParagraphs) { WordStats wordStats = StopWords.getStopWordCount(firstParagraph.text()); int paragraphScore = wordStats.getStopWordCount(); if ((float) (baselineScoreForSiblingParagraphs * .30) < paragraphScore) { if (logger.isDebugEnabled()) { logger.debug("This node looks like a good sibling, adding it"); } node.child(insertedSiblings).before("<p>" + firstParagraph.text() + "<p>"); insertedSiblings++; } } currentSibling = currentSibling.previousElementSibling(); } return node; }