Example usage for org.jsoup.select Elements first

List of usage examples for org.jsoup.select Elements first

Introduction

In this page you can find the example usage for org.jsoup.select Elements first.

Prototype

public Element first() 

Source Link

Document

Get the first matched element.

Usage

From source file:com.gumtreescraper.scraper.GumtreeScraper.java

private boolean isOwner(Element adElement) {
    Elements forSaleByElements = adElement.select("span.rs-ad-attributes-forsaleby_s");
    Elements forRentByElements = adElement.select("span.rs-ad-attributes-forrentby_s");

    // sometime if ads is owner then it does not display
    if (forSaleByElements.isEmpty() && forRentByElements.isEmpty()) {
        return true;
    }//from   w ww.  ja  va2 s. c o  m

    if (!forSaleByElements.isEmpty() && ("agency".equalsIgnoreCase(forSaleByElements.first().text().trim())
            || "agent".equalsIgnoreCase(forSaleByElements.first().text().trim()))) {
        return false;
    }

    if (!forRentByElements.isEmpty() && ("agency".equalsIgnoreCase(forRentByElements.first().text().trim())
            || "agent".equalsIgnoreCase(forRentByElements.first().text().trim()))) {
        return false;
    }

    return true;
}

From source file:com.adarshahd.indianrailinfo.donate.PNRStat.java

private void createTableLayoutPsnDtls() {
    if (mPageResult.contains("FLUSHED PNR / ") || mPageResult.contains("Invalid PNR")) {
        mTextViewPNRSts.setText("The PNR entered is either invalid or expired! Please check.");
        mFrameLayout.removeAllViews();// w  w w . j av  a  2s  .c  o  m
        mFrameLayout.addView(mTextViewPNRSts);
        mStrPassengerDetails = null;
        return;
    }
    if (mPageResult.contains("Connectivity Failure") || mPageResult.contains("try again")) {
        mTextViewPNRSts.setText("Looks like server is busy or currently unavailable. Please try again later!");
        mFrameLayout.removeAllViews();
        mFrameLayout.addView(mTextViewPNRSts);
        mStrPassengerDetails = null;
        return;
    }
    List<List<String>> passengersList;
    if (mPassengerDetails == null || mPassengerDetails.getPNR() != mPNRNumber) {
        Elements elements = Jsoup.parse(mPageResult).select("table tr td:containsOwn(S. No.)");
        Iterator iterator = null;
        try {
            iterator = elements.first().parent().parent().getElementsByTag("tr").iterator();
        } catch (Exception e) {
            Log.i("PNRStat", mPageResult);
            return;
        }
        passengersList = new ArrayList<List<String>>();
        List<String> list;
        Element tmp;
        while (iterator.hasNext()) {
            tmp = (Element) iterator.next();
            if (tmp.toString().contains("Passenger")) {
                list = new ArrayList<String>();
                list.add(tmp.select("td").get(0).text());
                list.add(tmp.select("td").get(1).text());
                list.add(tmp.select("td").get(2).text());
                if (!tmp.select("td").get(2).text().toUpperCase().contains("CNF")
                        && !tmp.select("td").get(2).text().toUpperCase().contains("CAN")) {
                    isWaitingList = true;
                }
                passengersList.add(list);
            }
        }
        mPassengerDetails = new PassengerDetails(passengersList, mPNRNumber);
    } else {
        passengersList = mPassengerDetails.getPassengerList();
    }

    mTableLayoutPsn = new TableLayout(mActivity);
    TableRow row;
    TextView tv1, tv2, tv3, tv4;
    mStrPassengerDetails = new ArrayList<String>();
    int current;
    mTableLayoutPsn.setLayoutParams(new FrameLayout.LayoutParams(ViewGroup.LayoutParams.MATCH_PARENT,
            ViewGroup.LayoutParams.WRAP_CONTENT));
    for (int i = 0; i < passengersList.size(); ++i) {
        current = i + 1;
        row = new TableRow(mActivity);
        row.setLayoutParams(new FrameLayout.LayoutParams(ViewGroup.LayoutParams.MATCH_PARENT,
                ViewGroup.LayoutParams.WRAP_CONTENT));
        tv1 = new TextView(mActivity);
        tv2 = new TextView(mActivity);
        tv3 = new TextView(mActivity);
        tv4 = new TextView(mActivity);

        tv1.setText("" + (i + 1) + ".");
        tv2.setText("   " + passengersList.get(i).get(0));
        tv3.setText("   " + passengersList.get(i).get(1));
        tv4.setText("   " + passengersList.get(i).get(2));

        tv1.setTextAppearance(mActivity, android.R.style.TextAppearance_DeviceDefault_Medium);
        tv2.setTextAppearance(mActivity, android.R.style.TextAppearance_DeviceDefault_Medium);
        tv3.setTextAppearance(mActivity, android.R.style.TextAppearance_DeviceDefault_Medium);
        tv4.setTextAppearance(mActivity, android.R.style.TextAppearance_DeviceDefault_Medium);

        tv1.setPadding(10, 10, 10, 10);
        tv2.setPadding(10, 10, 10, 10);
        tv3.setPadding(10, 10, 10, 10);
        tv4.setPadding(10, 10, 10, 10);

        row.addView(tv1);
        row.addView(tv2);
        row.addView(tv3);
        row.addView(tv4);

        row.setBackgroundResource(R.drawable.card_background);
        row.setGravity(Gravity.CENTER_HORIZONTAL | Gravity.CENTER_VERTICAL);
        mTableLayoutPsn.addView(row);
        String strPsn = "" + current + ". " + passengersList.get(i).get(0) + "   "
                + passengersList.get(i).get(1) + "   " + passengersList.get(i).get(2);
        mStrPassengerDetails.add(strPsn);
    }
}

From source file:de.geeksfactory.opacclient.apis.Open.java

protected DetailledItem parse_result(Document doc) {
    DetailledItem item = new DetailledItem();

    // Title and Subtitle
    item.setTitle(doc.select("span[id$=LblShortDescriptionValue]").text());
    String subtitle = doc.select("span[id$=LblSubTitleValue]").text();
    if (!subtitle.equals("")) {
        item.addDetail(new Detail(stringProvider.getString(StringProvider.SUBTITLE), subtitle));
    }/*from  w ww. j a  v  a2s.com*/

    // Cover
    if (doc.select("input[id$=mediumImage]").size() > 0) {
        item.setCover(doc.select("input[id$=mediumImage]").attr("src"));
    } else if (doc.select("img[id$=CoverView_Image]").size() > 0) {
        item.setCover(getCoverUrl(doc.select("img[id$=CoverView_Image]").first()));
    }

    // ID
    item.setId(doc.select("input[id$=regionmednr]").val());

    // Description
    if (doc.select("span[id$=ucCatalogueContent_LblAnnotation]").size() > 0) {
        String name = doc.select("span[id$=lblCatalogueContent]").text();
        String value = doc.select("span[id$=ucCatalogueContent_LblAnnotation]").text();
        item.addDetail(new Detail(name, value));
    }
    // Details
    for (Element detail : doc.select("div[id$=CatalogueDetailView] .spacingBottomSmall:has(span+span)")) {
        String name = detail.select("span").get(0).text().replace(": ", "");
        String value = detail.select("span").get(1).text();
        item.addDetail(new Detail(name, value));
    }

    // Copies
    Element table = doc.select("table[id$=grdViewMediumCopies]").first();
    Elements trs = table.select("tr");
    List<String> columnmap = new ArrayList<>();
    for (Element th : trs.first().select("th")) {
        columnmap.add(getCopyColumnKey(th.text()));
    }

    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);
    for (int i = 1; i < trs.size(); i++) {
        Elements tds = trs.get(i).select("td");
        Copy copy = new Copy();
        for (int j = 0; j < tds.size(); j++) {
            if (columnmap.get(j) == null)
                continue;
            String text = tds.get(j).text().replace("\u00a0", "");
            if (text.equals(""))
                continue;
            copy.set(columnmap.get(j), text, fmt);
        }
        item.addCopy(copy);
    }

    return item;
}

From source file:GIST.IzbirkomExtractor.TableExtractor.java

public void processHTMLfile(File input_html) throws IOException, TableExtractorException,
            CloneNotSupportedException, SQLException, ResultSinkException {

        logger.info("Start processing " + input_html);

        Document doc = Jsoup.parse(input_html, "UTF-8");
        Elements tables = doc.getElementsByTag("table");

        /* count of parseable tables found */
        int tables_found = 0;

        /* determine raion name */
        String raion_name = extractRaionFromFileName(input_html.getName());
        //System.err.println(raion_name);

        // TODO: inflect raion name in  case

        /* searches for a table that has " . -" in its very 1st cell */
        for (Element table : tables) {
            Elements rows = table.getElementsByTag("tr");
            boolean firstRow = true;

            row_loop: for (Element row : rows) {
                Elements cells = row.getElementsByTag("td");

                if (firstRow) {
                    //System.err.println(row.text());
                    if (isParsableTable(row)) {
                        firstRow = false;
                        logger.info("Processing table #" + ++tables_found + " in " + input_html);
                    } else
                        break row_loop;
                }//from w w w .j a va 2s  . c  om

                if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()),
                        " . -") < 3)
                    continue row_loop; /* skip the row if it looks like a table header */

                /* skip rows with all cells empty */
                boolean emptyRow = true;
                for (Element cell : cells)
                    emptyRow = emptyRow && cleanupUNICODE(cell.text()).isEmpty();
                if (emptyRow)
                    continue;

                int i_cell = 0;
                Element station_id = null;
                Element address_field = null;
                Element org_address = null; /* address of the ??? */
                Element station_address = null;

                for (Element cell : cells) {
                    switch (i_cell) {
                    case 0:
                        station_id = cell;
                        break;
                    case 1:
                        address_field = cell;
                        break;
                    case 2:
                        org_address = cell;
                        break;
                    case 3:
                        station_address = cell;
                    default:
                        break;
                    }
                    i_cell++;
                }

                if (station_id == null)
                    throw new TableExtractorException("Polling station ID not found", row, input_html);
                if (address_field == null)
                    throw new TableExtractorException("Address list not found", row, input_html);

                /* extract int from poll station id */
                int psid;
                try {
                    psid = Integer.valueOf(cleanupUNICODE(station_id.text()).trim().replaceAll("[^\\d]", ""));
                } catch (NumberFormatException e) {
                    Exception te = new TableExtractorException("Failed to parse polling station ID >"
                            + cleanupUNICODE(station_id.text()).trim() + "<: ", station_id, input_html);
                    logger.severe(te.getMessage() + "; rest of " + input_html + " ignored.");
                    return;
                }

                /* extraction from HTML completely finished, now we work only with the addresses in the text form */
                extractAddressesFromText(raion_name.trim(), psid, cleanLeftoverHTML(address_field),
                        cleanLeftoverHTML(org_address), cleanLeftoverHTML(station_address));
            }
        }

        if (tables_found == 0)
            logger.severe("No parsable tables found in " + input_html);
        resultSink.commit();

        logger.info("" + tables_found + " table(s) processed in " + input_html);
    }

From source file:de.geeksfactory.opacclient.apis.Open.java

@Override
public SearchRequestResult searchGetPage(int page) throws IOException, OpacErrorException, JSONException {
    /*// w  w  w.j  a  v  a 2  s  .co m
    When there are many pages of results, there will only be links to the next 4 and
    previous 4 pages, so we will click links until it gets to the correct page.
     */

    if (searchResultDoc == null)
        throw new NotReachableException();

    Document doc = searchResultDoc;

    Elements pageLinks = doc.select("span[id$=DataPager1]").first().select("a[id*=LinkButtonPageN");
    int from = Integer.valueOf(pageLinks.first().text());
    int to = Integer.valueOf(pageLinks.last().text());
    Element linkToClick;
    boolean willBeCorrectPage;

    if (page < from) {
        linkToClick = pageLinks.first();
        willBeCorrectPage = false;
    } else if (page > to) {
        linkToClick = pageLinks.last();
        willBeCorrectPage = false;
    } else {
        linkToClick = pageLinks.get(page - from);
        willBeCorrectPage = true;
    }

    Pattern pattern = Pattern.compile("javascript:__doPostBack\\('([^,]*)','([^\\)]*)'\\)");
    Matcher matcher = pattern.matcher(linkToClick.attr("href"));
    if (!matcher.find())
        throw new OpacErrorException(StringProvider.INTERNAL_ERROR);

    FormElement form = (FormElement) doc.select("form").first();
    HttpEntity data = formData(form, null).addTextBody("__EVENTTARGET", matcher.group(1))
            .addTextBody("__EVENTARGUMENT", matcher.group(2)).build();

    ByteArrayOutputStream stream = new ByteArrayOutputStream();
    data.writeTo(stream);

    String postUrl = form.attr("abs:action");

    String html = httpPost(postUrl, data, "UTF-8");
    if (willBeCorrectPage) {
        // We clicked on the correct link
        Document doc2 = Jsoup.parse(html);
        doc2.setBaseUri(postUrl);
        return parse_search(doc2, page);
    } else {
        // There was no correct link, so try to find one again
        searchResultDoc = Jsoup.parse(html);
        searchResultDoc.setBaseUri(postUrl);
        return searchGetPage(page);
    }
}

From source file:net.kevxu.purdueassist.course.ScheduleDetail.java

private ScheduleDetailEntry parseDocument(Document document)
        throws HtmlParseException, CourseNotFoundException, ResultNotMatchException {
    ScheduleDetailEntry entry = new ScheduleDetailEntry(term, crn);
    Elements tableElements = document.getElementsByAttributeValue("summary",
            "This table is used to present the detailed class information.");

    if (!tableElements.isEmpty()) {
        for (Element tableElement : tableElements) {
            // get basic info for selected course
            Element tableBasicInfoElement = tableElement.getElementsByClass("ddlabel").first();
            if (tableBasicInfoElement != null) {
                setBasicInfo(entry, tableBasicInfoElement.text());
            } else {
                throw new HtmlParseException("Basic info element empty.");
            }/*  www.j a  v a2  s  . c  o m*/

            // get detailed course info
            Element tableDetailedInfoElement = tableElement.getElementsByClass("dddefault").first();

            if (tableDetailedInfoElement != null) {
                // process seat info
                Elements tableSeatDetailElements = tableDetailedInfoElement.getElementsByAttributeValue(
                        "summary", "This layout table is used to present the seating numbers.");
                if (tableSeatDetailElements.size() == 1) {
                    Element tableSeatDetailElement = tableSeatDetailElements.first();
                    Elements tableSeatDetailEntryElements = tableSeatDetailElement.getElementsByTag("tbody")
                            .first().children();
                    if (tableSeatDetailEntryElements.size() == 3 || tableSeatDetailEntryElements.size() == 4) {
                        setSeats(entry, tableSeatDetailEntryElements.get(1).text());
                        setWaitlistSeats(entry, tableSeatDetailEntryElements.get(2).text());
                        if (tableSeatDetailEntryElements.size() == 4) {
                            setCrosslistSeats(entry, tableSeatDetailEntryElements.get(3).text());
                        }
                    } else {
                        throw new HtmlParseException("Seat detail entry elements size not 3. We have "
                                + tableSeatDetailEntryElements.size() + ".");
                    }
                } else {
                    throw new HtmlParseException(
                            "Seat detail elements size not 1. We have " + tableSeatDetailElements.size() + ".");
                }
                // remove the seat info from detailed info
                tableSeatDetailElements.remove();

                // remaining information
                setRemainingInfo(entry, tableDetailedInfoElement.html());

            } else {
                throw new HtmlParseException("Detailed info element empty.");
            }

        }
    } else {
        // test empty
        Elements informationElements = document.getElementsByAttributeValue("summary",
                "This layout table holds message information");
        if (!informationElements.isEmpty()
                && informationElements.text().contains("No detailed class information found")) {
            throw new CourseNotFoundException(informationElements.text());
        } else {
            throw new HtmlParseException(
                    "Course table not found, but page does not contain message stating no course found.");
        }
    }

    return entry;
}

From source file:de.geeksfactory.opacclient.apis.Pica.java

@Override
public List<SearchField> getSearchFields() throws IOException, JSONException {
    if (!initialised) {
        start();/*from w ww. j a  v  a 2s . c o  m*/
    }

    String html = httpGet(opac_url + "/LNG=" + getLang() + "/DB=" + db + "/ADVANCED_SEARCHFILTER",
            getDefaultEncoding());
    Document doc = Jsoup.parse(html);
    List<SearchField> fields = new ArrayList<>();

    Elements options = doc.select("select[name=IKT0] option");
    for (Element option : options) {
        TextSearchField field = new TextSearchField();
        field.setDisplayName(option.text());
        field.setId(option.attr("value"));
        field.setHint("");
        field.setData(new JSONObject("{\"ADI\": false}"));

        Pattern pattern = Pattern.compile("\\[X?[A-Za-z]{2,3}:?\\]|\\(X?[A-Za-z]{2,3}:?\\)");
        Matcher matcher = pattern.matcher(field.getDisplayName());
        if (matcher.find()) {
            field.getData().put("meaning", matcher.group().replace(":", "").toUpperCase());
            field.setDisplayName(matcher.replaceFirst("").trim());
        }

        fields.add(field);
    }

    Elements sort = doc.select("select[name=SRT]");
    if (sort.size() > 0) {
        DropdownSearchField field = new DropdownSearchField();
        field.setDisplayName(sort.first().parent().parent().select(".longval").first().text());
        field.setId("SRT");
        for (Element option : sort.select("option")) {
            field.addDropdownValue(option.attr("value"), option.text());
        }
        fields.add(field);
    }

    for (Element input : doc.select("input[type=text][name^=ADI]")) {
        TextSearchField field = new TextSearchField();
        field.setDisplayName(input.parent().parent().select(".longkey").text());
        field.setId(input.attr("name"));
        field.setHint(input.parent().select("span").text());
        field.setData(new JSONObject("{\"ADI\": true}"));
        fields.add(field);
    }

    for (Element dropdown : doc.select("select[name^=ADI]")) {
        DropdownSearchField field = new DropdownSearchField();
        field.setDisplayName(dropdown.parent().parent().select(".longkey").text());
        field.setId(dropdown.attr("name"));
        for (Element option : dropdown.select("option")) {
            field.addDropdownValue(option.attr("value"), option.text());
        }
        fields.add(field);
    }

    Elements fuzzy = doc.select("input[name=FUZZY]");
    if (fuzzy.size() > 0) {
        CheckboxSearchField field = new CheckboxSearchField();
        field.setDisplayName(fuzzy.first().parent().parent().select(".longkey").first().text());
        field.setId("FUZZY");
        fields.add(field);
    }

    Elements mediatypes = doc.select("input[name=ADI_MAT]");
    if (mediatypes.size() > 0) {
        DropdownSearchField field = new DropdownSearchField();
        field.setDisplayName("Materialart");
        field.setId("ADI_MAT");

        field.addDropdownValue("", "Alle");
        for (Element mt : mediatypes) {
            field.addDropdownValue(mt.attr("value"),
                    mt.parent().nextElementSibling().text().replace("\u00a0", ""));
        }
        fields.add(field);
    }

    return fields;
}

From source file:hu.tbognar76.apking.ApKing.java

private boolean isLocalVersionGooglePlayImage(String packageName) {

    String fname = this.init.catalogHtml + "/" + this.init.catalogPic + "/" + packageName + ".png";

    if (!this.init.isCatalogPicForced) {
        File t = new File(fname);
        if (t.exists()) {
            // System.out.println("------"+fname);
            if (t.length() > 1) {
                return true;
            }//from w  ww . ja v a  2 s .co  m

            return false;
        } else {
            // WORK TO DO BELOW
        }
    }

    Document doc = null;

    try {
        doc = Jsoup
                .connect("https://play.google.com/store/apps/details?id=" + URI.create(packageName) + "&hl=en")
                .get();

        // Joni j, de nha nem
        // Elements img = doc.getElementsByClass("cover-image");
        Elements img = doc.select("div.cover-container img");

        String uu = "http:" + img.first().attr("src");
        uu = uu.replace("http:https:", "https:");
        uu = uu.replace("=w300", "=w120");
        URL url = new URL(uu);

        FileUtils.copyURLToFile(url, new File(fname));

    } catch (Exception e) {

        try {
            FileUtils.write(new File(fname), "-");
            return false;
        } catch (IOException e1) {
            // TODO Auto-generated catch block
            e1.printStackTrace();
        }
    }

    return true;
}

From source file:com.jimplush.goose.ContentExtractor.java

private String getMetaContent(Document doc, String metaName) {
    Elements meta = doc.select(metaName);
    if (meta.size() > 0) {
        String content = meta.first().attr("content");
        return string.isNullOrEmpty(content) ? string.empty : content.trim();
    }//from   w w  w . j  a  va2s. c o  m
    return string.empty;
}

From source file:com.jimplush.goose.ContentExtractor.java

/**
 * adds any siblings that may have a decent score to this node
 *
 * @param node/*from  www . jav a  2 s . co  m*/
 * @return
 */
private Element addSiblings(Element node) {
    if (logger.isDebugEnabled()) {
        logger.debug("Starting to add siblings");
    }
    int baselineScoreForSiblingParagraphs = getBaselineScoreForSiblings(node);

    Element currentSibling = node.previousElementSibling();
    while (currentSibling != null) {
        if (logger.isDebugEnabled()) {
            logger.debug("SIBLINGCHECK: " + debugNode(currentSibling));
        }

        if (currentSibling.tagName().equals("p")) {

            node.child(0).before(currentSibling.outerHtml());
            currentSibling = currentSibling.previousElementSibling();
            continue;
        }

        // check for a paraph embedded in a containing element
        int insertedSiblings = 0;
        Elements potentialParagraphs = currentSibling.getElementsByTag("p");
        if (potentialParagraphs.first() == null) {
            currentSibling = currentSibling.previousElementSibling();
            continue;
        }
        for (Element firstParagraph : potentialParagraphs) {
            WordStats wordStats = StopWords.getStopWordCount(firstParagraph.text());

            int paragraphScore = wordStats.getStopWordCount();

            if ((float) (baselineScoreForSiblingParagraphs * .30) < paragraphScore) {
                if (logger.isDebugEnabled()) {
                    logger.debug("This node looks like a good sibling, adding it");
                }
                node.child(insertedSiblings).before("<p>" + firstParagraph.text() + "<p>");
                insertedSiblings++;
            }

        }

        currentSibling = currentSibling.previousElementSibling();
    }
    return node;

}