Example usage for org.jsoup.nodes Document select

List of usage examples for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java

private List<Album> createOrGetAlbumList(Document doc) {
    List<Album> albumList = Lists.newArrayList();
    Elements albumElements = doc.select(".panel .fm-little li:has(a) a");
    if (CollectionUtils.isNotEmpty(albumElements)) {
        albumElements.forEach(albumElement -> {
            String href = albumElement.attr("href").toString();
            if (StringUtils.isNotBlank(href) && StringUtils.contains(href, "collection")) {
                String albumName = StringUtils.trimToEmpty(albumElement.text());
                if (StringUtils.isNotBlank(albumName)) {
                    Album album = createOrQueryAlbum(albumName);
                    if (null != album) {
                        albumList.add(album);
                    }/* w  ww . j a v a 2  s. c  om*/
                }
            }
        });
    }
    return albumList;
}

From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java

private String getFilmUrlYear(Document doc, String value) {
    String urlYear = StringUtils.EMPTY;

    Elements yearElements = doc.select(".fm-intro span .fm-genre");
    if (CollectionUtils.isNotEmpty(yearElements)) {
        Element yearElement = yearElements.get(0);
        if (null != yearElement) {
            String yearText = StringUtils.trimToEmpty(yearElement.text().toString());
            if (StringUtils.isNotBlank(yearText)) {
                urlYear = covertToUrlYear(yearText);
            }//from  w ww  . ja  va  2 s  .  c o m
        }
    }

    if (StringUtils.isBlank(urlYear) && StringUtils.isNotBlank(value)) {
        String yearText = value.substring(0, value.indexOf("-"));
        if (StringUtils.isNotBlank(yearText)) {
            urlYear = covertToUrlYear(yearText);
        }
    }
    return urlYear;
}

From source file:qhindex.controller.SearchAuthorWorksController.java

private boolean searchWebAuthorWorks(String authorUrl, ArrayList<AuthorWork> results) {
    Debug.info("Searching author works");
    int page = 0;
    // Search only for the first 200 works since no author has an h-index bigger of 200 according to observations
    for (int i = 0; i < 2; i++) {
        int maxRecordsPerPage = 100; // Max number of records allow to be retrieved at a time by google scholar
        String resultIndex = "&cstart=" + (page * maxRecordsPerPage) + "&pagesize=" + maxRecordsPerPage;

        Document authorDoc = requestWebDocFromScholar(
                "https://scholar.google.com.au" + authorUrl + resultIndex);

        AppHelper.waitBeforeNewRequest();

        if (authorDoc != null) {
            Elements authorWorksElems = authorDoc.select("tr.gsc_a_tr");

            for (Element aWorkElems : authorWorksElems) {
                try {
                    results.add(extractAuthorWorkData(aWorkElems));
                } catch (IOException ioe) {
                    Debug.print("Exception while processing author works: " + ioe.toString());
                    resultsMsg += "Exception while processing author works.\n";
                }/*  w  w w .java 2s.  c  o  m*/
            }
        }
        page += 1;
    }

    return true;
}

From source file:gui.InboxPanel.java

private void setTextBody(String sbody) {
    String html = BodyTextPane.getText();
    Document doc = Jsoup.parseBodyFragment(html);
    //Element body = doc.body();
    //body.text(sbody);
    doc.select("body").html(sbody);
    BodyTextPane.setText(doc.html());/*from   w  ww .j  av  a  2s.com*/
}

From source file:gov.medicaid.screening.dao.impl.OptometryLicenseDAOBean.java

/**
 * Performs a search for all possible results.
 *
 * @param identifier The value to be searched.
 * @return the search result for licenses
 * @throws URISyntaxException When an error occurs while building the URL.
 * @throws ClientProtocolException When client does not support protocol used.
 * @throws IOException When an error occurs while parsing response.
 * @throws ParseException When an error occurs while parsing response.
 * @throws PersistenceException for database related errors
 * @throws ServiceException for any other problems encountered
 *///  ww w  . jav  a  2s . c  o m
private SearchResult<License> getAllResults(String identifier) throws URISyntaxException,
        ClientProtocolException, IOException, ParseException, PersistenceException, ServiceException {
    DefaultHttpClient client = new DefaultHttpClient();
    URIBuilder builder = new URIBuilder(getSearchURL()).setPath("/Default.aspx");
    String hostId = builder.build().toString();
    builder.setParameter("tabid", "799");

    HttpGet httpget = new HttpGet(builder.build());
    HttpResponse landing = client.execute(httpget);
    Document document = Jsoup.parse(EntityUtils.toString(landing.getEntity()));

    HttpPost httppost = new HttpPost(builder.build());
    HttpEntity entity = postForm(hostId, client, httppost,
            new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier },
                    { "_ctl0:_ctl1:_ctl0:btnSubmit", "Search" }, { "__EVENTTARGET", "" },
                    { "__EVENTARGUMENT", "" },
                    { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } },
            true);

    // licenses list
    List<License> licenseList = new ArrayList<License>();
    while (entity != null) {
        String result = EntityUtils.toString(entity);
        document = Jsoup.parse(result);

        Elements trs = document.select("table.Datagrid tr");
        if (trs != null) {
            for (Element element : trs) {
                String cssClass = element.attr("class");
                if (!"DatagridHeaderStyle".equals(cssClass.trim()) && element.children().size() == 8) {
                    Elements tds = element.children();
                    licenseList.add(parseLicense(tds));
                }
            }
        }

        // done, check if there are additional results
        entity = null;
        Elements elements = document.getElementsByTag("a");
        for (Element element : elements) {
            if (element.text().equals("Next >>")) {
                entity = postForm(hostId, client, httppost,
                        new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier },
                                { "__EVENTTARGET", "_ctl0:_ctl1:_ctl0:dgrdLicensee:_ctl29:_ctl1" },
                                { "__EVENTARGUMENT", "" },
                                { "__VIEWSTATE",
                                        document.select("#Form input[name=__VIEWSTATE]").first().val() } },
                        true);
                break;
            }
        }
    }

    SearchResult<License> result = new SearchResult<License>();
    result.setItems(licenseList);
    return result;
}

From source file:com.laudandjolynn.mytv.crawler.tvmao.TvMaoCrawler.java

/**
 * ??// w w  w .j a  v  a  2s.  c  om
 * 
 * @param city
 * @param html
 * @return
 */
private List<TvStation> parseTvStation(String city, String html) {
    Document doc = Jsoup.parse(html);
    Elements classifyElements = doc.select("div.chlsnav div.pbar b");
    String classify = classifyElements.get(0).text().trim();
    List<TvStation> resultList = new ArrayList<TvStation>();
    Elements channelElements = doc.select("div.chlsnav ul.r li");
    for (Element element : channelElements) {
        Element channel = element.child(0);
        TvStation tv = new TvStation();
        String stationName = channel.text().trim();
        tv.setName(stationName);
        tv.setCity(city);
        tv.setClassify(classify);
        tv.setSequence(SEQUENCE.incrementAndGet());
        for (CrawlEventListener listener : listeners) {
            listener.itemFound(new TvStationFoundEvent(this, tv));
        }
        resultList.add(tv);
    }
    return resultList;
}

From source file:me.vertretungsplan.parser.UntisInfoParser.java

private void parseTimetable(SubstitutionSchedule v, String lastChange, Document doc, String klasse,
        String weekName) throws JSONException {
    v.setLastChange(ParserUtils.parseDateTime(lastChange));
    LocalDate weekStart = DateTimeFormat.forPattern("d.M.yyyy").parseLocalDate(weekName);

    Element table = doc.select("table").first();

    List<SubstitutionScheduleDay> days = new ArrayList<>();
    for (int i = 0; i < table.select("tr").first().select("td:gt(0)").size(); i++) {
        LocalDate date = weekStart.plusDays(i);

        SubstitutionScheduleDay day = null;
        for (SubstitutionScheduleDay d : v.getDays()) {
            if (d.getDate().equals(date)) {
                day = d;// w w w  .ja va 2 s . c  o m
                break;
            }
        }
        if (day == null) {
            day = new SubstitutionScheduleDay();
            day.setDate(date);
            v.addDay(day);
        }
        days.add(day);
    }

    Elements rows = table.select("> tbody > tr:gt(0)");
    Map<Integer, String> lessons = new HashMap<>();

    int i = 0;
    int lessonCounter = 1;
    while (i < rows.size()) {
        Element cell = rows.get(i).select("td").first();
        String lessonName = cell.text().trim();
        if (lessonName.length() > 3) {
            lessonName = String.valueOf(lessonCounter);
        }
        lessons.put(i, lessonName);
        i += getRowspan(cell);
        lessonCounter += 1;
    }

    // counts the number of columns that will be missing from each row due to a cell with colspan
    Map<Integer, Integer> columnsToSkip = new HashMap<>();
    for (int j = 0; j < rows.size(); j++) {
        columnsToSkip.put(j, 0);
    }

    for (int col = 1; col < days.size(); col++) {
        int row = 0;
        while (row < rows.size()) {
            Element cell = rows.get(row).select("> td").get(col - columnsToSkip.get(row));
            String lesson = getTimetableLesson(cell, row, lessons);

            days.get(col - 1).addAllSubstitutions(
                    parseTimetableCell(cell, lesson, klasse, data.getJSONArray("cellFormat"), colorProvider));

            for (int skippedRow = row + 1; skippedRow < row + getRowspan(cell); skippedRow++) {
                columnsToSkip.put(skippedRow, columnsToSkip.get(skippedRow) + 1);
            }

            row += getRowspan(cell);
        }
    }
}

From source file:eu.riscoss.dataproviders.providers.FossologyDataProvider.java

/**
 * Analyses a fossology html file//  w ww  .ja va 2  s . c  om
 * @param target
 * @param licensesMap
 * @return
 * @throws IOException
 */
private HashMap<String, Integer> analyseOverviewReport(String target,
        HashMap<String, Collection<String>> licensesMap) throws IOException {
    //private static HashMap<String, Integer> analyseFossologyReport(String target, String licenseFile) throws IOException {
    //        List<String> result = new ArrayList<String>();
    Document document;

    if (target.startsWith("http")) {
        document = Jsoup.connect(target).get();
    } else {
        File file = new File(target);
        document = Jsoup.parse(file, "UTF-8", "http://localhost");
    }

    Element table = document.select("table[id=lichistogram]").first();
    Elements rows = table.select("tr");

    List<LicenseEntry> llist = new ArrayList<LicenseEntry>(); //list of licenses in the fossology file

    //for each license, parses the name (0) and the number of occurrences (2) and saves it as a LicenseEntry
    for (Element element : rows) {
        Elements col = element.select("td");

        if (col.size() != 0) {
            int c = Integer.parseInt(col.get(0).ownText());//num of occurrences
            String lic = col.get(2).text();
            llist.add(new LicenseEntry(c, lic));
            //mlist.put(lic, c);
        }
        //           System.out.println(col.get(1).ownText());
        //           Element count=col.get(0);
    }

    //get license type buckets

    HashMap<String, Integer> licenseBuckets = new HashMap<String, Integer>();
    int total = 0;

    Set<String> licenseTypes = licensesMap.keySet();
    //initialize with 0 to avoid missing types
    for (String licensetype : licenseTypes) {
        licenseBuckets.put(licensetype, 0);
    }

    boolean matched = false;
    int numUnknown = 0;
    for (LicenseEntry le : llist) {
        for (String licenseType : licenseTypes) {//cycles on license types from config file
            if (le.matchesOneOf(licensesMap.get(licenseType), licenseType)) {
                Integer currentcount = licenseBuckets.get(le.licensetype);
                if (currentcount == null) //for safety, but should be initialised
                    currentcount = 0;
                licenseBuckets.put(le.licensetype, currentcount + le.count);
                matched = true;
            }
        }
        total += le.count;
        if (matched == false) { //unknown
            numUnknown += le.count;
            System.out.println("Unknown license: " + le.getName());
        }
    }

    licenseBuckets.put("_unknown_", numUnknown);
    licenseBuckets.put("_sum_", total);
    licenseBuckets.put("_count_", llist.size());

    System.out.println("\nLicense Buckets Fossology from HTML overview scanning:");
    System.out.println(licenseBuckets);

    //        for (String license : result) {
    //            System.out.format("%s\n", license);
    //        }
    return licenseBuckets;
}

From source file:org.commonjava.maven.galley.transport.htcli.internal.HttpListing.java

@Override
public ListingResult call() {
    request = new HttpGet(url);

    // return null if something goes wrong, after setting the error.
    // What we should be doing here is trying to retrieve the html directory
    // listing, then parse out the filenames from that...
    ////from   w  w w  .ja va  2 s. c om
    // They'll be links, so that's something to key in on.
    //
    // I'm wondering about this:
    // http://jsoup.org/cookbook/extracting-data/selector-syntax
    // the dependency is: org.jsoup:jsoup:1.7.2

    ListingResult result = null;
    InputStream in = null;

    String oldName = Thread.currentThread().getName();
    try {
        String newName = oldName + ": LIST " + url;
        Thread.currentThread().setName(newName);

        if (executeHttp()) {
            in = response.getEntity().getContent();
            String listing = IOUtils.toString(in);
            Logger logger = LoggerFactory.getLogger(getClass());
            logger.debug("Got raw listing content:\n\n{}\n\n", listing);

            final ArrayList<String> al = new ArrayList<>();

            // TODO: Charset!!
            Document doc = Jsoup.parse(listing, url);
            //                try
            //                {
            //                }
            //                catch ( final IOException e )
            //                {
            //                    this.error =
            //                            new TransferLocationException( resource.getLocation(), "Invalid HTML in: {}. Reason: {}", e, url, e.getMessage() );
            //                }

            if (doc != null) {
                for (final Element link : doc.select("a")) {
                    String linkText = link.text();
                    String linkHref = link.attr("href");

                    URL url = new URL(this.url);

                    boolean sameServer = isSameServer(url, linkHref);
                    boolean subpath = isSubpath(url, linkHref);

                    if ((sameServer && subpath)
                            && (linkHref.endsWith(linkText) || linkHref.endsWith(linkText + '/'))
                            && !EXCLUDES.contains(linkText)) {
                        al.add(linkText);
                    }
                }

                result = new ListingResult(resource, al.toArray(new String[al.size()]));
            }
        }
    } catch (final TransferException e) {
        this.error = e;
    } catch (final IOException e) {
        this.error = new TransferException("Failed to construct directory listing for: {}. Reason: {}", e, url,
                e.getMessage());
    } finally {
        closeQuietly(in);
        cleanup();
        if (oldName != null) {
            Thread.currentThread().setName(oldName);
        }
    }

    return error == null ? result : null;
}