List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java
private List<Album> createOrGetAlbumList(Document doc) { List<Album> albumList = Lists.newArrayList(); Elements albumElements = doc.select(".panel .fm-little li:has(a) a"); if (CollectionUtils.isNotEmpty(albumElements)) { albumElements.forEach(albumElement -> { String href = albumElement.attr("href").toString(); if (StringUtils.isNotBlank(href) && StringUtils.contains(href, "collection")) { String albumName = StringUtils.trimToEmpty(albumElement.text()); if (StringUtils.isNotBlank(albumName)) { Album album = createOrQueryAlbum(albumName); if (null != album) { albumList.add(album); }/* w ww . j a v a 2 s. c om*/ } } }); } return albumList; }
From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java
private String getFilmUrlYear(Document doc, String value) { String urlYear = StringUtils.EMPTY; Elements yearElements = doc.select(".fm-intro span .fm-genre"); if (CollectionUtils.isNotEmpty(yearElements)) { Element yearElement = yearElements.get(0); if (null != yearElement) { String yearText = StringUtils.trimToEmpty(yearElement.text().toString()); if (StringUtils.isNotBlank(yearText)) { urlYear = covertToUrlYear(yearText); }//from w ww . ja va 2 s . c o m } } if (StringUtils.isBlank(urlYear) && StringUtils.isNotBlank(value)) { String yearText = value.substring(0, value.indexOf("-")); if (StringUtils.isNotBlank(yearText)) { urlYear = covertToUrlYear(yearText); } } return urlYear; }
From source file:qhindex.controller.SearchAuthorWorksController.java
private boolean searchWebAuthorWorks(String authorUrl, ArrayList<AuthorWork> results) { Debug.info("Searching author works"); int page = 0; // Search only for the first 200 works since no author has an h-index bigger of 200 according to observations for (int i = 0; i < 2; i++) { int maxRecordsPerPage = 100; // Max number of records allow to be retrieved at a time by google scholar String resultIndex = "&cstart=" + (page * maxRecordsPerPage) + "&pagesize=" + maxRecordsPerPage; Document authorDoc = requestWebDocFromScholar( "https://scholar.google.com.au" + authorUrl + resultIndex); AppHelper.waitBeforeNewRequest(); if (authorDoc != null) { Elements authorWorksElems = authorDoc.select("tr.gsc_a_tr"); for (Element aWorkElems : authorWorksElems) { try { results.add(extractAuthorWorkData(aWorkElems)); } catch (IOException ioe) { Debug.print("Exception while processing author works: " + ioe.toString()); resultsMsg += "Exception while processing author works.\n"; }/* w w w .java 2s. c o m*/ } } page += 1; } return true; }
From source file:gui.InboxPanel.java
private void setTextBody(String sbody) { String html = BodyTextPane.getText(); Document doc = Jsoup.parseBodyFragment(html); //Element body = doc.body(); //body.text(sbody); doc.select("body").html(sbody); BodyTextPane.setText(doc.html());/*from w ww .j av a 2s.com*/ }
From source file:gov.medicaid.screening.dao.impl.OptometryLicenseDAOBean.java
/** * Performs a search for all possible results. * * @param identifier The value to be searched. * @return the search result for licenses * @throws URISyntaxException When an error occurs while building the URL. * @throws ClientProtocolException When client does not support protocol used. * @throws IOException When an error occurs while parsing response. * @throws ParseException When an error occurs while parsing response. * @throws PersistenceException for database related errors * @throws ServiceException for any other problems encountered */// ww w . jav a 2s . c o m private SearchResult<License> getAllResults(String identifier) throws URISyntaxException, ClientProtocolException, IOException, ParseException, PersistenceException, ServiceException { DefaultHttpClient client = new DefaultHttpClient(); URIBuilder builder = new URIBuilder(getSearchURL()).setPath("/Default.aspx"); String hostId = builder.build().toString(); builder.setParameter("tabid", "799"); HttpGet httpget = new HttpGet(builder.build()); HttpResponse landing = client.execute(httpget); Document document = Jsoup.parse(EntityUtils.toString(landing.getEntity())); HttpPost httppost = new HttpPost(builder.build()); HttpEntity entity = postForm(hostId, client, httppost, new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier }, { "_ctl0:_ctl1:_ctl0:btnSubmit", "Search" }, { "__EVENTTARGET", "" }, { "__EVENTARGUMENT", "" }, { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } }, true); // licenses list List<License> licenseList = new ArrayList<License>(); while (entity != null) { String result = EntityUtils.toString(entity); document = Jsoup.parse(result); Elements trs = document.select("table.Datagrid tr"); if (trs != null) { for (Element element : trs) { String cssClass = element.attr("class"); if (!"DatagridHeaderStyle".equals(cssClass.trim()) && element.children().size() == 8) { Elements tds = element.children(); licenseList.add(parseLicense(tds)); } } } // done, check if there are additional results entity = null; Elements elements = document.getElementsByTag("a"); for (Element element : elements) { if (element.text().equals("Next >>")) { entity = postForm(hostId, client, httppost, new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier }, { "__EVENTTARGET", "_ctl0:_ctl1:_ctl0:dgrdLicensee:_ctl29:_ctl1" }, { "__EVENTARGUMENT", "" }, { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } }, true); break; } } } SearchResult<License> result = new SearchResult<License>(); result.setItems(licenseList); return result; }
From source file:com.laudandjolynn.mytv.crawler.tvmao.TvMaoCrawler.java
/** * ??// w w w .j a v a 2s. c om * * @param city * @param html * @return */ private List<TvStation> parseTvStation(String city, String html) { Document doc = Jsoup.parse(html); Elements classifyElements = doc.select("div.chlsnav div.pbar b"); String classify = classifyElements.get(0).text().trim(); List<TvStation> resultList = new ArrayList<TvStation>(); Elements channelElements = doc.select("div.chlsnav ul.r li"); for (Element element : channelElements) { Element channel = element.child(0); TvStation tv = new TvStation(); String stationName = channel.text().trim(); tv.setName(stationName); tv.setCity(city); tv.setClassify(classify); tv.setSequence(SEQUENCE.incrementAndGet()); for (CrawlEventListener listener : listeners) { listener.itemFound(new TvStationFoundEvent(this, tv)); } resultList.add(tv); } return resultList; }
From source file:me.vertretungsplan.parser.UntisInfoParser.java
private void parseTimetable(SubstitutionSchedule v, String lastChange, Document doc, String klasse, String weekName) throws JSONException { v.setLastChange(ParserUtils.parseDateTime(lastChange)); LocalDate weekStart = DateTimeFormat.forPattern("d.M.yyyy").parseLocalDate(weekName); Element table = doc.select("table").first(); List<SubstitutionScheduleDay> days = new ArrayList<>(); for (int i = 0; i < table.select("tr").first().select("td:gt(0)").size(); i++) { LocalDate date = weekStart.plusDays(i); SubstitutionScheduleDay day = null; for (SubstitutionScheduleDay d : v.getDays()) { if (d.getDate().equals(date)) { day = d;// w w w .ja va 2 s . c o m break; } } if (day == null) { day = new SubstitutionScheduleDay(); day.setDate(date); v.addDay(day); } days.add(day); } Elements rows = table.select("> tbody > tr:gt(0)"); Map<Integer, String> lessons = new HashMap<>(); int i = 0; int lessonCounter = 1; while (i < rows.size()) { Element cell = rows.get(i).select("td").first(); String lessonName = cell.text().trim(); if (lessonName.length() > 3) { lessonName = String.valueOf(lessonCounter); } lessons.put(i, lessonName); i += getRowspan(cell); lessonCounter += 1; } // counts the number of columns that will be missing from each row due to a cell with colspan Map<Integer, Integer> columnsToSkip = new HashMap<>(); for (int j = 0; j < rows.size(); j++) { columnsToSkip.put(j, 0); } for (int col = 1; col < days.size(); col++) { int row = 0; while (row < rows.size()) { Element cell = rows.get(row).select("> td").get(col - columnsToSkip.get(row)); String lesson = getTimetableLesson(cell, row, lessons); days.get(col - 1).addAllSubstitutions( parseTimetableCell(cell, lesson, klasse, data.getJSONArray("cellFormat"), colorProvider)); for (int skippedRow = row + 1; skippedRow < row + getRowspan(cell); skippedRow++) { columnsToSkip.put(skippedRow, columnsToSkip.get(skippedRow) + 1); } row += getRowspan(cell); } } }
From source file:eu.riscoss.dataproviders.providers.FossologyDataProvider.java
/** * Analyses a fossology html file// w ww .ja va 2 s . c om * @param target * @param licensesMap * @return * @throws IOException */ private HashMap<String, Integer> analyseOverviewReport(String target, HashMap<String, Collection<String>> licensesMap) throws IOException { //private static HashMap<String, Integer> analyseFossologyReport(String target, String licenseFile) throws IOException { // List<String> result = new ArrayList<String>(); Document document; if (target.startsWith("http")) { document = Jsoup.connect(target).get(); } else { File file = new File(target); document = Jsoup.parse(file, "UTF-8", "http://localhost"); } Element table = document.select("table[id=lichistogram]").first(); Elements rows = table.select("tr"); List<LicenseEntry> llist = new ArrayList<LicenseEntry>(); //list of licenses in the fossology file //for each license, parses the name (0) and the number of occurrences (2) and saves it as a LicenseEntry for (Element element : rows) { Elements col = element.select("td"); if (col.size() != 0) { int c = Integer.parseInt(col.get(0).ownText());//num of occurrences String lic = col.get(2).text(); llist.add(new LicenseEntry(c, lic)); //mlist.put(lic, c); } // System.out.println(col.get(1).ownText()); // Element count=col.get(0); } //get license type buckets HashMap<String, Integer> licenseBuckets = new HashMap<String, Integer>(); int total = 0; Set<String> licenseTypes = licensesMap.keySet(); //initialize with 0 to avoid missing types for (String licensetype : licenseTypes) { licenseBuckets.put(licensetype, 0); } boolean matched = false; int numUnknown = 0; for (LicenseEntry le : llist) { for (String licenseType : licenseTypes) {//cycles on license types from config file if (le.matchesOneOf(licensesMap.get(licenseType), licenseType)) { Integer currentcount = licenseBuckets.get(le.licensetype); if (currentcount == null) //for safety, but should be initialised currentcount = 0; licenseBuckets.put(le.licensetype, currentcount + le.count); matched = true; } } total += le.count; if (matched == false) { //unknown numUnknown += le.count; System.out.println("Unknown license: " + le.getName()); } } licenseBuckets.put("_unknown_", numUnknown); licenseBuckets.put("_sum_", total); licenseBuckets.put("_count_", llist.size()); System.out.println("\nLicense Buckets Fossology from HTML overview scanning:"); System.out.println(licenseBuckets); // for (String license : result) { // System.out.format("%s\n", license); // } return licenseBuckets; }
From source file:org.commonjava.maven.galley.transport.htcli.internal.HttpListing.java
@Override public ListingResult call() { request = new HttpGet(url); // return null if something goes wrong, after setting the error. // What we should be doing here is trying to retrieve the html directory // listing, then parse out the filenames from that... ////from w w w .ja va 2 s. c om // They'll be links, so that's something to key in on. // // I'm wondering about this: // http://jsoup.org/cookbook/extracting-data/selector-syntax // the dependency is: org.jsoup:jsoup:1.7.2 ListingResult result = null; InputStream in = null; String oldName = Thread.currentThread().getName(); try { String newName = oldName + ": LIST " + url; Thread.currentThread().setName(newName); if (executeHttp()) { in = response.getEntity().getContent(); String listing = IOUtils.toString(in); Logger logger = LoggerFactory.getLogger(getClass()); logger.debug("Got raw listing content:\n\n{}\n\n", listing); final ArrayList<String> al = new ArrayList<>(); // TODO: Charset!! Document doc = Jsoup.parse(listing, url); // try // { // } // catch ( final IOException e ) // { // this.error = // new TransferLocationException( resource.getLocation(), "Invalid HTML in: {}. Reason: {}", e, url, e.getMessage() ); // } if (doc != null) { for (final Element link : doc.select("a")) { String linkText = link.text(); String linkHref = link.attr("href"); URL url = new URL(this.url); boolean sameServer = isSameServer(url, linkHref); boolean subpath = isSubpath(url, linkHref); if ((sameServer && subpath) && (linkHref.endsWith(linkText) || linkHref.endsWith(linkText + '/')) && !EXCLUDES.contains(linkText)) { al.add(linkText); } } result = new ListingResult(resource, al.toArray(new String[al.size()])); } } } catch (final TransferException e) { this.error = e; } catch (final IOException e) { this.error = new TransferException("Failed to construct directory listing for: {}. Reason: {}", e, url, e.getMessage()); } finally { closeQuietly(in); cleanup(); if (oldName != null) { Thread.currentThread().setName(oldName); } } return error == null ? result : null; }