List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:org.aliuge.crawler.fetcher.FetchWorker.java
/** * @param url//from ww w . j a va 2s.c o m * @desc */ public void fetchPage(WebURL url) { PageFetchResult result = null; try { if (null != url && StringUtils.isNotBlank(url.getUrl())) { result = fetcher.fetch(url, true); // ?? int statusCode = result.getStatusCode(); if (statusCode == CustomFetchStatus.PageTooBig) { onIgnored(url); return; } if (statusCode != HttpStatus.SC_OK) { onFailed(url); } else { Page page = new Page(url); onSuccessed(); if (!result.fetchContent(page)) { onFailed(url); return; } if (!parser.parse(page, url.getUrl())) { onFailed(url); return; } // ?? String e_url = extractFilterAndChangeUrl(url.getUrl()); if (StringUtils.isNoneBlank(e_url)) { url.setUrl(e_url); page.setWebURL(url); pendingPages.addElement(page); return; } // depth if (url.getDepth() > config.getMaxDepthOfCrawling() && config.getMaxDepthOfCrawling() != -1) { return; } // ???Url?Url Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getUrl())); Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); // ???url if ((fetchFilter(linkHref) || extractFilter(linkHref)) && !bloomfilterHelper.exist(linkHref)) { WebURL purl = new WebURL(); purl.setName(link.text()); purl.setUrl(linkHref); purl.setDepth((short) (url.getDepth() + 1)); if (purl.getDepth() > config.getMaxDepthOfCrawling() && config.getMaxDepthOfCrawling() != -1) return; try { if (!pendingUrls.addElement(purl)) { FileUtils.writeStringToFile(new File("status/_urls.good"), url.getUrl() + "\n", true); } } catch (QueueException e) { log.error(e.getMessage()); } } } } } } } catch (QueueException e) { onFailed(url); } catch (Exception e) { e.printStackTrace(); onFailed(url); } finally { if (null != result) result.discardContentIfNotConsumed(); } }
From source file:de.geeksfactory.opacclient.apis.Littera.java
protected void addAdvancedSearchFields(List<SearchField> fields) throws IOException, JSONException { final String html = httpGet(getApiUrl() + "&mode=a", getDefaultEncoding()); final Document doc = Jsoup.parse(html); final Elements options = doc.select("select#adv_search_crit_0").first().select("option"); for (final Element option : options) { final SearchField field; if (SEARCH_FIELDS_FOR_DROPDOWN.contains(option.val())) { field = new DropdownSearchField(); addDropdownValuesForField(((DropdownSearchField) field), option.val()); } else {//from w w w. j a v a2 s .c om field = new TextSearchField(); ((TextSearchField) field).setHint(""); } field.setDisplayName(option.text()); field.setId(option.val()); field.setData(new JSONObject()); field.getData().put("meaning", field.getId()); fields.add(field); } }
From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java
/** * ??:/*from ww w. j a v a 2 s. c o m*/ * 1. ??? * 2. ??????? * 3. ?? * 4. ?? ?? * 5. ? * * @return * @throws XpathSyntaxErrorException */ private String getAuthor() throws XpathSyntaxErrorException { String author = ""; if (StringUtils.isBlank(srcTime)) { author = getAuthor(doc.body().html()); return author; } Element cur = doc.body().select("*:containsOwn(" + srcTime + ")").first(); if (cur == null) { LOG.warn("?srcTime=" + srcTime); author = getAuthor(doc.body().html()); return author; } if (!noText(cur)) { String arr[] = cur.html().split(srcTime); for (String text : arr) { author = getShortText(text); if (!StringUtils.isBlank(author)) return author; } } Element parent = cur.parent(); while (parent != null && noText(parent)) { cur = parent; parent = parent.parent(); } author = getAuthor(parent.html()); if (!StringUtils.isBlank(author)) return author; Element pre = cur.previousElementSibling(); while (pre != null && noText(pre)) { pre = pre.previousElementSibling(); } if (pre != null) { author = getShortText(pre.text()); } if (!StringUtils.isBlank(author)) return author; Element next = cur.nextElementSibling(); while (next != null && noText(next)) { next = next.nextElementSibling(); } if (next != null) { author = getShortText(next.text()); } if (!StringUtils.isBlank(author)) return author; author = getShortText(parent.html().replace(srcTime, " ")); if (!StringUtils.isBlank(author)) return author; author = getAuthor(doc.body().html()); if (StringUtils.isBlank(author)) { return author_bak; } return author; }
From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java
/** * metaTitle?metaTitle,metaTitle??????title * * @param contentElement//from w w w . j av a 2 s. c o m * @return * @throws Exception */ protected String getTitle(final Element contentElement) throws Exception { final ArrayList<Element> titleList = new ArrayList<Element>(); final ArrayList<Double> titleSim = new ArrayList<Double>(); final String metaTitle = getText(doc.title().trim()); if (!metaTitle.isEmpty()) { doc.body().traverse(new NodeVisitor() { @Override public void head(Node node, int i) { if (node instanceof Element) { Element tag = (Element) node; String tagName = tag.tagName(); if (Pattern.matches("h[1-6]", tagName)) { String title = tag.text().trim(); double sim = strSim(title, metaTitle); titleSim.add(sim); titleList.add(tag); } } } @Override public void tail(Node node, int i) { } }); int index = titleSim.size(); if (index >= 0) { double maxScore = 0; int maxIndex = -1; for (int i = 0; i < index; i++) { double score = (i + 1) * titleSim.get(i); if (score > maxScore) { maxScore = score; maxIndex = i; } } if (maxIndex == -1 || titleSim.get(maxIndex) < 0.3) { String title = getText(metaTitle); if (!title.endsWith("") && title.length() > 7) { return title; } Collections.sort(titleList, new Comparator<Element>() { @Override public int compare(Element o1, Element o2) { int len1 = 1; int len2 = 1; if (o1.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() > 26 || o1.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() < 7) { len1 = 0; } if (o2.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() > 26 || o2.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() < 7) { len2 = 0; } if (len1 == len2) { return o1.tagName().charAt(1) - o2.tagName().charAt(1); } return len2 - len1; } }); return getText(titleList.get(0).text()); } return titleList.get(maxIndex).text(); } } /** * ? */ Elements titles = doc.body().select("*[id^=title],*[id$=title],*[class^=title],*[class$=title]"); if (titles.size() > 0) { String title = titles.first().text(); if (title.length() > 5 && title.length() < 40) { return titles.first().text(); } } try { return getTitleByEditDistance(contentElement); } catch (Exception ex) { throw new Exception("title not found"); } }
From source file:net.slkdev.swagger.confluence.service.impl.XHtmlToConfluenceServiceImpl.java
private void addLinksByType(final Map<String, ConfluenceLink> confluenceLinkMap, final Elements elements, final PageType pageType, final Integer numericPrefix) { final SwaggerConfluenceConfig swaggerConfluenceConfig = SWAGGER_CONFLUENCE_CONFIG.get(); int linkCount = 1; for (final Element element : elements) { final String confluenceLinkMarkup; final String originalTarget = element.attr("href"); final String text = element.text(); final String confluencePageTitle; if (pageType == INDIVIDUAL) { confluencePageTitle = buildConfluenceTitle(text, numericPrefix, linkCount); } else {/*from ww w . j ava 2s .co m*/ confluencePageTitle = buildConfluenceTitle(text, linkCount, null); } switch (swaggerConfluenceConfig.getPaginationMode()) { case SINGLE_PAGE: confluenceLinkMarkup = formatSinglePageLink(text); break; case CATEGORY_PAGES: confluenceLinkMarkup = formatCategoryPageLink(text, confluencePageTitle, pageType); break; case INDIVIDUAL_PAGES: confluenceLinkMarkup = formatIndividualPageLink(text, confluencePageTitle); break; default: throw new SwaggerConfluenceConfigurationException("Unhandled Pagination Mode!"); } final ConfluenceLink confluenceLink = ConfluenceLinkBuilder.aConfluenceLink().withPageType(pageType) .withOriginalHref(originalTarget).withText(text).withConfluenceLinkMarkup(confluenceLinkMarkup) .build(); LOG.debug("LINK MAP: {} -> {}", originalTarget, confluenceLinkMarkup); confluenceLinkMap.put(originalTarget, confluenceLink); linkCount++; } }
From source file:net.kevxu.purdueassist.course.ScheduleDetail.java
private ScheduleDetailEntry parseDocument(Document document) throws HtmlParseException, CourseNotFoundException, ResultNotMatchException { ScheduleDetailEntry entry = new ScheduleDetailEntry(term, crn); Elements tableElements = document.getElementsByAttributeValue("summary", "This table is used to present the detailed class information."); if (!tableElements.isEmpty()) { for (Element tableElement : tableElements) { // get basic info for selected course Element tableBasicInfoElement = tableElement.getElementsByClass("ddlabel").first(); if (tableBasicInfoElement != null) { setBasicInfo(entry, tableBasicInfoElement.text()); } else { throw new HtmlParseException("Basic info element empty."); }/*from w ww .ja va2 s . c o m*/ // get detailed course info Element tableDetailedInfoElement = tableElement.getElementsByClass("dddefault").first(); if (tableDetailedInfoElement != null) { // process seat info Elements tableSeatDetailElements = tableDetailedInfoElement.getElementsByAttributeValue( "summary", "This layout table is used to present the seating numbers."); if (tableSeatDetailElements.size() == 1) { Element tableSeatDetailElement = tableSeatDetailElements.first(); Elements tableSeatDetailEntryElements = tableSeatDetailElement.getElementsByTag("tbody") .first().children(); if (tableSeatDetailEntryElements.size() == 3 || tableSeatDetailEntryElements.size() == 4) { setSeats(entry, tableSeatDetailEntryElements.get(1).text()); setWaitlistSeats(entry, tableSeatDetailEntryElements.get(2).text()); if (tableSeatDetailEntryElements.size() == 4) { setCrosslistSeats(entry, tableSeatDetailEntryElements.get(3).text()); } } else { throw new HtmlParseException("Seat detail entry elements size not 3. We have " + tableSeatDetailEntryElements.size() + "."); } } else { throw new HtmlParseException( "Seat detail elements size not 1. We have " + tableSeatDetailElements.size() + "."); } // remove the seat info from detailed info tableSeatDetailElements.remove(); // remaining information setRemainingInfo(entry, tableDetailedInfoElement.html()); } else { throw new HtmlParseException("Detailed info element empty."); } } } else { // test empty Elements informationElements = document.getElementsByAttributeValue("summary", "This layout table holds message information"); if (!informationElements.isEmpty() && informationElements.text().contains("No detailed class information found")) { throw new CourseNotFoundException(informationElements.text()); } else { throw new HtmlParseException( "Course table not found, but page does not contain message stating no course found."); } } return entry; }
From source file:de.geeksfactory.opacclient.apis.Open.java
protected DetailledItem parse_result(Document doc) { DetailledItem item = new DetailledItem(); // Title and Subtitle item.setTitle(doc.select("span[id$=LblShortDescriptionValue]").text()); String subtitle = doc.select("span[id$=LblSubTitleValue]").text(); if (!subtitle.equals("")) { item.addDetail(new Detail(stringProvider.getString(StringProvider.SUBTITLE), subtitle)); }//from w ww. j a va 2s. com // Cover if (doc.select("input[id$=mediumImage]").size() > 0) { item.setCover(doc.select("input[id$=mediumImage]").attr("src")); } else if (doc.select("img[id$=CoverView_Image]").size() > 0) { item.setCover(getCoverUrl(doc.select("img[id$=CoverView_Image]").first())); } // ID item.setId(doc.select("input[id$=regionmednr]").val()); // Description if (doc.select("span[id$=ucCatalogueContent_LblAnnotation]").size() > 0) { String name = doc.select("span[id$=lblCatalogueContent]").text(); String value = doc.select("span[id$=ucCatalogueContent_LblAnnotation]").text(); item.addDetail(new Detail(name, value)); } // Details for (Element detail : doc.select("div[id$=CatalogueDetailView] .spacingBottomSmall:has(span+span)")) { String name = detail.select("span").get(0).text().replace(": ", ""); String value = detail.select("span").get(1).text(); item.addDetail(new Detail(name, value)); } // Copies Element table = doc.select("table[id$=grdViewMediumCopies]").first(); Elements trs = table.select("tr"); List<String> columnmap = new ArrayList<>(); for (Element th : trs.first().select("th")) { columnmap.add(getCopyColumnKey(th.text())); } DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); for (int i = 1; i < trs.size(); i++) { Elements tds = trs.get(i).select("td"); Copy copy = new Copy(); for (int j = 0; j < tds.size(); j++) { if (columnmap.get(j) == null) continue; String text = tds.get(j).text().replace("\u00a0", ""); if (text.equals("")) continue; copy.set(columnmap.get(j), text, fmt); } item.addCopy(copy); } return item; }
From source file:de.geeksfactory.opacclient.apis.Heidi.java
@Override public AccountData account(Account account) throws IOException, JSONException, OpacErrorException { login(account);/*w w w . j a v a 2 s .c o m*/ String html; Document doc; AccountData adata = new AccountData(account.getId()); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); html = httpGet(opac_url + "/konto.cgi?sess=" + sessid, getDefaultEncoding()); doc = Jsoup.parse(html); doc.setBaseUri(opac_url + "/"); for (Element td : doc.select("table.konto td")) { if (td.text().contains("Offene")) { String text = td.text().trim().replaceAll( "Offene[^0-9]+Geb.+hren:[^0-9]+([0-9.," + "]+)[^0-9A-Z]*(|EUR|CHF|Fr.)", "$1 $2"); adata.setPendingFees(text); } } List<LentItem> lent = new ArrayList<>(); for (Element tr : doc.select("table.kontopos tr")) { LentItem item = new LentItem(); Element desc = tr.child(1).select("label").first(); String dates = tr.child(2).text().trim(); if (tr.child(1).select("a").size() > 0) { String kk = getQueryParamsFirst(tr.child(1).select("a").first().absUrl("href")).get("katkey"); item.setId(kk); } int i = 0; for (Node node : desc.childNodes()) { if (node instanceof TextNode) { String text = ((TextNode) node).text().trim(); if (i == 0) { item.setAuthor(text); } else if (i == 1) { item.setTitle(text); } else if (text.contains("Mediennummer")) { item.setBarcode(text.replace("Mediennummer: ", "")); } i++; } } if (tr.child(0).select("input").size() == 1) { item.setProlongData(tr.child(0).select("input").first().val()); item.setRenewable(true); } else { item.setProlongData("" + tr.child(0).select("span").first().attr("class")); item.setRenewable(false); } String todate = dates; if (todate.contains("-")) { String[] datesplit = todate.split("-"); todate = datesplit[1].trim(); } try { item.setDeadline(fmt.parseLocalDate(todate.substring(0, 10))); } catch (IllegalArgumentException e) { e.printStackTrace(); } lent.add(item); } adata.setLent(lent); List<ReservedItem> reservations = new ArrayList<>(); html = httpGet(opac_url + "/konto.cgi?konto=v&sess=" + sessid, getDefaultEncoding()); reservations.addAll(parse_reservations(html)); html = httpGet(opac_url + "/konto.cgi?konto=b&sess=" + sessid, getDefaultEncoding()); reservations.addAll(parse_reservations(html)); adata.setReservations(reservations); return adata; }
From source file:gov.medicaid.screening.dao.impl.OptometryLicenseDAOBean.java
/** * Performs a search for all possible results. * * @param identifier The value to be searched. * @return the search result for licenses * @throws URISyntaxException When an error occurs while building the URL. * @throws ClientProtocolException When client does not support protocol used. * @throws IOException When an error occurs while parsing response. * @throws ParseException When an error occurs while parsing response. * @throws PersistenceException for database related errors * @throws ServiceException for any other problems encountered *//*from ww w .j a v a 2s . c o m*/ private SearchResult<License> getAllResults(String identifier) throws URISyntaxException, ClientProtocolException, IOException, ParseException, PersistenceException, ServiceException { DefaultHttpClient client = new DefaultHttpClient(); URIBuilder builder = new URIBuilder(getSearchURL()).setPath("/Default.aspx"); String hostId = builder.build().toString(); builder.setParameter("tabid", "799"); HttpGet httpget = new HttpGet(builder.build()); HttpResponse landing = client.execute(httpget); Document document = Jsoup.parse(EntityUtils.toString(landing.getEntity())); HttpPost httppost = new HttpPost(builder.build()); HttpEntity entity = postForm(hostId, client, httppost, new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier }, { "_ctl0:_ctl1:_ctl0:btnSubmit", "Search" }, { "__EVENTTARGET", "" }, { "__EVENTARGUMENT", "" }, { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } }, true); // licenses list List<License> licenseList = new ArrayList<License>(); while (entity != null) { String result = EntityUtils.toString(entity); document = Jsoup.parse(result); Elements trs = document.select("table.Datagrid tr"); if (trs != null) { for (Element element : trs) { String cssClass = element.attr("class"); if (!"DatagridHeaderStyle".equals(cssClass.trim()) && element.children().size() == 8) { Elements tds = element.children(); licenseList.add(parseLicense(tds)); } } } // done, check if there are additional results entity = null; Elements elements = document.getElementsByTag("a"); for (Element element : elements) { if (element.text().equals("Next >>")) { entity = postForm(hostId, client, httppost, new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier }, { "__EVENTTARGET", "_ctl0:_ctl1:_ctl0:dgrdLicensee:_ctl29:_ctl1" }, { "__EVENTARGUMENT", "" }, { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } }, true); break; } } } SearchResult<License> result = new SearchResult<License>(); result.setItems(licenseList); return result; }
From source file:me.vertretungsplan.parser.UntisCommonParser.java
/** * Parses an Untis substitution table ({@link UntisSubstitutionParser}). * * @param v//from w w w .j av a 2 s.c o m * @param lastChange * @param doc * @throws JSONException * @throws CredentialInvalidException */ protected void parseSubstitutionTable(SubstitutionSchedule v, String lastChange, Document doc) throws JSONException, CredentialInvalidException { JSONObject data = scheduleData.getData(); LocalDateTime lastChangeDate = ParserUtils.parseDateTime(lastChange); Pattern dayPattern = Pattern.compile("\\d\\d?.\\d\\d?. / \\w+"); int dateColumn = -1; JSONArray columns = data.getJSONArray("columns"); for (int i = 0; i < columns.length(); i++) { if (columns.getString(i).equals("date")) { dateColumn = i; break; } } Element table = doc.select("table[rules=all], table:has(tr:has(td[align=center]))").first(); if (table.text().replace("\u00a0", "").trim().equals("Keine Vertretungen")) return; if (dateColumn == -1) { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); day.setLastChangeString(lastChange); day.setLastChange(lastChangeDate); String title = doc.select("font[size=5], font[size=4], font[size=3] b").text(); Matcher matcher = dayPattern.matcher(title); if (matcher.find()) { String date = matcher.group(); day.setDateString(date); day.setDate(ParserUtils.parseDate(date)); } parseSubstitutionScheduleTable(table, data, day); v.addDay(day); } else { for (Element line : table.select("tr.list.odd:not(:has(td.inline_header)), " + "tr.list.even:not(:has(td.inline_header)), " + "tr:has(td[align=center]):gt(0)")) { SubstitutionScheduleDay day = null; String date = line.select("td").get(dateColumn).text().trim(); if (date.indexOf("-") > 0) { date = date.substring(0, date.indexOf("-") - 1).trim(); } LocalDate parsedDate = ParserUtils.parseDate(date); for (SubstitutionScheduleDay search : v.getDays()) { if (Objects.equals(search.getDate(), parsedDate) || Objects.equals(search.getDateString(), date)) { day = search; break; } } if (day == null) { day = new SubstitutionScheduleDay(); day.setDateString(date); day.setDate(parsedDate); day.setLastChangeString(lastChange); day.setLastChange(lastChangeDate); v.addDay(day); } parseSubstitutionScheduleTable(line, data, day); } } }