List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:tkbautobooking.BookingSystem.java
private void praseBookingPage() throws Exception { Document doc = Jsoup.parse(BookingPageHTML); Element class_selector = doc.getElementById("class_selector"); if (class_selector == null) throw new Exception("Prase Booking Page fail !"); classMap = new TreeMap<>(); for (Element option : class_selector.getElementsByTag("option")) { if (option.attr("value").equals("")) continue; classMap.put(option.attr("value"), option.text().replace("", " ")); }/*from w ww .j a va 2 s. c o m*/ }
From source file:gov.medicaid.screening.dao.impl.BBHTLicenseDAOBean.java
/** * Performs a search for all possible results. * * @param criteria The search criteria./*from w ww.j a v a 2s. co m*/ * @param byName flag indicating it is a name search * @return the search result for licenses * * @throws URISyntaxException if an error occurs while building the URL. * @throws ClientProtocolException if client does not support protocol used. * @throws IOException if an error occurs while parsing response. * @throws ParseException if an error occurs while parsing response. * @throws ServiceException for any other problems encountered */ private SearchResult<License> getAllResults(BBHTLicenseSearchCriteria criteria, boolean byName) throws URISyntaxException, ClientProtocolException, IOException, ParseException, ServiceException { DefaultHttpClient client = new DefaultHttpClient(getLaxSSLConnectionManager()); client.setRedirectStrategy(new LaxRedirectStrategy()); HttpGet getSearch = new HttpGet(new URIBuilder(getSearchURL()).build()); HttpResponse response = client.execute(getSearch); verifyAndAuditCall(getSearchURL(), response); Document page = Jsoup.parse(EntityUtils.toString(response.getEntity())); HttpPost search = new HttpPost(new URIBuilder(getSearchURL()).build()); List<License> allLicenses = new ArrayList<License>(); // switch to search by name screen if (byName) { HttpEntity entity = postForm(getSearchURL(), client, search, new String[][] { { "__EVENTTARGET", "_ctl7_rbtnSearch_1" }, { "__EVENTARGUMENT", "" }, { "_ctl7:ddlbLicenseType", "CD" }, { "_ctl7:rbtnSearch", "2" }, { "_ctl7:txtLicenseNumber", "" }, { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() } }, true); page = Jsoup.parse(EntityUtils.toString(entity)); entity = getResultPage(criteria, client, page, search, "_ctl7:cmdSearch", getSearchURL()); page = Jsoup.parse(EntityUtils.toString(entity)); // get the data grid entries if (page.select("table#_ctl7_grdSearchResults").size() < 1) { throw new ParsingException(ErrorCode.MITA50002.getDesc()); } Elements rows = page.select(GRID_ROW_SELECTOR); while (rows.size() > 0) { for (Element row : rows) { String url = row.select("a").first().attr("href"); String licenseNo = row.select("td:eq(5)").text(); HttpGet getDetail = new HttpGet(Util.replaceLastURLPart(getSearchURL(), url)); response = client.execute(getDetail); verifyAndAuditCall(getSearchURL(), response); Document licenseDetails = Jsoup.parse(EntityUtils.toString(response.getEntity())); allLicenses.add(parseLicense(licenseDetails, licenseNo)); } rows.clear(); // check for next page Element currentPage = page.select("#_ctl7_grdSearchResults tr.TablePager span").first(); if (getLog() != null) { getLog().log(Level.DEBUG, "Current page is: " + currentPage.text()); } Element pageLink = currentPage.nextElementSibling(); if (pageLink != null && pageLink.hasAttr("href")) { if (getLog() != null) { getLog().log(Level.DEBUG, "There are more results, getting the next page."); } String target = parseEventTarget(pageLink.attr("href")); entity = getResultPage(criteria, client, page, search, target, getSearchURL()); page = Jsoup.parse(EntityUtils.toString(entity)); rows = page.select(GRID_ROW_SELECTOR); } } } else { // search by license number (site supports only exact match) HttpEntity entity = postForm(getSearchURL(), client, search, new String[][] { { "__EVENTTARGET", "_ctl7:cmdSearch" }, { "__EVENTARGUMENT", "" }, { "_ctl7:ddlbLicenseType", Util.defaultString(criteria.getLicenseType().getName()) }, { "_ctl7:rbtnSearch", "1" }, { "_ctl7:txtLicenseNumber", Util.defaultString(criteria.getIdentifier()) }, { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() } }, true); page = Jsoup.parse(EntityUtils.toString(entity)); if (page.select("span#lblFormTitle").text().equals("License Details")) { String prefLicenseNo = criteria.getIdentifier(); allLicenses.add(parseLicense(page, prefLicenseNo)); } } SearchResult<License> searchResult = new SearchResult<License>(); searchResult.setItems(allLicenses); return searchResult; }
From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java
/** * @param node //from ww w.j a v a 2s . c o m * 1. styleclass * 2. ????density??? * 3. p??? * @return */ protected CountInfo computeInfo(Node node) { if (node instanceof Element) { node.removeAttr("style").removeAttr("class"); Element tag = (Element) node; if (tag.text().matches(".{1,20}>.{1,10}>.{1,20}")) { CountInfo countInfo = new CountInfo(); countInfo.density = -200; return countInfo; } CountInfo countInfo = new CountInfo(); for (Node childNode : tag.childNodes()) { CountInfo childCountInfo = computeInfo(childNode); countInfo.textCount += childCountInfo.textCount; countInfo.linkTextCount += childCountInfo.linkTextCount; countInfo.tagCount += childCountInfo.tagCount; countInfo.linkTagCount += childCountInfo.linkTagCount; countInfo.leafList.addAll(childCountInfo.leafList); countInfo.densitySum += childCountInfo.density; countInfo.pCount += childCountInfo.pCount; } countInfo.tagCount++; String tagName = tag.tagName(); if (tagName.equals("a") || tagName.equals("img")) { countInfo.linkTextCount = countInfo.textCount; countInfo.linkTagCount++; } else if (tagName.equals("p")) { countInfo.pCount++; } int pureLen = countInfo.textCount - countInfo.linkTextCount; int len = countInfo.tagCount - countInfo.linkTagCount; if (pureLen == 0 || len == 0) { countInfo.density = 0; } else { countInfo.density = (pureLen + 0.0) / len; } infoMap.put(tag, countInfo); return countInfo; } else if (node instanceof TextNode) { TextNode tn = (TextNode) node; CountInfo countInfo = new CountInfo(); String text = tn.text(); int len = text.length(); countInfo.textCount = len; countInfo.leafList.add(len); return countInfo; } else { return new CountInfo(); } }
From source file:gui.InboxPanel.java
private String getTextBody() { String html = BodyTextPane.getText(); Document doc = Jsoup.parseBodyFragment(html); Element body = doc.body(); return body.text(); }
From source file:org.confab.PhpBB3Parser.java
public List<Forum> parseForums(Document root, BulletinBoard parent) { Utilities.debug("parseForums"); List<Forum> ret = new ArrayList<Forum>(); // get table//from w w w . jav a2s.c om Elements forum_tables = root.select("ul[class=topiclist forums]"); assert !forum_tables.isEmpty() : root.html(); for (Element forum_table : forum_tables) { Elements els_li = forum_table.select("li.row"); assert !els_li.isEmpty(); for (Element el_li : els_li) { Forum new_forum = new Forum(parent); // Get the forum url Elements els_a = el_li.select("a.forumtitle"); Element el_a = els_a.first(); assert el_a != null; new_forum.url = el_a.attr("href"); assert new_forum.url != null; Utilities.debug("new_forum.url : " + new_forum.url); // Get the title text new_forum.title = el_a.text(); assert new_forum.title != null; Utilities.debug("new_forum.title : " + new_forum.title); // Check for any subforums in remaining a elements els_a.remove(els_a.first()); for (Element _el_a : els_a) { Forum sub_forum = new Forum(parent); sub_forum.url = el_a.attr("href"); assert sub_forum.url != null; sub_forum.title = el_a.text(); assert sub_forum.title != null; new_forum.subForums.add(sub_forum); Utilities.debug("added subForum: " + sub_forum.title); } // Get the description/message of this topic String el_description = el_a.parent().text(); if (el_description != null) { new_forum.description = el_description; } else { new_forum.description = ""; } Utilities.debug("new_forum.description : " + new_forum.description); Utilities.debug("new_forum.parent.url : " + new_forum.parent.url); ret.add(new_forum); Utilities.debug("-----"); } } Utilities.debug("end parseForums"); return ret; }
From source file:com.thesmartweb.swebrank.WebParser.java
/** * Parse the url and get all the content * @param link_html the url to parse/*w ww .j av a 2s. c o m*/ * @return The content parsed */ public String cleanhtml(String link_html) { try { Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get(); String title = doc.title(); String mainbody = doc.body().text(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); //fix link html to remove https:// or http:// and simple / if (link_html.substring(link_html.length() - 1, link_html.length()).equalsIgnoreCase("/")) { link_html = link_html.substring(0, link_html.length() - 1); } if (link_html.substring(0, 5).equalsIgnoreCase("https")) { link_html = link_html.substring(8); } else if (link_html.substring(0, 4).equalsIgnoreCase("http")) { link_html = link_html.substring(7); } String anchortext = ""; String alttext = ""; //-----get the anchor text of internal links for (Element link : links) { String str_check = link.attr("abs:href").toString(); if (link.attr("abs:href").contains(link_html) && link.text().length() > 1) { anchortext = anchortext + link.text() + " "; } } //-------get alt text to internal images links for (Element medi : media) { if (medi.getElementsByTag("img").attr("src").toString().contains(link_html)) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString(); } if (medi.getElementsByTag("img").attr("src").toString().startsWith("/")) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString(); } } String content = mainbody + title + anchortext + alttext; return content; } catch (IOException ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (NullPointerException ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (Exception ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } }
From source file:neembuu.release1.externalImpl.linkhandler.YoutubeLinkHandlerProvider.java
private BasicLinkHandler.Builder linkYoutubeExtraction(TrialLinkHandler tlh, int retryCount) throws Exception { String url = tlh.getReferenceLinkString(); BasicLinkHandler.Builder linkHandlerBuilder = BasicLinkHandler.Builder.create(); try {/* w w w. j av a2 s . c om*/ DefaultHttpClient httpClient = NHttpClient.getNewInstance(); String requestUrl = "http://www.linkyoutube.com/watch/index.php?video=" + URLEncoder.encode(url, "UTF-8"); final String responseString = NHttpClientUtils.getData(requestUrl, httpClient); //Set the group name as the name of the video String nameOfVideo = getVideoName(url); String fileName = "text"; linkHandlerBuilder.setGroupName(nameOfVideo); long c_duration = -1; Document doc = Jsoup.parse(responseString); Elements elements = doc.select("#download_links a"); for (Element element : elements) { String singleUrl = element.attr("href"); fileName = element.text(); if (!singleUrl.equals("#")) { long length = NHttpClientUtils.calculateLength(singleUrl, httpClient); singleUrl = Utils.normalize(singleUrl); LOGGER.log(Level.INFO, "Normalized URL: " + singleUrl); if (length == 0) { length = NHttpClientUtils.calculateLength(singleUrl, httpClient); } //LOGGER.log(Level.INFO,"Length: " + length); if (length <= 0) { continue; /*skip this url*/ } BasicOnlineFile.Builder fileBuilder = linkHandlerBuilder.createFile(); try { // finding video/audio length String dur = StringUtils.stringBetweenTwoStrings(singleUrl, "dur=", "&"); long duration = (int) (Double.parseDouble(dur) * 1000); if (c_duration < 0) { c_duration = duration; } fileBuilder.putLongPropertyValue( PropertyProvider.LongProperty.MEDIA_DURATION_IN_MILLISECONDS, duration); LOGGER.log(Level.INFO, "dur=" + dur); } catch (NumberFormatException a) { // ignore } try { // finding the quality short name String type = fileName.substring(fileName.indexOf("(") + 1); type = type.substring(0, type.indexOf(")")); fileBuilder.putStringPropertyValue(PropertyProvider.StringProperty.VARIANT_DESCRIPTION, type); LOGGER.log(Level.INFO, "type=" + type); } catch (Exception a) { a.printStackTrace(); } fileName = nameOfVideo + " " + fileName; fileBuilder.setName(fileName).setUrl(singleUrl).setSize(length).next(); } } for (OnlineFile of : linkHandlerBuilder.getFiles()) { long dur = of.getPropertyProvider() .getLongPropertyValue(PropertyProvider.LongProperty.MEDIA_DURATION_IN_MILLISECONDS); if (dur < 0 && c_duration > 0 && of.getPropertyProvider() instanceof BasicPropertyProvider) { ((BasicPropertyProvider) of.getPropertyProvider()).putLongPropertyValue( PropertyProvider.LongProperty.MEDIA_DURATION_IN_MILLISECONDS, c_duration); } } } catch (Exception ex) { int retryLimit = ((YT_TLH) tlh).retryLimit; ex.printStackTrace(); LOGGER.log(Level.INFO, "retry no. = " + retryCount); if (retryCount > retryLimit) throw ex; return linkYoutubeExtraction(tlh, retryCount + 1); } return linkHandlerBuilder; }
From source file:gov.medicaid.screening.dao.impl.NursingLicenseDAOBean.java
/** * Performs a search for all possible results. * * @param criteria The search criteria.// w ww.j a v a 2s . c o m * @param byName flag indicating it is a name search * @return the search result for licenses * * @throws URISyntaxException if an error occurs while building the URL. * @throws ClientProtocolException if client does not support protocol used. * @throws IOException if an error occurs while parsing response. * @throws ParseException if an error occurs while parsing response. * @throws ServiceException for any other problems encountered */ private SearchResult<License> getAllResults(NursingLicenseSearchCriteria criteria, boolean byName) throws URISyntaxException, ClientProtocolException, IOException, ParseException, ServiceException { DefaultHttpClient client = new DefaultHttpClient(getLaxSSLConnectionManager()); client.setRedirectStrategy(new LaxRedirectStrategy()); client.setCookieStore(loginAsPublicUser()); HttpGet getSearch = new HttpGet(new URIBuilder(getSearchURL()).build()); HttpResponse response = client.execute(getSearch); verifyAndAuditCall(getSearchURL(), response); Document page = Jsoup.parse(EntityUtils.toString(response.getEntity())); HttpPost search = new HttpPost(new URIBuilder(getSearchURL()).build()); List<License> allLicenses = new ArrayList<License>(); // switch to search by name screen if (byName) { HttpEntity entity = postForm(getSearchURL(), client, search, new String[][] { { "__EVENTTARGET", "_ctl7_rbtnSearch_1" }, { "__EVENTARGUMENT", "" }, { "_ctl7:ddlbLicenseType", "R" }, { "_ctl7:rbtnSearch", "2" }, { "_ctl7:txtCheckDigit", "" }, { "_ctl7:txtLicenseNumber", "" }, { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() } }, true); page = Jsoup.parse(EntityUtils.toString(entity)); entity = getResultPage(criteria, client, page, search, "_ctl7:cmdSearch", getSearchURL()); page = Jsoup.parse(EntityUtils.toString(entity)); // get the data grid entries if (page.select("table#_ctl7_grdSearchResults").size() < 1) { throw new ParsingException(ErrorCode.MITA50002.getDesc()); } Elements rows = page.select(GRID_ROW_SELECTOR); while (rows.size() > 0) { for (Element row : rows) { String url = row.select("a").first().attr("href"); String licenseNo = row.select("td:eq(4)").text(); HttpGet getDetail = new HttpGet(Util.replaceLastURLPart(getSearchURL(), url)); response = client.execute(getDetail); verifyAndAuditCall(getSearchURL(), response); Document licenseDetails = Jsoup.parse(EntityUtils.toString(response.getEntity())); allLicenses.add(parseLicense(licenseDetails, licenseNo.substring(0, 1))); } rows.clear(); // check for next page Element currentPage = page.select("#_ctl7_grdSearchResults tr.TablePager span").first(); if (getLog() != null) { getLog().log(Level.DEBUG, "Current page is: " + currentPage.text()); } Element pageLink = currentPage.nextElementSibling(); if (pageLink != null && pageLink.hasAttr("href")) { if (getLog() != null) { getLog().log(Level.DEBUG, "There are more results, getting the next page."); } String target = parseEventTarget(pageLink.attr("href")); entity = getResultPage(criteria, client, page, search, target, getSearchURL()); page = Jsoup.parse(EntityUtils.toString(entity)); rows = page.select(GRID_ROW_SELECTOR); } } } else { // search by license number (site supports only exact match) HttpEntity entity = postForm(getSearchURL(), client, search, new String[][] { { "__EVENTTARGET", "_ctl7:cmdSearch" }, { "__EVENTARGUMENT", "" }, { "_ctl7:ddlbLicenseType", Util.defaultString(criteria.getLicenseType().getName()) }, { "_ctl7:rbtnSearch", "1" }, { "_ctl7:txtCheckDigit", Util.defaultString(criteria.getCheckDigit()) }, { "_ctl7:txtLicenseNumber", Util.defaultString(criteria.getIdentifier()) }, { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() } }, true); page = Jsoup.parse(EntityUtils.toString(entity)); if (page.select("span#lblFormTitle").text().equals("License Details")) { String prefLicenseType = criteria.getLicenseType().getName(); allLicenses.add(parseLicense(page, prefLicenseType)); } } SearchResult<License> searchResult = new SearchResult<License>(); searchResult.setItems(allLicenses); return searchResult; }
From source file:me.vertretungsplan.parser.UntisInfoParser.java
private void parseSubstitutionDays(SubstitutionSchedule v, String lastChange, Document doc, String klasse) throws JSONException, CredentialInvalidException { Elements days = doc.select("#vertretung > p > b, #vertretung > b, p:has(a[href^=#]) > b"); if (days.size() > 0) { for (Element dayElem : days) { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); day.setLastChangeString(lastChange); day.setLastChange(ParserUtils.parseDateTime(lastChange)); String date = dayElem.text(); day.setDateString(date);//from w ww .ja v a2 s. c o m day.setDate(ParserUtils.parseDate(date)); Element next; if (dayElem.parent().tagName().equals("p")) { next = dayElem.parent().nextElementSibling().nextElementSibling(); } else { next = dayElem.parent().select("p").first().nextElementSibling(); } parseDay(day, next, v, klasse); } } else if (doc.select("tr:has(td[align=center]):gt(0)").size() > 0) { parseSubstitutionTable(v, null, doc); v.setLastChangeString(lastChange); v.setLastChange(ParserUtils.parseDateTime(lastChange)); } }
From source file:me.vertretungsplan.parser.DaVinciParser.java
@NotNull void parsePage(Element doc, SubstitutionSchedule schedule) throws IOException { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); Element titleElem; if (doc.select("h1.list-table-caption").size() > 0) { titleElem = doc.select("h1.list-table-caption").first(); } else {//from w w w.j a v a 2s . co m // DaVinci 5 titleElem = doc.select("h2").first(); } String title = titleElem.text(); String klasse = null; // title can either be date or class Pattern datePattern = Pattern.compile("\\d+\\.\\d+.\\d{4}"); Matcher dateMatcher = datePattern.matcher(title); if (dateMatcher.find()) { day.setDateString(dateMatcher.group()); day.setDate(ParserUtils.parseDate(dateMatcher.group())); } else { klasse = title; String nextText = titleElem.nextElementSibling().text(); if (nextText.matches("\\w+ \\d+\\.\\d+.\\d{4}")) { day.setDateString(nextText); day.setDate(ParserUtils.parseDate(nextText)); } else { // could not find date, must be multiple days day = null; } } for (Element p : doc.select(".row:has(h1.list-table-caption) p")) { for (TextNode node : p.textNodes()) { if (!node.text().trim().isEmpty() && day != null) day.addMessage(node.text().trim()); } } for (Element message : doc.select(".callout")) { for (TextNode node : message.textNodes()) { if (!node.text().trim().isEmpty()) day.addMessage(node.text().trim()); } } Element lastChangeElem = doc.select(".row.copyright div").first(); if (lastChangeElem == null) { // DaVinci 5 lastChangeElem = doc.select("h1").first(); } String lastChange = lastChangeElem.ownText(); Pattern pattern = Pattern.compile("(\\d{2}-\\d{2}-\\d{4} \\d{2}:\\d{2}) \\|"); Matcher matcher = pattern.matcher(lastChange); if (matcher.find()) { LocalDateTime lastChangeTime = DateTimeFormat.forPattern("dd-MM-yyyy HH:mm") .parseLocalDateTime(matcher.group(1)); if (day != null) { day.setLastChange(lastChangeTime); } else { schedule.setLastChange(lastChangeTime); } } else { Pattern pattern2 = Pattern.compile("(\\d{2}.\\d{2}.\\d{4} \\| \\d+:\\d{2})"); Matcher matcher2 = pattern2.matcher(lastChange); if (matcher2.find()) { LocalDateTime lastChangeTime = DateTimeFormat.forPattern("dd.MM.yyyy | HH:mm") .parseLocalDateTime(matcher2.group(1)); if (day != null) { day.setLastChange(lastChangeTime); } else { schedule.setLastChange(lastChangeTime); } } } if (doc.select(".list-table").size() > 0 || !doc.select(".callout").text().contains("Es liegen keine")) { Element table = doc.select(".list-table, table").first(); parseDaVinciTable(table, schedule, klasse, day, colorProvider); } if (day != null) { schedule.addDay(day); } }