List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java
private boolean noText(Element ele) { return noText(ele.text()); }
From source file:com.jejking.hh.nord.corpus.AllrisHtmlToRawDrucksache.java
private ImmutableList<String> druckSachenContents(Document htmlDoc) { /*//from w w w . j a v a2 s . c o m * In this way we can identify the bits of "RTF" like text inserted into the overall HTML. * JSoup cleans up the broken HTML removing the xml declaration and inserted html roots * that ALLRIS manages to put in. */ Elements contentMetaElements = htmlDoc.getElementsByAttributeValue("name", "generator"); ImmutableList.Builder<String> listBuilder = ImmutableList.builder(); /* * Iterate over our candidates. Sometimes there are several. */ for (Element contentMetaElement : contentMetaElements) { StringBuilder contentAsTextBuilder = new StringBuilder(); Element nextSibling = contentMetaElement.nextElementSibling(); /* * In the cleaned up HTML DOM returned by JSoup the "RTF" content is * rendered as siblings of the meta node (JSoup having removed the html, head, body * elements which should never have been there in the first place). */ while (nextSibling != null && !nextSibling.tag().equals("meta")) { contentAsTextBuilder.append(nextSibling.text()); nextSibling = nextSibling.nextElementSibling(); } /* * Only carry over non-empty content. */ String contentAsText = contentAsTextBuilder.toString(); if (!removeNonBreakingSpacesAndTrim(contentAsText).isEmpty()) { listBuilder.add(contentAsText); } } return listBuilder.build(); }
From source file:me.vertretungsplan.parser.UntisInfoParser.java
@Override public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException { new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore); Document navbarDoc = Jsoup.parse(getNavbarDoc().replace(" ", "")); Element select = navbarDoc.select("select[name=week]").first(); SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData); String info = navbarDoc.select(".description").text(); String lastChange;// ww w . ja v a 2 s .co m try { lastChange = info.substring(info.indexOf("Stand:") + "Stand:".length()).trim(); } catch (Exception e) { try { String infoHtml = httpGet(baseUrl + "/frames/title.htm", data.optString(PARAM_ENCODING, null)); Document infoDoc = Jsoup.parse(infoHtml); String info2 = infoDoc.select(".description").text(); lastChange = info2.substring(info2.indexOf("Stand:") + "Stand:".length()).trim(); } catch (Exception e1) { lastChange = ""; } } int successfulWeeks = 0; HttpResponseException lastException = null; for (Element option : select.children()) { String week = option.attr("value"); String weekName = option.text(); if (data.optBoolean(PARAM_SINGLE_CLASSES, data.optBoolean("single_classes", false)) // backwards compatibility || data.optString(PARAM_SCHEDULE_TYPE, "substitution").equals("timetable")) { int classNumber = 1; for (String klasse : getAllClasses()) { String url = getScheduleUrl(week, classNumber, data); try { parsePage(v, lastChange, klasse, url, weekName); } catch (HttpResponseException e) { if (e.getStatusCode() == 500) { // occurs in Hannover_MMBS classNumber++; continue; } else { throw e; } } classNumber++; } successfulWeeks++; } else { String url = getScheduleUrl(week, 0, data); try { parsePage(v, lastChange, null, url, weekName); successfulWeeks++; } catch (HttpResponseException e) { lastException = e; } } } if (successfulWeeks == 0 && lastException != null) { throw lastException; } v.setClasses(getAllClasses()); v.setTeachers(getAllTeachers()); v.setWebsite(baseUrl + "/default.htm"); return v; }
From source file:gov.medicaid.screening.dao.impl.DieteticsAndNutritionPracticeLicenseDAOBean.java
/** * Performs a search for all possible results. * * @param identifier The value to be searched. * @return the search result for licenses * @throws URISyntaxException When an error occurs while building the URL. * @throws ClientProtocolException When client does not support protocol used. * @throws IOException When an error occurs while parsing response. * @throws ParseException When an error occurs while parsing response. * @throws PersistenceException for database related errors * @throws ServiceException for any other errors *//*from w w w . j a va 2 s .com*/ private SearchResult<License> getAllResults(String identifier) throws URISyntaxException, ClientProtocolException, IOException, ParseException, PersistenceException, ServiceException { DefaultHttpClient client = new DefaultHttpClient(); URIBuilder builder = new URIBuilder(getSearchURL()); String hostId = builder.build().toString(); HttpGet httpget = new HttpGet(builder.build()); HttpResponse landing = client.execute(httpget); Document document = Jsoup.parse(EntityUtils.toString(landing.getEntity())); HttpPost httppost = new HttpPost(builder.build()); HttpEntity entity = postForm(hostId, client, httppost, new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier }, { "_ctl0:_ctl1:_ctl0:btnSubmit", "Search" }, { "__EVENTTARGET", "" }, { "__EVENTARGUMENT", "" }, { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } }, true); // licenses list List<License> licenseList = new ArrayList<License>(); while (entity != null) { String result = EntityUtils.toString(entity); document = Jsoup.parse(result); Elements trs = document.select(GRID_ROW_SELECTOR); if (trs != null) { for (Element element : trs) { licenseList.add(parseLicense(element.children())); } } // done, check if there are additional results entity = null; Elements elements = document.getElementsByTag("a"); for (Element element : elements) { if (element.text().equals("Next >>")) { entity = postForm(hostId, client, httppost, new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier }, { "__EVENTTARGET", "_ctl0:_ctl1:_ctl0:dgrdLicensee:_ctl29:_ctl1" }, { "__EVENTARGUMENT", "" }, { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } }, true); break; } } } SearchResult<License> result = new SearchResult<License>(); result.setItems(licenseList); return result; }
From source file:org.confab.VBulletinParser.java
public List<Forum> parseForums(Document root, BulletinBoard parent) { Utilities.debug("parseForums"); List<Forum> ret = new ArrayList<Forum>(); // get table//from w ww . ja va2 s . c o m Elements forum_table = root.select("tbody[id*=collapseobj_forumbit_] tr"); assert !forum_table.isEmpty(); for (Element el_tr : forum_table) { Forum new_forum = new Forum(parent); // Get the table data for this row Elements el_tds = el_tr.select("td"); assert !el_tds.isEmpty() : el_tr.html(); // xbox360achievements has a lot of subforums and puts these in their own table // The <a>'s are picked up as children of the parent <td> so don't parse this sub- // tables row's seperatly if (!el_tds.select("td.thead").isEmpty() || el_tds.size() < 3) { //Utilities.debug("tr doesn't seem to have anything we want, skipping."); continue; } // Get the title URL Elements els_a = el_tds.get(1).select("a"); assert !els_a.isEmpty() : el_tds.html(); new_forum.url = els_a.first().attr("href"); assert new_forum.url != null; Utilities.debug("new_forum.url : " + new_forum.url); // Get the title text assert els_a.first() != null; new_forum.title = els_a.first().text(); assert new_forum.title != null; Utilities.debug("new_forum.title : " + new_forum.title); // Check for any subforums in remaining a elements els_a.remove(els_a.first()); for (Element el_a : els_a) { Forum sub_forum = new Forum(parent); sub_forum.url = el_a.attr("href"); assert sub_forum.url != null; sub_forum.title = el_a.text(); assert sub_forum.title != null; new_forum.subForums.add(sub_forum); Utilities.debug("added subForum: " + sub_forum.title); } // Get num viewing the current forum Element el_viewing = el_tr.select(":matchesOwn((\\d+ Viewing))").first(); if (el_viewing != null) { new_forum.numViewing = el_viewing.text(); } else { new_forum.numViewing = "0"; } Utilities.debug("new_forum.numViewing : " + new_forum.numViewing); // Get the description/message of this topic Element el_description = el_tds.get(1).select("div.smallfont").first(); if (el_description != null) { new_forum.description = el_description.text(); } else { new_forum.description = ""; } Utilities.debug("new_forum.description : " + new_forum.description); Utilities.debug("new_forum.parent.url : " + new_forum.parent.url); ret.add(new_forum); Utilities.debug("-----"); } Utilities.debug("end parseForums"); return ret; }
From source file:com.spd.ukraine.lucenewebsearch1.web.IndexingController.java
private boolean prevElementContainsElementText(Element prevElement, Element element) { return (prevElement.hasText() && element.hasText() && prevElement.text().contains(element.text())); }
From source file:net.groupbuy.entity.Article.java
/** * ?//from w w w. ja va2 s . c o m * * @return */ @Transient public String[] getPageContents() { if (StringUtils.isEmpty(content)) { return new String[] { "" }; } if (content.contains(PAGE_BREAK_SEPARATOR)) { return content.split(PAGE_BREAK_SEPARATOR); } else { List<String> pageContents = new ArrayList<String>(); Document document = Jsoup.parse(content); List<Node> children = document.body().childNodes(); if (children != null) { int textLength = 0; StringBuffer html = new StringBuffer(); for (Node node : children) { if (node instanceof Element) { Element element = (Element) node; html.append(element.outerHtml()); textLength += element.text().length(); if (textLength >= PAGE_CONTENT_LENGTH) { pageContents.add(html.toString()); textLength = 0; html.setLength(0); } } else if (node instanceof TextNode) { TextNode textNode = (TextNode) node; String text = textNode.text(); String[] contents = PARAGRAPH_SEPARATOR_PATTERN.split(text); Matcher matcher = PARAGRAPH_SEPARATOR_PATTERN.matcher(text); for (String content : contents) { if (matcher.find()) { content += matcher.group(); } html.append(content); textLength += content.length(); if (textLength >= PAGE_CONTENT_LENGTH) { pageContents.add(html.toString()); textLength = 0; html.setLength(0); } } } } String pageContent = html.toString(); if (StringUtils.isNotEmpty(pageContent)) { pageContents.add(pageContent); } } return pageContents.toArray(new String[pageContents.size()]); } }
From source file:gov.medicaid.screening.dao.impl.MedicalPracticeLicenseDAOBean.java
/** * Searches for the available specialty options matching the criteria. * * @param criteria the criteria for specialty search * @param document the current page// w w w.j av a2 s. c om * @return the matched code * @throws ServiceException if the code provided is not present */ private String matchSpecialtyCode(MedicalPracticeLicenseSearchCriteria criteria, Document document) throws ServiceException { Elements specialtyOptions = document.select("select#_ctl7_ddlbSpecialty option"); Specialty specialty = criteria.getSpecialty(); String code = null; boolean found = false; for (Element option : specialtyOptions) { code = option.attr("value"); if (Util.isNotBlank(specialty.getName())) { // match the name if (specialty.getName().equalsIgnoreCase(option.text())) { if (specialty.getCode() > 0 && Integer.parseInt(code) != specialty.getCode()) { throw new ServiceException(ErrorCode.MITA10007.getDesc()); } found = true; break; } } else { // match only the code if (Integer.parseInt(code) == specialty.getCode()) { found = true; break; } } } if (!found) { throw new ServiceException(ErrorCode.MITA10007.getDesc()); } return code; }
From source file:us.colloquy.index.IndexHandler.java
public void getURIForAllLetters(Set<DocumentPointer> uriList, String letterDirectory, boolean useOnlyNumber) { ///Documents/Tolstoy/diaries Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".ncx"); })) {//from w w w.j a v a2 s . c om stream.forEach(results::add); // String joined = stream // .sorted() // .map(String::valueOf) // .collect(Collectors.joining("; ")); // // System.out.println("\nFound: " + joined); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); try { for (Path res : results) { Path parent = res.getParent(); // System.out.println("---------------------------------------------"); // System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); String title = ""; for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { title = child.text(); // System.out.println("Title: " + title); } } for (Element element : doc.getElementsByTag("navPoint")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { String label = child.text(); if (StringUtils.isNotEmpty(label)) { if (label.matches("?")) { System.out.println("------------------"); } String url = child.getElementsByTag("content").attr("src"); if (label.matches(".*\\d{1,3}.*[?--?]+.*") && StringUtils.isNotEmpty(url)) { DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + url.replaceAll("#.*", ""), title); uriList.add(documentPointer); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else if (label.matches(".*\\d{1,3}.*") && StringUtils.isNotEmpty(url) && useOnlyNumber) { DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + url.replaceAll("#.*", ""), title); uriList.add(documentPointer); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else { // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); } } } } } } catch (Exception e) { e.printStackTrace(); } // System.out.println("Size: " + uriList.size()); // for (DocumentPointer pointer : uriList) // { // //parse and // System.out.println(pointer.getSourse() + "\t" + pointer.getUri()); // } }
From source file:us.colloquy.index.IndexHandler.java
public void getURIForAllDiaries(List<DocumentPointer> documentPointers, Path pathToLetters) { List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".ncx"); })) {//from ww w .j a v a 2 s.c o m stream.forEach(results::add); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); try { for (Path res : results) { Path parent = res.getParent(); // System.out.println("---------------------------------------------"); // System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); String title = ""; for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { title = child.text(); // System.out.println("Title: " + title); } } // System.out.println("========================== " + res.toString() + " =========================="); boolean startPrinting = false; boolean newFile = true; for (Element element : doc.getElementsByTag("navPoint")) { //get nav label and content Element navLabelElement = element.select("navLabel").first(); Element srsElement = element.select("content").first(); String navLabel = ""; String srs = ""; if (navLabelElement != null) { navLabel = navLabelElement.text().replaceAll("\\*", "").trim(); } if (srsElement != null) { srs = srsElement.attr("src"); } if ("??".matches(navLabel)) { startPrinting = false; // System.out.println("----------------- end of file pointer ---------------"); } if (StringUtils.isNotEmpty(navLabel) && navLabel.matches("??.*|?? ?.*") && newFile) { newFile = false; startPrinting = true; } if (startPrinting && !navLabel .matches("(|??? ??)")) { // System.out.println("----------------- file pointer ---------------"); // System.out.println(navLabel + "\t" + srs); DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + srs.replaceAll("#.*", ""), title); documentPointers.add(documentPointer); } } // System.out.println("========================== END OF FILE =========================="); } } catch (Exception e) { e.printStackTrace(); } System.out.println("Size: " + documentPointers.size()); // for (DocumentPointer pointer : documentPointers) // { //parse and // System.out.println(pointer.getSourse() + "\t" + pointer.getUri()); }