List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:com.thesmartweb.swebrank.WebParser.java
/** * Method to get all the elements with a specific html feature (not used in SWebRank's current version) * @param link_html the url to check/*from w w w . j a v a2s .co m*/ * @param dir the directory to save the file * @return a list with the text of all the elements */ public List<String> getbold(String link_html, String dir) { List<String> SEwords = new ArrayList<String>(); try { //link_html="http://www.themismavridis.com/"; Document doc = Jsoup.connect(link_html).get(); //---------to select the rest of the terms Elements bold = doc.select("em"); //bold=bold.select("b"); for (Element btext : bold) { String stringtosplit = btext.text().toString().toString(); if (!(stringtosplit == null) && (!(stringtosplit.equalsIgnoreCase("")))) { stringtosplit = stringtosplit.replaceAll("[\\W&&[^\\s]]", ""); if (!(stringtosplit == null) && (!(stringtosplit.equalsIgnoreCase("")))) { String[] tokenizedTerms = stringtosplit.split("\\W+"); for (int j = 0; j < tokenizedTerms.length; j++) { if (!(tokenizedTerms[j] == null) && (!(tokenizedTerms[j].equalsIgnoreCase("")))) { SEwords.add(tokenizedTerms[j]); } } } } } File file_thelist = new File(dir + "Javawords.txt"); FileUtils.writeLines(file_thelist, SEwords); return SEwords; } catch (IOException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); System.out.print("can not create the content file for SEwords"); return SEwords; } }
From source file:gov.medicaid.screening.dao.impl.SocialWorkLicenseDAOBean.java
/** * Retrieves all results from the source site. * * @param searchCriteria the search criteria. * @return the providers matched//from w ww. j a v a 2 s . c om * @throws URISyntaxException if the URL could not be correctly constructed * @throws IOException for any I/O related errors * @throws ServiceException for any other errors encountered */ private SearchResult<License> getAllResults(SocialWorkCriteria searchCriteria) throws URISyntaxException, IOException, ServiceException { DefaultHttpClient client = new DefaultHttpClient(getLaxSSLConnectionManager()); client.setRedirectStrategy(new LaxRedirectStrategy()); HttpGet getSearchPage = new HttpGet(new URIBuilder(getSearchURL()).build()); HttpResponse response = client.execute(getSearchPage); verifyAndAuditCall(getSearchURL(), response); Document page = Jsoup.parse(EntityUtils.toString(response.getEntity())); String licenseNo = ""; if (searchCriteria instanceof SocialWorkLicenseSearchByLicenseNumberCriteria) { licenseNo = "" + ((SocialWorkLicenseSearchByLicenseNumberCriteria) searchCriteria).getLicenseNumber(); } String level = "none"; if (searchCriteria.getLevel() != null) { level = Util.defaultString(searchCriteria.getLevel().getName()); } HttpPost search = new HttpPost(new URIBuilder(getSearchURL()).build()); HttpEntity entity = postForm(getSearchURL(), client, search, buildParams(searchCriteria, page, licenseNo, level, null), true); page = Jsoup.parse(EntityUtils.toString(entity)); List<License> allLicenses = new ArrayList<License>(); // check if detail page (single match) if (page.select("#lblFormTitle").text().equals("License Details")) { allLicenses.add(parseLicenseDetail(page)); } else { Elements rows = page.select(RESULT_ROWS_SELECTOR); while (rows.size() > 0) { for (Element row : rows) { License license = parseLicense(row.children()); if (license != null) { allLicenses.add(license); } } rows.clear(); // check for next page Element currentPage = page.select("#_ctl7_grdSearchResults tr.TablePager span").first(); getLog().log(Level.DEBUG, "Current page is: " + currentPage.text()); Element pageLink = currentPage.nextElementSibling(); if (pageLink != null && pageLink.hasAttr("href")) { getLog().log(Level.DEBUG, "There are more results, getting the next page."); String target = parseEventTarget(pageLink.attr("href")); entity = postForm(getSearchURL(), client, search, buildParams(searchCriteria, page, licenseNo, level, target), true); page = Jsoup.parse(EntityUtils.toString(entity)); rows = page.select(RESULT_ROWS_SELECTOR); } } } SearchResult<License> results = new SearchResult<License>(); results.setItems(allLicenses); return results; }
From source file:net.GoTicketing.GoTicketing.java
/** * ??/*w w w. java2 s.c o m*/ * @return ? */ private int praseTicketingResultPage() { Document doc = Jsoup.parse(FinishTicketingPageHTML); Elements fonts = doc.getElementsByTag("font"); if (fonts != null) { // ? for (Element font : fonts) { if (font.text().equals(", ?")) return ROCID_WRONG; } } Elements strongs = doc.getElementsByTag("strong"); if (strongs != null) { // ? for (Element strong : strongs) { if (strong.text().equals("")) return RAND_OR_TIMEOUT_FAIL; } // ? for (Element strong : strongs) { if (strong.text().contains( "????")) return TRAIN_STATION_WRONG; } // ?? for (Element strong : strongs) { if (strong.text().equals("??") || strong.text().equals("????")) return TRAIN_NO_SEAT; } // ?? for (Element strong : strongs) { if (strong.text().contains("?(?)?")) return TICKETING_TO_LATE; } // ? for (Element strong : strongs) { if (strong.text().contains("")) return TICKETING_TO_EARLY; } // ? for (Element strong : strongs) { if (strong.text().contains("??")) return TRAIN_NUM_WRONG; } // ?? for (Element strong : strongs) { if (strong.text().equals("")) { String orderCode = doc.getElementById("spanOrderCode").text(); ticket.setOrderCode(orderCode); return TICKETING_SUCCESS; } } } // ?? return UNKNOW_PRASE_RESULT; }
From source file:me.vertretungsplan.parser.UntisInfoParser.java
private void parseTimetable(SubstitutionSchedule v, String lastChange, Document doc, String klasse, String weekName) throws JSONException { v.setLastChange(ParserUtils.parseDateTime(lastChange)); LocalDate weekStart = DateTimeFormat.forPattern("d.M.yyyy").parseLocalDate(weekName); Element table = doc.select("table").first(); List<SubstitutionScheduleDay> days = new ArrayList<>(); for (int i = 0; i < table.select("tr").first().select("td:gt(0)").size(); i++) { LocalDate date = weekStart.plusDays(i); SubstitutionScheduleDay day = null; for (SubstitutionScheduleDay d : v.getDays()) { if (d.getDate().equals(date)) { day = d;/* ww w .ja v a 2 s . c om*/ break; } } if (day == null) { day = new SubstitutionScheduleDay(); day.setDate(date); v.addDay(day); } days.add(day); } Elements rows = table.select("> tbody > tr:gt(0)"); Map<Integer, String> lessons = new HashMap<>(); int i = 0; int lessonCounter = 1; while (i < rows.size()) { Element cell = rows.get(i).select("td").first(); String lessonName = cell.text().trim(); if (lessonName.length() > 3) { lessonName = String.valueOf(lessonCounter); } lessons.put(i, lessonName); i += getRowspan(cell); lessonCounter += 1; } // counts the number of columns that will be missing from each row due to a cell with colspan Map<Integer, Integer> columnsToSkip = new HashMap<>(); for (int j = 0; j < rows.size(); j++) { columnsToSkip.put(j, 0); } for (int col = 1; col < days.size(); col++) { int row = 0; while (row < rows.size()) { Element cell = rows.get(row).select("> td").get(col - columnsToSkip.get(row)); String lesson = getTimetableLesson(cell, row, lessons); days.get(col - 1).addAllSubstitutions( parseTimetableCell(cell, lesson, klasse, data.getJSONArray("cellFormat"), colorProvider)); for (int skippedRow = row + 1; skippedRow < row + getRowspan(cell); skippedRow++) { columnsToSkip.put(skippedRow, columnsToSkip.get(skippedRow) + 1); } row += getRowspan(cell); } } }
From source file:com.mythesis.userbehaviouranalysis.WebParser.java
/** * Parse the url and get all the content * @param link the url to parse//www.jav a 2 s .c o m * @return The content parsed */ private String cleanhtml(String link) { try { Document doc = Jsoup.connect(link).timeout(10 * 1000).get(); String title = doc.title(); String mainbody = doc.body().text(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); //fix link html to remove https:// or http:// and simple / if (link.substring(link.length() - 1, link.length()).equalsIgnoreCase("/")) { link = link.substring(0, link.length() - 1); } if (link.substring(0, 5).equalsIgnoreCase("https")) { link = link.substring(8); } else if (link.substring(0, 4).equalsIgnoreCase("http")) { link = link.substring(7); } String anchortext = ""; String alttext = ""; //-----get the anchor text of internal links for (Element el : links) { String str_check = el.attr("abs:href"); if (el.attr("abs:href").contains(link) && el.text().length() > 1) { anchortext = anchortext + el.text() + " "; } } //-------get alt text to internal images links for (Element medi : media) { if (medi.getElementsByTag("img").attr("src").contains(link)) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt"); } if (medi.getElementsByTag("img").attr("src").startsWith("/")) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt"); } } String content = mainbody + title + anchortext + alttext; return content; } catch (IOException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (NullPointerException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (Exception ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } }
From source file:gov.medicaid.screening.dao.impl.OIGDAOBean.java
/** * Performs a search for all exclusion types. * * @return the search result for provider profiles * * @throws URISyntaxException if an error occurs while building the URL. * @throws ClientProtocolException if client does not support protocol used. * @throws IOException if an error occurs while parsing response. * @throws ServiceException for any other problems encountered * * @deprecated not updated in new site layout. *//*from w w w . j a v a2 s . c om*/ @Deprecated private List<ExclusionType> getAllExclusions() throws URISyntaxException, ClientProtocolException, IOException, ServiceException { DefaultHttpClient client = new DefaultHttpClient(); HttpGet getSearch = new HttpGet(new URIBuilder(getExclusionURL()).build()); HttpResponse response = client.execute(getSearch); verifyAndAuditCall(getExclusionURL(), response); Document page = Jsoup.parse(EntityUtils.toString(response.getEntity())); List<ExclusionType> allExclusions = new ArrayList<ExclusionType>(); Elements links = page.select("table#CountList tbody tr td a"); for (Element link : links) { ExclusionType exclusionType = new ExclusionType(); exclusionType.setName(link.text()); allExclusions.add(exclusionType); } return allExclusions; }
From source file:qhindex.controller.SearchAuthorWorksController.java
private AuthorWork extractAuthorWorkData(Element authorWorkElements) throws IOException { AuthorWork aw = new AuthorWork(); Element titleElem = authorWorkElements.select("td.gsc_a_t > a").get(0); String name = titleElem.text(); aw.setTitle(name);/* ww w . j a v a2 s . co m*/ String urlAuthorWork = titleElem.attr("href"); Elements workData = authorWorkElements.select("td.gsc_a_t > div"); if (workData.size() > 1) { String publisherInGoogle = workData.get(1).text(); aw.setPublisherInGoogle(publisherInGoogle); aw.setPublisher(handlePublicationMedium(publisherInGoogle, urlAuthorWork)); String authors = workData.get(0).text(); aw.setAuthors(authors); } Elements citationsData = authorWorkElements.select("td.gsc_a_c > a"); if (citationsData.size() > 0) { aw.setCitationsUrl(citationsData.get(0).attr("href")); int cititationsExtractedNumber = 0; try { String citationStr = citationsData.get(0).text(); if (citationStr.length() > 0) { cititationsExtractedNumber = Integer.parseInt(citationStr); } } catch (Exception ex) { Debug.print("Exception while extracting author work data: " + ex.toString()); resultsMsg += "Exception while extracting author work data.\n"; } aw.setCitations(cititationsExtractedNumber); } return aw; }
From source file:org.manalith.ircbot.plugin.uriinfo.UriInfoPlugin.java
private String getSiteSpecificTitle(String uri, Document document) { if (uri.startsWith("http://mlbpark.donga.com/bbs/view.php?") || uri.startsWith("http://mlbpark.donga.com/mbs/articleV.php?")) { // MLB Park article Element element = document.getElementsByClass("D14").first(); if (element != null) { try { return element.child(0).text(); } catch (IndexOutOfBoundsException e) { return null; }/*from ww w .ja v a2 s . c o m*/ } } else if (uri.startsWith("http://www.slrclub.com/bbs/vx2.php?")) { Element element = document.getElementsByClass("sbj").first(); if (element != null) { return element.text(); } } return null; }
From source file:gov.medicaid.screening.dao.impl.ChiropracticLicenseDAOBean.java
/** * Parses the Chiropractic license details page. * // ww w . j av a 2 s .co m * @param page * the details page * @param licenseType * if user has multiple licenses, this one will be used * @return the parsed license details * @throws ParsingException * if the expected tags were not found */ private License parseLicense(Document page, String licenseType) throws ParsingException { License license = new License(); ProviderProfile profile = new ProviderProfile(); license.setProfile(profile); User user = new User(); profile.setUser(user); Elements tables = page.select("table"); for (Element cell : tables.get(0).select("td")) { if (cell.text().equals("First Name")) { user.setFirstName(cell.nextElementSibling().text()); } else if (cell.text().equals("Middle Name")) { user.setMiddleName(cell.nextElementSibling().text()); } else if (cell.text().equals("Last Name")) { user.setLastName(cell.nextElementSibling().text()); } else if (cell.text().equals("Gender")) { String gender = cell.nextElementSibling().text(); if (Util.isNotBlank(gender)) { if ("M".equalsIgnoreCase(gender)) { profile.setSex(Sex.MALE); } else { profile.setSex(Sex.FEMALE); } } } } List<Address> addresses = new ArrayList<Address>(); Address address = new Address(); addresses.add(address); profile.setAddresses(addresses); StringBuffer locBuffer = new StringBuffer(); for (Element cell : tables.get(1).select("td")) { if (cell.text().equals("Address Line1")) { locBuffer.insert(0, cell.nextElementSibling().text() + " "); } else if (cell.text().equals("Address Line2")) { locBuffer.append(cell.nextElementSibling().text()); } else if (cell.text().equals("City")) { address.setCity(cell.nextElementSibling().text()); } else if (cell.text().equals("State")) { address.setState(cell.nextElementSibling().text()); } else if (cell.text().equals("ZIP")) { address.setZipcode(cell.nextElementSibling().text()); } else if (cell.text().equals("Phone Number")) { profile.setContactPhoneNumber(cell.nextElementSibling().text()); } } address.setLocation(locBuffer.toString().trim()); for (Element row : tables.get(2).select("tr")) { String lType = row.select("td:eq(0)").text(); if (licenseType != null && !lType.startsWith(licenseType)) { // user has multiple licenses, the results will show this user twice (search by name) continue; } LicenseType type = new LicenseType(); type.setName(row.select("td:eq(0)").text()); license.setType(type); license.setLicenseNumber(row.select("td:eq(1)").text()); LicenseStatus status = new LicenseStatus(); status.setName(row.select("td:eq(2)").text()); license.setStatus(status); String issueDate = row.select("td:eq(3)").text(); if (Util.isNotBlank(issueDate)) { license.setOriginalIssueDate(parseDate(issueDate, DATE_FORMAT)); } String renewalDate = row.select("td:eq(4)").text(); if (Util.isNotBlank(renewalDate)) { license.setRenewalDate(parseDate(renewalDate, DATE_FORMAT)); } String expirationDate = row.select("td:eq(5)").text(); if (Util.isNotBlank(expirationDate)) { license.setExpireDate(parseDate(expirationDate, DATE_FORMAT)); } } return license; }
From source file:com.johan.vertretungsplan.parser.UntisCommonParser.java
/** * Parst eine Vertretungstabelle eines Untis-Vertretungsplans * /*from ww w. j av a2 s.c om*/ * @param table * das <code>table</code>-Element des HTML-Dokuments, das geparst * werden soll * @param data * Daten von der Schule (aus <code>Schule.getData()</code>) * @param tag * der {@link VertretungsplanTag} in dem die Vertretungen * gespeichert werden sollen * @throws JSONException */ protected void parseVertretungsplanTable(Element table, JSONObject data, VertretungsplanTag tag) throws JSONException { if (data.optBoolean("class_in_extra_line")) { for (Element element : table.select("td.inline_header")) { String className = getClassName(element.text(), data); if (isValidClass(className)) { KlassenVertretungsplan kv = new KlassenVertretungsplan(className); Element zeile = null; try { zeile = element.parent().nextElementSibling(); if (zeile.select("td") == null) { zeile = zeile.nextElementSibling(); } while (zeile != null && !zeile.select("td").attr("class").equals("list inline_header")) { Vertretung v = new Vertretung(); int i = 0; for (Element spalte : zeile.select("td")) { if (!hasData(spalte.text())) { i++; continue; } String type = data.getJSONArray("columns").getString(i); if (type.equals("lesson")) v.setLesson(spalte.text()); else if (type.equals("subject")) v.setSubject(spalte.text()); else if (type.equals("previousSubject")) v.setPreviousSubject(spalte.text()); else if (type.equals("type")) v.setType(spalte.text()); else if (type.equals("type-entfall")) { if (spalte.text().equals("x")) v.setType("Entfall"); else v.setType("Vertretung"); } else if (type.equals("room")) v.setRoom(spalte.text()); else if (type.equals("teacher")) v.setTeacher(spalte.text()); else if (type.equals("previousTeacher")) v.setPreviousTeacher(spalte.text()); else if (type.equals("desc")) v.setDesc(spalte.text()); else if (type.equals("desc-type")) { v.setDesc(spalte.text()); v.setType(recognizeType(spalte.text())); } else if (type.equals("previousRoom")) v.setPreviousRoom(spalte.text()); i++; } if (v.getDesc() != null && v.getLesson() == null && v.getPreviousRoom() == null && v.getPreviousSubject() == null && v.getPreviousTeacher() == null && v.getRoom() == null && v.getSubject() == null && v.getTeacher() == null && v.getType() == null) { // Beschreibung aus der letzten Zeile fortgesetzt Vertretung previousVertretung = kv.getVertretung() .get(kv.getVertretung().size() - 1); previousVertretung.setDesc(previousVertretung.getDesc() + " " + v.getDesc()); zeile = zeile.nextElementSibling(); continue; } if (v.getType() == null) v.setType("Vertretung"); if (!v.getLesson().equals("")) { kv.add(v); } zeile = zeile.nextElementSibling(); } tag.getKlassen().put(className, kv); } catch (Throwable e) { e.printStackTrace(); } } } } else { boolean hasType = false; for (int i = 0; i < data.getJSONArray("columns").length(); i++) { if (data.getJSONArray("columns").getString(i).equals("type")) hasType = true; } Vertretung previousVertretung = null; for (Element zeile : table.select("tr.list.odd:not(:has(td.inline_header)), " + "tr.list.even:not(:has(td.inline_header)), " + "tr:has(td[align=center]:has(font[color]))")) { Vertretung v = new Vertretung(); String klassen = ""; int i = 0; for (Element spalte : zeile.select("td")) { if (!hasData(spalte.text())) { i++; continue; } String type = data.getJSONArray("columns").getString(i); if (type.equals("lesson")) v.setLesson(spalte.text()); else if (type.equals("subject")) v.setSubject(spalte.text()); else if (type.equals("previousSubject")) v.setPreviousSubject(spalte.text()); else if (type.equals("type")) v.setType(spalte.text()); else if (type.equals("type-entfall")) { if (spalte.text().equals("x")) v.setType("Entfall"); else if (!hasType) v.setType("Vertretung"); } else if (type.equals("room")) v.setRoom(spalte.text()); else if (type.equals("previousRoom")) v.setPreviousRoom(spalte.text()); else if (type.equals("desc")) v.setDesc(spalte.text()); else if (type.equals("desc-type")) { v.setDesc(spalte.text()); v.setType(recognizeType(spalte.text())); } else if (type.equals("teacher")) v.setTeacher(spalte.text()); else if (type.equals("previousTeacher")) v.setPreviousTeacher(spalte.text()); else if (type.equals("class")) klassen = getClassName(spalte.text(), data); i++; } if (v.getDesc() != null && v.getLesson() == null && v.getPreviousRoom() == null && v.getPreviousSubject() == null && v.getPreviousTeacher() == null && v.getRoom() == null && v.getSubject() == null && v.getTeacher() == null && v.getType() == null && previousVertretung != null) { // Beschreibung aus der letzten Zeile fortgesetzt previousVertretung.setDesc(previousVertretung.getDesc() + " " + v.getDesc()); continue; } if (v.getType() == null) { if (zeile.select("strike").size() > 0 || (v.getSubject() == null && v.getRoom() == null && v.getTeacher() == null && v.getPreviousSubject() != null)) v.setType("Entfall"); else v.setType("Vertretung"); } List<String> affectedClasses; // Detect things like "5-12" Pattern pattern = Pattern.compile("(\\d+) ?- ?(\\d+)"); Matcher matcher = pattern.matcher(klassen); if (matcher.find()) { affectedClasses = new ArrayList<String>(); int min = Integer.parseInt(matcher.group(1)); int max = Integer.parseInt(matcher.group(2)); try { for (String klasse : getAllClasses()) { Pattern pattern2 = Pattern.compile("\\d+"); Matcher matcher2 = pattern2.matcher(klasse); if (matcher2.find()) { int num = Integer.parseInt(matcher2.group()); if (min <= num && num <= max) affectedClasses.add(klasse); } } } catch (IOException e) { e.printStackTrace(); } } else { if (data.optBoolean("classes_separated", true)) { affectedClasses = Arrays.asList(klassen.split(", ")); } else { affectedClasses = new ArrayList<String>(); try { for (String klasse : getAllClasses()) { // TODO: // Gibt es // eine // bessere // Mglichkeit? StringBuilder regex = new StringBuilder(); for (char character : klasse.toCharArray()) { regex.append(character); regex.append(".*"); } if (klassen.matches(regex.toString())) affectedClasses.add(klasse); } } catch (IOException e) { e.printStackTrace(); } } } for (String klasse : affectedClasses) { if (isValidClass(klasse)) { KlassenVertretungsplan kv = tag.getKlassen().get(klasse); if (kv == null) kv = new KlassenVertretungsplan(klasse); kv.add(v); tag.getKlassen().put(klasse, kv); } } previousVertretung = v; } } if (data.optBoolean("sort_classes")) { List<KlassenVertretungsplan> list = new ArrayList<>(tag.getKlassen().values()); Collections.sort(list, new Comparator<KlassenVertretungsplan>() { @Override public int compare(KlassenVertretungsplan o1, KlassenVertretungsplan o2) { return o1.getKlasse().compareTo(o2.getKlasse()); } }); LinkedHashMap<String, KlassenVertretungsplan> hashMap = new LinkedHashMap<>(); for (KlassenVertretungsplan klasse : list) { hashMap.put(klasse.getKlasse(), klasse); } tag.setKlassen(hashMap); } }