List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:mergedoc.core.APIDocument.java
/** * Javadoc ? ??????/*from www . j av a 2 s. c om*/ * @param className ?? * @param context * @param comment */ private void parseCommonTag(String className, Element element, Comment comment) { Elements dts = element.select("dl dt"); for (Element dt : dts) { String dtText = dt.text(); if (dtText.contains("")) { Elements aTags = dt.nextElementSibling().select("a:has(code)"); for (Element a : aTags) { String url = a.attr("href"); String ref; if (a.childNodeSize() != 1) { ref = aTags.outerHtml(); } else { ref = formatClassName(className, url); ref = FastStringUtils.replace(ref, "%28", "("); ref = FastStringUtils.replace(ref, "%29", ")"); Pattern methodRefPat = PatternCache.getPattern("-(.*)-$"); Matcher methodRefMat = methodRefPat.matcher(ref); if (methodRefMat.find()) { ref = FastStringUtils.replaceAll(ref, "-(.*)-$", "($1)"); // for Java8 ref = FastStringUtils.replace(ref, "-", ","); // for Java8 ref = FastStringUtils.replace(ref, ":A", "[]"); // for Java8 } } comment.addSee(ref); } } else if (dtText.contains("???:")) { comment.addSince(dt.nextElementSibling().text()); } } }
From source file:by.heap.remark.convert.TextCleaner.java
/** * Replaces all {@code <br/>} tags with a newline in a copy of the input node, and * returns the resulting innter text./* ww w . j a va 2s . co m*/ * This is necessary to ensure that manual linebreaks are supported in preformatted code. * * @param oinput Preformatted node to process * @return inner text of the node. */ private String getPreformattedText(Element oinput) { Element el = oinput.clone(); fixLineBreaks(el); return el.text(); }
From source file:accountgen.controller.Controller.java
private void setBday(Document doc, Person p) { Element bday = doc.select(".bday").first(); Date bd = new Date(); Date date = null;//from w w w. j a v a 2 s . c o m try { date = new SimpleDateFormat("MMM", Locale.ENGLISH).parse(bday.text().split(" ")[0]); } catch (ParseException ex) { Logger.getLogger(Controller.class.getName()).log(Level.SEVERE, null, ex); } Calendar cal = Calendar.getInstance(); cal.setTime(date); int month = cal.get(Calendar.MONTH); bd.setMonth(month); bd.setDate(Integer.parseInt(bday.text().split(" ")[1].replace(",", ""))); bd.setYear(Integer.parseInt(bday.text().split(",")[1].substring(1, 5)) - 1900); p.setBirthday(bd); }
From source file:GIST.IzbirkomExtractor.TableExtractor.java
public void processHTMLfile(File input_html) throws IOException, TableExtractorException, CloneNotSupportedException, SQLException, ResultSinkException { logger.info("Start processing " + input_html); Document doc = Jsoup.parse(input_html, "UTF-8"); Elements tables = doc.getElementsByTag("table"); /* count of parseable tables found */ int tables_found = 0; /* determine raion name */ String raion_name = extractRaionFromFileName(input_html.getName()); //System.err.println(raion_name); // TODO: inflect raion name in case /* searches for a table that has " . -" in its very 1st cell */ for (Element table : tables) { Elements rows = table.getElementsByTag("tr"); boolean firstRow = true; row_loop: for (Element row : rows) { Elements cells = row.getElementsByTag("td"); if (firstRow) { //System.err.println(row.text()); if (isParsableTable(row)) { firstRow = false; logger.info("Processing table #" + ++tables_found + " in " + input_html); } else break row_loop; }/*from www .j av a2 s .c o m*/ if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()), " . -") < 3) continue row_loop; /* skip the row if it looks like a table header */ /* skip rows with all cells empty */ boolean emptyRow = true; for (Element cell : cells) emptyRow = emptyRow && cleanupUNICODE(cell.text()).isEmpty(); if (emptyRow) continue; int i_cell = 0; Element station_id = null; Element address_field = null; Element org_address = null; /* address of the ??? */ Element station_address = null; for (Element cell : cells) { switch (i_cell) { case 0: station_id = cell; break; case 1: address_field = cell; break; case 2: org_address = cell; break; case 3: station_address = cell; default: break; } i_cell++; } if (station_id == null) throw new TableExtractorException("Polling station ID not found", row, input_html); if (address_field == null) throw new TableExtractorException("Address list not found", row, input_html); /* extract int from poll station id */ int psid; try { psid = Integer.valueOf(cleanupUNICODE(station_id.text()).trim().replaceAll("[^\\d]", "")); } catch (NumberFormatException e) { Exception te = new TableExtractorException("Failed to parse polling station ID >" + cleanupUNICODE(station_id.text()).trim() + "<: ", station_id, input_html); logger.severe(te.getMessage() + "; rest of " + input_html + " ignored."); return; } /* extraction from HTML completely finished, now we work only with the addresses in the text form */ extractAddressesFromText(raion_name.trim(), psid, cleanLeftoverHTML(address_field), cleanLeftoverHTML(org_address), cleanLeftoverHTML(station_address)); } } if (tables_found == 0) logger.severe("No parsable tables found in " + input_html); resultSink.commit(); logger.info("" + tables_found + " table(s) processed in " + input_html); }
From source file:me.vertretungsplan.parser.SVPlanParser.java
private void parseSvPlanDay(SubstitutionSchedule v, Element svp, Document doc) throws IOException { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); if ((svp.select(".svp-plandatum-heute, .svp-plandatum-morgen, .Titel").size() > 0 || doc.title().startsWith("Vertretungsplan fr "))) { setDate(svp, doc, day);/*from w w w. j ava2 s . co m*/ if (svp.select(".svp-tabelle, table:has(.Klasse)").size() > 0) { Elements rows = svp.select(".svp-tabelle tr, table:has(.Klasse) tr"); String lastLesson = ""; String lastClass = ""; for (Element row : rows) { if ((doc.select(".svp-header").size() > 0 && row.hasClass("svp-header")) || row.select("th").size() > 0 || row.text().trim().equals("")) { continue; } Substitution substitution = new Substitution(); for (Element column : row.select("td")) { String type = column.className(); if (!hasData(column.text())) { if ((type.startsWith("svp-stunde") || type.startsWith("Stunde")) && hasData(lastLesson)) { substitution.setLesson(lastLesson); } else if ((type.startsWith("svp-klasse") || type.startsWith("Klasse")) && hasData(lastClass)) { substitution.getClasses().addAll(Arrays .asList(lastClass.split(data.optString(PARAM_CLASS_SEPARATOR, ", ")))); } continue; } if (type.startsWith("svp-stunde") || type.startsWith("Stunde")) { substitution.setLesson(column.text()); lastLesson = column.text(); } else if (type.startsWith("svp-klasse") || type.startsWith("Klasse")) { substitution.getClasses().addAll(Arrays .asList(column.text().split(data.optString(PARAM_CLASS_SEPARATOR, ", ")))); lastClass = column.text(); } else if (type.startsWith("svp-esfehlt") || type.startsWith("Lehrer")) { if (!data.optBoolean(PARAM_EXCLUDE_TEACHERS)) { substitution.setPreviousTeacher(column.text()); } } else if (type.startsWith("svp-esvertritt") || type.startsWith("Vertretung")) { if (!data.optBoolean(PARAM_EXCLUDE_TEACHERS)) { substitution.setTeacher(column.text().replaceAll(" \\+$", "")); } } else if (type.startsWith("svp-fach") || type.startsWith("Fach")) { substitution.setSubject(column.text()); } else if (type.startsWith("svp-bemerkung") || type.startsWith("Anmerkung")) { substitution.setDesc(column.text()); String recognizedType = recognizeType(column.text()); substitution.setType(recognizedType); substitution.setColor(colorProvider.getColor(recognizedType)); } else if (type.startsWith("svp-raum") || type.startsWith("Raum")) { substitution.setRoom(column.text()); } } if (substitution.getType() == null) { substitution.setType("Vertretung"); substitution.setColor(colorProvider.getColor("Vertretung")); } day.addSubstitution(substitution); } } if (svp.select(".LehrerVerplant").size() > 0) { day.addMessage("<b>Verplante Lehrer:</b> " + svp.select(".LehrerVerplant").text()); } if (svp.select(".Abwesenheiten").size() > 0) { day.addMessage("<b>Abwesenheiten:</b> " + svp.select(".Abwesenheiten").text()); } if (svp.select("h2:contains(Mitteilungen)").size() > 0) { Element h2 = svp.select("h2:contains(Mitteilungen)").first(); Element sibling = h2.nextElementSibling(); while (sibling != null && sibling.tagName().equals("p")) { for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText() .split("<br />\\s*<br />")) { if (hasData(nachricht)) day.addMessage(nachricht); } sibling = sibling.nextElementSibling(); } } else if (svp.select(".Mitteilungen").size() > 0) { for (Element p : svp.select(".Mitteilungen")) { for (String nachricht : TextNode.createFromEncoded(p.html(), null).getWholeText() .split("<br />\\s*<br />")) { if (hasData(nachricht)) day.addMessage(nachricht); } } } v.addDay(day); } else { throw new IOException("keine SVPlan-Tabelle gefunden"); } }
From source file:GIST.IzbirkomExtractor.TableExtractor.java
/** * Cleaning up leftover of HTML code from the cell content. * /* ww w . j a v a 2s.co m*/ * @param cell_content HTML code contains in the table cell * @return an array list containing each line of the cell_content withh all HTML markup removed */ private ArrayList<String> cleanLeftoverHTML(Element cell_content) { ArrayList<String> streets_and_numbers = new ArrayList<String>(); /* <div>s designate separate lines inside the table cell */ for (Element addr_line : cell_content.getElementsByTag("div")) { /* skip empty address lines */ String addr_line_text = cleanupUNICODE(addr_line.text()); if (StringUtils.isBlank(addr_line_text)) continue; /* <strong> is not particularly useful, but can designate placement of simple separators like space */ Elements streets = addr_line.getElementsByTag("strong"); if (!streets.isEmpty()) { addr_line_text = addr_line_text.replaceFirst(Pattern.quote(streets.text()), " " + streets.text() + " "); } streets_and_numbers.add(addr_line_text); } return streets_and_numbers; }
From source file:net.parser.JobParser.java
public Set<Category> getCategories() { Elements elements = doc.select("#job-position .details dd a"); Category category = null;// w ww . j a va2 s .c o m if (elements.size() == 0) { return null; } else { Set<Category> categories = new HashSet<>(); for (Element element : elements) { category = new Category(); category.setId(CategoryEnum.getId(element.text())); categories.add(category); } return categories; } }
From source file:eu.sisob.uma.extractors.adhoc.websearchers_cv.WebSearchersCVExtractor.java
/** * * @param nextLine/* www .ja v a 2 s. c om*/ * @param idStaffIdentifier * @param idName * @param idFirstName * @param idLastName * @param idInitials * @param idSubject * @param idInstitutionName * @param idWebAddress * @param expression * @param params * @return */ @Override protected String get_result(String[] nextLine, int idStaffIdentifier, int idName, int idFirstName, int idLastName, int idInitials, int idSubject, int idInstitutionName, int idWebAddress, String expression, Object[] params) { String domain = clean_site(nextLine[idWebAddress]); String subject = nextLine[idSubject]; String expression_subject = expression + " " + subject + " " + files + " " + cv_keywords_in_query; expression_subject = expression_subject.replaceAll("\t", " "); expression_subject = expression_subject.replaceAll(" ", " "); String url = "https://duckduckgo.com/html/?q=" + expression_subject; Logger.getRootLogger().info("Go with " + url); boolean again = false; Document doc = null; do { doc = getDocumentFromPage(url, 10, 2000, 5000); if (doc != null && doc.text().contains("If this error persists, please let us know")) { try { Thread.sleep(30000); } catch (InterruptedException ex) { } again = true; } else { again = false; } } while (again); //if(doc.select("div[class*=links_main] > a[href*=" + domain + "]").size() > 0){ String final_result = ""; if (doc != null && doc.select("div[class*=links_main] > a").size() > 0) { /* Write resercher founded */ Elements elements = doc.select("div[class*=links_main] > a"); /* We will take the first html page and the first pdf */ List<String[]> results = new ArrayList<String[]>(); final int EXT_I = 0; final int SCORE_INT_I = 1; final int SCORE_LETTER_I = 2; final int RESULT_I = 3; final int WORST_SCORE = 67; //int max_results = elements.size(); //int i_result = 0; for (Element e : elements) { if ((e.text().startsWith("[") && !e.text().startsWith("[PDF]")) || e.absUrl("href").contains("duckduckgo.com/y.js") || e.absUrl("href").contains("wikipedia.") || e.absUrl("href").contains("microsoft.com") || e.absUrl("href").contains("google.com") || e.absUrl("href").contains("linkedin") || e.absUrl("href").contains("www.biography.com") || e.absUrl("href").contains("biomedexperts.com") || e.absUrl("href").contains("www.experts.scival.com") || e.absUrl("href").contains("ratemyprofessors.com") || e.absUrl("href").contains("flickr.com") || e.absUrl("href").endsWith(".txt") || e.absUrl("href").endsWith(".csv") || e.absUrl("href").endsWith(".xml") || e.absUrl("href").endsWith(".doc") || e.absUrl("href").endsWith(".docx") || e.absUrl("href").endsWith(".xls") || e.absUrl("href").endsWith(".xlxs") || e.absUrl("href").contains("www.amazon")) { continue; } boolean add = false; int score_int = WORST_SCORE; String score = ""; String ext = ""; if (e.text().startsWith("[PDF]") || e.text().startsWith("[DOCX]") || e.text().startsWith("[DOC]") || e.text().startsWith("[RTF]")) { String clean_name_1 = e.text().replaceAll("[^\\w\\s]", "").toLowerCase(); int i = e.absUrl("href").lastIndexOf("/"); int f = e.absUrl("href").lastIndexOf("."); String clean_name_2 = ""; if (i != -1 && f != -1) clean_name_2 = e.absUrl("href").substring(i, f).toLowerCase(); boolean b = false; for (String k : cv_keywords_in_name_list) { if (clean_name_1.contains(k) || clean_name_2.contains(k)) { b = true; break; } } if (b) { score_int--; } if (clean_name_1.contains(nextLine[idLastName]) || clean_name_2.contains(nextLine[idLastName])) { score_int--; } score = Character.toChars(score_int)[0] + ""; add = true; ext = "PDF"; } //if(!results.containsKey("HTML") && !e.text().startsWith("[")){ //} if (add) { String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\";"; result += "\"" + nextLine[idLastName] + "\";"; result += "\"" + nextLine[idInitials] + "\";"; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\";"; if (idName != -1) result += "\"" + nextLine[idName] + "\";"; result += "\"" + e.absUrl("href") + "\";"; result += "\"" + ext + "\";"; result += "\"" + "CV" + "\";"; result += "\"" + score + "\""; result += "\r\n"; results.add(new String[] { ext, score_int + "", score, result }); Logger.getRootLogger().info("Select " + e.absUrl("href") + " - " + score + " - " + e.text()); } } final_result = ""; int best_score = WORST_SCORE; for (String[] result : results) { if (result[EXT_I].equals("PDF")) { int act_score = Integer.parseInt(result[SCORE_INT_I]); if (act_score < best_score) { best_score = act_score; final_result = result[RESULT_I]; } } } } return final_result; }
From source file:com.obnsoft.ptcm3.MyApplication.java
private void parseCommandHtml() { mCommands = new ArrayList<Command>(); mCategories = new ArrayList<String>(); int categoryId = -1; try {//from w ww . ja va2 s . com InputStream in = openFileInput(FNAME_CMD_HTML); Document document = Jsoup.parse(in, "UTF-8", URL_CMD_HTML); in.close(); Element divContentArea = document.getElementById(ID_CONTENTAREA); for (Element e : divContentArea.children()) { if (e.tagName().equals(TAG_TABLE)) { if (e.className().equals("")) { mCommands.add(new Command(e, categoryId)); } } else if (e.tagName().equals(TAG_H3)) { mCategories.add(e.text()); categoryId++; } } } catch (IOException e) { e.printStackTrace(); mCommands = null; mCategories = null; } }
From source file:org.confab.PhpBB3Parser.java
/** * Parses each topic for a particular forum. * @param forum Document of html containing topics * @param parent Forum the threads belong to * @return List of ForumThread objects *//* www . ja v a 2 s .c om*/ public List<ForumThread> parseForumThreads(Document forum, Forum parent) { Utilities.debug("parseForumThreads"); List<ForumThread> ret = new ArrayList<ForumThread>(); // Get topic table Elements thread_table_tds = forum.select("tbody[id*=threadbits_forum_] td"); if (thread_table_tds.isEmpty()) { Utilities.debug("It seems " + parent.url + " has no topics."); return ret; } // Get any stickies Elements stickies = thread_table_tds.select("td:contains(Sticky:) a[id*=thread_title_]"); // Get all topics Elements els_a = thread_table_tds.select("a[id*=thread_title_]"); assert !els_a.isEmpty(); // Loop topics and grab info about each for (Element el_a : els_a) { ForumThread new_topic = new ForumThread(parent); // Get topic new_topic.title = el_a.text(); assert new_topic.title != null; Utilities.debug("new_topic.title: " + new_topic.title); // Check if sticky if (stickies.html().contains(new_topic.title)) { new_topic.isSticky = true; Utilities.debug("new_topic.isSticky: " + new_topic.isSticky); } // Get URL new_topic.url = el_a.attr("href"); assert new_topic.url != null; Utilities.debug("new_topic.url:" + new_topic.url); ret.add(new_topic); } Utilities.debug("end printForumThreads"); return ret; }