List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:GIST.IzbirkomExtractor.TableExtractor.java
/** * Tests the row if it looks like the 1st row of a parsable table * @param row//from w ww . jav a2 s . c om * @return */ private boolean isParsableTable(Element row) { Elements cells = row.getElementsByTag("td"); /* number of columns should be 4 */ if (cells.size() != 4) return false; /* look for number signs in 1st cell*/ if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()), " . -") < 3) return true; /* discard the table if any of the cells is empty */ for (Element cell : cells) { if (cleanupUNICODE(cell.text()).isEmpty()) return false; } /* 1st column should be a number */ try { Integer.parseInt(cleanupUNICODE(cells.first().text()).trim()); return true; } catch (NumberFormatException e) { return false; } }
From source file:eu.sisob.uma.extractors.adhoc.websearchers.WebSearchersExtractor.java
/** * * @param nextLine/*w w w. j a v a2s . c om*/ * @param idStaffIdentifier * @param idName * @param idFirstName * @param idLastName * @param idInitials * @param idSubject * @param idInstitutionName * @param idWebAddress * @param expression * @param params * @return */ @Override protected String get_result(String[] nextLine, int idStaffIdentifier, int idName, int idFirstName, int idLastName, int idInitials, int idSubject, int idInstitutionName, int idWebAddress, String expression, Object[] params) { String keywords = " (PROFILE OR PHD OR RESEARCHER OR FACULTY OR PROFESSOR OR RESEARCH) AND "; keywords = ""; String domain = clean_site(nextLine[idWebAddress]); String subject = nextLine[idSubject]; String and_institution_name = (idInstitutionName != -1 ? " AND " + nextLine[idInstitutionName] : ""); String expression_subject = expression + " AND " + subject; String expression_site = expression + " site: " + domain; String expression_inst_name = expression + and_institution_name; String expression_inst_name_and_subject = expression + and_institution_name + " AND " + subject; String url = ""; switch (search_patterns) { case P1: url = "https://duckduckgo.com/html/?q=" + keywords + expression; break; case P2: url = "https://duckduckgo.com/html/?q=" + keywords + expression_subject; break; case P3: url = "https://duckduckgo.com/html/?q=" + keywords + expression_site; break; case P4: url = "https://duckduckgo.com/html/?q=" + keywords + expression_inst_name; break; case P5: url = "https://duckduckgo.com/html/?q=" + keywords + expression_inst_name_and_subject; break; default: url = "https://duckduckgo.com/html/?q=" + keywords + expression_subject; break; } Logger.getRootLogger().info("Go with " + url); boolean again = false; Document doc = null; do { doc = getDocumentFromPage(url, 10, 1000, 5000); if (doc != null && doc.text().contains("If this error persists, please let us know")) { try { Thread.sleep(30000); } catch (InterruptedException ex) { } again = true; } else { again = false; } } while (again); String final_result = ""; if (doc != null && doc.select("div[class*=links_main] > a").size() > 0) { /* Write resercher founded */ Elements elements = doc.select("div[class*=links_main] > a"); /* We will take the first html page and the first pdf */ HashMap<String, String> results = new HashMap<String, String>(); int max_results = 2; int i_result = 0; for (Element e : elements) { if ((e.text().startsWith("[") //&& !e.text().startsWith("[PDF]") ) || e.absUrl("href").contains("duckduckgo.com/y.js") || e.absUrl("href").contains("wikipedia.") || e.absUrl("href").contains("facebook.com") || e.absUrl("href").contains("microsoft.com") || e.absUrl("href").contains("google.com") || e.absUrl("href").contains("linkedin") || e.absUrl("href").contains("www.biography.com") || e.absUrl("href").contains("biomedexperts.com") || e.absUrl("href").contains("www.experts.scival.com") || e.absUrl("href").contains("ratemyprofessors.com") || e.absUrl("href").contains("flickr.com") || e.absUrl("href").endsWith(".txt") || e.absUrl("href").endsWith(".csv") || e.absUrl("href").endsWith(".xml") || e.absUrl("href").endsWith(".doc") || e.absUrl("href").endsWith(".docx") || e.absUrl("href").endsWith(".xls") || e.absUrl("href").endsWith(".xlxs") || e.absUrl("href").contains("www.amazon")) { max_results++; continue; } boolean add = false; String score = ""; String ext = ""; if (!results.containsKey("HTML") && !e.text().startsWith("[")) { //results.put("html", ) File temp; try { temp = File.createTempFile("temp-file-name", ".tmp"); URL fetched_url = Downloader.fetchURL(e.absUrl("href")); FileUtils.copyURLToFile(fetched_url, temp); long sizeInBytes = temp.length(); long sizeInMb = sizeInBytes / (1024 * 1024); if (sizeInMb > 100) { score = "B"; } else { String content = FileUtils.readFileToString(temp); if (content.contains(nextLine[idLastName])) { score = "A"; } else { score = "B"; } } } catch (IOException ex) { score = "B"; } ext = "HTML"; add = true; } //if(!results.containsKey("PDF") && e.text().startsWith("[PDF]")){ // score = "A"; // ext = "PDF"; // add = true; //} if (add) { String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\";"; result += "\"" + nextLine[idLastName] + "\";"; result += "\"" + nextLine[idInitials] + "\";"; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\";"; if (idName != -1) result += "\"" + nextLine[idName] + "\";"; result += "\"" + e.absUrl("href") + "\";"; result += "\"" + ext + "\";"; result += "\"" + "CV" + "\";"; result += "\"" + score + "\""; result += "\r\n"; results.put(ext, result); Logger.getRootLogger().info("Select " + e.absUrl("href") + " - " + e.text()); } // if(results.containsKey("PDF") && results.containsKey("HTML")){ // break; // } i_result++; if (max_results <= i_result) { break; } } // if(results.containsKey("PDF")) // final_result = results.get("PDF"); // else if (results.containsKey("HTML")) final_result = results.get("HTML"); else final_result = ""; } return final_result; }
From source file:com.johan.vertretungsplan.parser.UntisCommonParser.java
protected VertretungsplanTag parseMonitorVertretungsplanTag(Document doc, JSONObject data) throws JSONException { VertretungsplanTag tag = new VertretungsplanTag(); tag.setDatum(doc.select(".mon_title").first().text().replaceAll(" \\(Seite \\d+ / \\d+\\)", "")); if (doc.select("table.mon_head td[align=right] p").size() == 0 || schule.getData().optBoolean("stand_links", false)) { tag.setStand(doc.select("body").html().substring(0, doc.select("body").html().indexOf("<p>") - 1)); } else {/*from w w w . java2s . co m*/ Element stand = doc.select("table.mon_head td[align=right] p").first(); String info = stand.text(); tag.setStand(info.substring(info.indexOf("Stand:"))); } // NACHRICHTEN if (doc.select("table.info").size() > 0) parseNachrichten(doc.select("table.info").first(), data, tag); // VERTRETUNGSPLAN if (doc.select("table:has(tr.list)").size() > 0) parseVertretungsplanTable(doc.select("table:has(tr.list)").first(), data, tag); return tag; }
From source file:net.parser.JobParser.java
public String getEmployerAddress() { Elements elements = doc.select("#employer-profile .details li"); String address = null;/* w w w .j a va 2 s. co m*/ for (Element element : elements) { if (element.text().contains("Adresa:")) { address = element.text().replace("Adresa: ", ""); } } return address; }
From source file:net.poemerchant.scraper.ShopScraper.java
public String scrapeAccountName() { Element profileLink = doc.select(".profile-link").select("a[href^=/account/view-profile]").first(); // TODO, take the challenge data also // TODO, take the twitch data accountName = profileLink.text(); return accountName; }
From source file:com.liato.bankdroid.banking.banks.Bioklubben.java
@Override public void update() throws BankException, LoginException, BankChoiceException { super.update(); if (username == null || password == null || username.length() == 0 || password.length() == 0) { throw new LoginException(res.getText(R.string.invalid_username_password).toString()); }//from w w w. j ava2s. c o m urlopen = login(); try { Document d = Jsoup .parse(urlopen.open("http://bioklubben.sf.se/MyPurchases.aspx?ParentTreeID=1&TreeID=1")); Element e = d.getElementById("ctl00_ContentPlaceHolder1_BonusPointsLabel"); if (e == null) { throw new BankException(res.getText(R.string.unable_to_find).toString() + " points element."); } BigDecimal b = Helpers.parseBalance(e.text()); Account a = new Account("Pong", b, "1"); a.setCurrency(context.getString(R.string.points)); accounts.add(a); balance = balance.add(a.getBalance()); Elements es = d.select(".GridViewStd_Item,.GridViewStd_ItemAlt"); List<Transaction> transactions = new ArrayList<Transaction>(); if (es != null) { for (Element el : es) { transactions.add(new Transaction(el.child(0).text().trim(), el.child(1).text().trim(), Helpers.parseBalance(el.child(2).text()))); } } a.setTransactions(transactions); } catch (IOException e) { if (e == null) { throw new BankException(e.getMessage()); } } if (accounts.isEmpty()) { throw new BankException(res.getText(R.string.no_accounts_found).toString()); } super.updateComplete(); }
From source file:DownloadDialog.java
/******************************************************************** * Method: storeTerms/* w w w. j a va 2 s . c o m*/ * Purpose: store available terms to use /*******************************************************************/ public void storeTerms() { try { // Default terms termsName = new ArrayList<String>(); termsValue = new ArrayList<String>(); // Create client for terms DefaultHttpClient client = new DefaultHttpClient(); HttpGet dynamicGet = new HttpGet("http://jweb.kettering.edu/cku1/xhwschedule.P_SelectSubject"); // Execute post call HttpResponse response = client.execute(dynamicGet); Document doc = Jsoup.parse(HTMLParser.parse(response)); Elements options = doc.getElementsByTag("option"); // Store every option for (Element option : options) { // First term option if (!option.text().contains("None")) { this.termsName.add(option.text()); this.termsValue.add(option.val()); } } //client.close(); } // Catch all exceptions catch (Exception e) { // Print track, set false, return false e.printStackTrace(); } }
From source file:me.vertretungsplan.parser.DaVinciParser.java
@Override public List<String> getAllClasses() throws IOException, JSONException, CredentialInvalidException { if (scheduleData.getData().has(PARAM_CLASSES_SOURCE)) { Document doc = Jsoup.parse(httpGet(scheduleData.getData().getString("classesSource"), ENCODING)); List<String> classes = new ArrayList<>(); Elements elems = doc.select("li.Class"); if (elems.size() == 0) { // daVinci 5 elems = doc.select("td[align=left] a"); }//from w ww . ja v a 2 s . c om for (Element li : elems) { classes.add(li.text()); } return classes; } else { return getClassesFromJson(); } }
From source file:com.lingxiang2014.entity.Article.java
@Transient public String[] getPageContents() { if (StringUtils.isEmpty(content)) { return new String[] { "" }; }/*from ww w. j a v a 2 s . c o m*/ if (content.contains(PAGE_BREAK_SEPARATOR)) { return content.split(PAGE_BREAK_SEPARATOR); } else { List<String> pageContents = new ArrayList<String>(); Document document = Jsoup.parse(content); List<Node> children = document.body().childNodes(); if (children != null) { int textLength = 0; StringBuffer html = new StringBuffer(); for (Node node : children) { if (node instanceof Element) { Element element = (Element) node; html.append(element.outerHtml()); textLength += element.text().length(); if (textLength >= PAGE_CONTENT_LENGTH) { pageContents.add(html.toString()); textLength = 0; html.setLength(0); } } else if (node instanceof TextNode) { TextNode textNode = (TextNode) node; String text = textNode.text(); String[] contents = PARAGRAPH_SEPARATOR_PATTERN.split(text); Matcher matcher = PARAGRAPH_SEPARATOR_PATTERN.matcher(text); for (String content : contents) { if (matcher.find()) { content += matcher.group(); } html.append(content); textLength += content.length(); if (textLength >= PAGE_CONTENT_LENGTH) { pageContents.add(html.toString()); textLength = 0; html.setLength(0); } } } } String pageContent = html.toString(); if (StringUtils.isNotEmpty(pageContent)) { pageContents.add(pageContent); } } return pageContents.toArray(new String[pageContents.size()]); } }
From source file:de.stkl.gbgvertretungsplan.sync.SyncAdapter.java
private List<List<String>> parseRows(Element root) { Element table = root.select("table.mon_list").first(); // each row has categories.size() categories, build a two dimensional array: // <row-index><category-index> = <value> // rows[0] is the name of the class, if multiple classes are set there, split them (separator: ,) List<List<String>> allRows = new ArrayList<List<String>>(); Elements rows = table.select("tr:gt(0)"); for (Element row : rows) { int i = 0; ArrayList<String> newrow = new ArrayList<String>(); String[] pendingClasses = null; // each category for (Element categ : row.select("td")) { if (i == 0) { // split class field by separator(,) if needed String text = categ.text(); pendingClasses = text.split(","); }/*from ww w . j av a 2s.co m*/ // dont add class if multiple classes are given if (i != 0 || (pendingClasses == null || pendingClasses.length == 0)) newrow.add(categ.text()); // Log.i(LOG_TAG, categ.text()); i++; } // add row with category info to allRows array, if not multiple classes if (pendingClasses == null || pendingClasses.length == 0) allRows.add(newrow); // otherwise set class names to multiple rows else { for (String classN : pendingClasses) { ArrayList<String> n = (ArrayList<String>) newrow.clone(); n.add(0, classN.trim()); allRows.add(n); } } } return allRows; }