List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:com.liato.bankdroid.banking.banks.Hors.java
@Override public void update() throws BankException, LoginException, BankChoiceException, IOException { super.update(); urlopen = login();//from w w w . j ava 2s .c o m Document document = Jsoup.parse(response); Element balanceElement = document.getElementById("cphMain_lblAmount"); if (balanceElement == null) { throw new BankException( res.getText(R.string.unable_to_find).toString() + res.getText(R.string.balance).toString()); } Element nameElement = document.getElementById("lblCardName"); String accountName = nameElement == null ? NAME.toUpperCase() : nameElement.text(); if (this.getCustomName().isEmpty()) { this.setCustomName(accountName); } Account account = new Account(accountName, Helpers.parseBalance(balanceElement.text()), "0"); accounts.add(account); balance = balance.add(account.getBalance()); document = Jsoup.parse(urlopen.open("https://www.dittkort.se/q/Partial/Transactions.aspx?cnt=20")); Elements transactionElements = document.select("tr"); List<Transaction> transactions = new ArrayList<Transaction>(); if (transactionElements != null) { for (Element element : transactionElements) { transactions.add(asTransaction(element)); } } account.setTransactions(transactions); super.updateComplete(); }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule7.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(Authors/Editors) ~ dd, dt:contains(Author/Editor) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("authors/editors") && !prev.text().trim().toLowerCase().startsWith("author/editor")) { skip = true;//w ww . j ava 2 s. c o m } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("authors/editors") || next.text().trim().toLowerCase().startsWith("author/editor")) { skip = false; continue; } } continue; } if (StringUtils.countMatches(editor.text(), " - ") > 2) { Log.log("warning", url + ": This editor may be a list of editors separated by - "); EditorsRule5 ed5 = new EditorsRule5(); return ed5.run(url, doc); } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("dt")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:cvegrabber.CVEController.java
private String grabMitreData(String cveid, String data) throws IOException { //String url = "http://www.cvedetails.com/cve/" + cveid + "/"; String url = "http://cve.mitre.org/cgi-bin/cvename.cgi?name=" + cveid; Document doc = Jsoup.connect(url).get(); String dataToReturn = ""; if (doc.select("h2").text().contains("ERROR")) { dataToReturn = "CVE " + cveid + " Unknown or CVE Not Loaded Yet."; return dataToReturn; } else if (data.matches("references")) { //Elements references = doc.select("td.r_average"); Elements references = doc.select("li"); int counter = 0; for (Element reference : references) { if (counter == 0) { //dataToReturn += link.select("a[href]").text(); dataToReturn += reference.text(); counter++;/* ww w .ja v a 2 s.c o m*/ } else { //dataToReturn += "," + link.select("a[href]").text(); dataToReturn += "," + reference.text(); } } } else if (data.matches("description")) { //Element description = doc.select("div.cvedetailssummary").first(); Elements tds = doc.select("td[colspan=\"2\"]"); if (tds.eq(2).text().contains("** RESERVED **")) { return "No data on mitre yet."; } dataToReturn = tds.eq(2).text(); } return dataToReturn; }
From source file:com.ignorelist.kassandra.steam.scraper.HtmlTagLoader.java
@Override public GameInfo load(Long gameId, EnumSet<TagType> types) { GameInfo gameInfo = new GameInfo(); gameInfo.setId(gameId);//from w w w.j a v a 2 s. c o m try { if (!types.isEmpty()) { InputStream inputStream = cache.get(gameId.toString()); try { Document document = Jsoup.parse(inputStream, Charsets.UTF_8.name(), buildPageUrl(gameId)); Elements appName = document.select("div.apphub_AppName"); Element nameElement = Iterables.getFirst(appName, null); if (null != nameElement && null != nameElement.text()) { gameInfo.setName(nameElement.text().trim()); } Elements appIconElements = document.select("div.apphub_AppIcon img"); gameInfo.setIcon(getSrcUri(appIconElements)); Elements headerImageElements = document.select("img.game_header_image_full"); gameInfo.setHeaderImage(getSrcUri(headerImageElements)); final SetMultimap<TagType, String> tags = gameInfo.getTags(); if (types.contains(TagType.CATEGORY)) { Elements categories = document.select("div#category_block a.name"); copyText(categories, tags.get(TagType.CATEGORY)); } if (types.contains(TagType.GENRE)) { Elements genres = document.select("div.details_block a[href*=/genre/]"); copyText(genres, tags.get(TagType.GENRE)); } if (types.contains(TagType.USER)) { Elements userTags = document.select("a.app_tag"); copyText(Iterables.filter(userTags, Predicates.not(DISPLAY_NONE_PREDICATE)), tags.get(TagType.USER)); copyText(Iterables.filter(userTags, DISPLAY_NONE_PREDICATE), tags.get(TagType.USER_HIDDEN)); } if (types.contains(TagType.VR)) { Elements vrSupport = document .select("div.game_area_details_specs a.name[href*=#vrsupport="); copyText(vrSupport, tags.get(TagType.VR)); } } finally { IOUtils.closeQuietly(inputStream); } } } catch (ExecutionException ex) { Logger.getLogger(HtmlTagLoader.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(HtmlTagLoader.class.getName()).log(Level.SEVERE, null, ex); } return gameInfo; }
From source file:net.niyonkuru.koodroid.html.SubscribersHandler.java
@Override public ArrayList<ContentProviderOperation> parse(Document doc, ContentResolver resolver) throws HandlerException { final ArrayList<ContentProviderOperation> batch = new ArrayList<ContentProviderOperation>(); Element subscriberLi = doc.select("div#banSelector li:has(div)").first(); while (subscriberLi != null) { String text = subscriberLi.text(); /* this assumes the name and phone number are separated by a space */ int separator = text.lastIndexOf(' ') + 1; String subscriberId = text.substring(separator).replaceAll("\\D", ""); if (subscriberId.length() != 10) throw new HandlerException(getString(R.string.parser_error_unexpected_input)); final ContentProviderOperation.Builder builder; final Uri subscriberUri = Subscribers.buildSubscriberUri(subscriberId); if (subscriberExists(subscriberUri, resolver)) { builder = ContentProviderOperation.newUpdate(subscriberUri); builder.withValue(Subscribers.UPDATED, System.currentTimeMillis()); } else {/*ww w. j a v a2s .c om*/ builder = ContentProviderOperation.newInsert(Subscribers.CONTENT_URI); } builder.withValue(Subscribers.SUBSCRIBER_ID, subscriberId); String fullName = ""; String[] names = text.substring(0, separator).split("\\s"); for (String name : names) { fullName += ParserUtils.capitalize(name) + " "; } builder.withValue(Subscribers.SUBSCRIBER_FULL_NAME, fullName.trim()); if (subscriberLi.hasAttr("onClick")) { String switchUrl = subscriberLi.attr("onClick"); /* extract only the url */ switchUrl = switchUrl.substring(switchUrl.indexOf('/'), switchUrl.lastIndexOf('\'')); builder.withValue(Subscribers.SUBSCRIBER_SWITCHER, switchUrl); } else { /* this is the default subscriber as it doesn't have a switcher url */ ContentValues cv = new ContentValues(1); cv.put(Settings.SUBSCRIBER, subscriberId); resolver.insert(Settings.CONTENT_URI, cv); } builder.withValue(Subscribers.SUBSCRIBER_EMAIL, mParent); batch.add(builder.build()); subscriberLi = subscriberLi.nextElementSibling(); } if (batch.size() == 0) throw new HandlerException(getString(R.string.parser_error_unexpected_input)); JSONObject metadata = new JSONObject(); try { metadata.put("subscribers", batch.size()); metadata.put("language", getString(R.string.locale)); } catch (JSONException ignored) { } Crittercism.setMetadata(metadata); Crittercism.setUsername(mParent); return batch; }
From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java
/** * cleans up and converts any nodes that should be considered text into text *///from www .j av a2s . c o m private void convertLinksToText() { if (logger.isDebugEnabled()) { logger.debug("Turning links to text"); } Elements links = topNode.getElementsByTag("a"); for (Element item : links) { if (item.getElementsByTag("img").size() == 0) { TextNode tn = new TextNode(item.text(), topNode.baseUri()); item.replaceWith(tn); } } }
From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java
/** * Depricated use {@link #getFormattedText(Element)} * takes an element and turns the P tags into \n\n * // todo move this to an output formatter object instead of inline here * * @return//from w w w . java 2 s . c o m */ @Deprecated public String getFormattedText() { StringBuilder sb = new StringBuilder(); Elements nodes = topNode.getAllElements(); for (Element e : nodes) { if (e.tagName().equals("p")) { String text = StringEscapeUtils.unescapeHtml(e.text()).trim(); sb.append(text); sb.append("\n\n"); } } return sb.toString(); }
From source file:io.seldon.importer.articles.dynamicextractors.FirstElementTextValueDateDynamicExtractor.java
@Override public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception { String attrib_value = null;//ww w .j a v a 2s . c o m if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 1)) { String cssSelector = attributeDetail.extractor_args.get(0); Element element = articleDoc.select(cssSelector).first(); if (StringUtils.isNotBlank(cssSelector)) { if (element != null) { attrib_value = element.text(); } } } if (attrib_value != null) { String pubtext = attrib_value; SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); DateFormat df = new SimpleDateFormat("dd/mm/yyyy hh:mm", Locale.ENGLISH); Date result = null; try { result = df.parse(pubtext); } catch (ParseException e) { logger.info("Failed to parse date withUTC format " + pubtext); } // try a simpler format df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH); try { result = df.parse(pubtext); } catch (ParseException e) { logger.info("Failed to parse date " + pubtext); } if (result != null) { attrib_value = dateFormatter.format(result); } else { logger.error("Failed to parse date " + pubtext); } } return attrib_value; }
From source file:org.commonjava.maven.galley.transport.htcli.internal.HttpListing.java
@Override public ListingResult call() { request = new HttpGet(url); // return null if something goes wrong, after setting the error. // What we should be doing here is trying to retrieve the html directory // listing, then parse out the filenames from that... ///*from ww w . java2 s. com*/ // They'll be links, so that's something to key in on. // // I'm wondering about this: // http://jsoup.org/cookbook/extracting-data/selector-syntax // the dependency is: org.jsoup:jsoup:1.7.2 ListingResult result = null; InputStream in = null; String oldName = Thread.currentThread().getName(); try { String newName = oldName + ": LIST " + url; Thread.currentThread().setName(newName); if (executeHttp()) { in = response.getEntity().getContent(); String listing = IOUtils.toString(in); Logger logger = LoggerFactory.getLogger(getClass()); logger.debug("Got raw listing content:\n\n{}\n\n", listing); final ArrayList<String> al = new ArrayList<>(); // TODO: Charset!! Document doc = Jsoup.parse(listing, url); // try // { // } // catch ( final IOException e ) // { // this.error = // new TransferLocationException( resource.getLocation(), "Invalid HTML in: {}. Reason: {}", e, url, e.getMessage() ); // } if (doc != null) { for (final Element link : doc.select("a")) { String linkText = link.text(); String linkHref = link.attr("href"); URL url = new URL(this.url); boolean sameServer = isSameServer(url, linkHref); boolean subpath = isSubpath(url, linkHref); if ((sameServer && subpath) && (linkHref.endsWith(linkText) || linkHref.endsWith(linkText + '/')) && !EXCLUDES.contains(linkText)) { al.add(linkText); } } result = new ListingResult(resource, al.toArray(new String[al.size()])); } } } catch (final TransferException e) { this.error = e; } catch (final IOException e) { this.error = new TransferException("Failed to construct directory listing for: {}. Reason: {}", e, url, e.getMessage()); } finally { closeQuietly(in); cleanup(); if (oldName != null) { Thread.currentThread().setName(oldName); } } return error == null ? result : null; }
From source file:com.johan.vertretungsplan.parser.SVPlanParser.java
public Vertretungsplan getVertretungsplan() throws IOException, JSONException { new LoginHandler(schule).handleLogin(executor, cookieStore, username, password); // JSONArray urls = schule.getData().getJSONArray("urls"); String encoding = schule.getData().getString("encoding"); List<Document> docs = new ArrayList<Document>(); for (int i = 0; i < urls.length(); i++) { JSONObject url = urls.getJSONObject(i); loadUrl(url.getString("url"), encoding, docs); }//from ww w. j a va 2 s .c o m LinkedHashMap<String, VertretungsplanTag> tage = new LinkedHashMap<String, VertretungsplanTag>(); for (Document doc : docs) { if (doc.select(".svp-tabelle").size() > 0) { VertretungsplanTag tag = new VertretungsplanTag(); String date = "Unbekanntes Datum"; if (doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").size() > 0) date = doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").text(); else if (doc.title().startsWith("Vertretungsplan fr ")) date = doc.title().substring("Vertretungsplan fr ".length()); tag.setDatum(date); if (doc.select(".svp-uploaddatum").size() > 0) tag.setStand(doc.select(".svp-uploaddatum").text().replace("Aktualisierung: ", "")); Elements rows = doc.select(".svp-tabelle tr"); String lastLesson = ""; for (Element row : rows) { if (row.hasClass("svp-header")) continue; Vertretung vertretung = new Vertretung(); List<String> affectedClasses = new ArrayList<String>(); for (Element column : row.select("td")) { if (!hasData(column.text())) { continue; } String type = column.className(); if (type.startsWith("svp-stunde")) { vertretung.setLesson(column.text()); lastLesson = column.text(); } else if (type.startsWith("svp-klasse")) affectedClasses = Arrays.asList(column.text().split(", ")); else if (type.startsWith("svp-esfehlt")) vertretung.setPreviousTeacher(column.text()); else if (type.startsWith("svp-esvertritt")) vertretung.setTeacher(column.text()); else if (type.startsWith("svp-fach")) vertretung.setSubject(column.text()); else if (type.startsWith("svp-bemerkung")) { vertretung.setDesc(column.text()); vertretung.setType(recognizeType(column.text())); } else if (type.startsWith("svp-raum")) vertretung.setRoom(column.text()); if (vertretung.getLesson() == null) vertretung.setLesson(lastLesson); } if (vertretung.getType() == null) { vertretung.setType("Vertretung"); } for (String klasse : affectedClasses) { KlassenVertretungsplan kv = tag.getKlassen().get(klasse); if (kv == null) kv = new KlassenVertretungsplan(klasse); kv.add(vertretung); tag.getKlassen().put(klasse, kv); } } List<String> nachrichten = new ArrayList<String>(); if (doc.select("h2:contains(Mitteilungen)").size() > 0) { Element h2 = doc.select("h2:contains(Mitteilungen)").first(); Element sibling = h2.nextElementSibling(); while (sibling != null && sibling.tagName().equals("p")) { for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText() .split("<br />\\s*<br />")) { if (hasData(nachricht)) nachrichten.add(nachricht); } sibling = sibling.nextElementSibling(); } } tag.setNachrichten(nachrichten); tage.put(date, tag); } else { throw new IOException("keine SVPlan-Tabelle gefunden"); } } Vertretungsplan v = new Vertretungsplan(); v.setTage(new ArrayList<VertretungsplanTag>(tage.values())); return v; }