Example usage for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text()

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:com.liato.bankdroid.banking.banks.Hors.java

@Override
public void update() throws BankException, LoginException, BankChoiceException, IOException {
    super.update();
    urlopen = login();//from   w w  w . j  ava 2s  .c  o m
    Document document = Jsoup.parse(response);
    Element balanceElement = document.getElementById("cphMain_lblAmount");
    if (balanceElement == null) {
        throw new BankException(
                res.getText(R.string.unable_to_find).toString() + res.getText(R.string.balance).toString());
    }

    Element nameElement = document.getElementById("lblCardName");
    String accountName = nameElement == null ? NAME.toUpperCase() : nameElement.text();
    if (this.getCustomName().isEmpty()) {
        this.setCustomName(accountName);
    }

    Account account = new Account(accountName, Helpers.parseBalance(balanceElement.text()), "0");
    accounts.add(account);
    balance = balance.add(account.getBalance());

    document = Jsoup.parse(urlopen.open("https://www.dittkort.se/q/Partial/Transactions.aspx?cnt=20"));
    Elements transactionElements = document.select("tr");
    List<Transaction> transactions = new ArrayList<Transaction>();
    if (transactionElements != null) {
        for (Element element : transactionElements) {
            transactions.add(asTransaction(element));
        }
    }
    account.setTransactions(transactions);
    super.updateComplete();
}

From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule7.java

@Override
public ArrayList<Person> run(String url, Document doc) {
    ArrayList<Person> editorList = new ArrayList<>();

    Elements editors = doc.select("dt:contains(Authors/Editors) ~ dd, dt:contains(Author/Editor) ~ dd");
    if (editors.size() == 0)
        return null;

    boolean skip = false;
    for (Element editor : editors) {
        Element prev = editor.previousElementSibling();
        if (prev.tagName().equals("dt")) {
            if (!prev.text().trim().toLowerCase().startsWith("authors/editors")
                    && !prev.text().trim().toLowerCase().startsWith("author/editor")) {
                skip = true;//w ww . j ava 2 s.  c o m
            }
        }

        if (skip) {
            Element next = editor.nextElementSibling();
            if (next != null) {
                if (next.text().trim().toLowerCase().startsWith("authors/editors")
                        || next.text().trim().toLowerCase().startsWith("author/editor")) {
                    skip = false;
                    continue;
                }
            }
            continue;
        }

        if (StringUtils.countMatches(editor.text(), " - ") > 2) {
            Log.log("warning", url + ": This editor may be a list of editors separated by  - ");
            EditorsRule5 ed5 = new EditorsRule5();

            return ed5.run(url, doc);
        }

        String[] splitted = editor.html().split("<br />|<br clear=\"none\" />");

        if (splitted.length < 2) {
            if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:"))
                continue;
            Person result = NameParser.parse(editor.text());
            if (result == null)
                continue;

            for (int i = 0; i < editor.select("a").size(); i++) {
                if (!editor.select("a").get(i).attr("href").isEmpty()) {
                    if (editor.select("a").get(i).attr("href").contains("@")) {
                        result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", ""));
                    } else {
                        result.addWebsite(editor.select("a").get(i).attr("href"));
                    }
                }
            }

            editorList.add(result);
        } else {
            for (String split : splitted) {
                if (!split.isEmpty()) {
                    if (split.equals("WHATWG:") || split.equals("W3C:"))
                        continue;
                    Document newdoc = Jsoup.parse(split.replaceAll("\n", ""));
                    Person result = NameParser.parse(newdoc.text());
                    if (result == null)
                        continue;

                    for (int i = 0; i < newdoc.select("a").size(); i++) {
                        if (!newdoc.select("a").get(i).attr("href").isEmpty()) {
                            if (newdoc.select("a").get(i).attr("href").contains("@")) {
                                result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", ""));
                            } else {
                                result.addWebsite(newdoc.select("a").get(i).attr("href"));
                            }
                        }
                    }

                    editorList.add(result);
                }
            }
        }

        Element next = editor.nextElementSibling();
        if (next != null)
            if (next.tag().getName().equals("dt"))
                break;
    }

    if (editorList.size() == 0)
        return null;

    return editorList;
}

From source file:cvegrabber.CVEController.java

private String grabMitreData(String cveid, String data) throws IOException {
    //String url = "http://www.cvedetails.com/cve/" + cveid + "/";
    String url = "http://cve.mitre.org/cgi-bin/cvename.cgi?name=" + cveid;
    Document doc = Jsoup.connect(url).get();
    String dataToReturn = "";

    if (doc.select("h2").text().contains("ERROR")) {
        dataToReturn = "CVE " + cveid + " Unknown or CVE Not Loaded Yet.";
        return dataToReturn;
    } else if (data.matches("references")) {
        //Elements references = doc.select("td.r_average");
        Elements references = doc.select("li");
        int counter = 0;
        for (Element reference : references) {
            if (counter == 0) {
                //dataToReturn += link.select("a[href]").text();
                dataToReturn += reference.text();
                counter++;/* ww w .ja v a 2  s.c  o  m*/
            } else {
                //dataToReturn += "," + link.select("a[href]").text();
                dataToReturn += "," + reference.text();
            }
        }
    } else if (data.matches("description")) {
        //Element description = doc.select("div.cvedetailssummary").first();
        Elements tds = doc.select("td[colspan=\"2\"]");
        if (tds.eq(2).text().contains("** RESERVED **")) {
            return "No data on mitre yet.";
        }
        dataToReturn = tds.eq(2).text();
    }
    return dataToReturn;
}

From source file:com.ignorelist.kassandra.steam.scraper.HtmlTagLoader.java

@Override
public GameInfo load(Long gameId, EnumSet<TagType> types) {
    GameInfo gameInfo = new GameInfo();
    gameInfo.setId(gameId);//from  w  w  w.j a v  a  2  s.  c  o m
    try {
        if (!types.isEmpty()) {
            InputStream inputStream = cache.get(gameId.toString());
            try {
                Document document = Jsoup.parse(inputStream, Charsets.UTF_8.name(), buildPageUrl(gameId));

                Elements appName = document.select("div.apphub_AppName");
                Element nameElement = Iterables.getFirst(appName, null);
                if (null != nameElement && null != nameElement.text()) {
                    gameInfo.setName(nameElement.text().trim());
                }

                Elements appIconElements = document.select("div.apphub_AppIcon img");
                gameInfo.setIcon(getSrcUri(appIconElements));

                Elements headerImageElements = document.select("img.game_header_image_full");
                gameInfo.setHeaderImage(getSrcUri(headerImageElements));
                final SetMultimap<TagType, String> tags = gameInfo.getTags();

                if (types.contains(TagType.CATEGORY)) {
                    Elements categories = document.select("div#category_block a.name");
                    copyText(categories, tags.get(TagType.CATEGORY));
                }
                if (types.contains(TagType.GENRE)) {
                    Elements genres = document.select("div.details_block a[href*=/genre/]");
                    copyText(genres, tags.get(TagType.GENRE));
                }
                if (types.contains(TagType.USER)) {
                    Elements userTags = document.select("a.app_tag");
                    copyText(Iterables.filter(userTags, Predicates.not(DISPLAY_NONE_PREDICATE)),
                            tags.get(TagType.USER));
                    copyText(Iterables.filter(userTags, DISPLAY_NONE_PREDICATE), tags.get(TagType.USER_HIDDEN));
                }
                if (types.contains(TagType.VR)) {
                    Elements vrSupport = document
                            .select("div.game_area_details_specs a.name[href*=#vrsupport=");
                    copyText(vrSupport, tags.get(TagType.VR));
                }
            } finally {
                IOUtils.closeQuietly(inputStream);
            }
        }
    } catch (ExecutionException ex) {
        Logger.getLogger(HtmlTagLoader.class.getName()).log(Level.SEVERE, null, ex);
    } catch (IOException ex) {
        Logger.getLogger(HtmlTagLoader.class.getName()).log(Level.SEVERE, null, ex);
    }

    return gameInfo;
}

From source file:net.niyonkuru.koodroid.html.SubscribersHandler.java

@Override
public ArrayList<ContentProviderOperation> parse(Document doc, ContentResolver resolver)
        throws HandlerException {
    final ArrayList<ContentProviderOperation> batch = new ArrayList<ContentProviderOperation>();

    Element subscriberLi = doc.select("div#banSelector li:has(div)").first();
    while (subscriberLi != null) {
        String text = subscriberLi.text();

        /* this assumes the name and phone number are separated by a space */
        int separator = text.lastIndexOf(' ') + 1;

        String subscriberId = text.substring(separator).replaceAll("\\D", "");
        if (subscriberId.length() != 10)
            throw new HandlerException(getString(R.string.parser_error_unexpected_input));

        final ContentProviderOperation.Builder builder;

        final Uri subscriberUri = Subscribers.buildSubscriberUri(subscriberId);
        if (subscriberExists(subscriberUri, resolver)) {
            builder = ContentProviderOperation.newUpdate(subscriberUri);
            builder.withValue(Subscribers.UPDATED, System.currentTimeMillis());
        } else {/*ww  w. j a  v  a2s  .c om*/
            builder = ContentProviderOperation.newInsert(Subscribers.CONTENT_URI);
        }
        builder.withValue(Subscribers.SUBSCRIBER_ID, subscriberId);

        String fullName = "";
        String[] names = text.substring(0, separator).split("\\s");
        for (String name : names) {
            fullName += ParserUtils.capitalize(name) + " ";
        }
        builder.withValue(Subscribers.SUBSCRIBER_FULL_NAME, fullName.trim());

        if (subscriberLi.hasAttr("onClick")) {
            String switchUrl = subscriberLi.attr("onClick");

            /* extract only the url */
            switchUrl = switchUrl.substring(switchUrl.indexOf('/'), switchUrl.lastIndexOf('\''));
            builder.withValue(Subscribers.SUBSCRIBER_SWITCHER, switchUrl);
        } else { /* this is the default subscriber as it doesn't have a switcher url */
            ContentValues cv = new ContentValues(1);
            cv.put(Settings.SUBSCRIBER, subscriberId);

            resolver.insert(Settings.CONTENT_URI, cv);
        }
        builder.withValue(Subscribers.SUBSCRIBER_EMAIL, mParent);

        batch.add(builder.build());

        subscriberLi = subscriberLi.nextElementSibling();
    }
    if (batch.size() == 0)
        throw new HandlerException(getString(R.string.parser_error_unexpected_input));

    JSONObject metadata = new JSONObject();
    try {
        metadata.put("subscribers", batch.size());
        metadata.put("language", getString(R.string.locale));
    } catch (JSONException ignored) {
    }
    Crittercism.setMetadata(metadata);
    Crittercism.setUsername(mParent);

    return batch;
}

From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java

/**
 * cleans up and converts any nodes that should be considered text into text
 *///from  www  .j av a2s  .  c o m
private void convertLinksToText() {
    if (logger.isDebugEnabled()) {
        logger.debug("Turning links to text");
    }
    Elements links = topNode.getElementsByTag("a");
    for (Element item : links) {
        if (item.getElementsByTag("img").size() == 0) {
            TextNode tn = new TextNode(item.text(), topNode.baseUri());
            item.replaceWith(tn);
        }
    }
}

From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java

/**
 * Depricated use {@link #getFormattedText(Element)}
 * takes an element and turns the P tags into \n\n
 * // todo move this to an output formatter object instead of inline here
 *
 * @return//from   w w  w .  java 2 s . c o  m
 */
@Deprecated
public String getFormattedText() {

    StringBuilder sb = new StringBuilder();

    Elements nodes = topNode.getAllElements();
    for (Element e : nodes) {
        if (e.tagName().equals("p")) {
            String text = StringEscapeUtils.unescapeHtml(e.text()).trim();
            sb.append(text);
            sb.append("\n\n");
        }
    }

    return sb.toString();
}

From source file:io.seldon.importer.articles.dynamicextractors.FirstElementTextValueDateDynamicExtractor.java

@Override
public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception {

    String attrib_value = null;//ww  w  .j a  v  a  2s . c  o  m

    if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 1)) {
        String cssSelector = attributeDetail.extractor_args.get(0);
        Element element = articleDoc.select(cssSelector).first();
        if (StringUtils.isNotBlank(cssSelector)) {
            if (element != null) {
                attrib_value = element.text();
            }
        }
    }

    if (attrib_value != null) {
        String pubtext = attrib_value;
        SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        DateFormat df = new SimpleDateFormat("dd/mm/yyyy hh:mm", Locale.ENGLISH);
        Date result = null;
        try {
            result = df.parse(pubtext);
        } catch (ParseException e) {
            logger.info("Failed to parse date withUTC format " + pubtext);
        }
        // try a simpler format
        df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH);
        try {
            result = df.parse(pubtext);
        } catch (ParseException e) {
            logger.info("Failed to parse date " + pubtext);
        }

        if (result != null) {
            attrib_value = dateFormatter.format(result);
        } else {
            logger.error("Failed to parse date " + pubtext);
        }

    }

    return attrib_value;
}

From source file:org.commonjava.maven.galley.transport.htcli.internal.HttpListing.java

@Override
public ListingResult call() {
    request = new HttpGet(url);

    // return null if something goes wrong, after setting the error.
    // What we should be doing here is trying to retrieve the html directory
    // listing, then parse out the filenames from that...
    ///*from  ww  w . java2  s.  com*/
    // They'll be links, so that's something to key in on.
    //
    // I'm wondering about this:
    // http://jsoup.org/cookbook/extracting-data/selector-syntax
    // the dependency is: org.jsoup:jsoup:1.7.2

    ListingResult result = null;
    InputStream in = null;

    String oldName = Thread.currentThread().getName();
    try {
        String newName = oldName + ": LIST " + url;
        Thread.currentThread().setName(newName);

        if (executeHttp()) {
            in = response.getEntity().getContent();
            String listing = IOUtils.toString(in);
            Logger logger = LoggerFactory.getLogger(getClass());
            logger.debug("Got raw listing content:\n\n{}\n\n", listing);

            final ArrayList<String> al = new ArrayList<>();

            // TODO: Charset!!
            Document doc = Jsoup.parse(listing, url);
            //                try
            //                {
            //                }
            //                catch ( final IOException e )
            //                {
            //                    this.error =
            //                            new TransferLocationException( resource.getLocation(), "Invalid HTML in: {}. Reason: {}", e, url, e.getMessage() );
            //                }

            if (doc != null) {
                for (final Element link : doc.select("a")) {
                    String linkText = link.text();
                    String linkHref = link.attr("href");

                    URL url = new URL(this.url);

                    boolean sameServer = isSameServer(url, linkHref);
                    boolean subpath = isSubpath(url, linkHref);

                    if ((sameServer && subpath)
                            && (linkHref.endsWith(linkText) || linkHref.endsWith(linkText + '/'))
                            && !EXCLUDES.contains(linkText)) {
                        al.add(linkText);
                    }
                }

                result = new ListingResult(resource, al.toArray(new String[al.size()]));
            }
        }
    } catch (final TransferException e) {
        this.error = e;
    } catch (final IOException e) {
        this.error = new TransferException("Failed to construct directory listing for: {}. Reason: {}", e, url,
                e.getMessage());
    } finally {
        closeQuietly(in);
        cleanup();
        if (oldName != null) {
            Thread.currentThread().setName(oldName);
        }
    }

    return error == null ? result : null;
}

From source file:com.johan.vertretungsplan.parser.SVPlanParser.java

public Vertretungsplan getVertretungsplan() throws IOException, JSONException {
    new LoginHandler(schule).handleLogin(executor, cookieStore, username, password); //

    JSONArray urls = schule.getData().getJSONArray("urls");
    String encoding = schule.getData().getString("encoding");
    List<Document> docs = new ArrayList<Document>();

    for (int i = 0; i < urls.length(); i++) {
        JSONObject url = urls.getJSONObject(i);
        loadUrl(url.getString("url"), encoding, docs);
    }//from   ww w.  j a va 2 s  .c o  m

    LinkedHashMap<String, VertretungsplanTag> tage = new LinkedHashMap<String, VertretungsplanTag>();
    for (Document doc : docs) {
        if (doc.select(".svp-tabelle").size() > 0) {
            VertretungsplanTag tag = new VertretungsplanTag();
            String date = "Unbekanntes Datum";
            if (doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").size() > 0)
                date = doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").text();
            else if (doc.title().startsWith("Vertretungsplan fr "))
                date = doc.title().substring("Vertretungsplan fr ".length());
            tag.setDatum(date);
            if (doc.select(".svp-uploaddatum").size() > 0)
                tag.setStand(doc.select(".svp-uploaddatum").text().replace("Aktualisierung: ", ""));

            Elements rows = doc.select(".svp-tabelle tr");
            String lastLesson = "";
            for (Element row : rows) {
                if (row.hasClass("svp-header"))
                    continue;

                Vertretung vertretung = new Vertretung();
                List<String> affectedClasses = new ArrayList<String>();

                for (Element column : row.select("td")) {
                    if (!hasData(column.text())) {
                        continue;
                    }
                    String type = column.className();
                    if (type.startsWith("svp-stunde")) {
                        vertretung.setLesson(column.text());
                        lastLesson = column.text();
                    } else if (type.startsWith("svp-klasse"))
                        affectedClasses = Arrays.asList(column.text().split(", "));
                    else if (type.startsWith("svp-esfehlt"))
                        vertretung.setPreviousTeacher(column.text());
                    else if (type.startsWith("svp-esvertritt"))
                        vertretung.setTeacher(column.text());
                    else if (type.startsWith("svp-fach"))
                        vertretung.setSubject(column.text());
                    else if (type.startsWith("svp-bemerkung")) {
                        vertretung.setDesc(column.text());
                        vertretung.setType(recognizeType(column.text()));
                    } else if (type.startsWith("svp-raum"))
                        vertretung.setRoom(column.text());

                    if (vertretung.getLesson() == null)
                        vertretung.setLesson(lastLesson);
                }

                if (vertretung.getType() == null) {
                    vertretung.setType("Vertretung");
                }

                for (String klasse : affectedClasses) {
                    KlassenVertretungsplan kv = tag.getKlassen().get(klasse);
                    if (kv == null)
                        kv = new KlassenVertretungsplan(klasse);
                    kv.add(vertretung);
                    tag.getKlassen().put(klasse, kv);
                }
            }

            List<String> nachrichten = new ArrayList<String>();
            if (doc.select("h2:contains(Mitteilungen)").size() > 0) {
                Element h2 = doc.select("h2:contains(Mitteilungen)").first();
                Element sibling = h2.nextElementSibling();
                while (sibling != null && sibling.tagName().equals("p")) {
                    for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText()
                            .split("<br />\\s*<br />")) {
                        if (hasData(nachricht))
                            nachrichten.add(nachricht);
                    }
                    sibling = sibling.nextElementSibling();
                }
            }
            tag.setNachrichten(nachrichten);

            tage.put(date, tag);
        } else {
            throw new IOException("keine SVPlan-Tabelle gefunden");
        }
    }
    Vertretungsplan v = new Vertretungsplan();
    v.setTage(new ArrayList<VertretungsplanTag>(tage.values()));

    return v;
}