Example usage for org.jsoup.nodes Element text

List of usage examples for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text() 

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:com.jimplush.goose.ContentExtractor.java

/**
 * checks the density of links within a node, is there not much text and most of it contains linky shit?
 * if so it's no good// ww  w  .j a  v  a 2s .  c  o m
 *
 * @param e
 * @return
 */
private static boolean isHighLinkDensity(Element e) {

    Elements links = e.getElementsByTag("a");

    if (links.size() == 0) {
        return false;
    }

    String text = e.text().trim();
    String[] words = SPACE_SPLITTER.split(text);
    float numberOfWords = words.length;

    // let's loop through all the links and calculate the number of words that make up the links
    StringBuilder sb = new StringBuilder();
    for (Element link : links) {
        sb.append(link.text());
    }
    String linkText = sb.toString();
    String[] linkWords = SPACE_SPLITTER.split(linkText);
    float numberOfLinkWords = linkWords.length;

    float numberOfLinks = links.size();

    float linkDivisor = numberOfLinkWords / numberOfWords;
    float score = linkDivisor * numberOfLinks;

    if (logger.isDebugEnabled()) {
        String logText;
        if (e.text().length() >= 51) {
            logText = e.text().substring(0, 50);
        } else {
            logText = e.text();
        }
        logger.debug("Calulated link density score as: " + score + " for node: " + logText);
    }
    if (score > 1) {
        return true;
    }

    return false;
}

From source file:org.ala.lucene.CreateWordPressIndex.java

/**
 * Read the Google sitemap file on WP site and load up a list
 * of page URL./*from w w w .j  av  a  2  s  .c o  m*/
 *
 * @throws IOException
 */
protected void loadSitemap() throws IOException {
    Document doc = Jsoup.connect(WP_SITEMAP_URI).get();
    Elements pages = doc.select("loc");
    logger.info("Sitemap file lists " + pages.size() + " pages.");

    for (Element page : pages) {
        // add it to list of page urls Field
        this.pageUrls.add(page.text());
    }
}

From source file:crawler.AScraper.java

@Filter(inputChannel = "channel2", outputChannel = "channel3")
public boolean filter(Element payload) {
    Matcher m = patter.matcher(payload.text());
    if (m.find()) {
        return true;
    } else if (payload.text().startsWith("Beautyleg")) {
        return false;
    } else {/*from  w w  w .jav  a 2  s  .  c om*/
        LOG.error(String.format("Anchor text dose not match pattern:[%s]", payload.text()));
        return false;
    }
}

From source file:io.seldon.importer.articles.ItemAttributesImporter.java

public static Map<String, String> getAttributes(String url, String existingCategory) {
    ItemProcessResult itemProcessResult = new ItemProcessResult();
    itemProcessResult.client_item_id = url;
    itemProcessResult.extraction_status = "EXTRACTION_FAILED";

    logger.info("Trying to get attributes for " + url);
    Map<String, String> attributes = null;
    String title = "";
    String category = "";
    String subCategory = "";
    String img_url = "";
    String description = "";
    String tags = "";
    String leadtext = "";
    String link = "";
    String publishDate = "";
    String domain = "";
    try {/*  w  w  w  .ja  v a 2  s  .c om*/
        long now = System.currentTimeMillis();
        long timeSinceLastRequest = now - lastUrlFetchTime;
        if (timeSinceLastRequest < minFetchGapMsecs) {
            long timeToSleep = minFetchGapMsecs - timeSinceLastRequest;
            logger.info(
                    "Sleeping " + timeToSleep + "msecs as time since last fetch is " + timeSinceLastRequest);
            Thread.sleep(timeToSleep);
        }
        Document articleDoc = Jsoup.connect(url).userAgent("SeldonBot/1.0").timeout(httpGetTimeout).get();
        lastUrlFetchTime = System.currentTimeMillis();
        //get IMAGE URL
        if (StringUtils.isNotBlank(imageCssSelector)) {
            Element imageElement = articleDoc.select(imageCssSelector).first();
            if (imageElement != null && imageElement.attr("content") != null) {
                img_url = imageElement.attr("content");
            }
            if (imageElement != null && StringUtils.isBlank(img_url)) {
                img_url = imageElement.attr("src");
            }
            if (imageElement != null && StringUtils.isBlank(img_url)) {
                img_url = imageElement.attr("href");
            }

        }

        if (StringUtils.isBlank(img_url) && StringUtils.isNotBlank(defImageUrl)) {
            logger.info("Setting image to default: " + defImageUrl);
            img_url = defImageUrl;
        }
        img_url = StringUtils.strip(img_url);

        //get TITLE
        if (StringUtils.isNotBlank(titleCssSelector)) {
            Element titleElement = articleDoc.select(titleCssSelector).first();
            if ((titleElement != null) && (titleElement.attr("content") != null)) {
                title = titleElement.attr("content");
            }

            // if still blank get from text instead
            if (StringUtils.isBlank(title) && (titleElement != null)) {
                title = titleElement.text();
            }
        }

        //get LEAD TEXT
        if (StringUtils.isNotBlank(leadTextCssSelector)) {
            Element leadElement = articleDoc.select(leadTextCssSelector).first();
            if (leadElement != null && leadElement.attr("content") != null) {
                leadtext = leadElement.attr("content");
            }
        }

        //get publish date
        if (StringUtils.isNotBlank(publishDateCssSelector)) {
            //2013-01-21T10:40:55Z
            Element pubElement = articleDoc.select(publishDateCssSelector).first();
            if (pubElement != null && pubElement.attr("content") != null) {
                String pubtext = pubElement.attr("content");
                SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
                Date result = null;
                try {
                    result = df.parse(pubtext);
                } catch (ParseException e) {
                    logger.info("Failed to parse date withUTC format " + pubtext);
                }
                //try a simpler format
                df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH);
                try {
                    result = df.parse(pubtext);
                } catch (ParseException e) {
                    logger.info("Failed to parse date " + pubtext);
                }

                if (result != null)
                    publishDate = dateFormatter.format(result);
                else
                    logger.error("Failed to parse date " + pubtext);
            }
        }

        //get Link
        if (StringUtils.isNotBlank(linkCssSelector)) {
            Element linkElement = articleDoc.select(linkCssSelector).first();
            if (linkElement != null && linkElement.attr("content") != null) {
                link = linkElement.attr("content");
            }
        }

        //get CONTENT
        if (StringUtils.isNotBlank(textCssSelector)) {
            Element descriptionElement = articleDoc.select(textCssSelector).first();
            if (descriptionElement != null)
                description = Jsoup.parse(descriptionElement.html()).text();
        }

        //get TAGS
        Set<String> tagSet = AttributesImporterUtils.getTags(articleDoc, tagsCssSelector, title);

        if (tagSet.size() > 0)
            tags = CollectionTools.join(tagSet, ",");

        //get CATEGORY - client specific
        if (StringUtils.isNotBlank(categoryCssSelector)) {
            Element categoryElement = articleDoc.select(categoryCssSelector).first();
            if (categoryElement != null && categoryElement.attr("content") != null) {
                category = categoryElement.attr("content");
                if (StringUtils.isNotBlank(category))
                    category = category.toUpperCase();
            }
        } else if (StringUtils.isNotBlank(categoryClassPrefix)) {
            String className = "io.seldon.importer.articles.category." + categoryClassPrefix
                    + "CategoryExtractor";
            Class<?> clazz = Class.forName(className);
            Constructor<?> ctor = clazz.getConstructor();
            CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance();
            category = extractor.getCategory(url, articleDoc);
        }

        //get Sub CATEGORY - client specific
        if (StringUtils.isNotBlank(subCategoryCssSelector)) {
            Element subCategoryElement = articleDoc.select(subCategoryCssSelector).first();
            if (subCategoryElement != null && subCategoryElement.attr("content") != null) {
                subCategory = subCategoryElement.attr("content");
                if (StringUtils.isNotBlank(subCategory))
                    subCategory = category.toUpperCase();
            }
        } else if (StringUtils.isNotBlank(subCategoryClassPrefix)) {
            String className = "io.seldon.importer.articles.category." + subCategoryClassPrefix
                    + "SubCategoryExtractor";
            Class<?> clazz = Class.forName(className);
            Constructor<?> ctor = clazz.getConstructor();
            CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance();
            subCategory = extractor.getCategory(url, articleDoc);
        }

        // Get domain
        if (domainIsNeeded) {
            domain = getDomain(url);
        }

        if ((StringUtils.isNotBlank(title) && (imageNotNeeded || StringUtils.isNotBlank(img_url))
                && (categoryNotNeeded || StringUtils.isNotBlank(category))
                && (!domainIsNeeded || StringUtils.isNotBlank(domain)))) {
            attributes = new HashMap<String, String>();
            attributes.put(TITLE, title);
            if (StringUtils.isNotBlank(category))
                attributes.put(CATEGORY, category);
            if (StringUtils.isNotBlank(subCategory))
                attributes.put(SUBCATEGORY, subCategory);
            if (StringUtils.isNotBlank(link))
                attributes.put(LINK, link);
            if (StringUtils.isNotBlank(leadtext))
                attributes.put(LEAD_TEXT, leadtext);
            if (StringUtils.isNotBlank(img_url))
                attributes.put(IMG_URL, img_url);
            if (StringUtils.isNotBlank(tags))
                attributes.put(TAGS, tags);
            attributes.put(CONTENT_TYPE, VERIFIED_CONTENT_TYPE);
            if (StringUtils.isNotBlank(description))
                attributes.put(DESCRIPTION, description);
            if (StringUtils.isNotBlank(publishDate))
                attributes.put(PUBLISH_DATE, publishDate);
            if (StringUtils.isNotBlank(domain))
                attributes.put(DOMAIN, domain);
            System.out.println("Item: " + url + "; Category: " + category + " SubCategory: " + subCategory);
            itemProcessResult.extraction_status = "EXTRACTION_SUCCEEDED";
        } else {
            logger.warn("Failed to get needed attributes for article " + url);
            logger.warn("[title=" + title + ", img_url=" + img_url + ", category=" + category + ", domain="
                    + domain + "]");
        }

        { // check for failures for the log result
            if (StringUtils.isBlank(title)) {
                itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list
                        + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "title";
            }
            if (!imageNotNeeded && StringUtils.isBlank(img_url)) {
                itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list
                        + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "img_url";
            }
            if (!categoryNotNeeded && StringUtils.isBlank(category)) {
                itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list
                        + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",")
                        + "category";
            }
        }
    } catch (Exception e) {
        logger.error("Article: " + url + ". Attributes import FAILED", e);
        itemProcessResult.error = e.toString();
    }

    AttributesImporterUtils.logResult(logger, itemProcessResult);

    return attributes;
}

From source file:de.geeksfactory.opacclient.apis.TouchPoint.java

static List<ReservedItem> parse_reslist(Document doc) {
    List<ReservedItem> reservations = new ArrayList<>();
    Elements copytrs = doc.select(".data tr");
    int trs = copytrs.size();
    if (trs <= 1) {
        return null;
    }/*from  w  ww.jav  a2s  .com*/
    for (int i = 1; i < trs; i++) {
        Element tr = copytrs.get(i);
        ReservedItem item = new ReservedItem();

        if (tr.text().contains("keine Daten") || tr.children().size() == 1) {
            return null;
        }

        item.setTitle(tr.child(2).select("b, strong").text().trim());
        try {
            String[] rowsplit2 = tr.child(2).html().split("<br[ /]*>");
            String[] rowsplit3 = tr.child(3).html().split("<br[ /]*>");
            if (rowsplit2.length > 1)
                item.setAuthor(rowsplit2[1].trim());
            if (rowsplit3.length > 2)
                item.setBranch(rowsplit3[2].trim());
            if (rowsplit3.length > 2) {
                item.setStatus(rowsplit3[0].trim() + " (" + rowsplit3[1].trim() + ")");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        reservations.add(item);
    }
    return reservations;
}

From source file:fr.arlefebvre.pronostics.controller.UEFATeamsController.java

@RequestMapping("/uefa/teams")
public List<Team> teams() {
    if (pseudoCache != null && !pseudoCache.isEmpty())
        return pseudoCache;
    ArrayList<Team> result = new ArrayList<Team>();
    String uri = "http://fr.fifa.com/fifa-world-ranking/ranking-table/men/uefa.html";

    //On se connecte au site et on charge le document html

    Document doc;//from   ww w.  ja v  a  2s  . c o  m
    try {
        doc = Jsoup.connect(uri).get();
        Elements elements = doc.getElementsByClass("table");
        for (Element element : elements) {
            Element tbody = element.getElementsByTag("tbody").first();
            for (Element child : tbody.children()) {
                Element teamNameElement = child.getElementsByClass("tbl-teamname").first();
                String name = teamNameElement.text();
                String countryCode = child.getElementsByClass("tbl-countrycode").first().text();
                String imgUrl = teamNameElement.select("img").first().absUrl("src");
                Team team = new Team();
                team.setName(name);
                team.setCountryCode(countryCode);
                team.setImgUrl(imgUrl);
                team.setNationalTeam(true);
                result.add(team);
            }
        }

        //String titre =  element.text();
    } catch (IOException e) {
        e.printStackTrace();
    }

    //        RestTemplate restTemplate = new RestTemplate();
    //        ResponseEntity<ChampionListDto> response = restTemplate.getForEntity(
    //                uri,
    //                ChampionListDto.class);
    //
    //        List<ChampionDto> champions = response.getBody().getChampions();
    //        return champions.stream().map(c -> getChampionById(c.getId()).getName()).collect(Collectors.toList());
    result.sort((t1, t2) -> t1.getName().compareTo(t2.getName()));
    if (pseudoCache == null)
        pseudoCache = result;
    return result;
}

From source file:de.tudarmstadt.ukp.argumentation.data.roomfordebate.NYTimesArticleExtractor.java

public Article extractArticle(String html) throws ParseException, IOException {
    Article result = new Article();

    Document doc = Jsoup.parse(html, getBaseName());

    Element element;// w  w w  . ja  va 2 s .c o  m
    try {
        element = doc.select("article.rfd").iterator().next();
    } catch (NoSuchElementException exception) {
        throw new IOException("Cannot find article.rfd element");
    }

    //      System.out.println(element);

    String dateText = element.select("p.pubdate").text().replaceAll("Updated[\\s]+", "");
    // time
    try {
        DateFormat df = new SimpleDateFormat("MMM dd, yyyy, hh:mm aaa", Locale.ENGLISH);
        Date date = df.parse(dateText);
        result.setTimestamp(date);
    } catch (ParseException e) {
        // June 24, 2015
        DateFormat df = new SimpleDateFormat("MMM dd, yyyy", Locale.ENGLISH);
        Date date = df.parse(dateText);
        result.setTimestamp(date);
    }

    // title
    result.setTitle(TextCleaningUtils.normalizeWithParagraphs(element.select("h1").text()));

    // text
    StringBuilder sb = new StringBuilder();
    for (Element p : element.select("div.nytint-post > p")) {
        sb.append(p.text());
        sb.append("\n");
    }
    result.setText(TextCleaningUtils.normalizeWithParagraphs(sb.toString()));

    // debate title
    result.setDebateTitle(TextCleaningUtils
            .normalizeWithParagraphs(doc.select("div.nytint-discussion-overview > h2").text()));

    // debate url
    result.setDebateUrl(doc.select("div.nytint-discussion-overview > h2 > a").iterator().next().attr("href"));

    // document url
    result.setUrl(doc.select("meta[name=communityAssetURL]").attr("content"));

    // debate description
    result.setDebateDescription(TextCleaningUtils
            .normalizeWithParagraphs(((TextNode) doc.select("div.nytint-discussion-overview > p").iterator()
                    .next().childNodes().iterator().next()).text()));

    // aurhor
    result.setAuthor(element.select("div.nytint-mugshots > img").iterator().next().attr("alt"));

    // topics
    for (Element a : element.select("p.nytint-tags > a")) {
        result.getTopics().add(a.attr("href"));
    }

    return result;
}

From source file:org.talend.license.LicenseRetriver.java

public Collection<File> updateLicense(final String version, final File file) {
    logger.info("start to update {} license ", version);
    String url = String.format(Configer.getBuildURL() + Configer.getLicenseURL(), version);

    Document doc = connector.getPage(url);
    if (null == doc) {
        logger.error("no {} license page url:{}", version, url);
        return null;
    }//from w  w  w.j  av a 2 s . co  m
    String regex = String.format(Configer.getLicenseItem(), version);

    Elements eles = doc.getElementsMatchingOwnText(regex);

    if (eles.isEmpty()) {
        logger.error("no {} license page url:{}", version, url);
        return null;
    }

    final Pattern pattern = Pattern.compile(regex);

    SortedSet<String> set = new TreeSet<String>(new Comparator<String>() {

        public int compare(String o1, String o2) {
            String m1;
            String m2;
            Matcher matcher = pattern.matcher(o1);
            if (matcher.find()) {
                m1 = matcher.group(2);
            } else {
                return 1;
            }
            matcher = pattern.matcher(o2);
            if (matcher.find()) {
                m2 = matcher.group(2);
            } else {
                return -1;
            }
            return m2.compareTo(m1);
        }
    });
    logger.info("there are {} license build", eles.size());
    for (Element ele : eles) {
        String text = ele.text();
        set.add(text);
    }
    if (set.isEmpty()) {
        return null;
    }

    Iterator<String> ite = set.iterator();
    while (ite.hasNext()) {
        String target = ite.next();
        url = url + target;
        logger.info("retrive from newest build {}", url);
        Collection<File> fs = checkout(version, file, url);
        if (!fs.isEmpty()) {
            return fs;
        }
        logger.info("no available license in build");
    }
    logger.error("retrive license failed");
    return null;
}

From source file:de.geeksfactory.opacclient.apis.TouchPoint.java

static List<LentItem> parse_medialist(Document doc) {
    List<LentItem> media = new ArrayList<>();
    Elements copytrs = doc.select(".data tr");

    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);

    int trs = copytrs.size();
    if (trs == 1) {
        return null;
    }/*ww  w .java 2 s  .c o  m*/
    assert (trs > 0);
    for (int i = 1; i < trs; i++) {
        Element tr = copytrs.get(i);
        LentItem item = new LentItem();

        if (tr.text().contains("keine Daten")) {
            return null;
        }
        item.setTitle(tr.select(".account-display-title").select("b, strong").text().trim());
        try {
            item.setAuthor(tr.select(".account-display-title").html().split("<br[ /]*>")[1].trim());

            String[] col3split = tr.select(".account-display-state").html().split("<br[ /]*>");
            String deadline = Jsoup.parse(col3split[0].trim()).text().trim();
            if (deadline.contains(":")) {
                // BSB Munich: <span class="hidden-sm hidden-md hidden-lg">Flligkeitsdatum : </span>26.02.2016<br>
                deadline = deadline.split(":")[1].trim();
            }
            if (deadline.contains("-")) {
                // Chemnitz: 22.07.2015 - 20.10.2015<br>
                deadline = deadline.split("-")[1].trim();
            }

            try {
                item.setDeadline(fmt.parseLocalDate(deadline).toString());
            } catch (IllegalArgumentException e1) {
                e1.printStackTrace();
            }

            if (col3split.length > 1)
                item.setHomeBranch(col3split[1].trim());

            if (tr.select("a").size() > 0) {
                for (Element link : tr.select("a")) {
                    String href = link.attr("abs:href");
                    Map<String, String> hrefq = getQueryParamsFirst(href);
                    if (hrefq.get("methodToCall").equals("renewal")) {
                        item.setProlongData(href.split("\\?")[1]);
                        item.setRenewable(true);
                        break;
                    }
                }
            }

        } catch (Exception ex) {
            ex.printStackTrace();
        }

        media.add(item);
    }
    return media;
}