Android Open Source - Reader Article Text Extractor






From Project

Back to project page Reader.

License

The source code is released under:

GNU General Public License

If you think the Android project Reader listed in this page is inappropriate, such as containing malicious code/tools or violating the copyright, please email info at java2s dot com, thanks.

Java Source Code

package com.carlrice.reader.utils;
//from w  w w  . j av a2  s.c om
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Pattern;

/**
 * This class is thread safe.
 *
 * @author Alex P (ifesdjeen from jreadability)
 * @author Peter Karich
 */
public class ArticleTextExtractor {

    // Interesting nodes
    private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section");

    // Unlikely candidates
    private static final Pattern UNLIKELY = Pattern.compile("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|"
            + "header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsor"
            + "a(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|"
            + "login|si(debar|gn|ngle)");

    // Most likely positive candidates
    private static final Pattern POSITIVE = Pattern.compile("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))"
            + "|arti(cle|kel)|instapaper_body");

    // Most likely negative candidates
    private static final Pattern NEGATIVE = Pattern.compile("nav($|igation)|user|com(ment|bx)|(^com-)|contact|"
            + "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|"
            + "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard");

    private static final Pattern NEGATIVE_STYLE =
            Pattern.compile("hidden|display: ?none|font-size: ?small");

    /**
     * @param input            extracts article text from given html string. wasn't tested
     *                         with improper HTML, although jSoup should be able to handle minor stuff.
     * @param contentIndicator a text which should be included into the extracted content, or null
     * @return extracted article, all HTML tags stripped
     */
    public static String extractContent(InputStream input, String contentIndicator) throws Exception {
        return extractContent(Jsoup.parse(input, null, ""), contentIndicator);
    }

    public static String extractContent(Document doc, String contentIndicator) {
        if (doc == null)
            throw new NullPointerException("missing document");

        // now remove the clutter
        prepareDocument(doc);

        // init elements
        Collection<Element> nodes = getNodes(doc);
        int maxWeight = 0;
        Element bestMatchElement = null;
        for (Element entry : nodes) {
            int currentWeight = getWeight(entry, contentIndicator);
            if (currentWeight > maxWeight) {
                maxWeight = currentWeight;
                bestMatchElement = entry;
                if (maxWeight > 300)
                    break;
            }
        }

        if (bestMatchElement != null) {
            return bestMatchElement.toString();
        }

        return null;
    }

    /**
     * Weights current element. By matching it with positive candidates and
     * weighting child nodes. Since it's impossible to predict which exactly
     * names, ids or class names will be used in HTML, major role is played by
     * child nodes
     *
     * @param e                Element to weight, along with child nodes
     * @param contentIndicator a text which should be included into the extracted content, or null
     */
    private static int getWeight(Element e, String contentIndicator) {
        int weight = calcWeight(e);
        weight += (int) Math.round(e.ownText().length() / 100.0 * 10);
        weight += weightChildNodes(e, contentIndicator);
        return weight;
    }

    /**
     * Weights a child nodes of given Element. During tests some difficulties
     * were met. For instance, not every single document has nested paragraph
     * tags inside of the major article tag. Sometimes people are adding one
     * more nesting level. So, we're adding 4 points for every 100 symbols
     * contained in tag nested inside of the current weighted element, but only
     * 3 points for every element that's nested 2 levels deep. This way we give
     * more chances to extract the element that has less nested levels,
     * increasing probability of the correct extraction.
     *
     * @param rootEl           Element, who's child nodes will be weighted
     * @param contentIndicator a text which should be included into the extracted content, or null
     */
    private static int weightChildNodes(Element rootEl, String contentIndicator) {
        int weight = 0;
        Element caption = null;
        List<Element> pEls = new ArrayList<>(5);
        for (Element child : rootEl.children()) {
            String ownText = child.ownText();
            int ownTextLength = ownText.length();
            if (ownTextLength < 20)
                continue;

            if (contentIndicator != null && ownText.contains(contentIndicator)) {
                weight += 100; // We certainly found the item
            }

            if (ownTextLength > 200)
                weight += Math.max(50, ownTextLength / 10);

            if (child.tagName().equals("h1") || child.tagName().equals("h2")) {
                weight += 30;
            } else if (child.tagName().equals("div") || child.tagName().equals("p")) {
                weight += calcWeightForChild(ownText);
                if (child.tagName().equals("p") && ownTextLength > 50)
                    pEls.add(child);

                if (child.className().toLowerCase().equals("caption"))
                    caption = child;
            }
        }

        // use caption and image
        if (caption != null)
            weight += 30;

        if (pEls.size() >= 2) {
            for (Element subEl : rootEl.children()) {
                if ("h1;h2;h3;h4;h5;h6".contains(subEl.tagName())) {
                    weight += 20;
                    // headerEls.add(subEl);
                }
            }
        }
        return weight;
    }

    private static int calcWeightForChild(String ownText) {
        int c = count(ownText, "&quot;");
        c += count(ownText, "&lt;");
        c += count(ownText, "&gt;");
        c += count(ownText, "px");
        int val;
        if (c > 5)
            val = -30;
        else
            val = (int) Math.round(ownText.length() / 25.0);

        return val;
    }

    private static int calcWeight(Element e) {
        int weight = 0;
        if (POSITIVE.matcher(e.className()).find())
            weight += 35;

        if (POSITIVE.matcher(e.id()).find())
            weight += 40;

        if (UNLIKELY.matcher(e.className()).find())
            weight -= 20;

        if (UNLIKELY.matcher(e.id()).find())
            weight -= 20;

        if (NEGATIVE.matcher(e.className()).find())
            weight -= 50;

        if (NEGATIVE.matcher(e.id()).find())
            weight -= 50;

        String style = e.attr("style");
        if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find())
            weight -= 50;
        return weight;
    }

    /**
     * Prepares document. Currently only stipping unlikely candidates, since
     * from time to time they're getting more score than good ones especially in
     * cases when major text is short.
     *
     * @param doc document to prepare. Passed as reference, and changed inside
     *            of function
     */
    private static void prepareDocument(Document doc) {
        // stripUnlikelyCandidates(doc);
        removeScriptsAndStyles(doc);
    }

    /**
     * Removes unlikely candidates from HTML. Currently takes id and class name
     * and matches them against list of patterns
     *
     * @param doc document to strip unlikely candidates from
     */
//    protected void stripUnlikelyCandidates(Document doc) {
//        for (Element child : doc.select("body").select("*")) {
//            String className = child.className().toLowerCase();
//            String id = child.id().toLowerCase();
//
//            if (NEGATIVE.matcher(className).find()
//                    || NEGATIVE.matcher(id).find()) {
//                child.remove();
//            }
//        }
//    }
    private static Document removeScriptsAndStyles(Document doc) {
        Elements scripts = doc.getElementsByTag("script");
        for (Element item : scripts) {
            item.remove();
        }

        Elements noscripts = doc.getElementsByTag("noscript");
        for (Element item : noscripts) {
            item.remove();
        }

        Elements styles = doc.getElementsByTag("style");
        for (Element style : styles) {
            style.remove();
        }

        return doc;
    }

    /**
     * @return a set of all important nodes
     */
    private static Collection<Element> getNodes(Document doc) {
        Collection<Element> nodes = new HashSet<>(64);
        for (Element el : doc.select("body").select("*")) {
            if (NODES.matcher(el.tagName()).matches()) {
                nodes.add(el);
            }
        }
        return nodes;
    }

    private static int count(String str, String substring) {
        int c = 0;
        int index1 = str.indexOf(substring);
        if (index1 >= 0) {
            c++;
            c += count(str.substring(index1 + substring.length()), substring);
        }
        return c;
    }
}




Java Source Code List

com.carlrice.reader.Application.java
com.carlrice.reader.Constants.java
com.carlrice.reader.activity.AboutActivity.java
com.carlrice.reader.activity.AddGoogleNewsActivity.java
com.carlrice.reader.activity.BaseActivity.java
com.carlrice.reader.activity.EditFeedActivity.java
com.carlrice.reader.activity.EditFeedsListActivity.java
com.carlrice.reader.activity.EntryActivity.java
com.carlrice.reader.activity.GeneralPrefsActivity.java
com.carlrice.reader.activity.HomeActivity.java
com.carlrice.reader.adapter.CursorLoaderExpandableListAdapter.java
com.carlrice.reader.adapter.DrawerAdapter.java
com.carlrice.reader.adapter.EntriesCursorAdapter.java
com.carlrice.reader.adapter.FeedsCursorAdapter.java
com.carlrice.reader.adapter.FiltersCursorAdapter.java
com.carlrice.reader.fragment.EditFeedsListFragment.java
com.carlrice.reader.fragment.EntriesListFragment.java
com.carlrice.reader.fragment.EntryFragment.java
com.carlrice.reader.fragment.GeneralPrefsFragment.java
com.carlrice.reader.fragment.SwipeRefreshFragment.java
com.carlrice.reader.fragment.SwipeRefreshListFragment.java
com.carlrice.reader.loader.BaseLoader.java
com.carlrice.reader.parser.OPML.java
com.carlrice.reader.parser.RssAtomParser.java
com.carlrice.reader.provider.DatabaseHelper.java
com.carlrice.reader.provider.FeedDataContentProvider.java
com.carlrice.reader.provider.FeedData.java
com.carlrice.reader.receiver.BootCompletedBroadcastReceiver.java
com.carlrice.reader.receiver.ConnectionChangeReceiver.java
com.carlrice.reader.service.FetcherService.java
com.carlrice.reader.service.RefreshService.java
com.carlrice.reader.utils.ArticleTextExtractor.java
com.carlrice.reader.utils.CircleTransform.java
com.carlrice.reader.utils.FileUtils.java
com.carlrice.reader.utils.HtmlUtils.java
com.carlrice.reader.utils.NetworkUtils.java
com.carlrice.reader.utils.PrefUtils.java
com.carlrice.reader.utils.StringUtils.java
com.carlrice.reader.utils.ThrottledContentObserver.java
com.carlrice.reader.utils.UiUtils.java
com.carlrice.reader.view.AutoSummaryListPreference.java
com.carlrice.reader.view.BakedBezierInterpolator.java
com.carlrice.reader.view.DragNDropExpandableListView.java
com.carlrice.reader.view.DragNDropListener.java
com.carlrice.reader.view.EntryView.java
com.carlrice.reader.view.SwipeProgressBar.java
com.carlrice.reader.view.SwipeRefreshLayout.java
com.carlrice.reader.widget.ColorPickerDialogPreference.java
com.carlrice.reader.widget.TickerWidgetProvider.java
com.carlrice.reader.widget.TickerWidgetService.java
com.carlrice.reader.widget.WidgetConfigActivity.java
com.carlrice.reader.widget.WidgetConfigFragment.java
com.carlrice.reader.widget.WidgetProvider.java
com.carlrice.reader.widget.WidgetService.java