Back to project page CATaZine-Live.
The source code is released under:
GNU General Public License
If you think the Android project CATaZine-Live listed in this page is inappropriate, such as containing malicious code/tools or violating the copyright, please email info at java2s dot com, thanks.
package com.melegy.catazine.utils; //ww w.ja v a 2 s. c o m import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.InputStream; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.regex.Pattern; public class ArticleTextExtractor { // Interesting nodes private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section"); // Unlikely candidates private static final Pattern UNLIKELY = Pattern.compile("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|" + "header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsor" + "a(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|" + "login|si(debar|gn|ngle)"); // Most likely positive candidates private static final Pattern POSITIVE = Pattern.compile("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))" + "|arti(cle|kel)|instapaper_body"); // Most likely negative candidates private static final Pattern NEGATIVE = Pattern.compile("nav($|igation)|user|com(ment|bx)|(^com-)|contact|" + "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|" + "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard"); private static final Pattern NEGATIVE_STYLE = Pattern.compile("hidden|display: ?none|font-size: ?small"); /** * @param input extracts article text from given html string. wasn't tested * with improper HTML, although jSoup should be able to handle minor stuff. * @returns extracted article, all HTML tags stripped */ public static String extractContent(InputStream input) throws Exception { return extractContent(Jsoup.parse(input, null, "")); } public static String extractContent(Document doc) throws Exception { if (doc == null) throw new NullPointerException("missing document"); // now remove the clutter prepareDocument(doc); // init elements Collection<Element> nodes = getNodes(doc); int maxWeight = 0; Element bestMatchElement = null; for (Element entry : nodes) { int currentWeight = getWeight(entry); if (currentWeight > maxWeight) { maxWeight = currentWeight; bestMatchElement = entry; if (maxWeight > 200) break; } } if (bestMatchElement != null) { return bestMatchElement.toString(); } return null; } /** * Weights current element. By matching it with positive candidates and * weighting child nodes. Since it's impossible to predict which exactly * names, ids or class names will be used in HTML, major role is played by * child nodes * * @param e Element to weight, along with child nodes */ protected static int getWeight(Element e) { int weight = calcWeight(e); weight += (int) Math.round(e.ownText().length() / 100.0 * 10); weight += weightChildNodes(e); return weight; } /** * Weights a child nodes of given Element. During tests some difficulties * were met. For instanance, not every single document has nested paragraph * tags inside of the major article tag. Sometimes people are adding one * more nesting level. So, we're adding 4 points for every 100 symbols * contained in tag nested inside of the current weighted element, but only * 3 points for every element that's nested 2 levels deep. This way we give * more chances to extract the element that has less nested levels, * increasing probability of the correct extraction. * * @param rootEl Element, who's child nodes will be weighted */ protected static int weightChildNodes(Element rootEl) { int weight = 0; Element caption = null; List<Element> pEls = new ArrayList<Element>(5); for (Element child : rootEl.children()) { String ownText = child.ownText(); int ownTextLength = ownText.length(); if (ownTextLength < 20) continue; if (ownTextLength > 200) weight += Math.max(50, ownTextLength / 10); if (child.tagName().equals("h1") || child.tagName().equals("h2")) { weight += 30; } else if (child.tagName().equals("div") || child.tagName().equals("p")) { weight += calcWeightForChild(child, ownText); if (child.tagName().equals("p") && ownTextLength > 50) pEls.add(child); if (child.className().toLowerCase().equals("caption")) caption = child; } } // use caption and image if (caption != null) weight += 30; if (pEls.size() >= 2) { for (Element subEl : rootEl.children()) { if ("h1;h2;h3;h4;h5;h6".contains(subEl.tagName())) { weight += 20; // headerEls.add(subEl); } } } return weight; } private static int calcWeightForChild(Element child, String ownText) { int c = count(ownText, """); c += count(ownText, "<"); c += count(ownText, ">"); c += count(ownText, "px"); int val; if (c > 5) val = -30; else val = (int) Math.round(ownText.length() / 25.0); return val; } private static int calcWeight(Element e) { int weight = 0; if (POSITIVE.matcher(e.className()).find()) weight += 35; if (POSITIVE.matcher(e.id()).find()) weight += 40; if (UNLIKELY.matcher(e.className()).find()) weight -= 20; if (UNLIKELY.matcher(e.id()).find()) weight -= 20; if (NEGATIVE.matcher(e.className()).find()) weight -= 50; if (NEGATIVE.matcher(e.id()).find()) weight -= 50; String style = e.attr("style"); if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find()) weight -= 50; return weight; } /** * Prepares document. Currently only stipping unlikely candidates, since * from time to time they're getting more score than good ones especially in * cases when major text is short. * * @param doc document to prepare. Passed as reference, and changed inside * of function */ protected static void prepareDocument(Document doc) { // stripUnlikelyCandidates(doc); removeScriptsAndStyles(doc); } /** * Removes unlikely candidates from HTML. Currently takes id and class name * and matches them against list of patterns * * @param doc document to strip unlikely candidates from */ // protected void stripUnlikelyCandidates(Document doc) { // for (Element child : doc.select("body").select("*")) { // String className = child.className().toLowerCase(); // String id = child.id().toLowerCase(); // // if (NEGATIVE.matcher(className).find() // || NEGATIVE.matcher(id).find()) { // child.remove(); // } // } // } private static Document removeScriptsAndStyles(Document doc) { Elements scripts = doc.getElementsByTag("script"); for (Element item : scripts) { item.remove(); } Elements noscripts = doc.getElementsByTag("noscript"); for (Element item : noscripts) { item.remove(); } Elements styles = doc.getElementsByTag("style"); for (Element style : styles) { style.remove(); } return doc; } /** * @return a set of all important nodes */ public static Collection<Element> getNodes(Document doc) { Collection<Element> nodes = new HashSet<Element>(64); for (Element el : doc.select("body").select("*")) { if (NODES.matcher(el.tagName()).matches()) { nodes.add(el); } } return nodes; } public static int count(String str, String substring) { int c = 0; int index1 = str.indexOf(substring); if (index1 >= 0) { c++; c += count(str.substring(index1 + substring.length()), substring); } return c; } }