cn.edu.hfut.dmic.contentextractor.ContentExtractor.java Source code

Introduction

Here is the source code for cn.edu.hfut.dmic.contentextractor.ContentExtractor.java
Source

/*
 * Copyright (C) 2015 hu
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package cn.edu.hfut.dmic.contentextractor;

import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeVisitor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import cn.wanghaomiao.xpath.exception.XpathSyntaxErrorException;

/**
 * ContentExtractor could extract content,title,time from news webpage
 * ??github:WebCollector
 * https://github.com/CrawlScript/WebCollector
 */
public class ContentExtractor {

    public static final Logger LOG = LoggerFactory.getLogger(ContentExtractor.class);

    public static DecimalFormat TWONUMBERFORMAT = new DecimalFormat("00");

    protected Document doc;

    ContentExtractor(Document doc) {
        this.doc = doc;
    }

    protected HashMap<Element, CountInfo> infoMap = new HashMap<Element, CountInfo>();

    /**
     * ?Element?
     */
    private String srcTime = "";

    private String author_bak = "";

    class CountInfo {

        int textCount = 0;
        int linkTextCount = 0;
        int tagCount = 0;
        int linkTagCount = 0;
        double density = 0;
        double densitySum = 0;
        double score = 0;
        int pCount = 0;
        ArrayList<Integer> leafList = new ArrayList<Integer>();

    }

    /**
     * <br>
     */
    protected void clean() {
        doc.select("script,noscript,style,iframe").remove();
    }

    /**
     * @param node 
     *             1. styleclass
     *             2. ????density???
     *             3. p???
     * @return
     */
    protected CountInfo computeInfo(Node node) {
        if (node instanceof Element) {
            node.removeAttr("style").removeAttr("class");
            Element tag = (Element) node;

            if (tag.text().matches(".{1,20}>.{1,10}>.{1,20}")) {
                CountInfo countInfo = new CountInfo();
                countInfo.density = -200;
                return countInfo;
            }
            CountInfo countInfo = new CountInfo();
            for (Node childNode : tag.childNodes()) {
                CountInfo childCountInfo = computeInfo(childNode);
                countInfo.textCount += childCountInfo.textCount;
                countInfo.linkTextCount += childCountInfo.linkTextCount;
                countInfo.tagCount += childCountInfo.tagCount;
                countInfo.linkTagCount += childCountInfo.linkTagCount;
                countInfo.leafList.addAll(childCountInfo.leafList);
                countInfo.densitySum += childCountInfo.density;
                countInfo.pCount += childCountInfo.pCount;
            }

            countInfo.tagCount++;
            String tagName = tag.tagName();
            if (tagName.equals("a") || tagName.equals("img")) {
                countInfo.linkTextCount = countInfo.textCount;
                countInfo.linkTagCount++;
            } else if (tagName.equals("p")) {
                countInfo.pCount++;
            }

            int pureLen = countInfo.textCount - countInfo.linkTextCount;
            int len = countInfo.tagCount - countInfo.linkTagCount;
            if (pureLen == 0 || len == 0) {
                countInfo.density = 0;
            } else {
                countInfo.density = (pureLen + 0.0) / len;
            }

            infoMap.put(tag, countInfo);

            return countInfo;
        } else if (node instanceof TextNode) {
            TextNode tn = (TextNode) node;
            CountInfo countInfo = new CountInfo();
            String text = tn.text();
            int len = text.length();
            countInfo.textCount = len;
            countInfo.leafList.add(len);
            return countInfo;
        } else {
            return new CountInfo();
        }
    }

    private double computeScore(Element tag) {
        CountInfo countInfo = infoMap.get(tag);
        double var = Math.sqrt(computeVar(countInfo.leafList) + 1);
        double score = Math.log(var) * countInfo.densitySum
                * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 10);
        return score;
    }

    private double computeVar(ArrayList<Integer> data) {
        if (data.size() == 0) {
            return 0;
        }
        if (data.size() == 1) {
            return data.get(0) / 2;
        }
        double sum = 0;
        for (Integer i : data) {
            sum += i;
        }
        double ave = sum / data.size();
        sum = 0;
        for (Integer i : data) {
            sum += (i - ave) * (i - ave);
        }
        sum = sum / data.size();
        return sum;
    }

    private Element getContentElement() throws Exception {
        clean();
        computeInfo(doc.body());
        double maxScore = 0;
        Element content = null;
        for (Map.Entry<Element, CountInfo> entry : infoMap.entrySet()) {
            Element tag = entry.getKey();
            if (tag.tagName().equals("a") || tag.tagName().equals("p") || tag == doc.body()) {
                continue;
            }
            double score = computeScore(tag);
            if (score > maxScore) {
                maxScore = score;
                content = tag;
            }
        }
        if (content == null) {
            throw new Exception("extraction failed");
        }
        return content;
    }

    private News getNews(boolean flag) throws Exception {
        News news = new News();
        Element contentElement;
        try {
            contentElement = getContentElement();
            news.setContentElement(contentElement);
        } catch (Exception ex) {
            LOG.info("news content extraction failed,extraction abort", ex);
            throw new Exception(ex);
        }

        if (doc.baseUri() != null) {
            news.setUrl(doc.baseUri());
        }

        if (flag) {
            clearLi();
        }

        try {
            news.setTime(getTime(contentElement));
        } catch (Exception ex) {
            LOG.info("news title extraction failed", ex);
        }

        try {
            news.setAuthor(getAuthor());
        } catch (Exception ex) {
            LOG.info("news author extraction failed", ex);
        }

        try {
            news.setTitle(getTitle(contentElement));
        } catch (Exception ex) {
            LOG.info("title extraction failed", ex);
        }
        news.setSrcTime(srcTime);
        return news;
    }

    private void clearLi() {
        doc.select("li").remove();
    }

    /**
     * ??:
     * 1. ???
     * 2. ???????
     * 3. ??
     * 4. ?? ??
     * 5. ?
     *
     * @return
     * @throws XpathSyntaxErrorException
     */
    private String getAuthor() throws XpathSyntaxErrorException {
        String author = "";
        if (StringUtils.isBlank(srcTime)) {
            author = getAuthor(doc.body().html());
            return author;
        }
        Element cur = doc.body().select("*:containsOwn(" + srcTime + ")").first();
        if (cur == null) {
            LOG.warn("?srcTime=" + srcTime);
            author = getAuthor(doc.body().html());
            return author;
        }

        if (!noText(cur)) {
            String arr[] = cur.html().split(srcTime);
            for (String text : arr) {
                author = getShortText(text);
                if (!StringUtils.isBlank(author))
                    return author;
            }
        }
        Element parent = cur.parent();
        while (parent != null && noText(parent)) {
            cur = parent;
            parent = parent.parent();
        }
        author = getAuthor(parent.html());
        if (!StringUtils.isBlank(author))
            return author;

        Element pre = cur.previousElementSibling();
        while (pre != null && noText(pre)) {
            pre = pre.previousElementSibling();
        }
        if (pre != null) {
            author = getShortText(pre.text());
        }
        if (!StringUtils.isBlank(author))
            return author;
        Element next = cur.nextElementSibling();
        while (next != null && noText(next)) {
            next = next.nextElementSibling();
        }
        if (next != null) {
            author = getShortText(next.text());
        }
        if (!StringUtils.isBlank(author))
            return author;

        author = getShortText(parent.html().replace(srcTime, " "));
        if (!StringUtils.isBlank(author))
            return author;

        author = getAuthor(doc.body().html());
        if (StringUtils.isBlank(author)) {
            return author_bak;
        }
        return author;
    }

    private String getAuthor(String str) {
        str = str.replaceAll("</?.*?>", " ").replace("&nbsp;", " ");
        String reg = "(??|||?|)[: /]{1,3}(.{1,10}?)\\b";
        Pattern authorPattern = Pattern.compile(reg);
        Matcher matcher = authorPattern.matcher(str);
        if (matcher.find()) {
            return matcher.group(2);
        }
        return "";
    }

    private String getShortText(String str) {
        String author = getAuthor(str);
        if (!StringUtils.isBlank(author))
            return author;

        str = str.replaceAll("</?.*?>", " ").replace("&nbsp;", " ").replaceAll("[: /]{0,3}?(.{2,6}?)\\b",
                " ");
        while (str.contains(":")) {
            str = str.substring(str.indexOf(":") + 1, str.length());
        }
        while (str.contains("")) {
            str = str.substring(str.indexOf("") + 1, str.length());
        }
        String reg = "[\\u4e00-\\u9fa5a-zA-Z]{1,15}";
        Pattern authorPattern = Pattern.compile(reg);
        Matcher matcher = authorPattern.matcher(str);
        while (matcher.find()) {
            author = matcher.group(0);
            if (!hasChinese(author)) {
                author_bak = author;
                continue;
            }
            if (!(author.contains("") || author.contains(""))) {
                return author;
            }
        }
        return "";
    }

    private boolean hasChinese(String author) {
        return author.matches(".*[\\u4e00-\\u9fa5].*");
    }

    private boolean noText(Element ele) {
        return noText(ele.text());
    }

    private boolean noText(String str) {
        return str.replace(srcTime, "").matches("[^\\u4e00-\\u9fa5a-zA-Z]*");
    }

    /**
     * ?
     * <li>??
     *
     * @param contentElement
     * @return
     * @throws Exception
     */
    protected String getTime(Element contentElement) throws Exception {
        String regex = "\\b([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-3]?[0-9])[^0-9]{1,6}?([0-5]?[0-9])[:]([0-5]?[0-9])[:]([0-5]?[0-9])\\b";
        String time = getTime(contentElement, regex);
        if (!StringUtils.isBlank(time)) {
            return time;
        }
        regex = "\\b([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-3]?[0-9])[^0-9]{1,6}?([0-5]?[0-9])[:]([0-5]?[0-9])\\b";
        time = getTime(contentElement, regex);
        if (!StringUtils.isBlank(time)) {
            return time;
        }
        regex = "\\b([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-3]?[0-9])[^0-9]{0,6}?\\b";
        time = getTime(contentElement, regex);
        if (!StringUtils.isBlank(time)) {
            return time;
        }
        return "";
    }

    protected String getTime(Element contentElement, String regex) throws Exception {
        Pattern pattern = Pattern.compile(regex);
        Element current = contentElement;
        for (int i = 0; i < 2; i++) {
            if (current != null && current != doc.body()) {
                Element parent = current.parent();
                if (parent != null) {
                    current = parent;
                }
            }
        }
        for (int i = 0; i < 6; i++) {
            if (current == null) {
                break;
            }
            String currentHtml = current.outerHtml();
            Matcher matcher = pattern.matcher(currentHtml);
            if (matcher.find()) {
                srcTime = matcher.group(0);
                StringBuilder sb = new StringBuilder(
                        matcher.group(1) + "-" + format(matcher.group(2)) + "-" + format(matcher.group(3)));
                if (matcher.groupCount() >= 4) {
                    sb.append(" ").append(format(matcher.group(4)));
                }
                if (matcher.groupCount() >= 5) {
                    sb.append(":").append(format(matcher.group(5)));
                }
                if (matcher.groupCount() >= 6) {
                    sb.append(":").append(format(matcher.group(6)));
                }
                return sb.toString();
            }
            if (current != doc.body()) {
                current = current.parent();
            }
        }
        return "";
    }

    private String format(String str) {
        return TWONUMBERFORMAT.format(Integer.parseInt(str));
    }

    protected double strSim(String a, String b) {
        int len1 = a.length();
        int len2 = b.length();
        if (len1 == 0 || len2 == 0) {
            return 0;
        }
        double ratio;
        if (len1 > len2) {
            ratio = (len1 + 0.0) / len2;
        } else {
            ratio = (len2 + 0.0) / len1;
        }
        if (ratio >= 3) {
            return 0;
        }
        return (lcs(a, b) + 0.0) / Math.max(len1, len2);
    }

    /**
     * metaTitle?metaTitle,metaTitle??????title
     *
     * @param contentElement
     * @return
     * @throws Exception
     */
    protected String getTitle(final Element contentElement) throws Exception {
        final ArrayList<Element> titleList = new ArrayList<Element>();
        final ArrayList<Double> titleSim = new ArrayList<Double>();
        final String metaTitle = getText(doc.title().trim());
        if (!metaTitle.isEmpty()) {
            doc.body().traverse(new NodeVisitor() {
                @Override
                public void head(Node node, int i) {
                    if (node instanceof Element) {
                        Element tag = (Element) node;
                        String tagName = tag.tagName();
                        if (Pattern.matches("h[1-6]", tagName)) {
                            String title = tag.text().trim();
                            double sim = strSim(title, metaTitle);
                            titleSim.add(sim);
                            titleList.add(tag);
                        }
                    }
                }

                @Override
                public void tail(Node node, int i) {
                }
            });
            int index = titleSim.size();
            if (index >= 0) {
                double maxScore = 0;
                int maxIndex = -1;
                for (int i = 0; i < index; i++) {
                    double score = (i + 1) * titleSim.get(i);
                    if (score > maxScore) {
                        maxScore = score;
                        maxIndex = i;
                    }
                }

                if (maxIndex == -1 || titleSim.get(maxIndex) < 0.3) {
                    String title = getText(metaTitle);
                    if (!title.endsWith("") && title.length() > 7) {
                        return title;
                    }
                    Collections.sort(titleList, new Comparator<Element>() {
                        @Override
                        public int compare(Element o1, Element o2) {
                            int len1 = 1;
                            int len2 = 1;
                            if (o1.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() > 26
                                    || o1.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() < 7) {
                                len1 = 0;
                            }
                            if (o2.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() > 26
                                    || o2.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() < 7) {
                                len2 = 0;
                            }
                            if (len1 == len2) {
                                return o1.tagName().charAt(1) - o2.tagName().charAt(1);
                            }
                            return len2 - len1;
                        }
                    });
                    return getText(titleList.get(0).text());
                }
                return titleList.get(maxIndex).text();
            }
        }

        /**
         * ?
         */
        Elements titles = doc.body().select("*[id^=title],*[id$=title],*[class^=title],*[class$=title]");
        if (titles.size() > 0) {
            String title = titles.first().text();
            if (title.length() > 5 && title.length() < 40) {
                return titles.first().text();
            }
        }
        try {
            return getTitleByEditDistance(contentElement);
        } catch (Exception ex) {
            throw new Exception("title not found");
        }

    }

    private String getText(String metaTitle) {
        return metaTitle.replaceAll("[-/_|]{1,3}.*", "");
    }

    protected String getTitleByEditDistance(Element contentElement) throws Exception {
        final String metaTitle = doc.title();

        final ArrayList<Double> max = new ArrayList<Double>();
        max.add(0.0);
        final StringBuilder sb = new StringBuilder();
        doc.body().traverse(new NodeVisitor() {

            public void head(Node node, int i) {

                if (node instanceof TextNode) {
                    TextNode tn = (TextNode) node;
                    String text = tn.text().trim();
                    double sim = strSim(text, metaTitle);
                    if (sim > 0) {
                        if (sim > max.get(0)) {
                            max.set(0, sim);
                            sb.setLength(0);
                            sb.append(text);
                        }
                    }

                }
            }

            public void tail(Node node, int i) {
            }
        });
        if (sb.length() > 0) {
            return sb.toString();
        }
        throw new Exception();

    }

    protected int lcs(String x, String y) {

        int M = x.length();
        int N = y.length();
        if (M == 0 || N == 0) {
            return 0;
        }
        int[][] opt = new int[M + 1][N + 1];

        for (int i = M - 1; i >= 0; i--) {
            for (int j = N - 1; j >= 0; j--) {
                if (x.charAt(i) == y.charAt(j)) {
                    opt[i][j] = opt[i + 1][j + 1] + 1;
                } else {
                    opt[i][j] = Math.max(opt[i + 1][j], opt[i][j + 1]);
                }
            }
        }

        return opt[0][0];

    }

    protected int editDistance(String word1, String word2) {
        int len1 = word1.length();
        int len2 = word2.length();

        int[][] dp = new int[len1 + 1][len2 + 1];

        for (int i = 0; i <= len1; i++) {
            dp[i][0] = i;
        }

        for (int j = 0; j <= len2; j++) {
            dp[0][j] = j;
        }

        for (int i = 0; i < len1; i++) {
            char c1 = word1.charAt(i);
            for (int j = 0; j < len2; j++) {
                char c2 = word2.charAt(j);

                if (c1 == c2) {
                    dp[i + 1][j + 1] = dp[i][j];
                } else {
                    int replace = dp[i][j] + 1;
                    int insert = dp[i][j + 1] + 1;
                    int delete = dp[i + 1][j] + 1;

                    int min = replace > insert ? insert : replace;
                    min = delete > min ? min : delete;
                    dp[i + 1][j + 1] = min;
                }
            }
        }

        return dp[len1][len2];
    }

    /*JsoupDocument??*/
    private static News getNewsByDoc(Document doc, boolean flag) throws Exception {
        ContentExtractor ce = new ContentExtractor(doc);
        return ce.getNews(flag);
    }

    /*HTML??*/
    private static News getNewsByHtml(String html) throws Exception {
        html = html.replaceAll("\\<!--.*?--\\>", "").replace("", " ");
        Document doc = Jsoup.parse(html);
        News news = getNewsByDoc(doc, true);
        if (StringUtils.isBlank(news.getTime())) {
            doc = Jsoup.parse(html);
            news = getNewsByDoc(doc, false);
        }
        return news;
    }

    /*HTMLURL??*/
    private static News getNewsByHtml(String html, String url) throws Exception {
        if (StringUtils.isBlank(url)) {
            return getNewsByHtml(html);
        }
        html = html.replaceAll("\\<!--.*?--\\>", "").replace("", " ");
        Document doc = Jsoup.parse(html, url);
        News news = getNewsByDoc(doc, true);
        if (StringUtils.isBlank(news.getTime())) {
            doc = Jsoup.parse(html, url);
            news = getNewsByDoc(doc, false);
        }
        return news;
    }

    /*URL??*/
    public static News getNewsByUrl(String url) throws Exception {
        HttpRequest request = new HttpRequest(url);
        String html = request.response().decode();
        return getNewsByHtml(html, url);
    }

    public static void main(String[] args) {
        System.out.println("20170317"
                .matches("\\b([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-3]?[0-9])[^0-9]{1,5}?\\b"));
    }
}