org.apdplat.extractor.html.HtmlExtractor.java Source code

Java tutorial

Introduction

Here is the source code for org.apdplat.extractor.html.HtmlExtractor.java

Source

/**
 * 
 * APDPlat - Application Product Development Platform
 * Copyright (c) 2013, ??, yang-shangchuan@qq.com
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 */

package org.apdplat.extractor.html;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.apdplat.extractor.html.model.CssPath;
import org.apdplat.extractor.html.model.ExtractFailLog;
import org.apdplat.extractor.html.model.ExtractFunction;
import org.apdplat.extractor.html.model.ExtractResult;
import org.apdplat.extractor.html.model.ExtractResultItem;
import org.apdplat.extractor.html.model.HtmlTemplate;
import org.apdplat.extractor.html.model.UrlPattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * ?
 * ?URL?????CSS???HTML?
 *
 * @author ??
 *
 */
public class HtmlExtractor {
    private static final Logger LOGGER = LoggerFactory.getLogger(HtmlExtractor.class);
    private static HtmlExtractor htmlExtractor;
    private ExtractRegular extractRegular;

    private HtmlExtractor() {
    }

    /**
     * ?
     * @param extractRegular URL?
     * @return 
     */
    public static HtmlExtractor getInstance(ExtractRegular extractRegular) {
        if (htmlExtractor != null) {
            return htmlExtractor;
        }
        synchronized (HtmlExtractor.class) {
            if (htmlExtractor == null) {
                htmlExtractor = new HtmlExtractor();
                htmlExtractor.extractRegular = extractRegular;
            }
        }
        return htmlExtractor;
    }

    /**
     * ?
     * @param allExtractRegularUrl ??WEB??
     * @param redisHost REDIS
     * @param redisPort REDIS?
     * @return 
     */
    public static HtmlExtractor getInstance(String allExtractRegularUrl, String redisHost, int redisPort) {
        if (htmlExtractor != null) {
            return htmlExtractor;
        }
        synchronized (HtmlExtractor.class) {
            if (htmlExtractor == null) {
                ExtractRegular extractRegular = ExtractRegular.getInstance(allExtractRegularUrl, redisHost,
                        redisPort);
                htmlExtractor = new HtmlExtractor();
                htmlExtractor.extractRegular = extractRegular;
            }
        }
        return htmlExtractor;
    }

    /**
     * ??
     * @param url html?
     * @param encoding ??
     * @return ?
     */
    public List<ExtractResult> extract(String url, String encoding) {
        InputStream in = null;
        try {
            in = new URL(url).openConnection().getInputStream();
        } catch (Exception e) {
            LOGGER.error("?URL?" + url, e);
        }
        return extract(url, in, encoding);
    }

    /**
     * ??
     * @param url html?
     * @param encoding ??
     * @param content html?
     * @return 
     */
    public List<ExtractResult> extract(String url, byte[] content, String encoding) {
        InputStream in = new ByteArrayInputStream(content);
        return extract(url, in, encoding);
    }

    /**
     * ??
     * @param url html?
     * @param encoding ??
     * @param in html??
     * @return 
     */
    public List<ExtractResult> extract(String url, InputStream in, String encoding) {
        List<ExtractResult> extractResults = new ArrayList<>();
        if (!Charset.isSupported(encoding)) {
            LOGGER.error("???" + encoding + " URL" + url);
            return extractResults;
        }
        //?URL???
        List<HtmlTemplate> htmlTemplates = extractRegular.getHtmlTemplate(url);
        if (htmlTemplates.isEmpty()) {
            return extractResults;
        }
        try {
            byte[] content = readAll(in);
            Document doc = Jsoup.parse(new ByteArrayInputStream(content), encoding, url);
            Elements metas = doc.select("meta");
            String keywords = "";
            String description = "";
            for (Element meta : metas) {
                String name = meta.attr("name");
                if ("keywords".equals(name)) {
                    keywords = meta.attr("content");
                }
                if ("description".equals(name)) {
                    description = meta.attr("content");
                }
            }
            Set<String> tableNames = new HashSet<>();
            for (HtmlTemplate htmlTemplate : htmlTemplates) {
                if (tableNames.contains(htmlTemplate.getTableName())) {
                    LOGGER.debug(
                            "?tableName????UrlPattern?"
                                    + htmlTemplate.getUrlPattern().getUrlPattern());
                    LOGGER.debug(htmlTemplates.toString());
                }
                tableNames.add(htmlTemplate.getTableName());
                try {
                    //???
                    ExtractResult extractResult = extractHtmlTemplate(url, htmlTemplate, doc);
                    extractResult.setContent(content);
                    extractResult.setEncoding(encoding);
                    extractResult.setKeywords(keywords);
                    extractResult.setDescription(description);
                    extractResults.add(extractResult);
                } catch (Exception e) {
                    LOGGER.error("???" + htmlTemplate.getTemplateName(), e);
                }
            }
        } catch (Exception e) {
            LOGGER.error("?: " + url, e);
        }
        return extractResults;
    }

    /**
     * ??
     *
     * @param in ?
     * @return 
     */
    public static byte[] readAll(InputStream in) {
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        try {
            byte[] buffer = new byte[4096];
            for (int n; (n = in.read(buffer)) > 0;) {
                out.write(buffer, 0, n);
            }
        } catch (IOException ex) {
            LOGGER.error("?", ex);
        }
        return out.toByteArray();
    }

    /**
     * ????
     * @param url html?
     * @param htmlTemplate html??
     * @param doc jsoup
     * @return ?
     */
    private ExtractResult extractHtmlTemplate(String url, HtmlTemplate htmlTemplate, Document doc) {
        //???
        ExtractResult extractResult = new ExtractResult();
        extractResult.setUrl(url);
        extractResult.setTableName(htmlTemplate.getTableName());
        List<CssPath> cssPaths = htmlTemplate.getCssPaths();
        //??CSS???????
        //??CSS???
        for (CssPath cssPath : cssPaths) {
            // ??CSS PATH
            Elements elements = doc.select(cssPath.getCssPath());
            // CSS??
            for (Element element : elements) {
                String text = null;
                if (StringUtils.isBlank(cssPath.getAttr())) {
                    //???
                    text = element.text();
                } else {
                    //???
                    text = element.attr(cssPath.getAttr());
                }
                if (StringUtils.isNotBlank(text)) {
                    // ????
                    if (cssPath.hasExtractFunction()) {
                        //CSS???
                        for (ExtractFunction pf : cssPath.getExtractFunctions()) {
                            text = ExtractFunctionExecutor.execute(text, doc, cssPath, pf.getExtractExpression());
                            if (text != null) {
                                ExtractResultItem extractResultItem = new ExtractResultItem();
                                extractResultItem.setField(pf.getFieldName());
                                extractResultItem.setValue(text);
                                extractResult.addExtractResultItem(extractResultItem);
                            } else {
                                ExtractFailLog extractFailLog = new ExtractFailLog();
                                extractFailLog.setUrl(url);
                                extractFailLog.setUrlPattern(htmlTemplate.getUrlPattern().getUrlPattern());
                                extractFailLog.setTemplateName(htmlTemplate.getTemplateName());
                                extractFailLog.setCssPath(cssPath.getCssPath());
                                extractFailLog.setExtractExpression(pf.getExtractExpression());
                                extractFailLog.setTableName(htmlTemplate.getTableName());
                                extractFailLog.setFieldName(pf.getFieldName());
                                extractFailLog.setFieldDescription(pf.getFieldDescription());
                                extractResult.addExtractFailLog(extractFailLog);
                                //??????
                                //?
                                //???
                                return extractResult;
                            }
                        }
                    } else {
                        //CSS?
                        ExtractResultItem extractResultItem = new ExtractResultItem();
                        extractResultItem.setField(cssPath.getFieldName());
                        extractResultItem.setValue(text);
                        extractResult.addExtractResultItem(extractResultItem);
                    }
                } else {
                    //??????
                    ExtractFailLog extractFailLog = new ExtractFailLog();
                    extractFailLog.setUrl(url);
                    extractFailLog.setUrlPattern(htmlTemplate.getUrlPattern().getUrlPattern());
                    extractFailLog.setTemplateName(htmlTemplate.getTemplateName());
                    extractFailLog.setCssPath(cssPath.getCssPath());
                    extractFailLog.setExtractExpression("");
                    extractFailLog.setTableName(htmlTemplate.getTableName());
                    extractFailLog.setFieldName(cssPath.getFieldName());
                    extractFailLog.setFieldDescription(cssPath.getFieldDescription());
                    extractResult.addExtractFailLog(extractFailLog);
                    //??????
                    //?
                    //???
                    return extractResult;
                }
            }
        }
        return extractResult;
    }

    private static void usage2() {
        String allExtractRegularUrl = "http://localhost:8080/html-extractor-web/api/all_extract_regular.jsp";
        String redisHost = "localhost";
        int redisPort = 6379;

        HtmlExtractor htmlExtractor = HtmlExtractor.getInstance(allExtractRegularUrl, redisHost, redisPort);

        String url = "http://money.163.com/08/1219/16/4THR2TMP002533QK.html";
        List<ExtractResult> extractResults = htmlExtractor.extract(url, "gb2312");

        int i = 1;
        for (ExtractResult extractResult : extractResults) {
            System.out.println((i++) + "? " + extractResult.getUrl() + " ?");
            Map<String, List<ExtractResultItem>> extractResultItems = extractResult.getExtractResultItems();
            for (String field : extractResultItems.keySet()) {
                List<ExtractResultItem> values = extractResultItems.get(field);
                if (values.size() > 1) {
                    int j = 1;
                    System.out.println("\t:" + field);
                    for (ExtractResultItem item : values) {
                        System.out.println("\t\t" + (j++) + "?" + field + " = " + item.getValue());
                    }
                } else {
                    System.out.println("\t" + field + " = " + values.get(0).getValue());
                }
            }
            System.out.println("\tdescription = " + extractResult.getDescription());
            System.out.println("\tkeywords = " + extractResult.getKeywords());
        }
    }

    private static void usage1() {
        //1??
        List<UrlPattern> urlPatterns = new ArrayList<>();
        //1.1?URL?
        UrlPattern urlPattern = new UrlPattern();
        urlPattern.setUrlPattern("http://money.163.com/\\d{2}/\\d{4}/\\d{2}/[0-9A-Z]{16}.html");
        //1.2?HTML?
        HtmlTemplate htmlTemplate = new HtmlTemplate();
        htmlTemplate.setTemplateName("??");
        htmlTemplate.setTableName("finance");
        //1.3?URL?HTML??
        urlPattern.addHtmlTemplate(htmlTemplate);
        //1.4?CSS
        CssPath cssPath = new CssPath();
        cssPath.setCssPath("h1");
        cssPath.setFieldName("title");
        cssPath.setFieldDescription("");
        //1.5?CSS??
        htmlTemplate.addCssPath(cssPath);
        //1.6?CSS
        cssPath = new CssPath();
        cssPath.setCssPath("div#endText");
        cssPath.setFieldName("content");
        cssPath.setFieldDescription("");
        //1.7?CSS??
        htmlTemplate.addCssPath(cssPath);
        //??URLURL?
        urlPatterns.add(urlPattern);
        //2???
        ExtractRegular extractRegular = ExtractRegular.getInstance(urlPatterns);
        //??3???
        //extractRegular.addUrlPatterns(urlPatterns);
        //extractRegular.addUrlPattern(urlPattern);
        //extractRegular.removeUrlPattern(urlPattern.getUrlPattern());
        //3??HTML?
        HtmlExtractor htmlExtractor = HtmlExtractor.getInstance(extractRegular);
        //4??
        String url = "http://money.163.com/08/1219/16/4THR2TMP002533QK.html";
        List<ExtractResult> extractResults = htmlExtractor.extract(url, "gb2312");
        //5?
        int i = 1;
        for (ExtractResult extractResult : extractResults) {
            System.out.println((i++) + "? " + extractResult.getUrl() + " ?");
            Map<String, List<ExtractResultItem>> extractResultItems = extractResult.getExtractResultItems();
            for (String field : extractResultItems.keySet()) {
                List<ExtractResultItem> values = extractResultItems.get(field);
                if (values.size() > 1) {
                    int j = 1;
                    System.out.println("\t:" + field);
                    for (ExtractResultItem item : values) {
                        System.out.println("\t\t" + (j++) + "?" + field + " = " + item.getValue());
                    }
                } else {
                    System.out.println("\t" + field + " = " + values.get(0).getValue());
                }
            }
            System.out.println("\tdescription = " + extractResult.getDescription());
            System.out.println("\tkeywords = " + extractResult.getKeywords());
        }
    }

    private static void usage3() {
        //1??
        List<UrlPattern> urlPatterns = new ArrayList<>();
        //1.1?URL?
        UrlPattern urlPattern = new UrlPattern();
        urlPattern.setUrlPattern("http://list.jd.com/list.html\\?cat=([\\d,]+)");
        //1.2?HTML?
        HtmlTemplate htmlTemplate = new HtmlTemplate();
        htmlTemplate.setTemplateName("?");
        htmlTemplate.setTableName("jd_goods");
        //1.3?URL?HTML??
        urlPattern.addHtmlTemplate(htmlTemplate);
        //1.4?CSS
        CssPath cssPath = new CssPath();
        cssPath.setCssPath("html body div div div ul li div div.p-name");
        cssPath.setFieldName("name");
        cssPath.setFieldDescription("??");
        //1.5?CSS??
        htmlTemplate.addCssPath(cssPath);
        //1.6?CSS
        cssPath = new CssPath();
        cssPath.setCssPath("html body div div div ul li div div.p-name a");
        cssPath.setAttr("href");
        cssPath.setFieldName("link");
        cssPath.setFieldDescription("");
        //1.7?CSS??
        htmlTemplate.addCssPath(cssPath);
        //1.8?CSS
        cssPath = new CssPath();
        cssPath.setCssPath("html body div div div ul li div div.p-price strong");
        cssPath.setFieldName("price");
        cssPath.setFieldDescription("");
        //1.9?CSS??
        htmlTemplate.addCssPath(cssPath);
        //??URLURL?
        urlPatterns.add(urlPattern);
        //2???
        ExtractRegular extractRegular = ExtractRegular.getInstance(urlPatterns);
        //??3???
        //extractRegular.addUrlPatterns(urlPatterns);
        //extractRegular.addUrlPattern(urlPattern);
        //extractRegular.removeUrlPattern(urlPattern.getUrlPattern());
        //3??HTML?
        HtmlExtractor htmlExtractor = HtmlExtractor.getInstance(extractRegular);
        //4??
        String url = "http://list.jd.com/list.html?cat=9987,653,655";
        List<ExtractResult> extractResults = htmlExtractor.extract(url, "utf-8");
        //5?
        int i = 1;
        for (ExtractResult extractResult : extractResults) {
            System.out.println((i++) + "? " + extractResult.getUrl() + " ?");
            Map<String, List<ExtractResultItem>> extractResultItems = extractResult.getExtractResultItems();
            for (String field : extractResultItems.keySet()) {
                List<ExtractResultItem> values = extractResultItems.get(field);
                if (values.size() > 1) {
                    int j = 1;
                    System.out.println("\t:" + field);
                    for (ExtractResultItem item : values) {
                        System.out.println("\t\t" + (j++) + "?" + field + " = " + item.getValue());
                    }
                } else {
                    System.out.println("\t" + field + " = " + values.get(0).getValue());
                }
            }
            System.out.println("\tdescription = " + extractResult.getDescription());
            System.out.println("\tkeywords = " + extractResult.getKeywords());
        }
    }

    /**
     * @param args
     */
    public static void main(String[] args) {
        //??3??????
        //usage1();
        //usage2();
        usage3();
    }
}