org.apdplat.extractor.html.impl.DefaultHtmlExtractor.java Source code

Java tutorial

Introduction

Here is the source code for org.apdplat.extractor.html.impl.DefaultHtmlExtractor.java

Source

/**
 *
 * APDPlat - Application Product Development Platform
 * Copyright (c) 2013, ??, yang-shangchuan@qq.com
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.apdplat.extractor.html.impl;

import org.apache.commons.lang.StringUtils;
import org.apdplat.extractor.html.HtmlExtractor;
import org.apdplat.extractor.html.HtmlFetcher;
import org.apdplat.extractor.html.model.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;

/**
 * ?
 * ?URL?????CSS???HTML?
 *
 * @author ??
 *
 */
public class DefaultHtmlExtractor implements HtmlExtractor {
    private static final Logger LOGGER = LoggerFactory.getLogger(DefaultHtmlExtractor.class);

    private ExtractRegular extractRegular;

    public DefaultHtmlExtractor(ExtractRegular extractRegular) {
        this.extractRegular = extractRegular;
    }

    /**
     * ??
     * @param url URL
     * @param html HTML
     * @return ?
     */
    @Override
    public List<ExtractResult> extract(String url, String html) {
        List<ExtractResult> extractResults = new ArrayList<>();
        //?URL???
        List<HtmlTemplate> htmlTemplates = extractRegular.getHtmlTemplate(url);
        if (htmlTemplates.isEmpty()) {
            LOGGER.debug("URL?" + url);
            return extractResults;
        }
        try {
            Document doc = Jsoup.parse(html);
            Elements metas = doc.select("meta");
            String keywords = "";
            String description = "";
            for (Element meta : metas) {
                String name = meta.attr("name");
                if ("keywords".equals(name)) {
                    keywords = meta.attr("content");
                }
                if ("description".equals(name)) {
                    description = meta.attr("content");
                }
            }
            Set<String> tableNames = new HashSet<>();
            for (HtmlTemplate htmlTemplate : htmlTemplates) {
                if (tableNames.contains(htmlTemplate.getTableName())) {
                    LOGGER.debug(
                            "?tableName????UrlPattern?"
                                    + htmlTemplate.getUrlPattern().getUrlPattern());
                    LOGGER.debug(htmlTemplates.toString());
                }
                tableNames.add(htmlTemplate.getTableName());
                try {
                    //???
                    ExtractResult extractResult = extractHtmlTemplate(url, htmlTemplate, doc);
                    //?URL???????
                    if (!extractResult.getExtractFailLogs().isEmpty()
                            || !extractResult.getExtractResultItems().isEmpty()) {
                        extractResult.setContent(html.getBytes("utf-8"));
                        extractResult.setEncoding("utf-8");
                        extractResult.setKeywords(keywords);
                        extractResult.setDescription(description);
                        extractResults.add(extractResult);
                    } else {
                        LOGGER.debug(url + " ? " + htmlTemplate.getTemplateName() + " ?");
                    }
                } catch (Exception e) {
                    LOGGER.error("???" + htmlTemplate.getTemplateName(), e);
                }
            }
        } catch (Exception e) {
            LOGGER.error("?: " + url, e);
        }
        return extractResults;
    }

    /**
     * ????
     * @param url html?
     * @param htmlTemplate html??
     * @param doc jsoup
     * @return ?
     */
    private ExtractResult extractHtmlTemplate(String url, HtmlTemplate htmlTemplate, Document doc) {
        //???
        ExtractResult extractResult = new ExtractResult();
        extractResult.setUrl(url);
        extractResult.setTableName(htmlTemplate.getTableName());
        List<CssPath> cssPaths = htmlTemplate.getCssPaths();
        //??CSS???????
        //??CSS???
        for (CssPath cssPath : cssPaths) {
            // ??CSS PATH
            Elements elements = doc.select(cssPath.getCssPath());
            // CSS??
            for (Element element : elements) {
                String text = null;
                if (StringUtils.isBlank(cssPath.getAttr())) {
                    //???
                    text = element.text();
                } else {
                    //???
                    text = element.attr(cssPath.getAttr());
                }
                if (StringUtils.isNotBlank(text)) {
                    // ????
                    if (cssPath.hasExtractFunction()) {
                        //CSS???
                        for (ExtractFunction pf : cssPath.getExtractFunctions()) {
                            text = ExtractFunctionExecutor.execute(text, doc, cssPath, pf.getExtractExpression());
                            if (text != null) {
                                ExtractResultItem extractResultItem = new ExtractResultItem();
                                extractResultItem.setField(pf.getFieldName());
                                extractResultItem.setValue(text);
                                extractResult.addExtractResultItem(extractResultItem);
                            } else {
                                ExtractFailLog extractFailLog = new ExtractFailLog();
                                extractFailLog.setUrl(url);
                                extractFailLog.setUrlPattern(htmlTemplate.getUrlPattern().getUrlPattern());
                                extractFailLog.setTemplateName(htmlTemplate.getTemplateName());
                                extractFailLog.setCssPath(cssPath.getCssPath());
                                extractFailLog.setExtractExpression(pf.getExtractExpression());
                                extractFailLog.setTableName(htmlTemplate.getTableName());
                                extractFailLog.setFieldName(pf.getFieldName());
                                extractFailLog.setFieldDescription(pf.getFieldDescription());
                                extractResult.addExtractFailLog(extractFailLog);
                                //??????
                                //?
                                //???
                                return extractResult;
                            }
                        }
                    } else {
                        //CSS?
                        ExtractResultItem extractResultItem = new ExtractResultItem();
                        extractResultItem.setField(cssPath.getFieldName());
                        extractResultItem.setValue(text);
                        extractResult.addExtractResultItem(extractResultItem);
                    }
                } else {
                    //??????
                    ExtractFailLog extractFailLog = new ExtractFailLog();
                    extractFailLog.setUrl(url);
                    extractFailLog.setUrlPattern(htmlTemplate.getUrlPattern().getUrlPattern());
                    extractFailLog.setTemplateName(htmlTemplate.getTemplateName());
                    extractFailLog.setCssPath(cssPath.getCssPath());
                    extractFailLog.setExtractExpression("");
                    extractFailLog.setTableName(htmlTemplate.getTableName());
                    extractFailLog.setFieldName(cssPath.getFieldName());
                    extractFailLog.setFieldDescription(cssPath.getFieldDescription());
                    extractResult.addExtractFailLog(extractFailLog);
                    //??????
                    //?
                    //???
                    return extractResult;
                }
            }
        }
        return extractResult;
    }

    private static void usage1() {
        //1??
        List<UrlPattern> urlPatterns = new ArrayList<>();
        //1.1?URL?
        UrlPattern urlPattern = new UrlPattern();
        urlPattern.setUrlPattern("http://money.163.com/\\d{2}/\\d{4}/\\d{2}/[0-9A-Z]{16}.html");
        //1.2?HTML?
        HtmlTemplate htmlTemplate = new HtmlTemplate();
        htmlTemplate.setTemplateName("??");
        htmlTemplate.setTableName("finance");
        //1.3?URL?HTML??
        urlPattern.addHtmlTemplate(htmlTemplate);
        //1.4?CSS
        CssPath cssPath = new CssPath();
        cssPath.setCssPath("h1");
        cssPath.setFieldName("title");
        cssPath.setFieldDescription("");
        //1.5?CSS??
        htmlTemplate.addCssPath(cssPath);
        //1.6?CSS
        cssPath = new CssPath();
        cssPath.setCssPath("div#endText");
        cssPath.setFieldName("content");
        cssPath.setFieldDescription("");
        //1.7?CSS??
        htmlTemplate.addCssPath(cssPath);
        //??URLURL?
        urlPatterns.add(urlPattern);
        //2???
        ExtractRegular extractRegular = ExtractRegular.getInstance(urlPatterns);
        //??3???
        //extractRegular.addUrlPatterns(urlPatterns);
        //extractRegular.addUrlPattern(urlPattern);
        //extractRegular.removeUrlPattern(urlPattern.getUrlPattern());
        //3??HTML?
        HtmlExtractor htmlExtractor = new DefaultHtmlExtractor(extractRegular);
        //4??
        String url = "http://money.163.com/08/1219/16/4THR2TMP002533QK.html";
        HtmlFetcher htmlFetcher = new JSoupHtmlFetcher();
        String html = htmlFetcher.fetch(url);
        List<ExtractResult> extractResults = htmlExtractor.extract(url, html);
        //5?
        int i = 1;
        for (ExtractResult extractResult : extractResults) {
            System.out.println((i++) + "? " + extractResult.getUrl() + " ?");
            if (!extractResult.isSuccess()) {
                System.out.println("?");
                for (ExtractFailLog extractFailLog : extractResult.getExtractFailLogs()) {
                    System.out.println("\turl:" + extractFailLog.getUrl());
                    System.out.println("\turlPattern:" + extractFailLog.getUrlPattern());
                    System.out.println("\ttemplateName:" + extractFailLog.getTemplateName());
                    System.out.println("\tfieldName:" + extractFailLog.getFieldName());
                    System.out.println("\tfieldDescription:" + extractFailLog.getFieldDescription());
                    System.out.println("\tcssPath:" + extractFailLog.getCssPath());
                    if (extractFailLog.getExtractExpression() != null) {
                        System.out.println("\textractExpression:" + extractFailLog.getExtractExpression());
                    }
                }
                continue;
            }
            Map<String, List<ExtractResultItem>> extractResultItems = extractResult.getExtractResultItems();
            for (String field : extractResultItems.keySet()) {
                List<ExtractResultItem> values = extractResultItems.get(field);
                if (values.size() > 1) {
                    int j = 1;
                    System.out.println("\t:" + field);
                    for (ExtractResultItem item : values) {
                        System.out.println("\t\t" + (j++) + "?" + field + " = " + item.getValue());
                    }
                } else {
                    System.out.println("\t" + field + " = " + values.get(0).getValue());
                }
            }
            System.out.println("\tdescription = " + extractResult.getDescription());
            System.out.println("\tkeywords = " + extractResult.getKeywords());
        }
    }

    private static void usage2() {
        String allExtractRegularUrl = "http://localhost:8080/html-extractor-web/api/all_extract_regular.jsp";
        String redisHost = "localhost";
        int redisPort = 6379;

        ExtractRegular extractRegular = ExtractRegular.getInstance(allExtractRegularUrl, redisHost, redisPort);
        HtmlExtractor htmlExtractor = new DefaultHtmlExtractor(extractRegular);

        String url = "http://money.163.com/08/1219/16/4THR2TMP002533QK.html";
        HtmlFetcher htmlFetcher = new JSoupHtmlFetcher();
        String html = htmlFetcher.fetch(url);
        List<ExtractResult> extractResults = htmlExtractor.extract(url, html);

        int i = 1;
        for (ExtractResult extractResult : extractResults) {
            System.out.println((i++) + "? " + extractResult.getUrl() + " ?");
            if (!extractResult.isSuccess()) {
                System.out.println("?");
                for (ExtractFailLog extractFailLog : extractResult.getExtractFailLogs()) {
                    System.out.println("\turl:" + extractFailLog.getUrl());
                    System.out.println("\turlPattern:" + extractFailLog.getUrlPattern());
                    System.out.println("\ttemplateName:" + extractFailLog.getTemplateName());
                    System.out.println("\tfieldName:" + extractFailLog.getFieldName());
                    System.out.println("\tfieldDescription:" + extractFailLog.getFieldDescription());
                    System.out.println("\tcssPath:" + extractFailLog.getCssPath());
                    if (extractFailLog.getExtractExpression() != null) {
                        System.out.println("\textractExpression:" + extractFailLog.getExtractExpression());
                    }
                }
                continue;
            }
            Map<String, List<ExtractResultItem>> extractResultItems = extractResult.getExtractResultItems();
            for (String field : extractResultItems.keySet()) {
                List<ExtractResultItem> values = extractResultItems.get(field);
                if (values.size() > 1) {
                    int j = 1;
                    System.out.println("\t:" + field);
                    for (ExtractResultItem item : values) {
                        System.out.println("\t\t" + (j++) + "?" + field + " = " + item.getValue());
                    }
                } else {
                    System.out.println("\t" + field + " = " + values.get(0).getValue());
                }
            }
            System.out.println("\tdescription = " + extractResult.getDescription());
            System.out.println("\tkeywords = " + extractResult.getKeywords());
        }
    }

    private static void usage3() {
        //1??
        List<UrlPattern> urlPatterns = new ArrayList<>();
        //1.1?URL?
        UrlPattern urlPattern = new UrlPattern();
        urlPattern.setUrlPattern("http://list.jd.com/list.html\\?cat=([\\d,]+)");
        //1.2?HTML?
        HtmlTemplate htmlTemplate = new HtmlTemplate();
        htmlTemplate.setTemplateName("?");
        htmlTemplate.setTableName("jd_goods");
        //1.3?URL?HTML??
        urlPattern.addHtmlTemplate(htmlTemplate);
        //1.4?CSS
        CssPath cssPath = new CssPath();
        cssPath.setCssPath("html body div div div ul li div div.p-name");
        cssPath.setFieldName("name");
        cssPath.setFieldDescription("??");
        //1.5?CSS??
        htmlTemplate.addCssPath(cssPath);
        //1.6?CSS
        cssPath = new CssPath();
        cssPath.setCssPath("html body div div div ul li div div.p-name a");
        cssPath.setAttr("href");
        cssPath.setFieldName("link");
        cssPath.setFieldDescription("");
        //1.7?CSS??
        htmlTemplate.addCssPath(cssPath);
        //1.8?CSS
        cssPath = new CssPath();
        cssPath.setCssPath("html body div div div ul li div div.p-price strong");
        cssPath.setFieldName("price");
        cssPath.setFieldDescription("");
        //1.9?CSS??
        htmlTemplate.addCssPath(cssPath);
        //??URLURL?
        urlPatterns.add(urlPattern);
        //2???
        ExtractRegular extractRegular = ExtractRegular.getInstance(urlPatterns);
        //??3???
        //extractRegular.addUrlPatterns(urlPatterns);
        //extractRegular.addUrlPattern(urlPattern);
        //extractRegular.removeUrlPattern(urlPattern.getUrlPattern());
        //3??HTML?
        HtmlExtractor htmlExtractor = new DefaultHtmlExtractor(extractRegular);
        //4??
        String url = "http://list.jd.com/list.html?cat=9987,653,655";
        HtmlFetcher htmlFetcher = new JSoupHtmlFetcher();
        String html = htmlFetcher.fetch(url);
        List<ExtractResult> extractResults = htmlExtractor.extract(url, html);
        //5?
        int i = 1;
        for (ExtractResult extractResult : extractResults) {
            System.out.println((i++) + "? " + extractResult.getUrl() + " ?");
            if (!extractResult.isSuccess()) {
                System.out.println("?");
                for (ExtractFailLog extractFailLog : extractResult.getExtractFailLogs()) {
                    System.out.println("\turl:" + extractFailLog.getUrl());
                    System.out.println("\turlPattern:" + extractFailLog.getUrlPattern());
                    System.out.println("\ttemplateName:" + extractFailLog.getTemplateName());
                    System.out.println("\tfieldName:" + extractFailLog.getFieldName());
                    System.out.println("\tfieldDescription:" + extractFailLog.getFieldDescription());
                    System.out.println("\tcssPath:" + extractFailLog.getCssPath());
                    if (extractFailLog.getExtractExpression() != null) {
                        System.out.println("\textractExpression:" + extractFailLog.getExtractExpression());
                    }
                }
                continue;
            }
            Map<String, List<ExtractResultItem>> extractResultItems = extractResult.getExtractResultItems();
            for (String field : extractResultItems.keySet()) {
                List<ExtractResultItem> values = extractResultItems.get(field);
                if (values.size() > 1) {
                    int j = 1;
                    System.out.println("\t:" + field);
                    for (ExtractResultItem item : values) {
                        System.out.println("\t\t" + (j++) + "?" + field + " = " + item.getValue());
                    }
                } else {
                    System.out.println("\t" + field + " = " + values.get(0).getValue());
                }
            }
            System.out.println("\tdescription = " + extractResult.getDescription());
            System.out.println("\tkeywords = " + extractResult.getKeywords());
        }
    }

    /**
     * @param args
     */
    public static void main(String[] args) {
        //??3??????
        //usage1();
        //usage2();
        usage3();
    }
}