Java tutorial
/** * * APDPlat - Application Product Development Platform * Copyright (c) 2013, ??, yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.extractor.html.impl; import org.apache.commons.lang.StringUtils; import org.apdplat.extractor.html.HtmlExtractor; import org.apdplat.extractor.html.HtmlFetcher; import org.apdplat.extractor.html.model.*; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.*; /** * ? * ?URL?????CSS???HTML? * * @author ?? * */ public class DefaultHtmlExtractor implements HtmlExtractor { private static final Logger LOGGER = LoggerFactory.getLogger(DefaultHtmlExtractor.class); private ExtractRegular extractRegular; public DefaultHtmlExtractor(ExtractRegular extractRegular) { this.extractRegular = extractRegular; } /** * ?? * @param url URL * @param html HTML * @return ? */ @Override public List<ExtractResult> extract(String url, String html) { List<ExtractResult> extractResults = new ArrayList<>(); //?URL??? List<HtmlTemplate> htmlTemplates = extractRegular.getHtmlTemplate(url); if (htmlTemplates.isEmpty()) { LOGGER.debug("URL?" + url); return extractResults; } try { Document doc = Jsoup.parse(html); Elements metas = doc.select("meta"); String keywords = ""; String description = ""; for (Element meta : metas) { String name = meta.attr("name"); if ("keywords".equals(name)) { keywords = meta.attr("content"); } if ("description".equals(name)) { description = meta.attr("content"); } } Set<String> tableNames = new HashSet<>(); for (HtmlTemplate htmlTemplate : htmlTemplates) { if (tableNames.contains(htmlTemplate.getTableName())) { LOGGER.debug( "?tableName????UrlPattern?" + htmlTemplate.getUrlPattern().getUrlPattern()); LOGGER.debug(htmlTemplates.toString()); } tableNames.add(htmlTemplate.getTableName()); try { //??? ExtractResult extractResult = extractHtmlTemplate(url, htmlTemplate, doc); //?URL??????? if (!extractResult.getExtractFailLogs().isEmpty() || !extractResult.getExtractResultItems().isEmpty()) { extractResult.setContent(html.getBytes("utf-8")); extractResult.setEncoding("utf-8"); extractResult.setKeywords(keywords); extractResult.setDescription(description); extractResults.add(extractResult); } else { LOGGER.debug(url + " ? " + htmlTemplate.getTemplateName() + " ?"); } } catch (Exception e) { LOGGER.error("???" + htmlTemplate.getTemplateName(), e); } } } catch (Exception e) { LOGGER.error("?: " + url, e); } return extractResults; } /** * ???? * @param url html? * @param htmlTemplate html?? * @param doc jsoup * @return ? */ private ExtractResult extractHtmlTemplate(String url, HtmlTemplate htmlTemplate, Document doc) { //??? ExtractResult extractResult = new ExtractResult(); extractResult.setUrl(url); extractResult.setTableName(htmlTemplate.getTableName()); List<CssPath> cssPaths = htmlTemplate.getCssPaths(); //??CSS??????? //??CSS??? for (CssPath cssPath : cssPaths) { // ??CSS PATH Elements elements = doc.select(cssPath.getCssPath()); // CSS?? for (Element element : elements) { String text = null; if (StringUtils.isBlank(cssPath.getAttr())) { //??? text = element.text(); } else { //??? text = element.attr(cssPath.getAttr()); } if (StringUtils.isNotBlank(text)) { // ???? if (cssPath.hasExtractFunction()) { //CSS??? for (ExtractFunction pf : cssPath.getExtractFunctions()) { text = ExtractFunctionExecutor.execute(text, doc, cssPath, pf.getExtractExpression()); if (text != null) { ExtractResultItem extractResultItem = new ExtractResultItem(); extractResultItem.setField(pf.getFieldName()); extractResultItem.setValue(text); extractResult.addExtractResultItem(extractResultItem); } else { ExtractFailLog extractFailLog = new ExtractFailLog(); extractFailLog.setUrl(url); extractFailLog.setUrlPattern(htmlTemplate.getUrlPattern().getUrlPattern()); extractFailLog.setTemplateName(htmlTemplate.getTemplateName()); extractFailLog.setCssPath(cssPath.getCssPath()); extractFailLog.setExtractExpression(pf.getExtractExpression()); extractFailLog.setTableName(htmlTemplate.getTableName()); extractFailLog.setFieldName(pf.getFieldName()); extractFailLog.setFieldDescription(pf.getFieldDescription()); extractResult.addExtractFailLog(extractFailLog); //?????? //? //??? return extractResult; } } } else { //CSS? ExtractResultItem extractResultItem = new ExtractResultItem(); extractResultItem.setField(cssPath.getFieldName()); extractResultItem.setValue(text); extractResult.addExtractResultItem(extractResultItem); } } else { //?????? ExtractFailLog extractFailLog = new ExtractFailLog(); extractFailLog.setUrl(url); extractFailLog.setUrlPattern(htmlTemplate.getUrlPattern().getUrlPattern()); extractFailLog.setTemplateName(htmlTemplate.getTemplateName()); extractFailLog.setCssPath(cssPath.getCssPath()); extractFailLog.setExtractExpression(""); extractFailLog.setTableName(htmlTemplate.getTableName()); extractFailLog.setFieldName(cssPath.getFieldName()); extractFailLog.setFieldDescription(cssPath.getFieldDescription()); extractResult.addExtractFailLog(extractFailLog); //?????? //? //??? return extractResult; } } } return extractResult; } private static void usage1() { //1?? List<UrlPattern> urlPatterns = new ArrayList<>(); //1.1?URL? UrlPattern urlPattern = new UrlPattern(); urlPattern.setUrlPattern("http://money.163.com/\\d{2}/\\d{4}/\\d{2}/[0-9A-Z]{16}.html"); //1.2?HTML? HtmlTemplate htmlTemplate = new HtmlTemplate(); htmlTemplate.setTemplateName("??"); htmlTemplate.setTableName("finance"); //1.3?URL?HTML?? urlPattern.addHtmlTemplate(htmlTemplate); //1.4?CSS CssPath cssPath = new CssPath(); cssPath.setCssPath("h1"); cssPath.setFieldName("title"); cssPath.setFieldDescription(""); //1.5?CSS?? htmlTemplate.addCssPath(cssPath); //1.6?CSS cssPath = new CssPath(); cssPath.setCssPath("div#endText"); cssPath.setFieldName("content"); cssPath.setFieldDescription(""); //1.7?CSS?? htmlTemplate.addCssPath(cssPath); //??URLURL? urlPatterns.add(urlPattern); //2??? ExtractRegular extractRegular = ExtractRegular.getInstance(urlPatterns); //??3??? //extractRegular.addUrlPatterns(urlPatterns); //extractRegular.addUrlPattern(urlPattern); //extractRegular.removeUrlPattern(urlPattern.getUrlPattern()); //3??HTML? HtmlExtractor htmlExtractor = new DefaultHtmlExtractor(extractRegular); //4?? String url = "http://money.163.com/08/1219/16/4THR2TMP002533QK.html"; HtmlFetcher htmlFetcher = new JSoupHtmlFetcher(); String html = htmlFetcher.fetch(url); List<ExtractResult> extractResults = htmlExtractor.extract(url, html); //5? int i = 1; for (ExtractResult extractResult : extractResults) { System.out.println((i++) + "? " + extractResult.getUrl() + " ?"); if (!extractResult.isSuccess()) { System.out.println("?"); for (ExtractFailLog extractFailLog : extractResult.getExtractFailLogs()) { System.out.println("\turl:" + extractFailLog.getUrl()); System.out.println("\turlPattern:" + extractFailLog.getUrlPattern()); System.out.println("\ttemplateName:" + extractFailLog.getTemplateName()); System.out.println("\tfieldName:" + extractFailLog.getFieldName()); System.out.println("\tfieldDescription:" + extractFailLog.getFieldDescription()); System.out.println("\tcssPath:" + extractFailLog.getCssPath()); if (extractFailLog.getExtractExpression() != null) { System.out.println("\textractExpression:" + extractFailLog.getExtractExpression()); } } continue; } Map<String, List<ExtractResultItem>> extractResultItems = extractResult.getExtractResultItems(); for (String field : extractResultItems.keySet()) { List<ExtractResultItem> values = extractResultItems.get(field); if (values.size() > 1) { int j = 1; System.out.println("\t:" + field); for (ExtractResultItem item : values) { System.out.println("\t\t" + (j++) + "?" + field + " = " + item.getValue()); } } else { System.out.println("\t" + field + " = " + values.get(0).getValue()); } } System.out.println("\tdescription = " + extractResult.getDescription()); System.out.println("\tkeywords = " + extractResult.getKeywords()); } } private static void usage2() { String allExtractRegularUrl = "http://localhost:8080/html-extractor-web/api/all_extract_regular.jsp"; String redisHost = "localhost"; int redisPort = 6379; ExtractRegular extractRegular = ExtractRegular.getInstance(allExtractRegularUrl, redisHost, redisPort); HtmlExtractor htmlExtractor = new DefaultHtmlExtractor(extractRegular); String url = "http://money.163.com/08/1219/16/4THR2TMP002533QK.html"; HtmlFetcher htmlFetcher = new JSoupHtmlFetcher(); String html = htmlFetcher.fetch(url); List<ExtractResult> extractResults = htmlExtractor.extract(url, html); int i = 1; for (ExtractResult extractResult : extractResults) { System.out.println((i++) + "? " + extractResult.getUrl() + " ?"); if (!extractResult.isSuccess()) { System.out.println("?"); for (ExtractFailLog extractFailLog : extractResult.getExtractFailLogs()) { System.out.println("\turl:" + extractFailLog.getUrl()); System.out.println("\turlPattern:" + extractFailLog.getUrlPattern()); System.out.println("\ttemplateName:" + extractFailLog.getTemplateName()); System.out.println("\tfieldName:" + extractFailLog.getFieldName()); System.out.println("\tfieldDescription:" + extractFailLog.getFieldDescription()); System.out.println("\tcssPath:" + extractFailLog.getCssPath()); if (extractFailLog.getExtractExpression() != null) { System.out.println("\textractExpression:" + extractFailLog.getExtractExpression()); } } continue; } Map<String, List<ExtractResultItem>> extractResultItems = extractResult.getExtractResultItems(); for (String field : extractResultItems.keySet()) { List<ExtractResultItem> values = extractResultItems.get(field); if (values.size() > 1) { int j = 1; System.out.println("\t:" + field); for (ExtractResultItem item : values) { System.out.println("\t\t" + (j++) + "?" + field + " = " + item.getValue()); } } else { System.out.println("\t" + field + " = " + values.get(0).getValue()); } } System.out.println("\tdescription = " + extractResult.getDescription()); System.out.println("\tkeywords = " + extractResult.getKeywords()); } } private static void usage3() { //1?? List<UrlPattern> urlPatterns = new ArrayList<>(); //1.1?URL? UrlPattern urlPattern = new UrlPattern(); urlPattern.setUrlPattern("http://list.jd.com/list.html\\?cat=([\\d,]+)"); //1.2?HTML? HtmlTemplate htmlTemplate = new HtmlTemplate(); htmlTemplate.setTemplateName("?"); htmlTemplate.setTableName("jd_goods"); //1.3?URL?HTML?? urlPattern.addHtmlTemplate(htmlTemplate); //1.4?CSS CssPath cssPath = new CssPath(); cssPath.setCssPath("html body div div div ul li div div.p-name"); cssPath.setFieldName("name"); cssPath.setFieldDescription("??"); //1.5?CSS?? htmlTemplate.addCssPath(cssPath); //1.6?CSS cssPath = new CssPath(); cssPath.setCssPath("html body div div div ul li div div.p-name a"); cssPath.setAttr("href"); cssPath.setFieldName("link"); cssPath.setFieldDescription(""); //1.7?CSS?? htmlTemplate.addCssPath(cssPath); //1.8?CSS cssPath = new CssPath(); cssPath.setCssPath("html body div div div ul li div div.p-price strong"); cssPath.setFieldName("price"); cssPath.setFieldDescription(""); //1.9?CSS?? htmlTemplate.addCssPath(cssPath); //??URLURL? urlPatterns.add(urlPattern); //2??? ExtractRegular extractRegular = ExtractRegular.getInstance(urlPatterns); //??3??? //extractRegular.addUrlPatterns(urlPatterns); //extractRegular.addUrlPattern(urlPattern); //extractRegular.removeUrlPattern(urlPattern.getUrlPattern()); //3??HTML? HtmlExtractor htmlExtractor = new DefaultHtmlExtractor(extractRegular); //4?? String url = "http://list.jd.com/list.html?cat=9987,653,655"; HtmlFetcher htmlFetcher = new JSoupHtmlFetcher(); String html = htmlFetcher.fetch(url); List<ExtractResult> extractResults = htmlExtractor.extract(url, html); //5? int i = 1; for (ExtractResult extractResult : extractResults) { System.out.println((i++) + "? " + extractResult.getUrl() + " ?"); if (!extractResult.isSuccess()) { System.out.println("?"); for (ExtractFailLog extractFailLog : extractResult.getExtractFailLogs()) { System.out.println("\turl:" + extractFailLog.getUrl()); System.out.println("\turlPattern:" + extractFailLog.getUrlPattern()); System.out.println("\ttemplateName:" + extractFailLog.getTemplateName()); System.out.println("\tfieldName:" + extractFailLog.getFieldName()); System.out.println("\tfieldDescription:" + extractFailLog.getFieldDescription()); System.out.println("\tcssPath:" + extractFailLog.getCssPath()); if (extractFailLog.getExtractExpression() != null) { System.out.println("\textractExpression:" + extractFailLog.getExtractExpression()); } } continue; } Map<String, List<ExtractResultItem>> extractResultItems = extractResult.getExtractResultItems(); for (String field : extractResultItems.keySet()) { List<ExtractResultItem> values = extractResultItems.get(field); if (values.size() > 1) { int j = 1; System.out.println("\t:" + field); for (ExtractResultItem item : values) { System.out.println("\t\t" + (j++) + "?" + field + " = " + item.getValue()); } } else { System.out.println("\t" + field + " = " + values.get(0).getValue()); } } System.out.println("\tdescription = " + extractResult.getDescription()); System.out.println("\tkeywords = " + extractResult.getKeywords()); } } /** * @param args */ public static void main(String[] args) { //??3?????? //usage1(); //usage2(); usage3(); } }