Java tutorial
/** * Copyright (c) 2016-2020 Weibo, Inc. * All rights reserved. * * This software is the confidential and proprietary information of Weibo, * Inc. ("Confidential Information"). You shall not * disclose such Confidential Information and shall use it only in * accordance with the terms of the license agreement you entered into with Weibo. */ package com.weibo.datasys.crawler.base.factory; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import org.dom4j.Document; import org.dom4j.DocumentHelper; import org.dom4j.Node; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.weibo.datasys.common.util.StringUtils; import com.weibo.datasys.crawler.base.strategy.rule.seed.AbstractSeedGenerateRule; import com.weibo.datasys.crawler.base.crawlUnit.deduplicator.AbstractDeduplicator; import com.weibo.datasys.crawler.base.crawlUnit.fetcher.AbstractFetcher; import com.weibo.datasys.crawler.base.crawlUnit.preparser.AbstractParser; import com.weibo.datasys.crawler.base.crawlUnit.saver.AbstractSaver; import com.weibo.datasys.crawler.base.crawlUnit.seedProvider.AbstractSeedProvider; import com.weibo.datasys.crawler.base.entity.Task; import com.weibo.datasys.crawler.base.exception.TaskException; import com.weibo.datasys.crawler.base.manager.TaskManager; import com.weibo.datasys.crawler.base.strategy.rule.concurrent.ConcurrentControlRule; import com.weibo.datasys.crawler.base.strategy.rule.crawl.CrawlRule; import com.weibo.datasys.crawler.base.strategy.rule.filter.AbstractFilterRule; import com.weibo.datasys.crawler.base.strategy.rule.parse.AbstractContentExtractRule; import com.weibo.datasys.crawler.base.strategy.rule.parse.FieldRule; import com.weibo.datasys.crawler.base.strategy.rule.parse.UrlExtractRule; import com.weibo.datasys.crawler.base.strategy.rule.process.AbstractProcessRule; import com.weibo.datasys.crawler.base.strategy.rule.save.AbstractSaveRule; import com.weibo.datasys.crawler.impl.crawlUnit.deduplicator.LinkDBBaseDeduplicator; import com.weibo.datasys.crawler.impl.strategy.CrawlStrategy; import com.weibo.datasys.crawler.impl.strategy.ParseStrategy; import com.weibo.datasys.crawler.impl.strategy.SaveStrategy; /** * @author zouyandi * */ public class TaskFactory { private static Logger logger = LoggerFactory.getLogger(TaskFactory.class); /** * * ?xmlTask * * @param xml * @return * @throws TaskException */ public static Task buildTask(String xml) throws TaskException { Task task = null; try { // xml dom Document doc = DocumentHelper.parseText(xml); // task task = new Task(); // task xml task.setTaskXML(xml); // task idname String taskId = doc.selectSingleNode("//Config/taskId").getText().trim(); task.setTaskId(taskId); String taskName = doc.selectSingleNode("//Config/taskName").getText().trim(); task.setTaskName(taskName); // task String cycleString = doc.selectSingleNode("//Config/taskCycle").getText().trim(); task.setTaskCycle(StringUtils.parseLong(cycleString, 0)); // ???? buildCrawlUnit(task, doc); // ? buildCrawlStrategy(task, doc); // ? buildParseStrategy(task, doc); // buildSaveStrategy(task, doc); } catch (Exception e) { logger.error("[buildTask] - {}", e.toString()); throw new TaskException("build task error.", e); } return task; } /** * * * * @param task * @param doc */ @SuppressWarnings("unchecked") private static void buildSaveStrategy(Task task, Document doc) throws Exception { SaveStrategy saveStrategy = new SaveStrategy(task); task.setSaveStrategy(saveStrategy); // ??? saveStrategy.setSeedDS(doc.selectSingleNode("//Config/SaveStrategy/seedDB/dsname").getText().trim()); saveStrategy.setSeedDB(doc.selectSingleNode("//Config/SaveStrategy/seedDB/db").getText().trim()); saveStrategy.setSeedTable(doc.selectSingleNode("//Config/SaveStrategy/seedDB/table").getText().trim()); // ? saveStrategy.setLinkDS(doc.selectSingleNode("//Config/SaveStrategy/linkDB/dsname").getText().trim()); saveStrategy.setLinkDB(doc.selectSingleNode("//Config/SaveStrategy/linkDB/db").getText().trim()); saveStrategy.setLinkTable(doc.selectSingleNode("//Config/SaveStrategy/linkDB/table").getText().trim()); // ? saveStrategy.setPageDS(doc.selectSingleNode("//Config/SaveStrategy/pageDB/dsname").getText().trim()); saveStrategy.setPageDB(doc.selectSingleNode("//Config/SaveStrategy/pageDB/db").getText().trim()); saveStrategy.setPageTable(doc.selectSingleNode("//Config/SaveStrategy/pageDB/table").getText().trim()); List<Node> saveRuleNodes = doc.selectNodes("//Config/SaveStrategy/saveRule"); for (Node saveRuleNode : saveRuleNodes) { String saveRuleName = saveRuleNode.selectSingleNode("@name").getText().trim(); String saveRuleClass = saveRuleNode.selectSingleNode("@class").getText().trim(); AbstractSaveRule saveRule = (AbstractSaveRule) Class.forName(saveRuleClass).getConstructor(Task.class) .newInstance(task); saveRule.setName(saveRuleName); saveRule.configWithParameters(getParaMap(saveRuleNode)); saveStrategy.addSaveRule(saveRule); } } /** * * ? * * @param task * @param doc */ @SuppressWarnings("unchecked") private static void buildParseStrategy(Task task, Document doc) throws Exception { ParseStrategy parseStrategy = new ParseStrategy(task); task.setParseStrategy(parseStrategy); // Field??? List<Node> fieldRuleNodes = doc.selectNodes("//Config/ParseStrategy/FieldRule"); for (Node fieldRuleNode : fieldRuleNodes) { FieldRule fieldRule = new FieldRule(task); fieldRule.configWithParameters(getParaMap(fieldRuleNode)); parseStrategy.addFieldRule(fieldRule); // ??? Node contentRuleNode = fieldRuleNode.selectSingleNode("contentExtractRule"); String contentRuleClass = contentRuleNode.selectSingleNode("@class").getText().trim(); AbstractContentExtractRule contentRule = (AbstractContentExtractRule) Class.forName(contentRuleClass) .getConstructor(Task.class).newInstance(task); contentRule.configWithParameters(getParaMap(contentRuleNode)); fieldRule.setContentRule(contentRule); // ?s List<Node> processRuleNodes = fieldRuleNode.selectNodes("processRule"); for (Node processRuleNode : processRuleNodes) { String processRuleClass = processRuleNode.selectSingleNode("@class").getText().trim(); AbstractProcessRule processRule = (AbstractProcessRule) Class.forName(processRuleClass) .getConstructor(Task.class).newInstance(task); processRule.configWithParameters(getParaMap(processRuleNode)); fieldRule.addProcessRule(processRule); } // s List<Node> filterNodes = fieldRuleNode.selectNodes("filterRule"); for (Node filterNode : filterNodes) { // ??? String filterClass = filterNode.selectSingleNode("@class").getText().trim(); AbstractFilterRule filterRule = (AbstractFilterRule) Class.forName(filterClass) .getConstructor(Task.class).newInstance(task); // ??? filterRule.configWithParameters(getParaMap(filterNode)); fieldRule.addFilter(filterRule); } } } /** * * ? * * @param task * @param doc * @throws Exception */ @SuppressWarnings("unchecked") private static void buildCrawlStrategy(Task task, Document doc) throws Exception { CrawlStrategy crawlStrategy = new CrawlStrategy(task); task.setCrawlStrategy(crawlStrategy); // String priorityString = doc.selectSingleNode("//Config/CrawlStrategy/priority").getText().trim(); crawlStrategy.setPriority(StringUtils.parseInt(priorityString, 0)); // ? String siteEncoding = doc.selectSingleNode("//Config/CrawlStrategy/siteEncoding").getText().trim(); crawlStrategy.setSiteEncoding(siteEncoding); // ? String maxCrawlDepthString = doc.selectSingleNode("//Config/CrawlStrategy/maxCrawlDepth").getText().trim(); int maxCrawlDepth = StringUtils.parseInt(maxCrawlDepthString, Integer.MAX_VALUE); if (maxCrawlDepth <= 0) { maxCrawlDepth = Integer.MAX_VALUE; } crawlStrategy.setMaxCrawlDepth(maxCrawlDepth); // ??? Node seedRuleNode = doc.selectSingleNode("//Config/CrawlStrategy/seedGenerateRule"); String seedRuleClass = seedRuleNode.selectSingleNode("@class").getText().trim(); // ??? AbstractSeedGenerateRule seedRule = (AbstractSeedGenerateRule) Class.forName(seedRuleClass) .getConstructor(Task.class).newInstance(task); seedRule.configWithParameters(getParaMap(seedRuleNode)); crawlStrategy.setSeedRule(seedRule); // ? ConcurrentControlRule concurrentRule = new ConcurrentControlRule(task); crawlStrategy.setConcurrentRule(concurrentRule); Node controlRuleNode = doc.selectSingleNode("//Config/CrawlStrategy/concurrentControlRule"); concurrentRule.configWithParameters(getParaMap(controlRuleNode)); // ? List<Node> crawlRuleNodes = doc.selectNodes("//Config/CrawlStrategy/crawlRule"); for (Node crawlRuleNode : crawlRuleNodes) { CrawlRule crawlRule = new CrawlRule(task); // ??? crawlRule.configWithParameters(getParaMap(crawlRuleNode)); // crawlStrategy.addCrawlRule(crawlRule); // url??? List<Node> urlExtractNodes = crawlRuleNode.selectNodes("urlExtractRule"); for (Node urlExtractNode : urlExtractNodes) { UrlExtractRule urlExtractRule = new UrlExtractRule(task); // ??? urlExtractRule.configWithParameters(getParaMap(urlExtractNode)); crawlRule.addUrlExtractRule(urlExtractRule); // ?s List<Node> processNodes = urlExtractNode.selectNodes("processRule"); for (Node processNode : processNodes) { // ??? String processClass = processNode.selectSingleNode("@class").getText().trim(); AbstractProcessRule processRule = (AbstractProcessRule) Class.forName(processClass) .getConstructor(Task.class).newInstance(task); // ??? processRule.configWithParameters(getParaMap(processNode)); urlExtractRule.addProcessRule(processRule); } // s List<Node> filterNodes = urlExtractNode.selectNodes("filterRule"); for (Node filterNode : filterNodes) { // ??? String filterClass = filterNode.selectSingleNode("@class").getText().trim(); AbstractFilterRule filterRule = (AbstractFilterRule) Class.forName(filterClass) .getConstructor(Task.class).newInstance(task); // ??? filterRule.configWithParameters(getParaMap(filterNode)); urlExtractRule.addFilter(filterRule); } } // http? Node httpParaNode = crawlRuleNode.selectSingleNode("httpRequest"); Map<String, String> paraMap = getParaMap(httpParaNode); crawlRule.setHttpReqParameters(paraMap); } } /** * * ???? * * @param task * @param doc * @throws Exception */ private static void buildCrawlUnit(Task task, Document doc) throws Exception { String fetcherName = doc.selectSingleNode("//Config/Fetcher").getText().trim(); AbstractFetcher fetcher = (AbstractFetcher) Class.forName(fetcherName).newInstance(); fetcher.configWithKeyValues(getParaMap(doc.selectSingleNode("//Config/Fetcher"))); String parserName = doc.selectSingleNode("//Config/Parser").getText().trim(); AbstractParser parser = (AbstractParser) Class.forName(parserName).newInstance(); parser.configWithKeyValues(getParaMap(doc.selectSingleNode("//Config/Parser"))); String saverName = doc.selectSingleNode("//Config/Saver").getText().trim(); AbstractSaver saver = (AbstractSaver) Class.forName(saverName).newInstance(); saver.configWithKeyValues(getParaMap(doc.selectSingleNode("//Config/Saver"))); String seedProviderName = doc.selectSingleNode("//Config/SeedProvider").getText().trim(); AbstractSeedProvider seedProvider = (AbstractSeedProvider) Class.forName(seedProviderName).newInstance(); seedProvider.configWithKeyValues(getParaMap(doc.selectSingleNode("//Config/SeedProvider"))); Node deduplicatorNode = doc.selectSingleNode("//Config/Deduplicator"); AbstractDeduplicator deduplicator = new LinkDBBaseDeduplicator(); if (deduplicatorNode != null) { String deduplicatorName = deduplicatorNode.getText().trim(); if (StringUtils.isNotEmpty(deduplicatorName)) { deduplicator = (AbstractDeduplicator) Class.forName(deduplicatorName).newInstance(); } deduplicator.configWithKeyValues(getParaMap(deduplicatorNode)); } else { deduplicator.configWithKeyValues(new LinkedHashMap<String, String>()); } task.setFetcher(fetcher); task.setParser(parser); task.setSaver(saver); task.setSeedProvider(seedProvider); task.setDeduplicator(deduplicator); } /** * * ???node??Map * * @param node * @return */ @SuppressWarnings("unchecked") private static Map<String, String> getParaMap(Node node) { Map<String, String> paraMap = new LinkedHashMap<String, String>(); if (node != null) { List<Node> paraNodes = node.selectNodes("parameter"); for (Node paraNode : paraNodes) { String key = paraNode.selectSingleNode("@key").getText().trim(); String value = paraNode.getText().trim(); Node valueNode = paraNode.selectSingleNode("@value"); if (valueNode != null) { value = paraNode.selectSingleNode("@value").getText().trim(); } paraMap.put(key, value); } } return paraMap; } }