Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.aliuge.crawler.jobconf; import java.util.List; import java.util.Properties; import org.aliuge.crawler.exception.ConfigurationException; import org.aliuge.crawler.exception.QueueException; import org.aliuge.crawler.model.KeyValue; import org.aliuge.crawler.pendingqueue.PendingManager; import org.aliuge.crawler.url.WebURL; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.google.common.collect.Lists; /** * @author * @date 2015-7-22 * @desc */ public class FetchConfig extends Configuration { private static Logger log = Logger.getLogger(FetchConfig.class); public FetchConfig() { } private String jobTag = "default"; private String proxyPath = "conf/proxyips.properties"; private String type = "default"; /** * job?? */ private int threadNum = 1; /** * Socket?? */ private int socketTimeoutMilliseconds = 5000; /** * connection?? */ private int connectionTimeout = 5000; /** * */ private int delayBetweenRequests = 200; /** * ?-1? */ private int maxDepthOfCrawling = -1; /** * ??? */ private int maxOutgoingLinksToFollow = 5000; /** * ? */ private boolean fetchBinaryContent = false; /** * ?? */ private String fileSuffix = "jpg,gif,png,avi,mtk"; /** * agent */ private String agent = ""; /** * ?https */ private boolean https = true; /** * ????? */ private boolean onlyDomain = true; /** * ??robots?? */ private boolean robots = true; /** * */ private int maxTotalConnections = 200; /** * ? */ private int maxConnectionsPerHost = 200; /** * ????? */ private int maxDownloadSizePerPage = 1048576; private List<String> proxyIps = null; /** * ? */ private String proxyHost = null; /** * ?? */ private int proxyPort = 80; /** * ??? */ private String proxyUsername = null; /** * ?? */ private String proxyPassword = null; /** * ??? */ private List<String> seeds = Lists.newArrayList(); /** * url */ private List<String> fetchUrlFilters = Lists.newArrayList(); /** * ?Url? */ private List<KeyValue<String, String>> extractUrlfilters = Lists.newArrayList(); /** * ??? * * @param confFile * @return */ @SuppressWarnings("unchecked") public FetchConfig loadConfig(Document confDoc) throws ConfigurationException { try { Document doc = confDoc; super.setJobName(doc.select("job").attr("name")); super.setIndexName(doc.select("job").attr("indexName")); Elements e = doc.select("fetch"); this.type = e.select("type").text(); this.agent = e.select("agent").text(); String temp = e.select("threadNum").text(); if (StringUtils.isNotBlank(temp)) { this.threadNum = Integer.parseInt(temp); } temp = e.select("delayBetweenRequests").text(); if (StringUtils.isNotBlank(temp)) { this.delayBetweenRequests = Integer.parseInt(temp); } temp = e.select("maxDepthOfCrawling").text(); if (StringUtils.isNotBlank(temp)) { this.maxDepthOfCrawling = Integer.parseInt(temp); } temp = e.select("fetchBinaryContent").text(); if (StringUtils.isNotBlank(temp)) { this.fetchBinaryContent = Boolean.parseBoolean(temp); } if (StringUtils.isNotBlank(e.select("maxOutgoingLinksToFollow").text())) { this.maxOutgoingLinksToFollow = Integer.parseInt(e.select("maxOutgoingLinksToFollow").text()); } temp = e.select("fileSuffix").text(); if (StringUtils.isNotBlank(temp)) { this.fileSuffix = temp; } temp = e.select("maxDownloadSizePerPage").text(); if (StringUtils.isNotBlank(temp)) { this.maxDownloadSizePerPage = Integer.parseInt(temp); } temp = e.select("https").text(); if (StringUtils.isNotBlank(temp)) { this.https = Boolean.parseBoolean(temp); } temp = e.select("onlyDomain").text(); if (StringUtils.isNotBlank(temp)) { this.onlyDomain = Boolean.parseBoolean(temp); } temp = e.select("socketTimeoutMilliseconds").text(); if (StringUtils.isNotBlank(temp)) { this.socketTimeoutMilliseconds = Integer.parseInt(temp); } temp = e.select("connectionTimeout").text(); if (StringUtils.isNotBlank(temp)) { this.connectionTimeout = Integer.parseInt(temp); } temp = e.select("maxTotalConnections").text(); if (StringUtils.isNotBlank(temp)) { this.maxTotalConnections = Integer.parseInt(temp); } temp = e.select("maxConnectionsPerHost").text(); if (StringUtils.isNotBlank(temp)) { this.maxConnectionsPerHost = Integer.parseInt(e.select("maxConnectionsPerHost").text()); } temp = e.select("maxConnectionsPerHost").text(); if (StringUtils.isNotBlank(temp)) { this.maxConnectionsPerHost = Integer.parseInt(temp); } temp = e.select("proxy").text(); if (StringUtils.isNotBlank(temp)) { Properties p = PropertyConfigurationHelper.getProperties(temp); this.proxyIps = Lists.newLinkedList(); for (Object o : p.keySet()) { proxyIps.add((String) p.get(o)); } } // seed Elements seeds = doc.select("fetch seeds seed"); for (Element element : seeds) { // WebURL url = new WebURL(); String url = element.text(); if (StringUtils.isBlank(url)) { continue; } url = url.trim(); String area = element.attr("area"); this.seeds.add(url); WebURL areaUrl = new WebURL(area, url); try { PendingManager.getPendingArea(super.getJobName()).addElement(areaUrl); } catch (QueueException e1) { log.error("", e1); e1.printStackTrace(); } // BloomfilterHelper.getInstance().add(url.getURL()); } /* * ??Url */ Elements fetchUrlFilters = doc.select("fetchUrlFilters filter"); for (Element element : fetchUrlFilters) { String tmp = element.text(); if (StringUtils.isNoneBlank(tmp)) this.fetchUrlFilters.add(element.text()); } /* * ?????Url */ Elements extractUrlfilters = doc.select("extractUrlfilters filter"); for (Element element : extractUrlfilters) { String tmp = element.text(); String tmp_rep = element.attr("replace"); if (StringUtils.isNoneBlank(tmp)) this.extractUrlfilters.add(new KeyValue(tmp, tmp_rep)); } } catch (NumberFormatException e) { throw new ConfigurationException("?" + e.getMessage()); } // super.setFetchConfig(this); return this; } public String getJobTag() { return jobTag; } public void setJobTag(String jobTag) { this.jobTag = jobTag; } public String getProxyPath() { return proxyPath; } public void setProxyPath(String proxyPath) { this.proxyPath = proxyPath; } public List<String> getProxyIps() { return proxyIps; } public void setProxyIps(List<String> proxyIps) { this.proxyIps = proxyIps; } public int getThreadNum() { return threadNum; } public void setThreadNum(int threadNum) { this.threadNum = threadNum; } public int getSocketTimeoutMilliseconds() { return socketTimeoutMilliseconds; } public void setSocketTimeoutMilliseconds(int socketTimeoutMilliseconds) { this.socketTimeoutMilliseconds = socketTimeoutMilliseconds; } public int getConnectionTimeout() { return connectionTimeout; } public void setConnectionTimeout(int connectionTimeout) { this.connectionTimeout = connectionTimeout; } public int getDelayBetweenRequests() { return delayBetweenRequests; } public void setDelayBetweenRequests(int delayBetweenRequests) { this.delayBetweenRequests = delayBetweenRequests; } public int getMaxDepthOfCrawling() { return maxDepthOfCrawling; } public void setMaxDepthOfCrawling(int maxDepthOfCrawling) { this.maxDepthOfCrawling = maxDepthOfCrawling; } public int getMaxOutgoingLinksToFollow() { return maxOutgoingLinksToFollow; } public void setMaxOutgoingLinksToFollow(int maxOutgoingLinksToFollow) { this.maxOutgoingLinksToFollow = maxOutgoingLinksToFollow; } public boolean isFetchBinaryContent() { return fetchBinaryContent; } public void setFetchBinaryContent(boolean fetchBinaryContent) { this.fetchBinaryContent = fetchBinaryContent; } public String getFileSuffix() { return fileSuffix; } public void setFileSuffix(String fileSuffix) { this.fileSuffix = fileSuffix; } public String getAgent() { return agent; } public void setAgent(String agent) { this.agent = agent; } public boolean isHttps() { return https; } public void setHttps(boolean https) { this.https = https; } public boolean isOnlyDomain() { return onlyDomain; } public void setOnlyDomain(boolean onlyDomain) { this.onlyDomain = onlyDomain; } public boolean isRobots() { return robots; } public void setRobots(boolean robots) { this.robots = robots; } public int getMaxTotalConnections() { return maxTotalConnections; } public void setMaxTotalConnections(int maxTotalConnections) { this.maxTotalConnections = maxTotalConnections; } public int getMaxConnectionsPerHost() { return maxConnectionsPerHost; } public void setMaxConnectionsPerHost(int maxConnectionsPerHost) { this.maxConnectionsPerHost = maxConnectionsPerHost; } public int getMaxDownloadSizePerPage() { return maxDownloadSizePerPage; } public void setMaxDownloadSizePerPage(int maxDownloadSizePerPage) { this.maxDownloadSizePerPage = maxDownloadSizePerPage; } public List<String> getSeeds() { return seeds; } public void setSeeds(List<String> seeds) { this.seeds = seeds; } public List<String> getFetchUrlFilters() { return fetchUrlFilters; } public void setFetchUrlFilters(List<String> fetchUrlFilters) { this.fetchUrlFilters = fetchUrlFilters; } public String getType() { return type; } public void setType(String type) { this.type = type; } public List<KeyValue<String, String>> getExtractUrlfilters() { return extractUrlfilters; } public void setExtractUrlfilters(List<KeyValue<String, String>> extractUrlfilters) { this.extractUrlfilters = extractUrlfilters; } public String getProxyHost() { return proxyHost; } public void setProxyHost(String proxyHost) { this.proxyHost = proxyHost; } public int getProxyPort() { return proxyPort; } public void setProxyPort(int proxyPort) { this.proxyPort = proxyPort; } public String getProxyUsername() { return proxyUsername; } public void setProxyUsername(String proxyUsername) { this.proxyUsername = proxyUsername; } public String getProxyPassword() { return proxyPassword; } public void setProxyPassword(String proxyPassword) { this.proxyPassword = proxyPassword; } @Override public String toString() { return "FetchConfig [type=" + type + ", threadNum=" + threadNum + ", socketTimeoutMilliseconds=" + socketTimeoutMilliseconds + ", connectionTimeout=" + connectionTimeout + ", delayBetweenRequests=" + delayBetweenRequests + ", maxDepthOfCrawling=" + maxDepthOfCrawling + ", maxOutgoingLinksToFollow=" + maxOutgoingLinksToFollow + ", fetchBinaryContent=" + fetchBinaryContent + ", fileSuffix=" + fileSuffix + ", agent=" + agent + ", https=" + https + ", onlyDomain=" + onlyDomain + ", robots=" + robots + ", maxTotalConnections=" + maxTotalConnections + ", maxConnectionsPerHost=" + maxConnectionsPerHost + ", maxDownloadSizePerPage="; } }