org.aliuge.crawler.jobconf.FetchConfig.java Source code

Java tutorial

Introduction

Here is the source code for org.aliuge.crawler.jobconf.FetchConfig.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.aliuge.crawler.jobconf;

import java.util.List;
import java.util.Properties;

import org.aliuge.crawler.exception.ConfigurationException;
import org.aliuge.crawler.exception.QueueException;
import org.aliuge.crawler.model.KeyValue;
import org.aliuge.crawler.pendingqueue.PendingManager;
import org.aliuge.crawler.url.WebURL;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.google.common.collect.Lists;

/**
 * @author
 * @date 2015-7-22
 * @desc
 */
public class FetchConfig extends Configuration {

    private static Logger log = Logger.getLogger(FetchConfig.class);

    public FetchConfig() {

    }

    private String jobTag = "default";
    private String proxyPath = "conf/proxyips.properties";
    private String type = "default";
    /**
     * job??
     */
    private int threadNum = 1;
    /**
     * Socket??
     */
    private int socketTimeoutMilliseconds = 5000;
    /**
     * connection??
     */
    private int connectionTimeout = 5000;
    /**
     * 
     */
    private int delayBetweenRequests = 200;
    /**
     * ?-1?
     */
    private int maxDepthOfCrawling = -1;
    /**
     * ???
     */
    private int maxOutgoingLinksToFollow = 5000;
    /**
     * ?
     */
    private boolean fetchBinaryContent = false;
    /**
     * ??
     */
    private String fileSuffix = "jpg,gif,png,avi,mtk";
    /**
     * agent
     */
    private String agent = "";
    /**
     * ?https
     */
    private boolean https = true;
    /**
     * ?????
     */
    private boolean onlyDomain = true;
    /**
     * ??robots??
     */
    private boolean robots = true;
    /**
     * 
     */
    private int maxTotalConnections = 200;
    /**
     * ?
     */
    private int maxConnectionsPerHost = 200;
    /**
     * ?????
     */
    private int maxDownloadSizePerPage = 1048576;

    private List<String> proxyIps = null;
    /**
     * ?
     */
    private String proxyHost = null;
    /**
     * ??
     */
    private int proxyPort = 80;

    /**
     * ???
     */
    private String proxyUsername = null;

    /**
     * ??
     */
    private String proxyPassword = null;

    /**
     * ???
     */
    private List<String> seeds = Lists.newArrayList();
    /**
     * url
     */
    private List<String> fetchUrlFilters = Lists.newArrayList();
    /**
     * ?Url?
     */
    private List<KeyValue<String, String>> extractUrlfilters = Lists.newArrayList();

    /**
     * ???
     * 
     * @param confFile
     * @return
     */
    @SuppressWarnings("unchecked")
    public FetchConfig loadConfig(Document confDoc) throws ConfigurationException {
        try {
            Document doc = confDoc;
            super.setJobName(doc.select("job").attr("name"));
            super.setIndexName(doc.select("job").attr("indexName"));
            Elements e = doc.select("fetch");
            this.type = e.select("type").text();
            this.agent = e.select("agent").text();
            String temp = e.select("threadNum").text();
            if (StringUtils.isNotBlank(temp)) {
                this.threadNum = Integer.parseInt(temp);
            }

            temp = e.select("delayBetweenRequests").text();
            if (StringUtils.isNotBlank(temp)) {
                this.delayBetweenRequests = Integer.parseInt(temp);
            }

            temp = e.select("maxDepthOfCrawling").text();
            if (StringUtils.isNotBlank(temp)) {
                this.maxDepthOfCrawling = Integer.parseInt(temp);
            }

            temp = e.select("fetchBinaryContent").text();
            if (StringUtils.isNotBlank(temp)) {
                this.fetchBinaryContent = Boolean.parseBoolean(temp);
            }

            if (StringUtils.isNotBlank(e.select("maxOutgoingLinksToFollow").text())) {
                this.maxOutgoingLinksToFollow = Integer.parseInt(e.select("maxOutgoingLinksToFollow").text());
            }

            temp = e.select("fileSuffix").text();
            if (StringUtils.isNotBlank(temp)) {
                this.fileSuffix = temp;
            }

            temp = e.select("maxDownloadSizePerPage").text();
            if (StringUtils.isNotBlank(temp)) {
                this.maxDownloadSizePerPage = Integer.parseInt(temp);
            }

            temp = e.select("https").text();
            if (StringUtils.isNotBlank(temp)) {
                this.https = Boolean.parseBoolean(temp);
            }

            temp = e.select("onlyDomain").text();
            if (StringUtils.isNotBlank(temp)) {
                this.onlyDomain = Boolean.parseBoolean(temp);
            }

            temp = e.select("socketTimeoutMilliseconds").text();
            if (StringUtils.isNotBlank(temp)) {
                this.socketTimeoutMilliseconds = Integer.parseInt(temp);
            }

            temp = e.select("connectionTimeout").text();
            if (StringUtils.isNotBlank(temp)) {
                this.connectionTimeout = Integer.parseInt(temp);
            }

            temp = e.select("maxTotalConnections").text();
            if (StringUtils.isNotBlank(temp)) {
                this.maxTotalConnections = Integer.parseInt(temp);
            }

            temp = e.select("maxConnectionsPerHost").text();
            if (StringUtils.isNotBlank(temp)) {
                this.maxConnectionsPerHost = Integer.parseInt(e.select("maxConnectionsPerHost").text());
            }

            temp = e.select("maxConnectionsPerHost").text();
            if (StringUtils.isNotBlank(temp)) {
                this.maxConnectionsPerHost = Integer.parseInt(temp);
            }

            temp = e.select("proxy").text();
            if (StringUtils.isNotBlank(temp)) {
                Properties p = PropertyConfigurationHelper.getProperties(temp);
                this.proxyIps = Lists.newLinkedList();
                for (Object o : p.keySet()) {
                    proxyIps.add((String) p.get(o));
                }

            }

            // seed
            Elements seeds = doc.select("fetch seeds seed");
            for (Element element : seeds) {
                // WebURL url = new WebURL();
                String url = element.text();
                if (StringUtils.isBlank(url)) {
                    continue;
                }
                url = url.trim();
                String area = element.attr("area");
                this.seeds.add(url);

                WebURL areaUrl = new WebURL(area, url);

                try {
                    PendingManager.getPendingArea(super.getJobName()).addElement(areaUrl);
                } catch (QueueException e1) {
                    log.error("", e1);
                    e1.printStackTrace();
                }
                // BloomfilterHelper.getInstance().add(url.getURL());

            }

            /*
             * ??Url
             */
            Elements fetchUrlFilters = doc.select("fetchUrlFilters filter");
            for (Element element : fetchUrlFilters) {
                String tmp = element.text();
                if (StringUtils.isNoneBlank(tmp))
                    this.fetchUrlFilters.add(element.text());
            }
            /*
             * ?????Url
             */
            Elements extractUrlfilters = doc.select("extractUrlfilters filter");
            for (Element element : extractUrlfilters) {
                String tmp = element.text();
                String tmp_rep = element.attr("replace");
                if (StringUtils.isNoneBlank(tmp))
                    this.extractUrlfilters.add(new KeyValue(tmp, tmp_rep));
            }
        } catch (NumberFormatException e) {
            throw new ConfigurationException("?" + e.getMessage());
        }
        // super.setFetchConfig(this);
        return this;
    }

    public String getJobTag() {
        return jobTag;
    }

    public void setJobTag(String jobTag) {
        this.jobTag = jobTag;
    }

    public String getProxyPath() {
        return proxyPath;
    }

    public void setProxyPath(String proxyPath) {
        this.proxyPath = proxyPath;
    }

    public List<String> getProxyIps() {
        return proxyIps;
    }

    public void setProxyIps(List<String> proxyIps) {
        this.proxyIps = proxyIps;
    }

    public int getThreadNum() {
        return threadNum;
    }

    public void setThreadNum(int threadNum) {
        this.threadNum = threadNum;
    }

    public int getSocketTimeoutMilliseconds() {
        return socketTimeoutMilliseconds;
    }

    public void setSocketTimeoutMilliseconds(int socketTimeoutMilliseconds) {
        this.socketTimeoutMilliseconds = socketTimeoutMilliseconds;
    }

    public int getConnectionTimeout() {
        return connectionTimeout;
    }

    public void setConnectionTimeout(int connectionTimeout) {
        this.connectionTimeout = connectionTimeout;
    }

    public int getDelayBetweenRequests() {
        return delayBetweenRequests;
    }

    public void setDelayBetweenRequests(int delayBetweenRequests) {
        this.delayBetweenRequests = delayBetweenRequests;
    }

    public int getMaxDepthOfCrawling() {
        return maxDepthOfCrawling;
    }

    public void setMaxDepthOfCrawling(int maxDepthOfCrawling) {
        this.maxDepthOfCrawling = maxDepthOfCrawling;
    }

    public int getMaxOutgoingLinksToFollow() {
        return maxOutgoingLinksToFollow;
    }

    public void setMaxOutgoingLinksToFollow(int maxOutgoingLinksToFollow) {
        this.maxOutgoingLinksToFollow = maxOutgoingLinksToFollow;
    }

    public boolean isFetchBinaryContent() {
        return fetchBinaryContent;
    }

    public void setFetchBinaryContent(boolean fetchBinaryContent) {
        this.fetchBinaryContent = fetchBinaryContent;
    }

    public String getFileSuffix() {
        return fileSuffix;
    }

    public void setFileSuffix(String fileSuffix) {
        this.fileSuffix = fileSuffix;
    }

    public String getAgent() {
        return agent;
    }

    public void setAgent(String agent) {
        this.agent = agent;
    }

    public boolean isHttps() {
        return https;
    }

    public void setHttps(boolean https) {
        this.https = https;
    }

    public boolean isOnlyDomain() {
        return onlyDomain;
    }

    public void setOnlyDomain(boolean onlyDomain) {
        this.onlyDomain = onlyDomain;
    }

    public boolean isRobots() {
        return robots;
    }

    public void setRobots(boolean robots) {
        this.robots = robots;
    }

    public int getMaxTotalConnections() {
        return maxTotalConnections;
    }

    public void setMaxTotalConnections(int maxTotalConnections) {
        this.maxTotalConnections = maxTotalConnections;
    }

    public int getMaxConnectionsPerHost() {
        return maxConnectionsPerHost;
    }

    public void setMaxConnectionsPerHost(int maxConnectionsPerHost) {
        this.maxConnectionsPerHost = maxConnectionsPerHost;
    }

    public int getMaxDownloadSizePerPage() {
        return maxDownloadSizePerPage;
    }

    public void setMaxDownloadSizePerPage(int maxDownloadSizePerPage) {
        this.maxDownloadSizePerPage = maxDownloadSizePerPage;
    }

    public List<String> getSeeds() {
        return seeds;
    }

    public void setSeeds(List<String> seeds) {
        this.seeds = seeds;
    }

    public List<String> getFetchUrlFilters() {
        return fetchUrlFilters;
    }

    public void setFetchUrlFilters(List<String> fetchUrlFilters) {
        this.fetchUrlFilters = fetchUrlFilters;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public List<KeyValue<String, String>> getExtractUrlfilters() {
        return extractUrlfilters;
    }

    public void setExtractUrlfilters(List<KeyValue<String, String>> extractUrlfilters) {
        this.extractUrlfilters = extractUrlfilters;
    }

    public String getProxyHost() {
        return proxyHost;
    }

    public void setProxyHost(String proxyHost) {
        this.proxyHost = proxyHost;
    }

    public int getProxyPort() {
        return proxyPort;
    }

    public void setProxyPort(int proxyPort) {
        this.proxyPort = proxyPort;
    }

    public String getProxyUsername() {
        return proxyUsername;
    }

    public void setProxyUsername(String proxyUsername) {
        this.proxyUsername = proxyUsername;
    }

    public String getProxyPassword() {
        return proxyPassword;
    }

    public void setProxyPassword(String proxyPassword) {
        this.proxyPassword = proxyPassword;
    }

    @Override
    public String toString() {
        return "FetchConfig [type=" + type + ", threadNum=" + threadNum + ", socketTimeoutMilliseconds="
                + socketTimeoutMilliseconds + ", connectionTimeout=" + connectionTimeout + ", delayBetweenRequests="
                + delayBetweenRequests + ", maxDepthOfCrawling=" + maxDepthOfCrawling
                + ", maxOutgoingLinksToFollow=" + maxOutgoingLinksToFollow + ", fetchBinaryContent="
                + fetchBinaryContent + ", fileSuffix=" + fileSuffix + ", agent=" + agent + ", https=" + https
                + ", onlyDomain=" + onlyDomain + ", robots=" + robots + ", maxTotalConnections="
                + maxTotalConnections + ", maxConnectionsPerHost=" + maxConnectionsPerHost
                + ", maxDownloadSizePerPage=";
    }
}