org.sbs.goodcrawler.fetcher.FetchWorker.java Source code

Introduction

Here is the source code for org.sbs.goodcrawler.fetcher.FetchWorker.java
Source

/**
 * ##########################  GoodCrawler  ############################
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sbs.goodcrawler.fetcher;

import java.io.File;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpStatus;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.sbs.goodcrawler.conf.Worker;
import org.sbs.goodcrawler.exception.QueueException;
import org.sbs.goodcrawler.jobconf.FetchConfig;
import org.sbs.goodcrawler.page.Page;
import org.sbs.goodcrawler.page.PageFetchResult;
import org.sbs.goodcrawler.page.Parser;
import org.sbs.pendingqueue.PendingManager;
import org.sbs.pendingqueue.PendingPages;
import org.sbs.pendingqueue.PendingUrls;
import org.sbs.robotstxt.RobotstxtConfig;
import org.sbs.robotstxt.RobotstxtServer;
import org.sbs.url.WebURL;
import org.sbs.util.BloomfilterHelper;
import org.sbs.util.UrlUtils;

import com.google.common.collect.Lists;

/**
 * @author shenbaise(shenbaise@outlook.com)
 * @date 2013-7-1
 * ??
 */
public abstract class FetchWorker extends Worker {
    private Log log = LogFactory.getLog(this.getClass());
    protected UrlUtils urlUtils = new UrlUtils();
    protected BloomfilterHelper bloomfilterHelper = BloomfilterHelper.getInstance();
    /**
     * url
     */
    protected PendingUrls pendingUrls = null;
    /**
     * Page
     */
    protected PendingPages pendingPages = null;
    /**
     * ?
     */
    protected PageFetcher fetcher;
    /**
     * job?
     */
    protected FetchConfig conf;
    /**
     * ?
     */
    protected Parser parser;
    /**
     * robots
     */
    public static RobotstxtServer robotstxtServer;

    public List<Pattern> fetchFilters = Lists.newArrayList();

    public List<Pattern> extractFilters = Lists.newArrayList();

    /**
     * @param conf
     * ???setFetcherFetcher
     */
    private FetchWorker(FetchConfig conf) {
        this.conf = conf;

        pendingUrls = PendingManager.getPendingUlr(conf.jobName);
        pendingPages = PendingManager.getPendingPages(conf.jobName);

        parser = new Parser(conf.isFetchBinaryContent());
        RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
        robotstxtConfig.setCacheSize(1000);
        robotstxtConfig.setEnabled(conf.isRobots());
        robotstxtConfig.setUserAgentName(conf.getAgent());
        robotstxtServer = new RobotstxtServer(robotstxtConfig, fetcher);

        // 
        List<String> urls1 = conf.getFetchUrlFilters();
        List<String> urls2 = conf.getExtractUrlfilters();
        for (String s : urls1) {
            fetchFilters.add(Pattern.compile(s));
        }
        for (String s : urls2) {
            extractFilters.add(Pattern.compile(s));
        }
    }

    /**
     * @param conf
     * @param fetcher ??
     */
    public FetchWorker(FetchConfig conf, PageFetcher fetcher) {
        this.fetcher = fetcher;
        this.conf = conf;
        pendingUrls = PendingManager.getPendingUlr(conf.jobName);
        pendingPages = PendingManager.getPendingPages(conf.jobName);
        parser = new Parser(conf.isFetchBinaryContent());
        RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
        robotstxtConfig.setCacheSize(1000);
        robotstxtConfig.setEnabled(conf.isRobots());
        robotstxtConfig.setUserAgentName(conf.getAgent());
        robotstxtServer = new RobotstxtServer(robotstxtConfig, fetcher);
        // 
        List<String> urls1 = conf.getFetchUrlFilters();
        List<String> urls2 = conf.getExtractUrlfilters();
        for (String s : urls1) {
            fetchFilters.add(Pattern.compile(s));
        }
        for (String s : urls2) {
            extractFilters.add(Pattern.compile(s));
        }
    }

    public FetchWorker setFetcher(final PageFetcher fetcher) {
        this.fetcher = fetcher;
        return this;
    }

    /**
     * @desc ?
     */
    public abstract void onSuccessed();

    /**
     * @desc 
     */
    public abstract void onFailed(WebURL url);

    /**
     * @desc 
     */
    public abstract void onIgnored(WebURL url);

    /**
     * fetcher filter
     * @param url
     * @return
     */
    public boolean fetchFilter(String url) {

        if (null == fetchFilters || fetchFilters.size() == 0) {
            return true;
        }
        for (Pattern p : fetchFilters) {
            if (p.matcher(url).matches()) {
                return true;
            }
        }
        return false;
    }

    /**
     * extract filter
     * @param url
     * @return
     */
    public boolean extractFilter(String url) {
        // bloomfilter it
        if (null == extractFilters || extractFilters.size() == 0) {
            return true;
        }
        for (Pattern p : extractFilters) {
            if (p.matcher(url).matches()) {
                return true;
            }
        }
        return false;
    }

    /**
     * @param url
     * @desc 
     */
    public void fetchPage(WebURL url) {
        PageFetchResult result = null;
        try {
            if (null != url && StringUtils.isNotBlank(url.getURL())) {
                // ??
                if (fetchFilter(url.getURL())) {
                    result = fetcher.fetchHeader(url);
                    // ??
                    int statusCode = result.getStatusCode();
                    if (statusCode == CustomFetchStatus.PageTooBig) {
                        onIgnored(url);
                        return;
                    }
                    if (statusCode != HttpStatus.SC_OK) {
                        onFailed(url);
                    } else {
                        Page page = new Page(url);
                        pendingUrls.processedSuccess();
                        if (!result.fetchContent(page)) {
                            onFailed(url);
                            return;
                        }
                        if (!parser.parse(page, url.getURL())) {
                            onFailed(url);
                            return;
                        }
                        // ??
                        if (extractFilter(url.getURL())) {
                            pendingPages.addElement(page);
                        }

                        // depth
                        if (url.getDepth() > conf.getMaxDepthOfCrawling() && conf.getMaxDepthOfCrawling() != -1) {
                            return;
                        }
                        // ???Url?Url
                        Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()),
                                urlUtils.getBaseUrl(page.getWebURL().getURL()));
                        Elements links = doc.getElementsByTag("a");
                        if (!links.isEmpty()) {
                            for (Element link : links) {
                                String linkHref = link.absUrl("href");
                                // ??
                                if (fetchFilter(linkHref) && !bloomfilterHelper.exist(linkHref)) {
                                    WebURL purl = new WebURL();
                                    purl.setURL(linkHref);
                                    purl.setJobName(conf.jobName);
                                    purl.setDepth((short) (url.getDepth() + 1));
                                    if (purl.getDepth() > conf.getMaxDepthOfCrawling()
                                            && conf.getMaxDepthOfCrawling() != -1)
                                        return;
                                    try {
                                        if (!pendingUrls.addElement(purl, 1000)) {
                                            FileUtils.writeStringToFile(new File("status/_urls.good"),
                                                    url.getURL() + "\n", true);
                                        }
                                    } catch (QueueException e) {
                                        log.error(e.getMessage());
                                    }
                                }
                            }
                        }
                    }
                } else {
                    onIgnored(url);
                }
            }
        } catch (Exception e) {
            onFailed(url);
        } catch (QueueException e) {
            onFailed(url);
        } finally {
            if (null != result)
                result.discardContentIfNotConsumed();
        }
    }

    /**
     * ???url?
     * @param url
     */
    public void fetchPageWhitoutExtractUrl(WebURL url) {
        PageFetchResult result = null;
        try {
            if (null != url && StringUtils.isNotBlank(url.getURL())) {
                // ??
                if (fetchFilter(url.getURL())) {
                    result = fetcher.fetchHeader(url);
                    // ??
                    int statusCode = result.getStatusCode();
                    if (statusCode == CustomFetchStatus.PageTooBig) {
                        onIgnored(url);
                        return;
                    }
                    if (statusCode != HttpStatus.SC_OK) {
                        onFailed(url);
                    } else {
                        Page page = new Page(url);
                        pendingUrls.processedSuccess();
                        if (!result.fetchContent(page)) {
                            onFailed(url);
                            return;
                        }
                        if (!parser.parse(page, url.getURL())) {
                            onFailed(url);
                            return;
                        }
                        // ??
                        if (extractFilter(url.getURL())) {
                            pendingPages.addElement(page);
                        }
                    }
                } else {
                    onIgnored(url);
                }
            }
        } catch (Exception e) {
            onFailed(url);
        } catch (QueueException e) {
            onFailed(url);
        } finally {
            if (null != result)
                result.discardContentIfNotConsumed();
        }
    }

    public static void main(String[] args) {
    }
}