org.aliuge.crawler.fetcher.FetchWorker.java Source code

Java tutorial

Introduction

Here is the source code for org.aliuge.crawler.fetcher.FetchWorker.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.aliuge.crawler.fetcher;

import java.io.File;
import java.util.List;
import java.util.regex.Pattern;

import org.aliuge.crawler.exception.QueueException;
import org.aliuge.crawler.jobconf.FetchConfig;
import org.aliuge.crawler.model.KeyValue;
import org.aliuge.crawler.page.Page;
import org.aliuge.crawler.page.PageFetchResult;
import org.aliuge.crawler.page.Parser;
import org.aliuge.crawler.pendingqueue.PendingManager;
import org.aliuge.crawler.pendingqueue.PendingPages;
import org.aliuge.crawler.pendingqueue.PendingUrls;
import org.aliuge.crawler.url.WebURL;
import org.aliuge.crawler.util.BloomfilterHelper;
import org.aliuge.crawler.util.UrlUtils;
import org.aliuge.crawler.worker.Worker;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpStatus;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.google.common.collect.Lists;

public abstract class FetchWorker extends Worker {

    private static Logger log = Logger.getLogger(FetchWorker.class);

    protected UrlUtils urlUtils = new UrlUtils();
    protected BloomfilterHelper bloomfilterHelper = BloomfilterHelper.getInstance();

    /**
     * url
     */
    protected PendingUrls pendingUrls = null;
    /**
     * Page
     */
    protected PendingPages pendingPages = null;
    /**
     * ?
     */
    protected DefaultFetcher fetcher;
    /**
     * job?
     */
    protected FetchConfig config;
    /**
     * ?
     */
    protected Parser parser;
    /**
     * robots
     */
    public List<Pattern> fetchFilters = Lists.newArrayList();

    public List<KeyValue<Pattern, String>> extractFilters = Lists.newArrayList();

    /**
     * @param conf
     *            ???setFetcherFetcher
     */
    public FetchWorker(String jobTag, FetchConfig config) {
        this(jobTag, config, new DefaultFetcher(config));
    }

    /**
     * @param conf
     * @param fetcher
     *            ??
     */
    @SuppressWarnings("unchecked")
    public FetchWorker(String jobTag, FetchConfig config, DefaultFetcher fetcher) {
        super(jobTag);
        this.fetcher = fetcher;
        this.config = config;
        pendingUrls = PendingManager.getPendingUlr(config.getJobName());
        pendingPages = PendingManager.getPendingPages(config.getJobName());
        parser = new Parser(config.isFetchBinaryContent());

        // 
        List<String> urls1 = config.getFetchUrlFilters();
        List<KeyValue<String, String>> urls2 = config.getExtractUrlfilters();
        for (String s : urls1) {
            fetchFilters.add(Pattern.compile(s));
        }
        for (KeyValue<String, String> s : urls2) {
            extractFilters.add(new KeyValue(Pattern.compile(s.getKey()), s.getValue()));
        }
    }

    public FetchWorker setFetcher(final DefaultFetcher fetcher) {
        this.fetcher = fetcher;
        return this;
    }

    /**
     * @desc ?
     */
    public abstract void onSuccessed();

    /**
     * @desc 
     */
    public abstract void onFailed(WebURL url);

    /**
     * @desc 
     */
    public abstract void onIgnored(WebURL url);

    /**
     * fetcher filter
     * 
     * @param url
     * @return
     */
    public boolean fetchFilter(String url) {

        if (null == fetchFilters || fetchFilters.size() == 0) {
            return true;
        }
        for (Pattern p : fetchFilters) {
            if (p.matcher(url).matches()) {
                return true;
            }
        }
        return false;
    }

    /**
     * extract filter
     * 
     * @param url
     * @return
     */
    public boolean extractFilter(String url) {
        // extractFilters??url
        if (null == extractFilters || extractFilters.size() == 0) {
            return true;
        }
        for (KeyValue<Pattern, String> p : extractFilters) {
            if (p.getKey().matcher(url).matches()) {
                return true;
            }
        }
        return false;
    }

    /**
     * ???url????url
     * 
     * @param url
     * @return
     */
    public String extractFilterAndChangeUrl(String url) {
        // extractFilters??url
        if (null == extractFilters || extractFilters.size() == 0) {
            return url;
        }
        for (KeyValue<Pattern, String> p : extractFilters) {
            if (p.getKey().matcher(url).matches()) {
                if (StringUtils.isNoneBlank(p.getValue())) {
                    String[] pp = p.getValue().split(",");
                    //???url?
                    if (pp.length == 2)
                        return url.replace(pp[0], pp[1]);
                }
                return url;
            }
        }
        // ??url??null??url
        return null;
    }

    /**
     * @param url
     * @desc 
     */
    public void fetchPage(WebURL url) {

        PageFetchResult result = null;
        try {
            if (null != url && StringUtils.isNotBlank(url.getUrl())) {

                result = fetcher.fetch(url, true);
                // ??
                int statusCode = result.getStatusCode();
                if (statusCode == CustomFetchStatus.PageTooBig) {
                    onIgnored(url);
                    return;
                }
                if (statusCode != HttpStatus.SC_OK) {
                    onFailed(url);
                } else {
                    Page page = new Page(url);
                    onSuccessed();
                    if (!result.fetchContent(page)) {
                        onFailed(url);
                        return;
                    }
                    if (!parser.parse(page, url.getUrl())) {
                        onFailed(url);
                        return;
                    }
                    // ??
                    String e_url = extractFilterAndChangeUrl(url.getUrl());
                    if (StringUtils.isNoneBlank(e_url)) {
                        url.setUrl(e_url);
                        page.setWebURL(url);
                        pendingPages.addElement(page);
                        return;
                    }

                    // depth
                    if (url.getDepth() > config.getMaxDepthOfCrawling() && config.getMaxDepthOfCrawling() != -1) {
                        return;
                    }
                    // ???Url?Url
                    Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()),
                            urlUtils.getBaseUrl(page.getWebURL().getUrl()));
                    Elements links = doc.getElementsByTag("a");
                    if (!links.isEmpty()) {
                        for (Element link : links) {
                            String linkHref = link.absUrl("href");

                            // ???url
                            if ((fetchFilter(linkHref) || extractFilter(linkHref))
                                    && !bloomfilterHelper.exist(linkHref)) {
                                WebURL purl = new WebURL();
                                purl.setName(link.text());
                                purl.setUrl(linkHref);

                                purl.setDepth((short) (url.getDepth() + 1));
                                if (purl.getDepth() > config.getMaxDepthOfCrawling()
                                        && config.getMaxDepthOfCrawling() != -1)
                                    return;
                                try {
                                    if (!pendingUrls.addElement(purl)) {
                                        FileUtils.writeStringToFile(new File("status/_urls.good"),
                                                url.getUrl() + "\n", true);
                                    }
                                } catch (QueueException e) {
                                    log.error(e.getMessage());
                                }
                            }
                        }
                    }
                }

            }
        } catch (QueueException e) {
            onFailed(url);
        } catch (Exception e) {
            e.printStackTrace();
            onFailed(url);
        } finally {
            if (null != result)
                result.discardContentIfNotConsumed();
        }
    }

    /**
     * ???url?
     * 
     * @param url
     */
    public void fetchPageWhitoutExtractUrl(WebURL url) {
        PageFetchResult result = null;
        try {
            if (null != url && StringUtils.isNotBlank(url.getUrl())) {
                // ??
                if (fetchFilter(url.getUrl())) {
                    // result = fetcher.fetchHeader(url);
                    result = fetcher.fetch(url, true);
                    // ??
                    int statusCode = result.getStatusCode();
                    if (statusCode == CustomFetchStatus.PageTooBig) {
                        onIgnored(url);
                        return;
                    }
                    if (statusCode != HttpStatus.SC_OK) {
                        onFailed(url);
                    } else {
                        Page page = new Page(url);
                        pendingUrls.processedSuccess();
                        if (!result.fetchContent(page)) {
                            onFailed(url);
                            return;
                        }
                        if (!parser.parse(page, url.getUrl())) {
                            onFailed(url);
                            return;
                        }
                        // ??
                        if (extractFilter(url.getUrl())) {
                            pendingPages.addElement(page);
                        }
                    }
                } else {
                    onIgnored(url);
                }
            }
        } catch (Exception e) {
            onFailed(url);
        } catch (QueueException e) {
            onFailed(url);
        } finally {
            if (null != result)
                result.discardContentIfNotConsumed();
        }
    }

}