Source code

Java tutorial


Here is the source code for


 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package net.peacesoft.nutch.crawl;

import java.text.SimpleDateFormat;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

// Slf4j Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.protocol.*;
import org.apache.nutch.parse.*;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.*;
import org.apache.nutch.fetcher.FetcherOutputFormat;

 * A queue-based ReFetcher.
 * <p>This fetcher uses a well-known model of one producer (a QueueFeeder) and
 * many consumers (FetcherThread-s).
 * <p>QueueFeeder reads input fetchlists and populates a set of
 * FetchItemQueue-s, which hold FetchItem-s that describe the items to be
 * fetched. There are as many queues as there are unique hosts, but at any given
 * time the total number of fetch items in all queues is less than a fixed
 * number (currently set to a multiple of the number of threads).
 * <p>As items are consumed from the queues, the QueueFeeder continues to add
 * new input items, so that their total count stays fixed (FetcherThread-s may
 * also add new items to the queues e.g. as a results of redirection) - until
 * all input items are exhausted, at which point the number of items in the
 * queues begins to decrease. When this number reaches 0 fetcher will finish.
 * <p>This fetcher implementation handles per-host blocking itself, instead of
 * delegating this work to protocol-specific plugins. Each per-host queue
 * handles its own "politeness" settings, such as the maximum number of
 * concurrent requests and crawl delay between consecutive requests - and also a
 * list of requests in progress, and the time the last request was finished. As
 * FetcherThread-s ask for new items to be fetched, queues may return eligible
 * items or null if for "politeness" reasons this host's queue is not yet ready.
 * <p>If there are still unfetched items in the queues, but none of the items
 * are ready, FetcherThread-s will spin-wait until either some items become
 * available, or a timeout is reached (at which point the Fetcher will abort,
 * assuming the task is hung).
 * @author Andrzej Bialecki
public class ReFetcher extends Configured implements Tool, MapRunnable<Text, CrawlDatum, Text, NutchWritable> {

    public static final int PERM_REFRESH_TIME = 5;
    public static final String CONTENT_REDIR = "content";
    public static final String PROTOCOL_REDIR = "protocol";
    public static final Logger LOG = LoggerFactory.getLogger(ReFetcher.class);

    public static class InputFormat extends SequenceFileInputFormat<Text, CrawlDatum> {

         * Don't split inputs, to keep things polite.
        public InputSplit[] getSplits(JobConf job, int nSplits) throws IOException {
            FileStatus[] files = listStatus(job);
            FileSplit[] splits = new FileSplit[files.length];
            for (int i = 0; i < files.length; i++) {
                FileStatus cur = files[i];
                splits[i] = new FileSplit(cur.getPath(), 0, cur.getLen(), (String[]) null);
            return splits;

    private OutputCollector<Text, NutchWritable> output;
    private Reporter reporter;
    private String segmentName;
    private AtomicInteger activeThreads = new AtomicInteger(0);
    private AtomicInteger spinWaiting = new AtomicInteger(0);
    private long start = System.currentTimeMillis(); // start time of fetcher run
    private AtomicLong lastRequestStart = new AtomicLong(start);
    private AtomicLong bytes = new AtomicLong(0); // total bytes fetched
    private AtomicInteger pages = new AtomicInteger(0); // total pages fetched
    private AtomicInteger errors = new AtomicInteger(0); // total pages errored
    private boolean storingContent;
    private boolean parsing;
    ReFetcher.FetchItemQueues fetchQueues;
    ReFetcher.QueueFeeder feeder;

     * This class described the item to be fetched.
    private static class FetchItem {

        int outlinkDepth = 0;
        String queueID;
        Text url;
        URL u;
        CrawlDatum datum;

        public FetchItem(Text url, URL u, CrawlDatum datum, String queueID) {
            this(url, u, datum, queueID, 0);

        public FetchItem(Text url, URL u, CrawlDatum datum, String queueID, int outlinkDepth) {
            this.url = url;
            this.u = u;
            this.datum = datum;
            this.queueID = queueID;
            this.outlinkDepth = outlinkDepth;

         * Create an item. Queue id will be created based on
         * <code>queueMode</code> argument, either as a protocol + hostname
         * pair, protocol + IP address pair or protocol+domain pair.
        public static ReFetcher.FetchItem create(Text url, CrawlDatum datum, String queueMode) {
            return create(url, datum, queueMode, 0);

        public static ReFetcher.FetchItem create(Text url, CrawlDatum datum, String queueMode, int outlinkDepth) {
            String queueID;
            URL u = null;
            try {
                u = new URL(url.toString());
            } catch (Exception e) {
                LOG.warn("Cannot parse url: " + url, e);
                return null;
            final String proto = u.getProtocol().toLowerCase();
            String key;
            if (ReFetcher.FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) {
                try {
                    final InetAddress addr = InetAddress.getByName(u.getHost());
                    key = addr.getHostAddress();
                } catch (final UnknownHostException e) {
                    // unable to resolve it, so don't fall back to host name
                    LOG.warn("Unable to resolve: " + u.getHost() + ", skipping.");
                    return null;
            } else if (ReFetcher.FetchItemQueues.QUEUE_MODE_DOMAIN.equalsIgnoreCase(queueMode)) {
                key = URLUtil.getDomainName(u);
                if (key == null) {
                    LOG.warn("Unknown domain for url: " + url + ", using URL string as key");
                    key = u.toExternalForm();
            } else {
                key = u.getHost();
                if (key == null) {
                    LOG.warn("Unknown host for url: " + url + ", using URL string as key");
                    key = u.toExternalForm();
            queueID = proto + "://" + key.toLowerCase();
            return new ReFetcher.FetchItem(url, u, datum, queueID, outlinkDepth);

        public CrawlDatum getDatum() {
            return datum;

        public String getQueueID() {
            return queueID;

        public Text getUrl() {
            return url;

        public URL getURL2() {
            return u;

     * This class handles FetchItems which come from the same host ID (be it a
     * proto/hostname or proto/IP pair). It also keeps track of requests in
     * progress and elapsed time between requests.
    private static class FetchItemQueue {

        List<ReFetcher.FetchItem> queue = Collections.synchronizedList(new LinkedList<ReFetcher.FetchItem>());
        Set<ReFetcher.FetchItem> inProgress = Collections.synchronizedSet(new HashSet<ReFetcher.FetchItem>());
        AtomicLong nextFetchTime = new AtomicLong();
        AtomicInteger exceptionCounter = new AtomicInteger();
        long crawlDelay;
        long minCrawlDelay;
        int maxThreads;
        Configuration conf;

        public FetchItemQueue(Configuration conf, int maxThreads, long crawlDelay, long minCrawlDelay) {
            this.conf = conf;
            this.maxThreads = maxThreads;
            this.crawlDelay = crawlDelay;
            this.minCrawlDelay = minCrawlDelay;
            // ready to start
            setEndTime(System.currentTimeMillis() - crawlDelay);

        public synchronized int emptyQueue() {
            int presize = queue.size();
            return presize;

        public int getQueueSize() {
            return queue.size();

        public int getInProgressSize() {
            return inProgress.size();

        public int incrementExceptionCounter() {
            return exceptionCounter.incrementAndGet();

        public void finishFetchItem(ReFetcher.FetchItem it, boolean asap) {
            if (it != null) {
                setEndTime(System.currentTimeMillis(), asap);

        public void addFetchItem(ReFetcher.FetchItem it) {
            if (it == null) {

        public void addInProgressFetchItem(ReFetcher.FetchItem it) {
            if (it == null) {

        public ReFetcher.FetchItem getFetchItem() {
            if (inProgress.size() >= maxThreads) {
                return null;
            long now = System.currentTimeMillis();
            if (nextFetchTime.get() > now) {
                return null;
            ReFetcher.FetchItem it = null;
            if (queue.size() == 0) {
                return null;
            try {
                it = queue.remove(0);
            } catch (Exception e) {
                LOG.error("Cannot remove FetchItem from queue or cannot add it to inProgress queue", e);
            return it;

        public synchronized void dump() {
  "  maxThreads    = " + maxThreads);
  "  inProgress    = " + inProgress.size());
  "  crawlDelay    = " + crawlDelay);
  "  minCrawlDelay = " + minCrawlDelay);
  "  nextFetchTime = " + nextFetchTime.get());
  "  now           = " + System.currentTimeMillis());
            for (int i = 0; i < queue.size(); i++) {
                ReFetcher.FetchItem it = queue.get(i);
      "  " + i + ". " + it.url);

        private void setEndTime(long endTime) {
            setEndTime(endTime, false);

        private void setEndTime(long endTime, boolean asap) {
            if (!asap) {
                nextFetchTime.set(endTime + (maxThreads > 1 ? minCrawlDelay : crawlDelay));
            } else {

     * Convenience class - a collection of queues that keeps track of the total
     * number of items, and provides items eligible for fetching from any queue.
    private static class FetchItemQueues {

        public static final String DEFAULT_ID = "default";
        Map<String, ReFetcher.FetchItemQueue> queues = new HashMap<String, ReFetcher.FetchItemQueue>();
        AtomicInteger totalSize = new AtomicInteger(0);
        int maxThreads;
        long crawlDelay;
        long minCrawlDelay;
        long timelimit = -1;
        int maxExceptionsPerQueue = -1;
        Configuration conf;
        public static final String QUEUE_MODE_HOST = "byHost";
        public static final String QUEUE_MODE_DOMAIN = "byDomain";
        public static final String QUEUE_MODE_IP = "byIP";
        String queueMode;

        public FetchItemQueues(Configuration conf) {
            this.conf = conf;
            this.maxThreads = conf.getInt("fetcher.threads.per.queue", 1);
            queueMode = conf.get("fetcher.queue.mode", QUEUE_MODE_HOST);
            // check that the mode is known
            if (!queueMode.equals(QUEUE_MODE_IP) && !queueMode.equals(QUEUE_MODE_DOMAIN)
                    && !queueMode.equals(QUEUE_MODE_HOST)) {
                LOG.error("Unknown partition mode : " + queueMode + " - forcing to byHost");
                queueMode = QUEUE_MODE_HOST;
  "Using queue mode : " + queueMode);

            this.crawlDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000);
            this.minCrawlDelay = (long) (conf.getFloat("fetcher.server.min.delay", 0.0f) * 1000);
            this.timelimit = conf.getLong("fetcher.timelimit", -1);
            this.maxExceptionsPerQueue = conf.getInt("fetcher.max.exceptions.per.queue", -1);

        public int getTotalSize() {
            return totalSize.get();

        public int getQueueCount() {
            return queues.size();

        public void addFetchItem(Text url, CrawlDatum datum) {
            ReFetcher.FetchItem it = ReFetcher.FetchItem.create(url, datum, queueMode);
            if (it != null) {

        public synchronized void addFetchItem(ReFetcher.FetchItem it) {
            ReFetcher.FetchItemQueue fiq = getFetchItemQueue(it.queueID);

        public void finishFetchItem(ReFetcher.FetchItem it) {
            finishFetchItem(it, false);

        public void finishFetchItem(ReFetcher.FetchItem it, boolean asap) {
            ReFetcher.FetchItemQueue fiq = queues.get(it.queueID);
            if (fiq == null) {
                LOG.warn("Attempting to finish item from unknown queue: " + it);
            fiq.finishFetchItem(it, asap);

        public synchronized ReFetcher.FetchItemQueue getFetchItemQueue(String id) {
            ReFetcher.FetchItemQueue fiq = queues.get(id);
            if (fiq == null) {
                // initialize queue
                fiq = new ReFetcher.FetchItemQueue(conf, maxThreads, crawlDelay, minCrawlDelay);
                queues.put(id, fiq);
            return fiq;

        public synchronized ReFetcher.FetchItem getFetchItem() {
            Iterator<Map.Entry<String, ReFetcher.FetchItemQueue>> it = queues.entrySet().iterator();
            while (it.hasNext()) {
                ReFetcher.FetchItemQueue fiq =;
                // reap empty queues
                if (fiq.getQueueSize() == 0 && fiq.getInProgressSize() == 0) {
                ReFetcher.FetchItem fit = fiq.getFetchItem();
                if (fit != null) {
                    return fit;
            return null;

        // called only once the feeder has stopped
        public synchronized int checkTimelimit() {
            int count = 0;

            if (System.currentTimeMillis() >= timelimit && timelimit != -1) {
                // emptying the queues
                count = emptyQueues();

                // there might also be a case where totalsize !=0 but number of queues
                // == 0
                // in which case we simply force it to 0 to avoid blocking
                if (totalSize.get() != 0 && queues.isEmpty()) {
            return count;

        // empties the queues (used by timebomb and throughput threshold)
        public synchronized int emptyQueues() {
            int count = 0;

            for (String id : queues.keySet()) {
                ReFetcher.FetchItemQueue fiq = queues.get(id);
                if (fiq.getQueueSize() == 0) {
      "* queue: " + id + " >> dropping! ");
                int deleted = fiq.emptyQueue();
                for (int i = 0; i < deleted; i++) {
                count += deleted;

            return count;

         * Increment the exception counter of a queue in case of an exception
         * e.g. timeout; when higher than a given threshold simply empty the
         * queue.
         * @param queueid
         * @return number of purged items
        public synchronized int checkExceptionThreshold(String queueid) {
            ReFetcher.FetchItemQueue fiq = queues.get(queueid);
            if (fiq == null) {
                return 0;
            if (fiq.getQueueSize() == 0) {
                return 0;
            int excCount = fiq.incrementExceptionCounter();
            if (maxExceptionsPerQueue != -1 && excCount >= maxExceptionsPerQueue) {
                // too many exceptions for items in this queue - purge it
                int deleted = fiq.emptyQueue();
      "* queue: " + queueid + " >> removed " + deleted + " URLs from queue because " + excCount
                        + " exceptions occurred");
                for (int i = 0; i < deleted; i++) {
                return deleted;
            return 0;

        public synchronized void dump() {
            for (String id : queues.keySet()) {
                ReFetcher.FetchItemQueue fiq = queues.get(id);
                if (fiq.getQueueSize() == 0) {
      "* queue: " + id);

     * This class feeds the queues with input items, and re-fills them as items
     * are consumed by FetcherThread-s.
    private static class QueueFeeder extends Thread {

        private RecordReader<Text, CrawlDatum> reader;
        private ReFetcher.FetchItemQueues queues;
        private int size;
        private long timelimit = -1;

        public QueueFeeder(RecordReader<Text, CrawlDatum> reader, ReFetcher.FetchItemQueues queues, int size) {
            this.reader = reader;
            this.queues = queues;
            this.size = size;

        public void setTimeLimit(long tl) {
            timelimit = tl;

        public void run() {
            boolean hasMore = true;
            int cnt = 0;
            int timelimitcount = 0;
            while (hasMore) {
                if (System.currentTimeMillis() >= timelimit && timelimit != -1) {
                    // enough .. lets' simply
                    // read all the entries from the input without processing them
                    try {
                        Text url = new Text();
                        CrawlDatum datum = new CrawlDatum();
                        hasMore =, datum);
                    } catch (IOException e) {
                        LOG.error("QueueFeeder error reading input, record " + cnt, e);
                int feed = size - queues.getTotalSize();
                if (feed <= 0) {
                    // queues are full - spin-wait until they have some free space
                    try {
                    } catch (Exception e) {
                } else {
                    LOG.debug("-feeding " + feed + " input urls ...");
                    while (feed > 0 && hasMore) {
                        try {
                            Text url = new Text();
                            CrawlDatum datum = new CrawlDatum();
                            hasMore =, datum);
                            if (hasMore) {
                                queues.addFetchItem(url, datum);
                        } catch (IOException e) {
                            LOG.error("QueueFeeder error reading input, record " + cnt, e);
  "QueueFeeder finished: total " + cnt + " records + hit by time limit :" + timelimitcount);

     * This class picks items from queues and fetches the pages.
    private class FetcherThread extends Thread {

        private Configuration conf;
        private URLFilters urlFilters;
        private ScoringFilters scfilters;
        private ParseUtil parseUtil;
        private URLNormalizers normalizers;
        private ProtocolFactory protocolFactory;
        private long maxCrawlDelay;
        private String queueMode;
        private int maxRedirect;
        private String reprUrl;
        private boolean redirecting;
        private int redirectCount;
        private boolean ignoreExternalLinks;
        // Used by ReFetcher.follow.outlinks.depth in parse
        private int maxOutlinksPerPage;
        private final int maxOutlinks;
        private final int interval;
        private int maxOutlinkDepth;
        private int maxOutlinkDepthNumLinks;
        private boolean outlinksIgnoreExternal;
        private int outlinksDepthDivisor;
        private boolean skipTruncated;

        public FetcherThread(Configuration conf) {
            this.setDaemon(true); // don't hang JVM on exit
            this.setName("FetcherThread"); // use an informative name
            this.conf = conf;
            this.urlFilters = new URLFilters(conf);
            this.scfilters = new ScoringFilters(conf);
            this.parseUtil = new ParseUtil(conf);
            this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
            this.protocolFactory = new ProtocolFactory(conf);
            this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
            this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
            queueMode = conf.get("fetcher.queue.mode", ReFetcher.FetchItemQueues.QUEUE_MODE_HOST);
            // check that the mode is known
            if (!queueMode.equals(ReFetcher.FetchItemQueues.QUEUE_MODE_IP)
                    && !queueMode.equals(ReFetcher.FetchItemQueues.QUEUE_MODE_DOMAIN)
                    && !queueMode.equals(ReFetcher.FetchItemQueues.QUEUE_MODE_HOST)) {
                LOG.error("Unknown partition mode : " + queueMode + " - forcing to byHost");
                queueMode = ReFetcher.FetchItemQueues.QUEUE_MODE_HOST;
  "Using queue mode : " + queueMode);
            this.maxRedirect = conf.getInt("http.redirect.max", 3);
            this.ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);

            maxOutlinksPerPage = conf.getInt("", 100);
            maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage;
            interval = conf.getInt("db.fetch.interval.default", 2592000);
            ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
            maxOutlinkDepth = conf.getInt("fetcher.follow.outlinks.depth", -1);
            outlinksIgnoreExternal = conf.getBoolean("fetcher.follow.outlinks.ignore.external", false);
            maxOutlinkDepthNumLinks = conf.getInt("fetcher.follow.outlinks.num.links", 4);
            outlinksDepthDivisor = conf.getInt("fetcher.follow.outlinks.depth.divisor", 2);

        public void run() {
            activeThreads.incrementAndGet(); // count threads

            ReFetcher.FetchItem fit = null;
            try {

                while (true) {
                    fit = fetchQueues.getFetchItem();
                    if (fit == null) {
                        if (feeder.isAlive() || fetchQueues.getTotalSize() > 0) {
                            LOG.trace(getName() + " spin-waiting ...");
                            // spin-wait.
                            try {
                            } catch (Exception e) {
                        } else {
                            // all done, finish this thread
                    Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
                    if (reprUrlWritable == null) {
                        reprUrl = fit.url.toString();
                    } else {
                        reprUrl = reprUrlWritable.toString();
                    try {
                        // fetch the page
                        redirecting = false;
                        redirectCount = 0;
                        do {
                            Text category = (Text) fit.datum.getMetaData().get(new Text("category"));
                            if (LOG.isInfoEnabled()) {
                                if (category == null) {
                          "fetching " + fit.url);
                                } else {
                          "fetching category " + category.toString() + " url " + fit.url);
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("redirectCount=" + redirectCount);
                            redirecting = false;
                            Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString());
                            RobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
                            if (!rules.isAllowed(fit.u)) {
                                // unblock
                                fetchQueues.finishFetchItem(fit, true);
                                if (LOG.isDebugEnabled()) {
                                    LOG.debug("Denied by robots.txt: " + fit.url);
                                output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED,
                                reporter.incrCounter("FetcherStatus", "robots_denied", 1);
                            if (rules.getCrawlDelay() > 0) {
                                if (rules.getCrawlDelay() > maxCrawlDelay) {
                                    // unblock
                                    fetchQueues.finishFetchItem(fit, true);
                                    LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay()
                                            + "), skipping");
                                    output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED,
                                    reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1);
                                } else {
                                    ReFetcher.FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                                    fiq.crawlDelay = rules.getCrawlDelay();
                            ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.datum);
                            ProtocolStatus status = output.getStatus();
                            Content content = output.getContent();
                            ParseStatus pstatus = null;
                            // unblock queue

                            String urlString = fit.url.toString();

                            reporter.incrCounter("FetcherStatus", status.getName(), 1);

                            switch (status.getCode()) {

                            case ProtocolStatus.WOULDBLOCK:
                                // retry ?

                            case ProtocolStatus.SUCCESS: // got a page
                                if (category != null) {
                                    content.getMetadata().add("category", category.toString());
                                pstatus = output(fit.url, fit.datum, content, status,
                                        CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
                                if (pstatus != null && pstatus.isSuccess()
                                        && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                                    String newUrl = pstatus.getMessage();
                                    int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
                                    Text redirUrl = handleRedirect(fit.url, fit.datum, urlString, newUrl,
                                            refreshTime < ReFetcher.PERM_REFRESH_TIME, ReFetcher.CONTENT_REDIR);
                                    if (redirUrl != null) {
                                        CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
                                                fit.datum.getFetchInterval(), fit.datum.getScore());
                                        // transfer existing metadata to the redir
                                        scfilters.initialScore(redirUrl, newDatum);
                                        if (reprUrl != null) {
                                                    new Text(reprUrl));
                                        fit = ReFetcher.FetchItem.create(redirUrl, newDatum, queueMode);
                                        if (fit != null) {
                                            ReFetcher.FetchItemQueue fiq = fetchQueues
                                        } else {
                                            // stop redirecting
                                            redirecting = false;
                                            reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect",

                            case ProtocolStatus.MOVED: // redirect
                            case ProtocolStatus.TEMP_MOVED:
                                int code;
                                boolean temp;
                                if (status.getCode() == ProtocolStatus.MOVED) {
                                    code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
                                    temp = false;
                                } else {
                                    code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
                                    temp = true;
                                output(fit.url, fit.datum, content, status, code);
                                String newUrl = status.getMessage();
                                Text redirUrl = handleRedirect(fit.url, fit.datum, urlString, newUrl, temp,
                                if (redirUrl != null) {
                                    CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
                                            fit.datum.getFetchInterval(), fit.datum.getScore());
                                    // transfer existing metadata
                                    scfilters.initialScore(redirUrl, newDatum);
                                    if (reprUrl != null) {
                                        newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
                                    fit = ReFetcher.FetchItem.create(redirUrl, newDatum, queueMode);
                                    if (fit != null) {
                                        ReFetcher.FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                                    } else {
                                        // stop redirecting
                                        redirecting = false;
                                        reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1);
                                } else {
                                    // stop redirecting
                                    redirecting = false;

                            case ProtocolStatus.EXCEPTION:
                                logError(fit.url, status.getMessage());
                                int killedURLs = fetchQueues.checkExceptionThreshold(fit.getQueueID());
                                if (killedURLs != 0) {
                                    reporter.incrCounter("FetcherStatus", "AboveExceptionThresholdInQueue",
                                /* FALLTHROUGH */
                            case ProtocolStatus.RETRY: // retry
                            case ProtocolStatus.BLOCKED:
                                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);

                            case ProtocolStatus.GONE: // gone
                            case ProtocolStatus.NOTFOUND:
                            case ProtocolStatus.ACCESS_DENIED:
                            case ProtocolStatus.ROBOTS_DENIED:
                                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);

                            case ProtocolStatus.NOTMODIFIED:
                                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);

                                if (LOG.isWarnEnabled()) {
                                    LOG.warn("Unknown ProtocolStatus: " + status.getCode());
                                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);

                            if (redirecting && redirectCount > maxRedirect) {
                                if (LOG.isInfoEnabled()) {
                          " - redirect count exceeded " + fit.url);
                                output(fit.url, fit.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED,

                        } while (redirecting && (redirectCount <= maxRedirect));

                    } catch (Throwable t) { // unexpected exception
                        // unblock
                        logError(fit.url, StringUtils.stringifyException(t));
                        output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED,

            } catch (Throwable e) {
                if (LOG.isErrorEnabled()) {
                    LOG.error("fetcher caught:" + e.toString());
            } finally {
                if (fit != null) {
                activeThreads.decrementAndGet(); // count threads
      "-finishing thread " + getName() + ", activeThreads=" + activeThreads);

        private Text handleRedirect(Text url, CrawlDatum datum, String urlString, String newUrl, boolean temp,
                String redirType) throws MalformedURLException, URLFilterException {
            newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
            newUrl = urlFilters.filter(newUrl);

            if (ignoreExternalLinks) {
                try {
                    String origHost = new URL(urlString).getHost().toLowerCase();
                    String newHost = new URL(newUrl).getHost().toLowerCase();
                    if (!origHost.equals(newHost)) {
                        if (LOG.isDebugEnabled()) {
                            LOG.debug(" - ignoring redirect " + redirType + " from " + urlString + " to " + newUrl
                                    + " because external links are ignored");
                        return null;
                } catch (MalformedURLException e) {

            if (newUrl != null && !newUrl.equals(urlString)) {
                reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
                url = new Text(newUrl);
                if (maxRedirect > 0) {
                    redirecting = true;
                    if (LOG.isDebugEnabled()) {
                        LOG.debug(" - " + redirType + " redirect to " + url + " (fetching now)");
                    return url;
                } else {
                    CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED, datum.getFetchInterval(),
                    // transfer existing metadata
                    try {
                        scfilters.initialScore(url, newDatum);
                    } catch (ScoringFilterException e) {
                    if (reprUrl != null) {
                        newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
                    output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
                    if (LOG.isDebugEnabled()) {
                        LOG.debug(" - " + redirType + " redirect to " + url + " (fetching later)");
                    return null;
            } else {
                if (LOG.isDebugEnabled()) {
                    LOG.debug(" - " + redirType + " redirect skipped: "
                            + (newUrl != null ? "to same url" : "filtered"));
                return null;

        private void logError(Text url, String message) {
            if (LOG.isInfoEnabled()) {
      "fetch of " + url + " failed with: " + message);

        private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus,
                int status) {

            return output(key, datum, content, pstatus, status, 0);

        private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status,
                int outlinkDepth) {

            if (pstatus != null) {
                datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);

            ParseResult parseResult = null;
            if (content != null) {

                Metadata metadata = content.getMetadata();

                if (LOG.isDebugEnabled()) {
                    LOG.debug("Fetcher content url  " + content.getUrl());
                    LOG.debug("Fetcher content metadata  " + Arrays.toString(metadata.names()));

                // store the guessed content type in the crawldatum
                if (content.getContentType() != null) {
                    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));

                // add segment to metadata
                metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
                // add score to content metadata so that ParseSegment can pick it up.
                try {
                    scfilters.passScoreBeforeParsing(key, datum, content);
                } catch (Exception e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
                /* Note: Fetcher will only follow meta-redirects coming from the
                 * original URL. */
                if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
                    if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) {
                        try {
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Fetcher start parse url " + content.getUrl());
                            parseResult = this.parseUtil.parse(content);
                        } catch (Exception e) {
                            LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));

                    if (parseResult == null) {
                        byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content,
                                new ParseStatus().getEmptyParse(conf));

                /* Store status code in content So we can read this value during
                 * parsing (as a separate job) and decide to parse or not.
                content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));

            try {
                output.collect(key, new NutchWritable(datum));
                if (content != null && storingContent) {
                    output.collect(key, new NutchWritable(content));
                if (parseResult != null) {
                    for (Entry<Text, Parse> entry : parseResult) {
                        Text url = entry.getKey();
                        Parse parse = entry.getValue();
                        ParseStatus parseStatus = parse.getData().getStatus();
                        ParseData parseData = parse.getData();

                        if (!parseStatus.isSuccess()) {
                            LOG.warn("Error parsing: " + key + ": " + parseStatus);
                            parse = parseStatus.getEmptyParse(getConf());

                        // Calculate page signature. For non-parsing fetchers this will
                        // be done in ParseSegment
                        byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
                        // Ensure segment name and score are in parseData metadata
                        parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
                        parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
                        // Pass fetch time to content meta
                        parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
                        if (url.equals(key)) {
                        try {
                            scfilters.passScoreAfterParsing(url, content, parse);
                        } catch (Exception e) {
                            if (LOG.isWarnEnabled()) {
                                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");

                        String fromHost;

                        // collect outlinks for subsequent db update
                        Outlink[] links = parseData.getOutlinks();
                        int outlinksToStore = Math.min(maxOutlinks, links.length);
                        if (ignoreExternalLinks) {
                            try {
                                fromHost = new URL(url.toString()).getHost().toLowerCase();
                            } catch (MalformedURLException e) {
                                fromHost = null;
                        } else {
                            fromHost = null;

                        int validCount = 0;

                        // Process all outlinks, normalize, filter and deduplicate
                        List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
                        HashSet<String> outlinks = new HashSet<String>(outlinksToStore);
                        for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
                            String toUrl = links[i].getToUrl();

                            toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, fromHost,
                                    ignoreExternalLinks, urlFilters, normalizers);
                            if (toUrl == null) {


                        // Only process depth N outlinks
                        if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
                            reporter.incrCounter("FetcherOutlinks", "outlinks_detected", outlinks.size());

                            // Counter to limit num outlinks to follow per page
                            int outlinkCounter = 0;

                            // Calculate variable number of outlinks by depth using the divisor (outlinks = Math.floor(divisor / depth * num.links))
                            int maxOutlinksByDepth = (int) Math
                                    .floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);

                            String followUrl;

                            // Walk over the outlinks and add as new FetchItem to the queues
                            Iterator<String> iter = outlinks.iterator();
                            while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
                                followUrl =;

                                // Check whether we'll follow external outlinks
                                if (outlinksIgnoreExternal) {
                                    if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {

                                reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1);

                                CrawlDatum fCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);

                                if (LOG.isDebugEnabled()) {
                                    LOG.debug("Create new fetcher metadata: "
                                            + Arrays.toString(fCrawlDatum.getMetaData().values().toArray()));

                                // Create new FetchItem datumwith depth incremented
                                ReFetcher.FetchItem fit = ReFetcher.FetchItem.create(new Text(followUrl),
                                        fCrawlDatum, queueMode, outlinkDepth + 1);


                        // Overwrite the outlinks in ParseData with the normalized and filtered set
                        parseData.setOutlinks((Outlink[]) outlinkList.toArray(new Outlink[outlinkList.size()]));

                        Text category = (Text) datum.getMetaData().get(new Text("category"));
                        if (category != null) {
                            parseData.getContentMeta().add("category", category.toString());

                        output.collect(url, new NutchWritable(
                                new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical())));
            } catch (IOException e) {
                if (LOG.isErrorEnabled()) {
                    LOG.error("fetcher caught:" + e.toString());

            // return parse status if it exits
            if (parseResult != null && !parseResult.isEmpty()) {
                Parse p = parseResult.get(content.getUrl());
                if (p != null) {
                            ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1);
                    return p.getData().getStatus();
            return null;

    public ReFetcher() {

    public ReFetcher(Configuration conf) {

    private void updateStatus(int bytesInPage) throws IOException {

    private void reportStatus(int pagesLastSec, int bytesLastSec) throws IOException {
        String status;
        long elapsed = (System.currentTimeMillis() - start) / 1000;

        float avgPagesSec = Math.round(((float) pages.get() * 10) / elapsed) / 10;
        float avgBytesSec = Math.round(((((float) bytes.get()) * 8) / 1000) / elapsed);

        status = activeThreads + " threads, " + fetchQueues.getQueueCount() + " queues, "
                + fetchQueues.getTotalSize() + " URLs queued, " + pages + " pages, " + errors + " errors, "
                + avgPagesSec + " (" + pagesLastSec + ") pages/s, " + avgBytesSec + " (" + bytesLastSec
                + ") kbits/s, ";


    public void configure(JobConf job) {

        this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY);
        this.storingContent = isStoringContent(job);
        this.parsing = isParsing(job);

    public void close() {

    public static boolean isParsing(Configuration conf) {
        return conf.getBoolean("fetcher.parse", true);

    public static boolean isStoringContent(Configuration conf) {
        return conf.getBoolean("", true);

    public void run(RecordReader<Text, CrawlDatum> input, OutputCollector<Text, NutchWritable> output,
            Reporter reporter) throws IOException {

        this.output = output;
        this.reporter = reporter;
        this.fetchQueues = new ReFetcher.FetchItemQueues(getConf());

        int threadCount = getConf().getInt("fetcher.threads.fetch", 10);
        if (LOG.isInfoEnabled()) {
  "Fetcher: threads: " + threadCount);

        int timeoutDivisor = getConf().getInt("fetcher.threads.timeout.divisor", 2);
        if (LOG.isInfoEnabled()) {
  "Fetcher: time-out divisor: " + timeoutDivisor);

        int queueDepthMuliplier = getConf().getInt("fetcher.queue.depth.multiplier", 50);

        feeder = new ReFetcher.QueueFeeder(input, fetchQueues, threadCount * queueDepthMuliplier);
        //feeder.setPriority((Thread.MAX_PRIORITY + Thread.NORM_PRIORITY) / 2);

        // the value of the time limit is either -1 or the time where it should finish
        long timelimit = getConf().getLong("fetcher.timelimit", -1);
        if (timelimit != -1) {

        // set non-blocking & no-robots mode for HTTP protocol plugins.
        getConf().setBoolean(Protocol.CHECK_BLOCKING, false);
        getConf().setBoolean(Protocol.CHECK_ROBOTS, false);

        for (int i = 0; i < threadCount; i++) { // spawn threads
            new ReFetcher.FetcherThread(getConf()).start();

        // select a timeout that avoids a task timeout
        long timeout = getConf().getInt("mapred.task.timeout", 10 * 60 * 1000) / timeoutDivisor;

        // Used for threshold check, holds pages and bytes processed in the last second
        int pagesLastSec;
        int bytesLastSec;

        // Set to true whenever the threshold has been exceeded for the first time
        boolean throughputThresholdExceeded = false;
        int throughputThresholdNumRetries = 0;

        int throughputThresholdPages = getConf().getInt("fetcher.throughput.threshold.pages", -1);
        if (LOG.isInfoEnabled()) {
  "Fetcher: throughput threshold: " + throughputThresholdPages);
        int throughputThresholdMaxRetries = getConf().getInt("fetcher.throughput.threshold.retries", 5);
        if (LOG.isInfoEnabled()) {
  "Fetcher: throughput threshold retries: " + throughputThresholdMaxRetries);
        long throughputThresholdTimeLimit = getConf().getLong("fetcher.throughput.threshold.check.after", -1);

        do { // wait for threads to exit
            pagesLastSec = pages.get();
            bytesLastSec = (int) bytes.get();

            try {
            } catch (InterruptedException e) {

            pagesLastSec = pages.get() - pagesLastSec;
            bytesLastSec = (int) bytes.get() - bytesLastSec;

            reporter.incrCounter("FetcherStatus", "bytes_downloaded", bytesLastSec);

            reportStatus(pagesLastSec, bytesLastSec);

  "-activeThreads=" + activeThreads + ", spinWaiting=" + spinWaiting.get()
                    + ", fetchQueues.totalSize=" + fetchQueues.getTotalSize());

            if (!feeder.isAlive() && fetchQueues.getTotalSize() < 5) {

            // if throughput threshold is enabled
            if (throughputThresholdTimeLimit < System.currentTimeMillis() && throughputThresholdPages != -1) {
                // Check if we're dropping below the threshold
                if (pagesLastSec < throughputThresholdPages) {
                            + ": dropping below configured threshold of "
                            + Integer.toString(throughputThresholdPages) + " pages per second");

                    // Quit if we dropped below threshold too many times
                    if (throughputThresholdNumRetries == throughputThresholdMaxRetries) {
                        LOG.warn("Dropped below threshold too many times, killing!");

                        // Disable the threshold checker
                        throughputThresholdPages = -1;

                        // Empty the queues cleanly and get number of items that were dropped
                        int hitByThrougputThreshold = fetchQueues.emptyQueues();

                        if (hitByThrougputThreshold != 0) {
                            reporter.incrCounter("FetcherStatus", "hitByThrougputThreshold",

            // check timelimit
            if (!feeder.isAlive()) {
                int hitByTimeLimit = fetchQueues.checkTimelimit();
                if (hitByTimeLimit != 0) {
                    reporter.incrCounter("FetcherStatus", "hitByTimeLimit", hitByTimeLimit);

            // some requests seem to hang, despite all intentions
            if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) {
                if (LOG.isWarnEnabled()) {
                    LOG.warn("Aborting with " + activeThreads + " hung threads.");

        } while (activeThreads.get() > 0);"-activeThreads=" + activeThreads);


    public void fetch(Path segment, int threads) throws IOException {


        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
  "Fetcher: starting at " + sdf.format(start));
  "Fetcher: segment: " + segment);

        // set the actual time for the timelimit relative
        // to the beginning of the whole job and not of a specific task
        // otherwise it keeps trying again if a task fails
        long timelimit = getConf().getLong("ReFetcher.timelimit.mins", -1);
        if (timelimit != -1) {
            timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
  "Fetcher Timelimit set for : " + timelimit);
            getConf().setLong("fetcher.timelimit", timelimit);

        // Set the time limit after which the throughput threshold feature is enabled
        timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10);
        timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
        getConf().setLong("fetcher.throughput.threshold.check.after", timelimit);

        int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1);
        if (maxOutlinkDepth > 0) {
  "Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth));

            int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4);
            int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2);

            int totalOutlinksToFollow = 0;
            for (int i = 0; i < maxOutlinkDepth; i++) {
                totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks);

  "Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow));

        JobConf job = new NutchJob(getConf());
        job.setJobName("fetch " + segment);

        job.setInt("fetcher.threads.fetch", threads);
        job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

        // for politeness, don't permit parallel execution of a single task

        FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));


        FileOutputFormat.setOutputPath(job, segment);


        long end = System.currentTimeMillis();"Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

     * Run the ReFetcher.
    public static void main(String[] args) throws Exception {
        int res =, new ReFetcher(), args);

    public int run(String[] args) throws Exception {

        String usage = "Usage: Fetcher <segment> [-threads n]";

        if (args.length < 1) {
            return -1;

        Path segment = new Path(args[0]);

        int threads = getConf().getInt("fetcher.threads.fetch", 10);
        boolean parsing = false;

        for (int i = 1; i < args.length; i++) { // parse command line
            if (args[i].equals("-threads")) { // found -threads option
                threads = Integer.parseInt(args[++i]);

        getConf().setInt("fetcher.threads.fetch", threads);

        try {
            fetch(segment, threads);
            return 0;
        } catch (Exception e) {
            LOG.error("Fetcher: " + StringUtils.stringifyException(e));
            return -1;


    private void checkConfiguration() {

        // ensure that a value has been set for the agent name and that that
        // agent name is the first value in the agents we advertise for robot
        // rules parsing
        String agentName = getConf().get("");
        if (agentName == null || agentName.trim().length() == 0) {
            String message = "Fetcher: No agents listed in ''" + " property.";
            if (LOG.isErrorEnabled()) {
            throw new IllegalArgumentException(message);
        } else {

            // get all of the agents that we advertise
            String agentNames = getConf().get("http.robots.agents");
            StringTokenizer tok = new StringTokenizer(agentNames, ",");
            ArrayList<String> agents = new ArrayList<String>();
            while (tok.hasMoreTokens()) {

            // if the first one is not equal to our agent name, log fatal and throw
            // an exception
            if (!(agents.get(0)).equalsIgnoreCase(agentName)) {
                String message = "Fetcher: Your '' value should be "
                        + "listed first in 'http.robots.agents' property.";
                if (LOG.isWarnEnabled()) {