org.apache.nutch.fetcher.FetcherReducer.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.fetcher.FetcherReducer.java

Source

/*******************************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package org.apache.nutch.fetcher;

import crawlercommons.robots.BaseRobotRules;
import org.apache.avro.util.Utf8;
import org.apache.gora.mapreduce.GoraReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.nutch.crawl.CrawlStatus;
import org.apache.nutch.host.HostDb;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParserJob;
import org.apache.nutch.protocol.*;
import org.apache.nutch.storage.Host;
import org.apache.nutch.storage.Mark;
import org.apache.nutch.storage.ProtocolStatus;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;

import java.io.IOException;
import java.net.InetAddress;
import java.net.URL;
import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

public class FetcherReducer extends GoraReducer<IntWritable, FetchEntry, String, WebPage> {

    public static final Logger LOG = FetcherJob.LOG;

    private final AtomicInteger activeThreads = new AtomicInteger(0);
    private final AtomicInteger spinWaiting = new AtomicInteger(0);

    private final long start = System.currentTimeMillis(); // start time of
                                                           // fetcher run
    private final AtomicLong lastRequestStart = new AtomicLong(start);

    private final AtomicLong bytes = new AtomicLong(0); // total bytes fetched
    private final AtomicInteger pages = new AtomicInteger(0); // total pages
                                                              // fetched
    private final AtomicInteger errors = new AtomicInteger(0); // total pages
                                                               // errored

    private QueueFeeder feeder;

    private final List<FetcherThread> fetcherThreads = new ArrayList<FetcherThread>();

    private FetchItemQueues fetchQueues;

    private boolean storingContent;
    private boolean parse;

    private ParseUtil parseUtil;
    private boolean skipTruncated;

    /**
     * This class described the item to be fetched.
     */
    private static class FetchItem {
        WebPage page;
        String queueID;
        String url;
        URL u;

        public FetchItem(String url, WebPage page, URL u, String queueID) {
            this.page = page;
            this.url = url;
            this.u = u;
            this.queueID = queueID;
        }

        /**
         * Create an item. Queue id will be created based on <code>queueMode</code>
         * argument, either as a protocol + hostname pair, protocol + IP address
         * pair or protocol+domain pair.
         */
        public static FetchItem create(String url, WebPage page, String queueMode) {
            String queueID;
            URL u = null;
            try {
                u = new URL(url);
            } catch (final Exception e) {
                LOG.warn("Cannot parse url: " + url, e);
                return null;
            }
            final String proto = u.getProtocol().toLowerCase();
            String host;
            if (FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) {
                try {
                    final InetAddress addr = InetAddress.getByName(u.getHost());
                    host = addr.getHostAddress();
                } catch (final UnknownHostException e) {
                    // unable to resolve it, so don't fall back to host name
                    LOG.warn("Unable to resolve: " + u.getHost() + ", skipping.");
                    return null;
                }
            } else if (FetchItemQueues.QUEUE_MODE_DOMAIN.equalsIgnoreCase(queueMode)) {
                host = URLUtil.getDomainName(u);
                if (host == null) {
                    LOG.warn("Unknown domain for url: " + url + ", using URL string as key");
                    host = u.toExternalForm();
                }
            } else {
                host = u.getHost();
                if (host == null) {
                    LOG.warn("Unknown host for url: " + url + ", using URL string as key");
                    host = u.toExternalForm();
                }
            }
            queueID = proto + "://" + host.toLowerCase();
            return new FetchItem(url, page, u, queueID);
        }

        @Override
        public String toString() {
            return "FetchItem [queueID=" + queueID + ", url=" + url + ", u=" + u + ", page=" + page + "]";
        }
    }

    /**
     * This class handles FetchItems which come from the same host ID (be it a
     * proto/hostname or proto/IP pair). It also keeps track of requests in
     * progress and elapsed time between requests.
     */
    private static class FetchItemQueue {
        List<FetchItem> queue = Collections.synchronizedList(new LinkedList<FetchItem>());
        Set<FetchItem> inProgress = Collections.synchronizedSet(new HashSet<FetchItem>());
        AtomicLong nextFetchTime = new AtomicLong();
        long crawlDelay;
        long minCrawlDelay;
        int maxThreads;

        public FetchItemQueue(Configuration conf, int maxThreads, long crawlDelay, long minCrawlDelay) {
            this.maxThreads = maxThreads;
            this.crawlDelay = crawlDelay;
            this.minCrawlDelay = minCrawlDelay;
            // ready to start
            setEndTime(System.currentTimeMillis() - crawlDelay);
        }

        public int getQueueSize() {
            return queue.size();
        }

        public int getInProgressSize() {
            return inProgress.size();
        }

        public void finishFetchItem(FetchItem it, boolean asap) {
            if (it != null) {
                inProgress.remove(it);
                setEndTime(System.currentTimeMillis(), asap);
            }
        }

        public void addFetchItem(FetchItem it) {
            if (it == null)
                return;
            queue.add(it);
        }

        @SuppressWarnings("unused")
        public void addInProgressFetchItem(FetchItem it) {
            if (it == null)
                return;
            inProgress.add(it);
        }

        public FetchItem getFetchItem() {
            if (inProgress.size() >= maxThreads)
                return null;
            final long now = System.currentTimeMillis();
            if (nextFetchTime.get() > now)
                return null;
            FetchItem it = null;
            if (queue.size() == 0)
                return null;
            try {
                it = queue.remove(0);
                inProgress.add(it);
            } catch (final Exception e) {
                LOG.error("Cannot remove FetchItem from queue or cannot add it to inProgress queue", e);
            }
            return it;
        }

        public synchronized void dump() {
            LOG.info("  maxThreads    = " + maxThreads);
            LOG.info("  inProgress    = " + inProgress.size());
            LOG.info("  crawlDelay    = " + crawlDelay);
            LOG.info("  minCrawlDelay = " + minCrawlDelay);
            LOG.info("  nextFetchTime = " + nextFetchTime.get());
            LOG.info("  now           = " + System.currentTimeMillis());
            for (int i = 0; i < queue.size(); i++) {
                final FetchItem it = queue.get(i);
                LOG.info("  " + i + ". " + it.url);
            }
        }

        private void setEndTime(long endTime) {
            setEndTime(endTime, false);
        }

        private void setEndTime(long endTime, boolean asap) {
            if (!asap)
                nextFetchTime.set(endTime + (maxThreads > 1 ? minCrawlDelay : crawlDelay));
            else
                nextFetchTime.set(endTime);
        }

        public synchronized int emptyQueue() {
            int presize = queue.size();
            queue.clear();
            return presize;
        }
    }

    /**
     * Convenience class - a collection of queues that keeps track of the total
     * number of items, and provides items eligible for fetching from any queue.
     */
    private static class FetchItemQueues {
        @SuppressWarnings("unused")
        public static final String DEFAULT_ID = "default";
        Map<String, FetchItemQueue> queues = new HashMap<String, FetchItemQueue>();
        AtomicInteger totalSize = new AtomicInteger(0);
        int maxThreads;
        String queueMode;
        long crawlDelay;
        long minCrawlDelay;
        Configuration conf;
        long timelimit = -1;

        boolean useHostSettings = false;
        HostDb hostDb = null;

        public static final String QUEUE_MODE_HOST = "byHost";
        public static final String QUEUE_MODE_DOMAIN = "byDomain";
        public static final String QUEUE_MODE_IP = "byIP";

        public FetchItemQueues(Configuration conf) throws IOException {
            this.conf = conf;
            this.maxThreads = conf.getInt("fetcher.threads.per.queue", 1);
            queueMode = conf.get("fetcher.queue.mode", QUEUE_MODE_HOST);
            // check that the mode is known
            if (!queueMode.equals(QUEUE_MODE_IP) && !queueMode.equals(QUEUE_MODE_DOMAIN)
                    && !queueMode.equals(QUEUE_MODE_HOST)) {
                LOG.error("Unknown partition mode : " + queueMode + " - forcing to byHost");
                queueMode = QUEUE_MODE_HOST;
            }
            LOG.info("Using queue mode : " + queueMode);

            // Optionally enable host specific queue behavior
            if (queueMode.equals(QUEUE_MODE_HOST)) {
                useHostSettings = conf.getBoolean("fetcher.queue.use.host.settings", false);
                if (useHostSettings) {
                    LOG.info("Host specific queue settings enabled.");
                    // Initialize the HostDb if we need it.
                    hostDb = new HostDb(conf);
                }
            }

            this.crawlDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000);
            this.minCrawlDelay = (long) (conf.getFloat("fetcher.server.min.delay", 0.0f) * 1000);
            this.timelimit = conf.getLong("fetcher.timelimit", -1);
        }

        public int getTotalSize() {
            return totalSize.get();
        }

        public int getQueueCount() {
            return queues.size();
        }

        public void addFetchItem(String url, WebPage page) {
            final FetchItem it = FetchItem.create(url, page, queueMode);
            if (it != null)
                addFetchItem(it);
        }

        public synchronized void addFetchItem(FetchItem it) {
            final FetchItemQueue fiq = getFetchItemQueue(it.queueID);
            fiq.addFetchItem(it);
            totalSize.incrementAndGet();
        }

        public void finishFetchItem(FetchItem it) {
            finishFetchItem(it, false);
        }

        public void finishFetchItem(FetchItem it, boolean asap) {
            final FetchItemQueue fiq = queues.get(it.queueID);
            if (fiq == null) {
                LOG.warn("Attempting to finish item from unknown queue: " + it);
                return;
            }
            fiq.finishFetchItem(it, asap);
        }

        public synchronized FetchItemQueue getFetchItemQueue(String id) {
            FetchItemQueue fiq = queues.get(id);
            if (fiq == null) {
                // Create a new queue
                if (useHostSettings) {
                    // Use host specific queue settings (if defined in the host table)
                    try {
                        String hostname = id.substring(id.indexOf("://") + 3);
                        Host host = hostDb.getByHostName(hostname);
                        if (host != null) {
                            fiq = new FetchItemQueue(conf, host.getInt("q_mt", maxThreads),
                                    host.getLong("q_cd", crawlDelay), host.getLong("q_mcd", minCrawlDelay));
                        }

                    } catch (IOException e) {
                        LOG.error("Error while trying to access host settings", e);
                    }
                }
                if (fiq == null) {
                    // Use queue defaults
                    fiq = new FetchItemQueue(conf, maxThreads, crawlDelay, minCrawlDelay);
                }
                queues.put(id, fiq);
            }
            return fiq;
        }

        public synchronized FetchItem getFetchItem() {
            final Iterator<Map.Entry<String, FetchItemQueue>> it = queues.entrySet().iterator();
            while (it.hasNext()) {
                final FetchItemQueue fiq = it.next().getValue();
                // reap empty queues
                if (fiq.getQueueSize() == 0 && fiq.getInProgressSize() == 0) {
                    it.remove();
                    continue;
                }
                final FetchItem fit = fiq.getFetchItem();
                if (fit != null) {
                    totalSize.decrementAndGet();

                    return fit;
                }
            }
            return null;
        }

        public synchronized int checkTimelimit() {
            if (System.currentTimeMillis() >= timelimit && timelimit != -1) {
                return emptyQueues();
            }
            return 0;
        }

        public synchronized void dump() {
            for (final String id : queues.keySet()) {
                final FetchItemQueue fiq = queues.get(id);
                if (fiq.getQueueSize() == 0)
                    continue;
                LOG.info("* queue: " + id);
                fiq.dump();
            }
        }

        // empties the queues (used by timebomb and throughput threshold)
        public synchronized int emptyQueues() {
            int count = 0;

            // emptying the queues
            for (String id : queues.keySet()) {
                FetchItemQueue fiq = queues.get(id);
                if (fiq.getQueueSize() == 0)
                    continue;
                LOG.info("* queue: " + id + " >> dropping! ");
                int deleted = fiq.emptyQueue();
                for (int i = 0; i < deleted; i++) {
                    totalSize.decrementAndGet();
                }
                count += deleted;
            }
            // there might also be a case where totalsize !=0 but number of queues
            // == 0
            // in which case we simply force it to 0 to avoid blocking
            if (totalSize.get() != 0 && queues.size() == 0)
                totalSize.set(0);

            return count;
        }
    }

    /**
     * This class picks items from queues and fetches the pages.
     */
    private class FetcherThread extends Thread {
        private final URLFilters urlFilters;
        private final URLNormalizers normalizers;
        private final ProtocolFactory protocolFactory;
        private final long maxCrawlDelay;
        @SuppressWarnings("unused")
        private final boolean byIP;
        private String reprUrl;

        private final Context context;
        private final boolean ignoreExternalLinks;

        public FetcherThread(Context context, int num) {
            this.setDaemon(true); // don't hang JVM on exit
            this.setName("FetcherThread" + num); // use an informative name
            this.context = context;
            Configuration conf = context.getConfiguration();
            this.urlFilters = new URLFilters(conf);
            this.protocolFactory = new ProtocolFactory(conf);
            this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
            this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
            // backward-compatible default setting
            this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true);
            this.ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
        }

        @Override
        @SuppressWarnings("fallthrough")
        public void run() {
            activeThreads.incrementAndGet(); // count threads

            FetchItem fit = null;
            try {

                while (true) {
                    fit = fetchQueues.getFetchItem();
                    if (fit == null) {
                        if (feeder.isAlive() || fetchQueues.getTotalSize() > 0) {
                            if (LOG.isDebugEnabled()) {
                                LOG.debug(getName() + " fetchQueues.getFetchItem() was null, spin-waiting ...");
                            }
                            // spin-wait.
                            spinWaiting.incrementAndGet();
                            try {
                                Thread.sleep(500);
                            } catch (final Exception e) {
                            }
                            spinWaiting.decrementAndGet();
                            continue;
                        } else {
                            // all done, finish this thread
                            return;
                        }
                    }
                    lastRequestStart.set(System.currentTimeMillis());
                    if (fit.page.getReprUrl() == null) {
                        reprUrl = fit.url;
                    } else {
                        reprUrl = TableUtil.toString(fit.page.getReprUrl());
                    }
                    try {
                        LOG.info("fetching " + fit.url + " (queue crawl delay="
                                + fetchQueues.getFetchItemQueue(fit.queueID).crawlDelay + "ms)");

                        // fetch the page
                        final Protocol protocol = this.protocolFactory.getProtocol(fit.url);
                        final BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.page);
                        if (!rules.isAllowed(fit.u.toString())) {
                            // unblock
                            fetchQueues.finishFetchItem(fit, true);
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Denied by robots.txt: " + fit.url);
                            }
                            output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED, CrawlStatus.STATUS_GONE);
                            continue;
                        }
                        if (rules.getCrawlDelay() > 0) {
                            if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                                // unblock
                                fetchQueues.finishFetchItem(fit, true);
                                LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay()
                                        + "), skipping");
                                output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED,
                                        CrawlStatus.STATUS_GONE);
                                continue;
                            } else {
                                final FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                                fiq.crawlDelay = rules.getCrawlDelay();
                                if (LOG.isDebugEnabled()) {
                                    LOG.info("Crawl delay for queue: " + fit.queueID + " is set to "
                                            + fiq.crawlDelay + " as per robots.txt. url: " + fit.url);
                                }
                            }
                        }
                        final ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.page);
                        final ProtocolStatus status = output.getStatus();
                        final Content content = output.getContent();
                        // unblock queue
                        fetchQueues.finishFetchItem(fit);

                        context.getCounter("FetcherStatus", ProtocolStatusUtils.getName(status.getCode()))
                                .increment(1);

                        int length = 0;
                        if (content != null && content.getContent() != null)
                            length = content.getContent().length;
                        updateStatus(length);

                        switch (status.getCode()) {

                        case ProtocolStatusCodes.WOULDBLOCK:
                            // retry ?
                            fetchQueues.addFetchItem(fit);
                            break;

                        case ProtocolStatusCodes.SUCCESS: // got a page
                            output(fit, content, status, CrawlStatus.STATUS_FETCHED);
                            break;

                        case ProtocolStatusCodes.MOVED: // redirect
                        case ProtocolStatusCodes.TEMP_MOVED:
                            byte code;
                            boolean temp;
                            if (status.getCode() == ProtocolStatusCodes.MOVED) {
                                code = CrawlStatus.STATUS_REDIR_PERM;
                                temp = false;
                            } else {
                                code = CrawlStatus.STATUS_REDIR_TEMP;
                                temp = true;
                            }
                            final String newUrl = ProtocolStatusUtils.getMessage(status);
                            handleRedirect(fit.url, newUrl, temp, FetcherJob.PROTOCOL_REDIR, fit.page);
                            output(fit, content, status, code);
                            break;
                        case ProtocolStatusCodes.EXCEPTION:
                            logFetchFailure(fit.url, ProtocolStatusUtils.getMessage(status));
                            /* FALLTHROUGH */
                        case ProtocolStatusCodes.RETRY: // retry
                        case ProtocolStatusCodes.BLOCKED:
                            output(fit, null, status, CrawlStatus.STATUS_RETRY);
                            break;

                        case ProtocolStatusCodes.GONE: // gone
                        case ProtocolStatusCodes.NOTFOUND:
                        case ProtocolStatusCodes.ACCESS_DENIED:
                        case ProtocolStatusCodes.ROBOTS_DENIED:
                            output(fit, null, status, CrawlStatus.STATUS_GONE);
                            break;

                        case ProtocolStatusCodes.NOTMODIFIED:
                            output(fit, null, status, CrawlStatus.STATUS_NOTMODIFIED);
                            break;

                        default:
                            if (LOG.isWarnEnabled()) {
                                LOG.warn("Unknown ProtocolStatus: " + status.getCode());
                            }
                            output(fit, null, status, CrawlStatus.STATUS_RETRY);
                        }

                    } catch (final Throwable t) { // unexpected exception
                        // unblock
                        fetchQueues.finishFetchItem(fit);
                        LOG.error("Unexpected error for " + fit.url, t);
                        output(fit, null, ProtocolStatusUtils.STATUS_FAILED, CrawlStatus.STATUS_RETRY);
                    }
                }

            } catch (final Throwable e) {
                LOG.error("fetcher throwable caught", e);
            } finally {
                if (fit != null)
                    fetchQueues.finishFetchItem(fit);
                activeThreads.decrementAndGet(); // count threads
                LOG.info("-finishing thread " + getName() + ", activeThreads=" + activeThreads);
            }
        }

        private void handleRedirect(String url, String newUrl, boolean temp, String redirType, WebPage page)
                throws URLFilterException, IOException, InterruptedException {
            newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
            newUrl = urlFilters.filter(newUrl);
            if (newUrl == null || newUrl.equals(url)) {
                return;
            }

            if (ignoreExternalLinks) {
                String toHost = new URL(newUrl).getHost().toLowerCase();
                String fromHost = new URL(url).getHost().toLowerCase();
                if (toHost == null || !toHost.equals(fromHost)) {
                    // external links
                    return;
                }
            }

            page.getOutlinks().put(new Utf8(newUrl), new Utf8());
            page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL);
            reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
            if (reprUrl == null) {
                LOG.warn("reprUrl==null");
            } else {
                page.setReprUrl(new Utf8(reprUrl));
                if (LOG.isDebugEnabled()) {
                    LOG.debug(" - " + redirType + " redirect to " + reprUrl + " (fetching later)");
                }
            }
        }

        private void updateStatus(int bytesInPage) throws IOException {
            pages.incrementAndGet();
            bytes.addAndGet(bytesInPage);
        }

        private void output(FetchItem fit, Content content, ProtocolStatus pstatus, byte status)
                throws IOException, InterruptedException {
            fit.page.setStatus((int) status);
            final long prevFetchTime = fit.page.getFetchTime();
            fit.page.setPrevFetchTime(prevFetchTime);
            fit.page.setFetchTime(System.currentTimeMillis());
            if (pstatus != null) {
                fit.page.setProtocolStatus(pstatus);
            }

            if (content != null) {
                fit.page.setContent(ByteBuffer.wrap(content.getContent()));
                fit.page.setContentType(new Utf8(content.getContentType()));
                fit.page.setBaseUrl(new Utf8(content.getBaseUrl()));
            }
            Mark.FETCH_MARK.putMark(fit.page, Mark.GENERATE_MARK.checkMark(fit.page));
            String key = TableUtil.reverseUrl(fit.url);

            if (parse) {
                if (!skipTruncated || (skipTruncated && !ParserJob.isTruncated(fit.url, fit.page))) {
                    parseUtil.process(key, fit.page);
                }
            }
            // remove content if storingContent is false. Content is added to fit.page
            // above
            // for ParseUtil be able to parse it.
            if (content != null && !storingContent) {
                fit.page.setContent(ByteBuffer.wrap(new byte[0]));
            }
            context.write(key, fit.page);
        }

        private void logFetchFailure(String url, String message) {
            LOG.warn("fetch of " + url + " failed with: " + message);
            errors.incrementAndGet();
        }
    }

    /**
     * This class feeds the queues with input items, and re-fills them as items
     * are consumed by FetcherThread-s.
     */
    private static class QueueFeeder extends Thread {
        private final Context context;
        private final FetchItemQueues queues;
        private final int size;
        private Iterator<FetchEntry> currentIter;
        boolean hasMore;
        private long timelimit = -1;

        public QueueFeeder(Context context, FetchItemQueues queues, int size)
                throws IOException, InterruptedException {
            this.context = context;
            this.queues = queues;
            this.size = size;
            this.setDaemon(true);
            this.setName("QueueFeeder");
            hasMore = context.nextKey();
            if (hasMore) {
                currentIter = context.getValues().iterator();
            }
            // the value of the time limit is either -1 or the time where it should
            // finish
            timelimit = context.getConfiguration().getLong("fetcher.timelimit", -1);
        }

        @Override
        public void run() {
            int cnt = 0;
            int timelimitcount = 0;
            try {
                while (hasMore) {
                    if (System.currentTimeMillis() >= timelimit && timelimit != -1) {
                        // enough .. lets' simply
                        // read all the entries from the input without processing them
                        while (currentIter.hasNext()) {
                            currentIter.next();
                            timelimitcount++;
                        }
                        hasMore = context.nextKey();
                        if (hasMore) {
                            currentIter = context.getValues().iterator();
                        }
                        continue;
                    }
                    int feed = size - queues.getTotalSize();
                    if (feed <= 0) {
                        // queues are full - spin-wait until they have some free space
                        try {
                            Thread.sleep(1000);
                        } catch (final Exception e) {
                        }
                        ;
                        continue;
                    }
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("-feeding " + feed + " input urls ...");
                    }
                    while (feed > 0 && currentIter.hasNext()) {
                        FetchEntry entry = currentIter.next();
                        final String url = TableUtil.unreverseUrl(entry.getKey());
                        queues.addFetchItem(url, entry.getWebPage());
                        feed--;
                        cnt++;
                    }
                    if (currentIter.hasNext()) {
                        continue; // finish items in current list before reading next key
                    }
                    hasMore = context.nextKey();
                    if (hasMore) {
                        currentIter = context.getValues().iterator();
                    }
                }
            } catch (Exception e) {
                LOG.error("QueueFeeder error reading input, record " + cnt, e);
                return;
            }
            LOG.info("QueueFeeder finished: total " + cnt + " records. Hit by time limit :" + timelimitcount);
            context.getCounter("FetcherStatus", "HitByTimeLimit-QueueFeeder").increment(timelimitcount);
        }
    }

    private void reportAndLogStatus(Context context, float actualPages, int actualBytes, int totalSize)
            throws IOException {
        StringBuilder status = new StringBuilder();
        long elapsed = (System.currentTimeMillis() - start) / 1000;
        status.append(spinWaiting).append("/").append(activeThreads).append(" spinwaiting/active, ");
        status.append(pages).append(" pages, ").append(errors).append(" errors, ");
        status.append(Math.round((((float) pages.get()) * 10) / elapsed) / 10.0).append(" ");
        status.append(Math.round((actualPages * 10) / 10.0)).append(" pages/s, ");
        status.append(Math.round((((float) bytes.get()) * 8) / 1024) / elapsed).append(" ");
        status.append(Math.round(((float) actualBytes) * 8) / 1024).append(" kb/s, ");
        status.append(totalSize).append(" URLs in ");
        status.append(this.fetchQueues.getQueueCount()).append(" queues");
        String toString = status.toString();
        context.setStatus(toString);
        LOG.info(toString);
    }

    @Override
    public void run(Context context) throws IOException, InterruptedException {
        Configuration conf = context.getConfiguration();
        this.fetchQueues = new FetchItemQueues(conf);
        int threadCount = conf.getInt("fetcher.threads.fetch", 10);
        parse = conf.getBoolean(FetcherJob.PARSE_KEY, false);
        storingContent = conf.getBoolean("fetcher.store.content", true);
        if (parse) {
            skipTruncated = conf.getBoolean(ParserJob.SKIP_TRUNCATED, true);
            parseUtil = new ParseUtil(conf);
        }
        LOG.info("Fetcher: threads: " + threadCount);

        int maxFeedPerThread = conf.getInt("fetcher.queue.depth.multiplier", 50);
        feeder = new QueueFeeder(context, fetchQueues, threadCount * maxFeedPerThread);
        feeder.start();

        for (int i = 0; i < threadCount; i++) { // spawn threads
            FetcherThread ft = new FetcherThread(context, i);
            fetcherThreads.add(ft);
            ft.start();
        }
        // select a timeout that avoids a task timeout
        final long timeout = conf.getInt("mapred.task.timeout", 10 * 60 * 1000) / 2;

        // Used for threshold check, holds pages and bytes processed in the last sec
        float pagesLastSec;
        int bytesLastSec;

        int throughputThresholdCurrentSequence = 0;

        int throughputThresholdPages = conf.getInt("fetcher.throughput.threshold.pages", -1);
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: throughput threshold: " + throughputThresholdPages);
        }
        int throughputThresholdSequence = conf.getInt("fetcher.throughput.threshold.sequence", 5);
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: throughput threshold sequence: " + throughputThresholdSequence);
        }
        long throughputThresholdTimeLimit = conf.getLong("fetcher.throughput.threshold.check.after", -1);

        do { // wait for threads to exit
            pagesLastSec = pages.get();
            bytesLastSec = (int) bytes.get();
            final int secondsToSleep = 5;
            try {
                Thread.sleep(secondsToSleep * 1000);
            } catch (InterruptedException e) {
            }

            pagesLastSec = (pages.get() - pagesLastSec) / secondsToSleep;
            bytesLastSec = ((int) bytes.get() - bytesLastSec) / secondsToSleep;

            int fetchQueuesTotalSize = fetchQueues.getTotalSize();
            reportAndLogStatus(context, pagesLastSec, bytesLastSec, fetchQueuesTotalSize);

            boolean feederAlive = feeder.isAlive();
            if (!feederAlive && fetchQueuesTotalSize < 5) {
                fetchQueues.dump();
            }

            // check timelimit
            if (!feederAlive) {
                int hitByTimeLimit = fetchQueues.checkTimelimit();
                if (hitByTimeLimit != 0) {
                    context.getCounter("FetcherStatus", "HitByTimeLimit-Queues").increment(hitByTimeLimit);
                }
            }

            // if throughput threshold is enabled
            if (throughputThresholdTimeLimit < System.currentTimeMillis() && throughputThresholdPages != -1) {
                // Check if we're dropping below the threshold
                if (pagesLastSec < throughputThresholdPages) {
                    throughputThresholdCurrentSequence++;
                    LOG.warn(Integer.toString(throughputThresholdCurrentSequence)
                            + ": dropping below configured threshold of "
                            + Integer.toString(throughputThresholdPages) + " pages per second");

                    // Quit if we dropped below threshold too many times
                    if (throughputThresholdCurrentSequence > throughputThresholdSequence) {
                        LOG.warn("Dropped below threshold too many times in a row, killing!");

                        // Disable the threshold checker
                        throughputThresholdPages = -1;

                        // Empty the queues cleanly and get number of items that were
                        // dropped
                        int hitByThrougputThreshold = fetchQueues.emptyQueues();

                        if (hitByThrougputThreshold != 0)
                            context.getCounter("FetcherStatus", "hitByThrougputThreshold")
                                    .increment(hitByThrougputThreshold);
                    }
                } else {
                    throughputThresholdCurrentSequence = 0;
                }
            }

            // some requests seem to hang, despite all intentions
            if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) {
                if (LOG.isWarnEnabled() && activeThreads.get() > 0) {
                    LOG.warn("Aborting with " + activeThreads + " hung threads.");
                    for (int i = 0; i < fetcherThreads.size(); i++) {
                        FetcherThread thread = fetcherThreads.get(i);
                        if (thread.isAlive()) {
                            LOG.warn("Thread #" + i + " hung while processing " + thread.reprUrl);
                            if (LOG.isDebugEnabled()) {
                                StackTraceElement[] stack = thread.getStackTrace();
                                StringBuilder sb = new StringBuilder();
                                sb.append("Stack of thread #").append(i).append(":\n");
                                for (StackTraceElement s : stack) {
                                    sb.append(s.toString()).append('\n');
                                }
                                LOG.debug(sb.toString());
                            }
                        }
                    }
                }
                return;
            }

        } while (activeThreads.get() > 0);
        LOG.info("-activeThreads=" + activeThreads);
    }
}