org.apache.nutch.fetcher.FetcherReducer.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.fetcher.FetcherReducer.java

Source

/*******************************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package org.apache.nutch.fetcher;

import java.io.IOException;
import java.util.List;

import org.apache.commons.lang.Validate;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.nutch.api.NutchServer;
import org.apache.nutch.fetcher.data.FetchEntry;
import org.apache.nutch.fetcher.data.FetchItemQueues;
import org.apache.nutch.fetcher.server.FetcherServer;
import org.apache.nutch.mapreduce.NutchReducer;
import org.apache.nutch.mapreduce.NutchUtil;
import org.apache.nutch.net.proxy.ProxyUpdateThread;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.NetUtil;
import org.slf4j.Logger;

import com.google.common.collect.Lists;

public class FetcherReducer extends NutchReducer<IntWritable, FetchEntry, String, WebPage> {

    public static final Logger LOG = FetcherJob.LOG;

    private QueueFeederThread queueFeederThread; // feeder thread who feeds fetch queues
    private Integer fetchServerPort;
    private FetcherServer fetcherServer;
    private final List<FetchThread> fetchThreads = Lists.newArrayList();
    private int fetchThreadCount = 5;
    private int maxFeedPerThread = 100;

    private FetchManager fetchManager;
    private FetchMode fetchMode = FetchMode.NATIVE;

    private long fetchJobTimeout;
    private long pendingQueueCheckInterval;
    private long pendingQueueLastCheckTime;
    private long pendingTimeout;
    private int reportIntervalSec;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        super.setup(context);

        fetchMode = FetchMode.fromString(conf.get("fetcher.fetch.mode", "native"));

        fetchServerPort = FetcherServer.acquirePort(conf);
        fetchManager = new FetchManager(context.getJobID().getId(), getCounter(), context);
        FetchManagerPool.getInstance().put(fetchManager);

        getCounter().register(FetchManager.Counter.class);
        getReporter().silence();

        fetchJobTimeout = 60 * 1000 * conf.getInt("mapred.task.timeout.mins", 10);
        pendingQueueCheckInterval = 60 * 1000 * conf.getLong("fetcher.pending.queue.check.time.mins", 8);
        pendingTimeout = 60 * 1000 * conf.getLong("fetcher.pending.timeout.mins", 3);
        reportIntervalSec = conf.getInt("fetcher.pending.timeout.secs", 20);
        pendingQueueLastCheckTime = startTime;
        fetchThreadCount = conf.getInt("fetcher.threads.fetch", 5);
        maxFeedPerThread = conf.getInt("fetcher.queue.depth.multiplier", 100);

        LOG.info(NutchUtil.printArgMap("fetchMode", fetchMode, "fetchJobTimeout", fetchJobTimeout,
                "pendingQueueCheckInterval", pendingQueueCheckInterval, "pendingTimeout", pendingTimeout,
                "reportIntervalSec", reportIntervalSec, "pendingQueueLastCheckTime", pendingQueueLastCheckTime,
                "fetchThreadCount", fetchThreadCount, "maxFeedPerThread", maxFeedPerThread, "fetchServerPort",
                fetchServerPort));
    }

    @Override
    protected void doRun(Context context) throws IOException, InterruptedException {
        if (fetchServerPort == null) {
            LOG.error("Failed to acquire fetch server port");
            stop();
        }

        // Queue feeder thread
        startQueueFeederThread(context);

        if (FetchMode.CROWDSOURCING.equals(fetchMode)) {
            startCrowdsourcingThreads(context);

            startFetchServer(conf, fetchServerPort);
        } else {
            if (FetchMode.PROXY.equals(fetchMode)) {
                ProxyUpdateThread proxyUpdateThread = new ProxyUpdateThread(conf);
                proxyUpdateThread.start();
            }

            // Threads for native or proxy mode
            startNativeFetcherThreads(context);
        }

        checkAndReportFetcherStatus(context);
    }

    @Override
    protected void cleanup(Context context) {
        FetchManagerPool.getInstance().remove(context.getJobID().getId());
        if (fetcherServer != null && fetcherServer.isRunning()) {
            fetcherServer.stop(true);
        }

        super.cleanup(context);
    }

    /**
     * Start queue feeder thread. The thread fetches webpages from the reduce result
     * and add it into the fetch queue
     * Non-Blocking
     * @throws InterruptedException 
     * @throws IOException 
     * */
    public void startQueueFeederThread(Context context) throws IOException, InterruptedException {
        FetchItemQueues queues = fetchManager.getFetchItemQueues();
        queueFeederThread = new QueueFeederThread(context, queues, fetchThreadCount * maxFeedPerThread);
        queueFeederThread.start();
    }

    private boolean isFeederAlive() {
        Validate.notNull(queueFeederThread);

        return queueFeederThread.isAlive();
    }

    private boolean isMissionComplete() {
        return !isFeederAlive() && fetchManager.getReadyItemCount() == 0 && fetchManager.getPendingItemCount() == 0;
    }

    private void startFetchServer(final Configuration conf, final int port) {
        fetcherServer = FetcherServer.startInDaemonThread(conf, port);
    }

    /**
     * Blocking
     * */
    private void startCrowdsourcingThreads(Context context) {
        // Start native fetch threads to handle fetch result from fetch clients
        for (int i = 0; i < fetchThreadCount; i++) {
            FetchThread fetchThread = new FetchThread(queueFeederThread, fetchManager, context);
            fetchThreads.add(fetchThread);
            fetchThread.start();
        }
    }

    private void checkAndReportCrowdsourcingFetcherStatus(Context context) throws IOException {
        boolean shouldStop = false;
        do {
            fetchManager.waitAndReport(context, reportIntervalSec, isFeederAlive());

            long now = System.currentTimeMillis();
            long idleTime = now - fetchManager.getLastTaskFinishTime();

            checkPendingQueue(now, idleTime);

            if (!shouldStop && idleTime > fetchJobTimeout) {
                LOG.info("Hit fetch job timeout " + idleTime / 1000 + "s, exit the job...");
                shouldStop = true;
            }

            // All fetch tasks are finished
            if (!shouldStop && isMissionComplete()) {
                LOG.info("All done, exit the job...");
                shouldStop = true;
            }

        } while (!shouldStop);
    }

    private void startNativeFetcherThreads(Context context) {
        for (int i = 0; i < fetchThreadCount; i++) {
            FetchThread fetchThread = new FetchThread(queueFeederThread, fetchManager, context);
            fetchThreads.add(fetchThread);
            fetchThread.start();
        }
    }

    // Blocking
    private void checkAndReportFetcherStatus(Context context) throws IOException {
        if (FetchMode.CROWDSOURCING.equals(fetchMode)) {
            checkAndReportCrowdsourcingFetcherStatus(context);
        } else {
            checkAndReportNativeFetcherStatus(context);
        }
    }

    private void checkAndReportNativeFetcherStatus(Context context) throws IOException {
        // Used for threshold check, holds pages and bytes processed in the last sec
        int throughputThresholdCurrentSequence = 0;

        int throughputThresholdPages = conf.getInt("fetcher.throughput.threshold.pages", -1);
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: throughput threshold: " + throughputThresholdPages);
        }
        int throughputThresholdSequence = conf.getInt("fetcher.throughput.threshold.sequence", 5);
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: throughput threshold sequence: " + throughputThresholdSequence);
        }
        long throughputThresholdTimeLimit = conf.getLong("fetcher.throughput.threshold.check.after", -1);

        do {
            float pagesLastSec = fetchManager.waitAndReport(context, reportIntervalSec, isFeederAlive());

            // if throughput threshold is enabled
            if (throughputThresholdTimeLimit < System.currentTimeMillis() && throughputThresholdPages != -1) {
                // Check if we're dropping below the threshold
                if (pagesLastSec < throughputThresholdPages) {
                    throughputThresholdCurrentSequence++;

                    LOG.warn(Integer.toString(throughputThresholdCurrentSequence)
                            + ": dropping below configured threshold of "
                            + Integer.toString(throughputThresholdPages) + " pages per second");

                    // Quit if we dropped below threshold too many times
                    if (throughputThresholdCurrentSequence > throughputThresholdSequence) {
                        LOG.warn("Dropped below threshold too many times in a row, killing!");

                        // Disable the threshold checker
                        throughputThresholdPages = -1;

                        // Empty the queues cleanly and get number of items that were dropped
                        int hitByThrougputThreshold = fetchManager.clearFetchItemQueues();

                        if (hitByThrougputThreshold != 0) {
                            context.getCounter("FetcherStatus", "hitByThrougputThreshold")
                                    .increment(hitByThrougputThreshold);
                        }
                    }
                } else {
                    throughputThresholdCurrentSequence = 0;
                }
            }

            // some requests seem to hang, despite all intentions
            if ((System.currentTimeMillis() - fetchManager.getLastTaskStartTime()) > fetchJobTimeout) {
                if (fetchManager.activeFetcherThreads.get() > 0) {
                    LOG.warn("Aborting with " + fetchManager.activeFetcherThreads.get() + " hung threads.");

                    for (int i = 0; i < fetchThreads.size(); i++) {
                        FetchThread thread = fetchThreads.get(i);
                        if (thread.isAlive()) {
                            LOG.warn("Thread #" + i + " hung while processing " + thread.reprUrl());

                            if (LOG.isDebugEnabled()) {
                                StackTraceElement[] stack = thread.getStackTrace();
                                StringBuilder sb = new StringBuilder();
                                sb.append("Stack of thread #").append(i).append(":\n");
                                for (StackTraceElement s : stack) {
                                    sb.append(s.toString()).append('\n');
                                }

                                LOG.debug(sb.toString());
                            }
                        }
                    } // for
                } // if

                return;
            } // if
        } while (fetchManager.activeFetcherThreads.get() > 0);
    }

    /**
     * Check pending queue to see if some item is expired,
     * which often means the fetch client running into wrong
     * TODO : may move this method into FetchManager
     * */
    private void checkPendingQueue(long now, long idleTime) {
        if (fetchManager.getReadyItemCount() + fetchManager.getPendingItemCount() < 10) {
            pendingTimeout = 2 * 60 * 1000;
        }

        boolean shouldCheck = now > pendingQueueLastCheckTime + 2 * reportIntervalSec * 1000;
        if (shouldCheck) {
            shouldCheck = idleTime > pendingTimeout || now - pendingQueueLastCheckTime > pendingQueueCheckInterval;
        }

        if (shouldCheck) {
            LOG.info("Check pending items");
            fetchManager.reviewPendingFetchItems(false);
            pendingQueueLastCheckTime = now;
        }
    }
}