Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package net.peacesoft.nutch.crawl; import java.io.IOException; import java.net.InetAddress; import java.net.MalformedURLException; import java.net.URL; import java.net.UnknownHostException; import java.text.SimpleDateFormat; import java.util.*; import java.util.Map.Entry; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; // Slf4j Logging imports import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.io.*; import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.crawl.SignatureFactory; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.*; import org.apache.nutch.protocol.*; import org.apache.nutch.parse.*; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.*; import org.apache.nutch.fetcher.FetcherOutputFormat; /** * A queue-based ReFetcher. * * <p>This fetcher uses a well-known model of one producer (a QueueFeeder) and * many consumers (FetcherThread-s). * * <p>QueueFeeder reads input fetchlists and populates a set of * FetchItemQueue-s, which hold FetchItem-s that describe the items to be * fetched. There are as many queues as there are unique hosts, but at any given * time the total number of fetch items in all queues is less than a fixed * number (currently set to a multiple of the number of threads). * * <p>As items are consumed from the queues, the QueueFeeder continues to add * new input items, so that their total count stays fixed (FetcherThread-s may * also add new items to the queues e.g. as a results of redirection) - until * all input items are exhausted, at which point the number of items in the * queues begins to decrease. When this number reaches 0 fetcher will finish. * * <p>This fetcher implementation handles per-host blocking itself, instead of * delegating this work to protocol-specific plugins. Each per-host queue * handles its own "politeness" settings, such as the maximum number of * concurrent requests and crawl delay between consecutive requests - and also a * list of requests in progress, and the time the last request was finished. As * FetcherThread-s ask for new items to be fetched, queues may return eligible * items or null if for "politeness" reasons this host's queue is not yet ready. * * <p>If there are still unfetched items in the queues, but none of the items * are ready, FetcherThread-s will spin-wait until either some items become * available, or a timeout is reached (at which point the Fetcher will abort, * assuming the task is hung). * * @author Andrzej Bialecki */ public class ReFetcher extends Configured implements Tool, MapRunnable<Text, CrawlDatum, Text, NutchWritable> { public static final int PERM_REFRESH_TIME = 5; public static final String CONTENT_REDIR = "content"; public static final String PROTOCOL_REDIR = "protocol"; public static final Logger LOG = LoggerFactory.getLogger(ReFetcher.class); public static class InputFormat extends SequenceFileInputFormat<Text, CrawlDatum> { /** * Don't split inputs, to keep things polite. */ public InputSplit[] getSplits(JobConf job, int nSplits) throws IOException { FileStatus[] files = listStatus(job); FileSplit[] splits = new FileSplit[files.length]; for (int i = 0; i < files.length; i++) { FileStatus cur = files[i]; splits[i] = new FileSplit(cur.getPath(), 0, cur.getLen(), (String[]) null); } return splits; } } private OutputCollector<Text, NutchWritable> output; private Reporter reporter; private String segmentName; private AtomicInteger activeThreads = new AtomicInteger(0); private AtomicInteger spinWaiting = new AtomicInteger(0); private long start = System.currentTimeMillis(); // start time of fetcher run private AtomicLong lastRequestStart = new AtomicLong(start); private AtomicLong bytes = new AtomicLong(0); // total bytes fetched private AtomicInteger pages = new AtomicInteger(0); // total pages fetched private AtomicInteger errors = new AtomicInteger(0); // total pages errored private boolean storingContent; private boolean parsing; ReFetcher.FetchItemQueues fetchQueues; ReFetcher.QueueFeeder feeder; /** * This class described the item to be fetched. */ private static class FetchItem { int outlinkDepth = 0; String queueID; Text url; URL u; CrawlDatum datum; public FetchItem(Text url, URL u, CrawlDatum datum, String queueID) { this(url, u, datum, queueID, 0); } public FetchItem(Text url, URL u, CrawlDatum datum, String queueID, int outlinkDepth) { this.url = url; this.u = u; this.datum = datum; this.queueID = queueID; this.outlinkDepth = outlinkDepth; } /** * Create an item. Queue id will be created based on * <code>queueMode</code> argument, either as a protocol + hostname * pair, protocol + IP address pair or protocol+domain pair. */ public static ReFetcher.FetchItem create(Text url, CrawlDatum datum, String queueMode) { return create(url, datum, queueMode, 0); } public static ReFetcher.FetchItem create(Text url, CrawlDatum datum, String queueMode, int outlinkDepth) { String queueID; URL u = null; try { u = new URL(url.toString()); } catch (Exception e) { LOG.warn("Cannot parse url: " + url, e); return null; } final String proto = u.getProtocol().toLowerCase(); String key; if (ReFetcher.FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) { try { final InetAddress addr = InetAddress.getByName(u.getHost()); key = addr.getHostAddress(); } catch (final UnknownHostException e) { // unable to resolve it, so don't fall back to host name LOG.warn("Unable to resolve: " + u.getHost() + ", skipping."); return null; } } else if (ReFetcher.FetchItemQueues.QUEUE_MODE_DOMAIN.equalsIgnoreCase(queueMode)) { key = URLUtil.getDomainName(u); if (key == null) { LOG.warn("Unknown domain for url: " + url + ", using URL string as key"); key = u.toExternalForm(); } } else { key = u.getHost(); if (key == null) { LOG.warn("Unknown host for url: " + url + ", using URL string as key"); key = u.toExternalForm(); } } queueID = proto + "://" + key.toLowerCase(); return new ReFetcher.FetchItem(url, u, datum, queueID, outlinkDepth); } public CrawlDatum getDatum() { return datum; } public String getQueueID() { return queueID; } public Text getUrl() { return url; } public URL getURL2() { return u; } } /** * This class handles FetchItems which come from the same host ID (be it a * proto/hostname or proto/IP pair). It also keeps track of requests in * progress and elapsed time between requests. */ private static class FetchItemQueue { List<ReFetcher.FetchItem> queue = Collections.synchronizedList(new LinkedList<ReFetcher.FetchItem>()); Set<ReFetcher.FetchItem> inProgress = Collections.synchronizedSet(new HashSet<ReFetcher.FetchItem>()); AtomicLong nextFetchTime = new AtomicLong(); AtomicInteger exceptionCounter = new AtomicInteger(); long crawlDelay; long minCrawlDelay; int maxThreads; Configuration conf; public FetchItemQueue(Configuration conf, int maxThreads, long crawlDelay, long minCrawlDelay) { this.conf = conf; this.maxThreads = maxThreads; this.crawlDelay = crawlDelay; this.minCrawlDelay = minCrawlDelay; // ready to start setEndTime(System.currentTimeMillis() - crawlDelay); } public synchronized int emptyQueue() { int presize = queue.size(); queue.clear(); return presize; } public int getQueueSize() { return queue.size(); } public int getInProgressSize() { return inProgress.size(); } public int incrementExceptionCounter() { return exceptionCounter.incrementAndGet(); } public void finishFetchItem(ReFetcher.FetchItem it, boolean asap) { if (it != null) { inProgress.remove(it); setEndTime(System.currentTimeMillis(), asap); } } public void addFetchItem(ReFetcher.FetchItem it) { if (it == null) { return; } queue.add(it); } public void addInProgressFetchItem(ReFetcher.FetchItem it) { if (it == null) { return; } inProgress.add(it); } public ReFetcher.FetchItem getFetchItem() { if (inProgress.size() >= maxThreads) { return null; } long now = System.currentTimeMillis(); if (nextFetchTime.get() > now) { return null; } ReFetcher.FetchItem it = null; if (queue.size() == 0) { return null; } try { it = queue.remove(0); inProgress.add(it); } catch (Exception e) { LOG.error("Cannot remove FetchItem from queue or cannot add it to inProgress queue", e); } return it; } public synchronized void dump() { LOG.info(" maxThreads = " + maxThreads); LOG.info(" inProgress = " + inProgress.size()); LOG.info(" crawlDelay = " + crawlDelay); LOG.info(" minCrawlDelay = " + minCrawlDelay); LOG.info(" nextFetchTime = " + nextFetchTime.get()); LOG.info(" now = " + System.currentTimeMillis()); for (int i = 0; i < queue.size(); i++) { ReFetcher.FetchItem it = queue.get(i); LOG.info(" " + i + ". " + it.url); } } private void setEndTime(long endTime) { setEndTime(endTime, false); } private void setEndTime(long endTime, boolean asap) { if (!asap) { nextFetchTime.set(endTime + (maxThreads > 1 ? minCrawlDelay : crawlDelay)); } else { nextFetchTime.set(endTime); } } } /** * Convenience class - a collection of queues that keeps track of the total * number of items, and provides items eligible for fetching from any queue. */ private static class FetchItemQueues { public static final String DEFAULT_ID = "default"; Map<String, ReFetcher.FetchItemQueue> queues = new HashMap<String, ReFetcher.FetchItemQueue>(); AtomicInteger totalSize = new AtomicInteger(0); int maxThreads; long crawlDelay; long minCrawlDelay; long timelimit = -1; int maxExceptionsPerQueue = -1; Configuration conf; public static final String QUEUE_MODE_HOST = "byHost"; public static final String QUEUE_MODE_DOMAIN = "byDomain"; public static final String QUEUE_MODE_IP = "byIP"; String queueMode; public FetchItemQueues(Configuration conf) { this.conf = conf; this.maxThreads = conf.getInt("fetcher.threads.per.queue", 1); queueMode = conf.get("fetcher.queue.mode", QUEUE_MODE_HOST); // check that the mode is known if (!queueMode.equals(QUEUE_MODE_IP) && !queueMode.equals(QUEUE_MODE_DOMAIN) && !queueMode.equals(QUEUE_MODE_HOST)) { LOG.error("Unknown partition mode : " + queueMode + " - forcing to byHost"); queueMode = QUEUE_MODE_HOST; } LOG.info("Using queue mode : " + queueMode); this.crawlDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000); this.minCrawlDelay = (long) (conf.getFloat("fetcher.server.min.delay", 0.0f) * 1000); this.timelimit = conf.getLong("fetcher.timelimit", -1); this.maxExceptionsPerQueue = conf.getInt("fetcher.max.exceptions.per.queue", -1); } public int getTotalSize() { return totalSize.get(); } public int getQueueCount() { return queues.size(); } public void addFetchItem(Text url, CrawlDatum datum) { ReFetcher.FetchItem it = ReFetcher.FetchItem.create(url, datum, queueMode); if (it != null) { addFetchItem(it); } } public synchronized void addFetchItem(ReFetcher.FetchItem it) { ReFetcher.FetchItemQueue fiq = getFetchItemQueue(it.queueID); fiq.addFetchItem(it); totalSize.incrementAndGet(); } public void finishFetchItem(ReFetcher.FetchItem it) { finishFetchItem(it, false); } public void finishFetchItem(ReFetcher.FetchItem it, boolean asap) { ReFetcher.FetchItemQueue fiq = queues.get(it.queueID); if (fiq == null) { LOG.warn("Attempting to finish item from unknown queue: " + it); return; } fiq.finishFetchItem(it, asap); } public synchronized ReFetcher.FetchItemQueue getFetchItemQueue(String id) { ReFetcher.FetchItemQueue fiq = queues.get(id); if (fiq == null) { // initialize queue fiq = new ReFetcher.FetchItemQueue(conf, maxThreads, crawlDelay, minCrawlDelay); queues.put(id, fiq); } return fiq; } public synchronized ReFetcher.FetchItem getFetchItem() { Iterator<Map.Entry<String, ReFetcher.FetchItemQueue>> it = queues.entrySet().iterator(); while (it.hasNext()) { ReFetcher.FetchItemQueue fiq = it.next().getValue(); // reap empty queues if (fiq.getQueueSize() == 0 && fiq.getInProgressSize() == 0) { it.remove(); continue; } ReFetcher.FetchItem fit = fiq.getFetchItem(); if (fit != null) { totalSize.decrementAndGet(); return fit; } } return null; } // called only once the feeder has stopped public synchronized int checkTimelimit() { int count = 0; if (System.currentTimeMillis() >= timelimit && timelimit != -1) { // emptying the queues count = emptyQueues(); // there might also be a case where totalsize !=0 but number of queues // == 0 // in which case we simply force it to 0 to avoid blocking if (totalSize.get() != 0 && queues.isEmpty()) { totalSize.set(0); } } return count; } // empties the queues (used by timebomb and throughput threshold) public synchronized int emptyQueues() { int count = 0; for (String id : queues.keySet()) { ReFetcher.FetchItemQueue fiq = queues.get(id); if (fiq.getQueueSize() == 0) { continue; } LOG.info("* queue: " + id + " >> dropping! "); int deleted = fiq.emptyQueue(); for (int i = 0; i < deleted; i++) { totalSize.decrementAndGet(); } count += deleted; } return count; } /** * Increment the exception counter of a queue in case of an exception * e.g. timeout; when higher than a given threshold simply empty the * queue. * * @param queueid * @return number of purged items */ public synchronized int checkExceptionThreshold(String queueid) { ReFetcher.FetchItemQueue fiq = queues.get(queueid); if (fiq == null) { return 0; } if (fiq.getQueueSize() == 0) { return 0; } int excCount = fiq.incrementExceptionCounter(); if (maxExceptionsPerQueue != -1 && excCount >= maxExceptionsPerQueue) { // too many exceptions for items in this queue - purge it int deleted = fiq.emptyQueue(); LOG.info("* queue: " + queueid + " >> removed " + deleted + " URLs from queue because " + excCount + " exceptions occurred"); for (int i = 0; i < deleted; i++) { totalSize.decrementAndGet(); } return deleted; } return 0; } public synchronized void dump() { for (String id : queues.keySet()) { ReFetcher.FetchItemQueue fiq = queues.get(id); if (fiq.getQueueSize() == 0) { continue; } LOG.info("* queue: " + id); fiq.dump(); } } } /** * This class feeds the queues with input items, and re-fills them as items * are consumed by FetcherThread-s. */ private static class QueueFeeder extends Thread { private RecordReader<Text, CrawlDatum> reader; private ReFetcher.FetchItemQueues queues; private int size; private long timelimit = -1; public QueueFeeder(RecordReader<Text, CrawlDatum> reader, ReFetcher.FetchItemQueues queues, int size) { this.reader = reader; this.queues = queues; this.size = size; this.setDaemon(true); this.setName("QueueFeeder"); } public void setTimeLimit(long tl) { timelimit = tl; } public void run() { boolean hasMore = true; int cnt = 0; int timelimitcount = 0; while (hasMore) { if (System.currentTimeMillis() >= timelimit && timelimit != -1) { // enough .. lets' simply // read all the entries from the input without processing them try { Text url = new Text(); CrawlDatum datum = new CrawlDatum(); hasMore = reader.next(url, datum); timelimitcount++; } catch (IOException e) { LOG.error("QueueFeeder error reading input, record " + cnt, e); return; } continue; } int feed = size - queues.getTotalSize(); if (feed <= 0) { // queues are full - spin-wait until they have some free space try { Thread.sleep(1000); } catch (Exception e) { } ; continue; } else { LOG.debug("-feeding " + feed + " input urls ..."); while (feed > 0 && hasMore) { try { Text url = new Text(); CrawlDatum datum = new CrawlDatum(); hasMore = reader.next(url, datum); if (hasMore) { queues.addFetchItem(url, datum); cnt++; feed--; } } catch (IOException e) { LOG.error("QueueFeeder error reading input, record " + cnt, e); return; } } } } LOG.info("QueueFeeder finished: total " + cnt + " records + hit by time limit :" + timelimitcount); } } /** * This class picks items from queues and fetches the pages. */ private class FetcherThread extends Thread { private Configuration conf; private URLFilters urlFilters; private ScoringFilters scfilters; private ParseUtil parseUtil; private URLNormalizers normalizers; private ProtocolFactory protocolFactory; private long maxCrawlDelay; private String queueMode; private int maxRedirect; private String reprUrl; private boolean redirecting; private int redirectCount; private boolean ignoreExternalLinks; // Used by ReFetcher.follow.outlinks.depth in parse private int maxOutlinksPerPage; private final int maxOutlinks; private final int interval; private int maxOutlinkDepth; private int maxOutlinkDepthNumLinks; private boolean outlinksIgnoreExternal; private int outlinksDepthDivisor; private boolean skipTruncated; public FetcherThread(Configuration conf) { this.setDaemon(true); // don't hang JVM on exit this.setName("FetcherThread"); // use an informative name this.conf = conf; this.urlFilters = new URLFilters(conf); this.scfilters = new ScoringFilters(conf); this.parseUtil = new ParseUtil(conf); this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true); this.protocolFactory = new ProtocolFactory(conf); this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER); this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000; queueMode = conf.get("fetcher.queue.mode", ReFetcher.FetchItemQueues.QUEUE_MODE_HOST); // check that the mode is known if (!queueMode.equals(ReFetcher.FetchItemQueues.QUEUE_MODE_IP) && !queueMode.equals(ReFetcher.FetchItemQueues.QUEUE_MODE_DOMAIN) && !queueMode.equals(ReFetcher.FetchItemQueues.QUEUE_MODE_HOST)) { LOG.error("Unknown partition mode : " + queueMode + " - forcing to byHost"); queueMode = ReFetcher.FetchItemQueues.QUEUE_MODE_HOST; } LOG.info("Using queue mode : " + queueMode); this.maxRedirect = conf.getInt("http.redirect.max", 3); this.ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false); maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100); maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage; interval = conf.getInt("db.fetch.interval.default", 2592000); ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false); maxOutlinkDepth = conf.getInt("fetcher.follow.outlinks.depth", -1); outlinksIgnoreExternal = conf.getBoolean("fetcher.follow.outlinks.ignore.external", false); maxOutlinkDepthNumLinks = conf.getInt("fetcher.follow.outlinks.num.links", 4); outlinksDepthDivisor = conf.getInt("fetcher.follow.outlinks.depth.divisor", 2); } public void run() { activeThreads.incrementAndGet(); // count threads ReFetcher.FetchItem fit = null; try { while (true) { fit = fetchQueues.getFetchItem(); if (fit == null) { if (feeder.isAlive() || fetchQueues.getTotalSize() > 0) { LOG.trace(getName() + " spin-waiting ..."); // spin-wait. spinWaiting.incrementAndGet(); try { Thread.sleep(500); } catch (Exception e) { } spinWaiting.decrementAndGet(); continue; } else { // all done, finish this thread return; } } lastRequestStart.set(System.currentTimeMillis()); Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); if (reprUrlWritable == null) { reprUrl = fit.url.toString(); } else { reprUrl = reprUrlWritable.toString(); } try { // fetch the page redirecting = false; redirectCount = 0; do { Text category = (Text) fit.datum.getMetaData().get(new Text("category")); if (LOG.isInfoEnabled()) { if (category == null) { LOG.info("fetching " + fit.url); } else { LOG.info("fetching category " + category.toString() + " url " + fit.url); } } if (LOG.isDebugEnabled()) { LOG.debug("redirectCount=" + redirectCount); } redirecting = false; Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString()); RobotRules rules = protocol.getRobotRules(fit.url, fit.datum); if (!rules.isAllowed(fit.u)) { // unblock fetchQueues.finishFetchItem(fit, true); if (LOG.isDebugEnabled()) { LOG.debug("Denied by robots.txt: " + fit.url); } output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); reporter.incrCounter("FetcherStatus", "robots_denied", 1); continue; } if (rules.getCrawlDelay() > 0) { if (rules.getCrawlDelay() > maxCrawlDelay) { // unblock fetchQueues.finishFetchItem(fit, true); LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping"); output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1); continue; } else { ReFetcher.FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID); fiq.crawlDelay = rules.getCrawlDelay(); } } ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.datum); ProtocolStatus status = output.getStatus(); Content content = output.getContent(); ParseStatus pstatus = null; // unblock queue fetchQueues.finishFetchItem(fit); String urlString = fit.url.toString(); reporter.incrCounter("FetcherStatus", status.getName(), 1); switch (status.getCode()) { case ProtocolStatus.WOULDBLOCK: // retry ? fetchQueues.addFetchItem(fit); break; case ProtocolStatus.SUCCESS: // got a page if (category != null) { content.getMetadata().add("category", category.toString()); } pstatus = output(fit.url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth); updateStatus(content.getContent().length); if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) { String newUrl = pstatus.getMessage(); int refreshTime = Integer.valueOf(pstatus.getArgs()[1]); Text redirUrl = handleRedirect(fit.url, fit.datum, urlString, newUrl, refreshTime < ReFetcher.PERM_REFRESH_TIME, ReFetcher.CONTENT_REDIR); if (redirUrl != null) { CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, fit.datum.getFetchInterval(), fit.datum.getScore()); // transfer existing metadata to the redir newDatum.getMetaData().putAll(fit.datum.getMetaData()); scfilters.initialScore(redirUrl, newDatum); if (reprUrl != null) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); } fit = ReFetcher.FetchItem.create(redirUrl, newDatum, queueMode); if (fit != null) { ReFetcher.FetchItemQueue fiq = fetchQueues .getFetchItemQueue(fit.queueID); fiq.addInProgressFetchItem(fit); } else { // stop redirecting redirecting = false; reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1); } } } break; case ProtocolStatus.MOVED: // redirect case ProtocolStatus.TEMP_MOVED: int code; boolean temp; if (status.getCode() == ProtocolStatus.MOVED) { code = CrawlDatum.STATUS_FETCH_REDIR_PERM; temp = false; } else { code = CrawlDatum.STATUS_FETCH_REDIR_TEMP; temp = true; } output(fit.url, fit.datum, content, status, code); String newUrl = status.getMessage(); Text redirUrl = handleRedirect(fit.url, fit.datum, urlString, newUrl, temp, ReFetcher.PROTOCOL_REDIR); if (redirUrl != null) { CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, fit.datum.getFetchInterval(), fit.datum.getScore()); // transfer existing metadata newDatum.getMetaData().putAll(fit.datum.getMetaData()); scfilters.initialScore(redirUrl, newDatum); if (reprUrl != null) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); } fit = ReFetcher.FetchItem.create(redirUrl, newDatum, queueMode); if (fit != null) { ReFetcher.FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID); fiq.addInProgressFetchItem(fit); } else { // stop redirecting redirecting = false; reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1); } } else { // stop redirecting redirecting = false; } break; case ProtocolStatus.EXCEPTION: logError(fit.url, status.getMessage()); int killedURLs = fetchQueues.checkExceptionThreshold(fit.getQueueID()); if (killedURLs != 0) { reporter.incrCounter("FetcherStatus", "AboveExceptionThresholdInQueue", killedURLs); } /* FALLTHROUGH */ case ProtocolStatus.RETRY: // retry case ProtocolStatus.BLOCKED: output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY); break; case ProtocolStatus.GONE: // gone case ProtocolStatus.NOTFOUND: case ProtocolStatus.ACCESS_DENIED: case ProtocolStatus.ROBOTS_DENIED: output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE); break; case ProtocolStatus.NOTMODIFIED: output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED); break; default: if (LOG.isWarnEnabled()) { LOG.warn("Unknown ProtocolStatus: " + status.getCode()); } output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY); } if (redirecting && redirectCount > maxRedirect) { fetchQueues.finishFetchItem(fit); if (LOG.isInfoEnabled()) { LOG.info(" - redirect count exceeded " + fit.url); } output(fit.url, fit.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, CrawlDatum.STATUS_FETCH_GONE); } } while (redirecting && (redirectCount <= maxRedirect)); } catch (Throwable t) { // unexpected exception // unblock fetchQueues.finishFetchItem(fit); logError(fit.url, StringUtils.stringifyException(t)); output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY); } } } catch (Throwable e) { if (LOG.isErrorEnabled()) { LOG.error("fetcher caught:" + e.toString()); } } finally { if (fit != null) { fetchQueues.finishFetchItem(fit); } activeThreads.decrementAndGet(); // count threads LOG.info("-finishing thread " + getName() + ", activeThreads=" + activeThreads); } } private Text handleRedirect(Text url, CrawlDatum datum, String urlString, String newUrl, boolean temp, String redirType) throws MalformedURLException, URLFilterException { newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); newUrl = urlFilters.filter(newUrl); if (ignoreExternalLinks) { try { String origHost = new URL(urlString).getHost().toLowerCase(); String newHost = new URL(newUrl).getHost().toLowerCase(); if (!origHost.equals(newHost)) { if (LOG.isDebugEnabled()) { LOG.debug(" - ignoring redirect " + redirType + " from " + urlString + " to " + newUrl + " because external links are ignored"); } return null; } } catch (MalformedURLException e) { } } if (newUrl != null && !newUrl.equals(urlString)) { reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp); url = new Text(newUrl); if (maxRedirect > 0) { redirecting = true; redirectCount++; if (LOG.isDebugEnabled()) { LOG.debug(" - " + redirType + " redirect to " + url + " (fetching now)"); } return url; } else { CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED, datum.getFetchInterval(), datum.getScore()); // transfer existing metadata newDatum.getMetaData().putAll(datum.getMetaData()); try { scfilters.initialScore(url, newDatum); } catch (ScoringFilterException e) { e.printStackTrace(); } if (reprUrl != null) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); } output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED); if (LOG.isDebugEnabled()) { LOG.debug(" - " + redirType + " redirect to " + url + " (fetching later)"); } return null; } } else { if (LOG.isDebugEnabled()) { LOG.debug(" - " + redirType + " redirect skipped: " + (newUrl != null ? "to same url" : "filtered")); } return null; } } private void logError(Text url, String message) { if (LOG.isInfoEnabled()) { LOG.info("fetch of " + url + " failed with: " + message); } errors.incrementAndGet(); } private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status) { return output(key, datum, content, pstatus, status, 0); } private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) { datum.setStatus(status); datum.setFetchTime(System.currentTimeMillis()); if (pstatus != null) { datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus); } ParseResult parseResult = null; if (content != null) { Metadata metadata = content.getMetadata(); if (LOG.isDebugEnabled()) { LOG.debug("Fetcher content url " + content.getUrl()); LOG.debug("Fetcher content metadata " + Arrays.toString(metadata.names())); } // store the guessed content type in the crawldatum if (content.getContentType() != null) { datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType())); } // add segment to metadata metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); // add score to content metadata so that ParseSegment can pick it up. try { scfilters.passScoreBeforeParsing(key, datum, content); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } /* Note: Fetcher will only follow meta-redirects coming from the * original URL. */ if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) { if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) { try { if (LOG.isDebugEnabled()) { LOG.debug("Fetcher start parse url " + content.getUrl()); } parseResult = this.parseUtil.parse(content); } catch (Exception e) { LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); } } if (parseResult == null) { byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, new ParseStatus().getEmptyParse(conf)); datum.setSignature(signature); } } /* Store status code in content So we can read this value during * parsing (as a separate job) and decide to parse or not. */ content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status)); } try { output.collect(key, new NutchWritable(datum)); if (content != null && storingContent) { output.collect(key, new NutchWritable(content)); } if (parseResult != null) { for (Entry<Text, Parse> entry : parseResult) { Text url = entry.getKey(); Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); ParseData parseData = parse.getData(); if (!parseStatus.isSuccess()) { LOG.warn("Error parsing: " + key + ": " + parseStatus); parse = parseStatus.getEmptyParse(getConf()); } // Calculate page signature. For non-parsing fetchers this will // be done in ParseSegment byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse); // Ensure segment name and score are in parseData metadata parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName); parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); // Pass fetch time to content meta parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime())); if (url.equals(key)) { datum.setSignature(signature); } try { scfilters.passScoreAfterParsing(url, content, parse); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } String fromHost; // collect outlinks for subsequent db update Outlink[] links = parseData.getOutlinks(); int outlinksToStore = Math.min(maxOutlinks, links.length); if (ignoreExternalLinks) { try { fromHost = new URL(url.toString()).getHost().toLowerCase(); } catch (MalformedURLException e) { fromHost = null; } } else { fromHost = null; } int validCount = 0; // Process all outlinks, normalize, filter and deduplicate List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore); HashSet<String> outlinks = new HashSet<String>(outlinksToStore); for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { String toUrl = links[i].getToUrl(); toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, fromHost, ignoreExternalLinks, urlFilters, normalizers); if (toUrl == null) { continue; } validCount++; links[i].setUrl(toUrl); outlinkList.add(links[i]); outlinks.add(toUrl); } // Only process depth N outlinks if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) { reporter.incrCounter("FetcherOutlinks", "outlinks_detected", outlinks.size()); // Counter to limit num outlinks to follow per page int outlinkCounter = 0; // Calculate variable number of outlinks by depth using the divisor (outlinks = Math.floor(divisor / depth * num.links)) int maxOutlinksByDepth = (int) Math .floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks); String followUrl; // Walk over the outlinks and add as new FetchItem to the queues Iterator<String> iter = outlinks.iterator(); while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) { followUrl = iter.next(); // Check whether we'll follow external outlinks if (outlinksIgnoreExternal) { if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) { continue; } } reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1); CrawlDatum fCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval); fCrawlDatum.getMetaData().putAll(datum.getMetaData()); if (LOG.isDebugEnabled()) { LOG.debug("Create new fetcher metadata: " + Arrays.toString(fCrawlDatum.getMetaData().values().toArray())); } // Create new FetchItem datumwith depth incremented ReFetcher.FetchItem fit = ReFetcher.FetchItem.create(new Text(followUrl), fCrawlDatum, queueMode, outlinkDepth + 1); fetchQueues.addFetchItem(fit); outlinkCounter++; } } // Overwrite the outlinks in ParseData with the normalized and filtered set parseData.setOutlinks((Outlink[]) outlinkList.toArray(new Outlink[outlinkList.size()])); Text category = (Text) datum.getMetaData().get(new Text("category")); if (category != null) { parseData.getContentMeta().add("category", category.toString()); } output.collect(url, new NutchWritable( new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical()))); } } } catch (IOException e) { if (LOG.isErrorEnabled()) { LOG.error("fetcher caught:" + e.toString()); } } // return parse status if it exits if (parseResult != null && !parseResult.isEmpty()) { Parse p = parseResult.get(content.getUrl()); if (p != null) { reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1); return p.getData().getStatus(); } } return null; } } public ReFetcher() { super(null); } public ReFetcher(Configuration conf) { super(conf); } private void updateStatus(int bytesInPage) throws IOException { pages.incrementAndGet(); bytes.addAndGet(bytesInPage); } private void reportStatus(int pagesLastSec, int bytesLastSec) throws IOException { String status; long elapsed = (System.currentTimeMillis() - start) / 1000; float avgPagesSec = Math.round(((float) pages.get() * 10) / elapsed) / 10; float avgBytesSec = Math.round(((((float) bytes.get()) * 8) / 1000) / elapsed); status = activeThreads + " threads, " + fetchQueues.getQueueCount() + " queues, " + fetchQueues.getTotalSize() + " URLs queued, " + pages + " pages, " + errors + " errors, " + avgPagesSec + " (" + pagesLastSec + ") pages/s, " + avgBytesSec + " (" + bytesLastSec + ") kbits/s, "; reporter.setStatus(status); } public void configure(JobConf job) { setConf(job); this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY); this.storingContent = isStoringContent(job); this.parsing = isParsing(job); } public void close() { } public static boolean isParsing(Configuration conf) { return conf.getBoolean("fetcher.parse", true); } public static boolean isStoringContent(Configuration conf) { return conf.getBoolean("fetcher.store.content", true); } public void run(RecordReader<Text, CrawlDatum> input, OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { this.output = output; this.reporter = reporter; this.fetchQueues = new ReFetcher.FetchItemQueues(getConf()); int threadCount = getConf().getInt("fetcher.threads.fetch", 10); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: threads: " + threadCount); } int timeoutDivisor = getConf().getInt("fetcher.threads.timeout.divisor", 2); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: time-out divisor: " + timeoutDivisor); } int queueDepthMuliplier = getConf().getInt("fetcher.queue.depth.multiplier", 50); feeder = new ReFetcher.QueueFeeder(input, fetchQueues, threadCount * queueDepthMuliplier); //feeder.setPriority((Thread.MAX_PRIORITY + Thread.NORM_PRIORITY) / 2); // the value of the time limit is either -1 or the time where it should finish long timelimit = getConf().getLong("fetcher.timelimit", -1); if (timelimit != -1) { feeder.setTimeLimit(timelimit); } feeder.start(); // set non-blocking & no-robots mode for HTTP protocol plugins. getConf().setBoolean(Protocol.CHECK_BLOCKING, false); getConf().setBoolean(Protocol.CHECK_ROBOTS, false); for (int i = 0; i < threadCount; i++) { // spawn threads new ReFetcher.FetcherThread(getConf()).start(); } // select a timeout that avoids a task timeout long timeout = getConf().getInt("mapred.task.timeout", 10 * 60 * 1000) / timeoutDivisor; // Used for threshold check, holds pages and bytes processed in the last second int pagesLastSec; int bytesLastSec; // Set to true whenever the threshold has been exceeded for the first time boolean throughputThresholdExceeded = false; int throughputThresholdNumRetries = 0; int throughputThresholdPages = getConf().getInt("fetcher.throughput.threshold.pages", -1); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: throughput threshold: " + throughputThresholdPages); } int throughputThresholdMaxRetries = getConf().getInt("fetcher.throughput.threshold.retries", 5); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: throughput threshold retries: " + throughputThresholdMaxRetries); } long throughputThresholdTimeLimit = getConf().getLong("fetcher.throughput.threshold.check.after", -1); do { // wait for threads to exit pagesLastSec = pages.get(); bytesLastSec = (int) bytes.get(); try { Thread.sleep(1000); } catch (InterruptedException e) { } pagesLastSec = pages.get() - pagesLastSec; bytesLastSec = (int) bytes.get() - bytesLastSec; reporter.incrCounter("FetcherStatus", "bytes_downloaded", bytesLastSec); reportStatus(pagesLastSec, bytesLastSec); LOG.info("-activeThreads=" + activeThreads + ", spinWaiting=" + spinWaiting.get() + ", fetchQueues.totalSize=" + fetchQueues.getTotalSize()); if (!feeder.isAlive() && fetchQueues.getTotalSize() < 5) { fetchQueues.dump(); } // if throughput threshold is enabled if (throughputThresholdTimeLimit < System.currentTimeMillis() && throughputThresholdPages != -1) { // Check if we're dropping below the threshold if (pagesLastSec < throughputThresholdPages) { throughputThresholdNumRetries++; LOG.warn(Integer.toString(throughputThresholdNumRetries) + ": dropping below configured threshold of " + Integer.toString(throughputThresholdPages) + " pages per second"); // Quit if we dropped below threshold too many times if (throughputThresholdNumRetries == throughputThresholdMaxRetries) { LOG.warn("Dropped below threshold too many times, killing!"); // Disable the threshold checker throughputThresholdPages = -1; // Empty the queues cleanly and get number of items that were dropped int hitByThrougputThreshold = fetchQueues.emptyQueues(); if (hitByThrougputThreshold != 0) { reporter.incrCounter("FetcherStatus", "hitByThrougputThreshold", hitByThrougputThreshold); } } } } // check timelimit if (!feeder.isAlive()) { int hitByTimeLimit = fetchQueues.checkTimelimit(); if (hitByTimeLimit != 0) { reporter.incrCounter("FetcherStatus", "hitByTimeLimit", hitByTimeLimit); } } // some requests seem to hang, despite all intentions if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) { if (LOG.isWarnEnabled()) { LOG.warn("Aborting with " + activeThreads + " hung threads."); } return; } } while (activeThreads.get() > 0); LOG.info("-activeThreads=" + activeThreads); } public void fetch(Path segment, int threads) throws IOException { checkConfiguration(); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: starting at " + sdf.format(start)); LOG.info("Fetcher: segment: " + segment); } // set the actual time for the timelimit relative // to the beginning of the whole job and not of a specific task // otherwise it keeps trying again if a task fails long timelimit = getConf().getLong("ReFetcher.timelimit.mins", -1); if (timelimit != -1) { timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); LOG.info("Fetcher Timelimit set for : " + timelimit); getConf().setLong("fetcher.timelimit", timelimit); } // Set the time limit after which the throughput threshold feature is enabled timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10); timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); getConf().setLong("fetcher.throughput.threshold.check.after", timelimit); int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1); if (maxOutlinkDepth > 0) { LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth)); int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4); int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2); int totalOutlinksToFollow = 0; for (int i = 0; i < maxOutlinkDepth; i++) { totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks); } LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow)); } JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(ReFetcher.InputFormat.class); job.setMapRunnerClass(ReFetcher.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } /** * Run the ReFetcher. */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new ReFetcher(), args); System.exit(res); } public int run(String[] args) throws Exception { String usage = "Usage: Fetcher <segment> [-threads n]"; if (args.length < 1) { System.err.println(usage); return -1; } Path segment = new Path(args[0]); int threads = getConf().getInt("fetcher.threads.fetch", 10); boolean parsing = false; for (int i = 1; i < args.length; i++) { // parse command line if (args[i].equals("-threads")) { // found -threads option threads = Integer.parseInt(args[++i]); } } getConf().setInt("fetcher.threads.fetch", threads); try { fetch(segment, threads); return 0; } catch (Exception e) { LOG.error("Fetcher: " + StringUtils.stringifyException(e)); return -1; } } private void checkConfiguration() { // ensure that a value has been set for the agent name and that that // agent name is the first value in the agents we advertise for robot // rules parsing String agentName = getConf().get("http.agent.name"); if (agentName == null || agentName.trim().length() == 0) { String message = "Fetcher: No agents listed in 'http.agent.name'" + " property."; if (LOG.isErrorEnabled()) { LOG.error(message); } throw new IllegalArgumentException(message); } else { // get all of the agents that we advertise String agentNames = getConf().get("http.robots.agents"); StringTokenizer tok = new StringTokenizer(agentNames, ","); ArrayList<String> agents = new ArrayList<String>(); while (tok.hasMoreTokens()) { agents.add(tok.nextToken().trim()); } // if the first one is not equal to our agent name, log fatal and throw // an exception if (!(agents.get(0)).equalsIgnoreCase(agentName)) { String message = "Fetcher: Your 'http.agent.name' value should be " + "listed first in 'http.robots.agents' property."; if (LOG.isWarnEnabled()) { LOG.warn(message); } } } } }