org.apache.nutch.crawl.TODOTestCrawlDbStates.java Source code

Introduction

Here is the source code for org.apache.nutch.crawl.TODOTestCrawlDbStates.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.crawl;

import java.io.IOException;
import java.lang.invoke.MethodHandles;

import static org.apache.nutch.crawl.CrawlDatum.*;

import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.TimingUtil;

import org.apache.hadoop.mapreduce.Reducer.Context;

import static org.junit.Assert.*;

import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TODOTestCrawlDbStates extends TestCrawlDbStates {

    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

    /**
     * NUTCH-578: a fetch_retry should result in a db_gone if db.fetch.retry.max
     * is reached. Retry counter has to be reset appropriately.
     */
    @Test
    public void testCrawlDbReducerPageRetrySchedule() {
        LOG.info("NUTCH-578: test long running continuous crawl with fetch_retry");
        ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestFetchRetry();
        // keep going for long, to "provoke" a retry counter overflow
        try {
            if (!crawlUtil.run(150)) {
                fail("fetch_retry did not result in a db_gone if retry counter > maxRetries (NUTCH-578)");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private class ContinuousCrawlTestFetchRetry extends ContinuousCrawlTestUtil {

        private int retryMax = 3;
        private int totalRetries = 0;

        ContinuousCrawlTestFetchRetry() {
            super();
            fetchStatus = STATUS_FETCH_RETRY;
            retryMax = context.getConfiguration().getInt("db.fetch.retry.max", retryMax);
        }

        @Override
        protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
            datum.setStatus(fetchStatus);
            datum.setFetchTime(currentTime);
            totalRetries++;
            return datum;
        }

        @Override
        protected boolean check(CrawlDatum result) {
            if (result.getRetriesSinceFetch() > retryMax) {
                LOG.warn("Retry counter > db.fetch.retry.max: " + result);
            } else if (result.getRetriesSinceFetch() == Byte.MAX_VALUE) {
                LOG.warn("Retry counter max. value reached (overflow imminent): " + result);
            } else if (result.getRetriesSinceFetch() < 0) {
                LOG.error("Retry counter overflow: " + result);
                return false;
            }
            // use retry counter bound to this class (totalRetries)
            // instead of result.getRetriesSinceFetch() because the retry counter
            // in CrawlDatum could be reset (eg. NUTCH-578_v5.patch)
            if (totalRetries < retryMax) {
                if (result.getStatus() == STATUS_DB_UNFETCHED) {
                    LOG.info("ok: " + result);
                    result.getRetriesSinceFetch();
                    return true;
                }
            } else {
                if (result.getStatus() == STATUS_DB_GONE) {
                    LOG.info("ok: " + result);
                    return true;
                }
            }
            LOG.warn("wrong: " + result);
            return false;
        }

    }

    /**
     * NUTCH-1564 AdaptiveFetchSchedule: sync_delta forces immediate re-fetch for
     * documents not modified
     * <p>
     * Problem: documents not modified for a longer time are fetched in every
     * cycle because of an error in the SYNC_DELTA calculation of
     * {@link AdaptiveFetchSchedule}. <br>
     * The next fetch time should always be in the future, never in the past.
     * </p>
     */
    @Test
    public void testAdaptiveFetchScheduleSyncDelta() {
        LOG.info("NUTCH-1564 test SYNC_DELTA calculation of AdaptiveFetchSchedule");
        Context context = CrawlDBTestUtil.createContext();
        Configuration conf = context.getConfiguration();
        conf.setLong("db.fetch.interval.default", 172800); // 2 days
        conf.setLong("db.fetch.schedule.adaptive.min_interval", 86400); // 1 day
        conf.setLong("db.fetch.schedule.adaptive.max_interval", 604800); // 7 days
        conf.setLong("db.fetch.interval.max", 604800); // 7 days
        conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl.AdaptiveFetchSchedule");
        ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchScheduleNotModifiedFetchTime(context);
        crawlUtil.setInterval(FetchSchedule.SECONDS_PER_DAY / 3);
        try {
            if (!crawlUtil.run(100)) {
                fail("failed: sync_delta calculation with AdaptiveFetchSchedule");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private class CrawlTestFetchScheduleNotModifiedFetchTime extends CrawlTestFetchNotModified {

        // time of current fetch
        private long fetchTime;

        private long minInterval;
        private long maxInterval;

        CrawlTestFetchScheduleNotModifiedFetchTime(Context context) {
            super(context);
            Configuration conf = context.getConfiguration();
            minInterval = conf.getLong("db.fetch.schedule.adaptive.min_interval", 86400); // 1 day
            maxInterval = conf.getLong("db.fetch.schedule.adaptive.max_interval", 604800); // 7 days
            if (conf.getLong("db.fetch.interval.max", 604800) < maxInterval) {
                maxInterval = conf.getLong("db.fetch.interval.max", 604800);
            }
        }

        @Override
        protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
            // remember time of fetching
            fetchTime = currentTime;
            return super.fetch(datum, currentTime);
        }

        @Override
        protected boolean check(CrawlDatum result) {
            if (result.getStatus() == STATUS_DB_NOTMODIFIED) {
                // check only status notmodified here
                long secondsUntilNextFetch = (result.getFetchTime() - fetchTime) / 1000L;
                if (secondsUntilNextFetch < -1) {
                    // next fetch time is in the past (more than one second)
                    LOG.error("Next fetch time is in the past: " + result);
                    return false;
                }
                if (secondsUntilNextFetch < 60) {
                    // next fetch time is in less than one minute
                    // (critical: Nutch can hardly be so fast)
                    LOG.error("Less then one minute until next fetch: " + result);
                }
                // Next fetch time should be within min. and max. (tolerance: 60 sec.)
                if (secondsUntilNextFetch + 60 < minInterval || secondsUntilNextFetch - 60 > maxInterval) {
                    LOG.error("Interval until next fetch time ("
                            + TimingUtil.elapsedTime(fetchTime, result.getFetchTime())
                            + ") is not within min. and max. interval: " + result);
                    // TODO: is this a failure?
                }
            }
            return true;
        }

    }

}