org.apache.nutch.util.hostdb.HostDb.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.util.hostdb.HostDb.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.util.hostdb;

import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.text.SimpleDateFormat;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.Date;
import java.util.Random;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Tool to create a HostDB from the CrawlDB. It aggregates fetch status values by host and checks
 * DNS entries for hosts.
 */
public class HostDb extends Configured implements Tool {
    public static final Logger LOG = LoggerFactory.getLogger(HostDb.class);
    public static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    public static final String LOCK_NAME = ".locked";
    public static final String CURRENT_NAME = "current";

    public static final String HOSTDB_PURGE_FAILED_HOSTS_THRESHOLD = "hostdb.purge.failed.hosts.threshold";
    public static final String HOSTDB_NUM_RESOLVER_THREADS = "hostdb.num.resolvers.threads";
    public static final String HOSTDB_RECHECK_INTERVAL = "hostdb.recheck.interval";
    public static final String HOSTDB_CHECK_FAILED = "hostdb.check.failed";
    public static final String HOSTDB_CHECK_NEW = "hostdb.check.new";
    public static final String HOSTDB_CHECK_KNOWN = "hostdb.check.known";
    public static final String HOSTDB_FORCE_CHECK = "hostdb.force.check";
    public static final String HOSTDB_URL_FILTERING = "hostdb.url.filter";
    public static final String HOSTDB_URL_NORMALIZING = "hostdb.url.normalize";

    /**
     * Mapper ingesting HostDB and CrawlDB entries. Additionally it can also read host score info
     * from a plain text key/value file generated by the Webgraph's NodeDumper tool.
     */
    public static class HostDbMapper extends Mapper<Text, Writable, Text, NutchWritable> {
        private Text host = new Text();
        private HostDatum hostDatum = null;
        private CrawlDatum crawlDatum = null;
        private String reprUrl = null;
        private String buffer = null;
        private boolean filter = false;
        private boolean normalize = false;
        private boolean readingCrawlDb = false;
        private URLFilters filters = null;
        private URLNormalizers normalizers = null;

        public void setup(Context context) {
            Configuration conf = context.getConfiguration();
            readingCrawlDb = conf.getBoolean("hostdb.reading.crawldb", false);
            filter = conf.getBoolean(HOSTDB_URL_FILTERING, false);
            normalize = conf.getBoolean(HOSTDB_URL_NORMALIZING, false);

            if (filter)
                filters = new URLFilters(conf);

            if (normalize)
                normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
        }

        /* Filters and or normalizes the input URL */
        private String filterNormalize(String u) {
            boolean isHost = false;
            String url = u;

            if (!u.startsWith("http://") && !u.startsWith("https://")) {
                // We received a hostname here so let's make a URL
                url = "http://" + u + "/";
                isHost = true;
            }

            try {
                if (normalizers != null)
                    url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);

                if (filters != null)
                    url = filters.filter(url);

                if (isHost && url == null) {
                    // All hosts may not allow HTTP scheme and just allow HTTPS scheme.
                    // So, try to force HTTPS for domains which are filtered with HTTP scheme
                    // Note that this is a hacky way of getting around and does not work
                    // for FTP and FILE schemes.
                    String httpsUrl = "https://" + u + "/";
                    if (normalizers != null)
                        httpsUrl = normalizers.normalize(httpsUrl, URLNormalizers.SCOPE_DEFAULT);

                    if (filters != null)
                        httpsUrl = filters.filter(httpsUrl);

                    url = httpsUrl;
                }
            } catch (Exception e) {
                return null;
            }
            return url;
        }

        /**
         * Mapper ingesting records from the HostDB, CrawlDB and plain-text host scores
         * file. Statistics and scores are passed on.
         */
        public void map(Text key, Writable value, Context context) throws IOException, InterruptedException {

            if (value instanceof CrawlDatum) {
                // This is a record from the CrawlDB
                // Get the normalized and filtered host of this URL
                buffer = filterNormalize(URLUtil.getHost(key.toString()));

                // Filtered out?
                if (buffer == null) {
                    context.getCounter("HostDb", "filtered_records").increment(1);
                    LOG.info(URLUtil.getHost(key.toString()) + " crawldatum has been filtered");
                    return;
                }

                // Set the host of this URL
                host.set(buffer);
                crawlDatum = (CrawlDatum) value;
                hostDatum = new HostDatum();

                /**
                 * Known limitation:
                 * multi redirects: host_a => host_b/page => host_c/page/whatever
                 *
                 * We cannot re-resolve redirects for host objects as CrawlDatum metadata is
                 * not available. We also cannot reliably use the reducer in all cases since
                 * redirects may be across hosts or even domains. For now saving this for future
                 * as multi-redirects are not very common on the entire internet.
                 */

                // Check if the current key is equals the host
                if (key.toString().equals("http://" + buffer + "/")) {
                    // Check if this is a redirect to the real home page
                    if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM
                            || crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {

                        // Obtain the repr url for this redirect via protocol status from the metadata
                        ProtocolStatus z = (ProtocolStatus) crawlDatum.getMetaData()
                                .get(Nutch.WRITABLE_PROTO_STATUS_KEY);

                        // Get the protocol status' arguments
                        reprUrl = z.getArgs()[0];

                        if (reprUrl != null) {
                            LOG.info("Homepage: " + key.toString() + " redirects to: " + reprUrl);
                            hostDatum.setHomepageUrl(reprUrl);
                        } else {
                            LOG.info("Homepage: " + key.toString() + " redirects to: " + reprUrl
                                    + " but has been filtered out");
                        }
                    } else {
                        hostDatum.setHomepageUrl("http://" + buffer + "/");
                        LOG.info("Homepage: " + "http://" + buffer + "/");
                    }
                }

                hostDatum.setStat(crawlDatum.getStatus(), 1);
                context.write(host, new NutchWritable(hostDatum));
            } else if (value instanceof HostDatum) { // we got a record from the hostdb
                buffer = filterNormalize(key.toString());

                // Filtered out?
                if (buffer == null) {
                    context.getCounter("HostDb", "filtered_records").increment(1);
                    LOG.info(key.toString() + " hostdatum has been filtered");
                    return;
                }

                // Get a HostDatum
                hostDatum = (HostDatum) value;
                key.set(buffer);

                // If we're also reading CrawlDb entries, reset db_* statistics because
                // we're aggregating them from CrawlDB anyway
                if (readingCrawlDb)
                    hostDatum.resetStatistics();

                context.write(key, new NutchWritable(hostDatum));
            } else if (value instanceof Text) { // we got a record with host scores
                buffer = filterNormalize(key.toString());

                // Filtered out?
                if (buffer == null) {
                    context.getCounter("HostDb", "filtered_records").increment(1);
                    LOG.info(key.toString() + " score has been filtered");
                    return;
                }

                key.set(buffer);
                context.write(key, new NutchWritable(new FloatWritable(Float.parseFloat(value.toString()))));
            }
        }
    }

    static class HostDbReducer extends Reducer<Text, NutchWritable, Text, HostDatum> {
        private ResolverThread resolverThread = null;

        private Integer numResolverThreads = 10;
        private static Integer purgeFailedHostsThreshold = -1;
        private static Integer recheckInterval = 86400000;
        private static boolean checkFailed = false;
        private static boolean checkNew = false;
        private static boolean checkKnown = false;
        private static boolean force = false;
        private static long now = new Date().getTime();

        private BlockingQueue<Runnable> queue = new SynchronousQueue<Runnable>();
        private ThreadPoolExecutor executor = null;

        /**
         * Configures the thread pool and prestarts all resolver threads.
         */
        public void setup(Context context) {
            Configuration conf = context.getConfiguration();
            purgeFailedHostsThreshold = conf.getInt(HOSTDB_PURGE_FAILED_HOSTS_THRESHOLD, -1);
            numResolverThreads = conf.getInt(HOSTDB_NUM_RESOLVER_THREADS, 10);
            recheckInterval = conf.getInt(HOSTDB_RECHECK_INTERVAL, 86400) * 1000;
            checkFailed = conf.getBoolean(HOSTDB_CHECK_FAILED, false);
            checkNew = conf.getBoolean(HOSTDB_CHECK_NEW, false);
            checkKnown = conf.getBoolean(HOSTDB_CHECK_KNOWN, false);
            force = conf.getBoolean(HOSTDB_FORCE_CHECK, false);

            // Initialize the thread pool with our queue
            executor = new ThreadPoolExecutor(numResolverThreads, numResolverThreads, 5, TimeUnit.SECONDS, queue);

            // Run all threads in the pool
            executor.prestartAllCoreThreads();
        }

        public void reduce(Text key, Iterable<NutchWritable> values, Context context)
                throws IOException, InterruptedException {

            HostDatum hostDatum = new HostDatum();
            float score = 0;

            // Loop through all values until we find a non-empty HostDatum or use an empty if this is a new host for the host db
            for (Writable value : values) {
                if (value instanceof HostDatum) {
                    HostDatum buffer = (HostDatum) value;

                    // Increment statistics only if this is not an existing HostDatum
                    if (hostDatum.isEmpty()) {
                        hostDatum.addStat(CrawlDatum.STATUS_DB_UNFETCHED, buffer);
                        hostDatum.addStat(CrawlDatum.STATUS_DB_FETCHED, buffer);
                        hostDatum.addStat(CrawlDatum.STATUS_DB_GONE, buffer);
                        hostDatum.addStat(CrawlDatum.STATUS_DB_REDIR_PERM, buffer);
                        hostDatum.addStat(CrawlDatum.STATUS_DB_REDIR_TEMP, buffer);
                        hostDatum.addStat(CrawlDatum.STATUS_DB_NOTMODIFIED, buffer);
                    }

                    // Check homepage URL
                    if (buffer.hasHomepageUrl())
                        hostDatum.setHomepageUrl(buffer.getHomepageUrl());

                    // Check lastCheck timestamp
                    if (!buffer.isEmpty())
                        hostDatum.setLastCheck(buffer.getLastCheck());

                    // Check and set failures
                    if (buffer.getDnsFailures() > 0)
                        hostDatum.setDnsFailures(buffer.getDnsFailures());

                    // Check and set failures
                    if (buffer.getConnectionFailures() > 0)
                        hostDatum.setConnectionFailures(buffer.getConnectionFailures());

                    // Check and set score (score from Web Graph has precedence)
                    if (buffer.getScore() > 0)
                        hostDatum.setScore(buffer.getScore());
                }

                // Check for the score
                if (value instanceof FloatWritable) {
                    FloatWritable buffer = (FloatWritable) value;
                    score = buffer.get();
                }
            }

            // Check if score was set from Web Graph
            if (score > 0)
                hostDatum.setScore(score);

            context.getCounter("HostDb", "total_hosts").increment(1);

            // See if this record is to be checked
            if (shouldCheck(hostDatum)) {
                // Make an entry
                resolverThread = new ResolverThread(key.toString(), hostDatum, context);

                // Add the entry to the queue (blocking)
                try {
                    queue.put(resolverThread);
                } catch (InterruptedException e) {
                    LOG.error("HostDb: " + StringUtils.stringifyException(e));
                }

                // Do not progress, the datum will be written in the resolver thread
                return;
            } else {
                context.getCounter("HostDb", "skipped_not_eligible").increment(1);
                LOG.info(key.toString() + ": skipped_not_eligible");
            }

            // Write the host datum if it wasn't written by the resolver thread
            context.write(key, hostDatum);
        }

        /**
         * Determines whether a record should be checked.
         */
        private boolean shouldCheck(HostDatum datum) {
            // Whether a new record is to be checked
            if (checkNew && datum.isEmpty()) {
                return true;
            }

            // Whether existing known hosts should be rechecked
            if (checkKnown && !datum.isEmpty() && datum.getDnsFailures() == 0) {
                return isEligibleForCheck(datum);
            }

            // Whether failed records are forced to be rechecked
            if (checkFailed && datum.getDnsFailures() > 0) {
                return isEligibleForCheck(datum);
            }

            // It seems this record is not to be checked
            return false;
        }

        /**
         * Determines whether a record is eligible for recheck
         */
        private boolean isEligibleForCheck(HostDatum datum) {
            // Whether an existing host, known or unknown, if forced to be rechecked
            return (force || datum.getLastCheck().getTime() + (recheckInterval * datum.getDnsFailures() + 1) < now);
        }

        /**
         * Shut down all running threads and wait for completion.
         */
        public void close() {
            LOG.info("Feeder finished, waiting for shutdown");

            // If we're here all keys have been fed and we can issue a shut down
            executor.shutdown();

            boolean finished = false;

            while (!finished) {
                try {
                    // Wait for the executor to shut down completely
                    if (!executor.isTerminated()) {
                        LOG.info("Threads waiting: " + Integer.toString(executor.getPoolSize()));
                        Thread.sleep(1000);
                    } else {
                        // All is well, get out
                        finished = true;
                    }
                } catch (InterruptedException e) {
                    LOG.warn(StringUtils.stringifyException(e));
                }
            }
        }

        static class ResolverThread implements Runnable {
            private String host = null;
            private HostDatum datum = null;
            private Text hostText = new Text();
            private Context context = null;

            public ResolverThread(String host, HostDatum datum, Context context) {
                hostText.set(host);
                this.host = host;
                this.datum = datum;
                this.context = context;
            }

            public void run() {
                // Resolve the host and act appropriately
                datum.setLastCheck();
                try {
                    // Throws an exception if host is not found
                    InetAddress.getByName(host);

                    if (datum.isEmpty()) {
                        context.getCounter("HostDb", "new_known_host").increment(1);
                        LOG.info(host + ": new_known_host " + datum);
                    } else if (datum.getDnsFailures() > 0) {
                        context.getCounter("HostDb", "rediscovered_host").increment(1);
                        datum.setDnsFailures(0);
                        LOG.info(host + ": rediscovered_host " + datum);
                    } else {
                        context.getCounter("HostDb", "existing_known_host").increment(1);
                        LOG.info(host + ": existing_known_host " + datum);
                    }
                    // Write the host datum
                    context.write(hostText, datum);
                } catch (UnknownHostException e) {
                    try {
                        // If the counter is empty we'll initialize with date = today and 1 failure
                        if (datum.isEmpty()) {
                            datum.setDnsFailures(1);
                            context.write(hostText, datum);
                            context.getCounter("HostDb", "new_unknown_host").increment(1);
                            LOG.info(host + ": new_unknown_host " + datum);
                        } else {
                            datum.incDnsFailures();

                            // Check if this host should be forgotten
                            if (purgeFailedHostsThreshold == -1
                                    || purgeFailedHostsThreshold < datum.getDnsFailures()) {
                                context.write(hostText, datum);
                                context.getCounter("HostDb", "existing_unknown_host").increment(1);
                                LOG.info(host + ": existing_unknown_host " + datum);
                            } else {
                                context.getCounter("HostDb", "purged_unknown_host").increment(1);
                                LOG.info(host + ": purged_unknown_host " + datum);
                            }
                        }

                        context.getCounter("HostDb", Integer.toString(datum.numFailures()) + "_times_failed")
                                .increment(1);
                    } catch (Exception ioe) {
                        LOG.warn(StringUtils.stringifyException(ioe));
                    }
                } catch (Exception e) {
                    LOG.warn(StringUtils.stringifyException(e));
                }
                context.getCounter("HostDb", "checked_hosts").increment(1);
            }
        }
    }

    private void hostDb(Path hostDb, Path crawlDb, Path topHosts, boolean checkFailed, boolean checkNew,
            boolean checkKnown, boolean force, boolean filter, boolean normalize) throws Exception {

        long start = System.currentTimeMillis();
        LOG.info("HostDb: starting at " + sdf.format(start));

        Configuration conf = getConf();
        conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        conf.setBoolean(HOSTDB_CHECK_FAILED, checkFailed);
        conf.setBoolean(HOSTDB_CHECK_NEW, checkNew);
        conf.setBoolean(HOSTDB_CHECK_KNOWN, checkKnown);
        conf.setBoolean(HOSTDB_FORCE_CHECK, force);
        conf.setBoolean(HOSTDB_URL_FILTERING, filter);
        conf.setBoolean(HOSTDB_URL_NORMALIZING, normalize);

        // Check whether the urlfilter-domainblacklist plugin is loaded
        if (filter && "urlfilter-domainblacklist".matches(conf.get("plugin.includes"))) {
            throw new Exception("domainblacklist-urlfilter must not be enabled");
        }

        // Check whether the urlnormalizer-host plugin is loaded
        if (normalize && "urlnormalizer-host".matches(conf.get("plugin.includes"))) {
            throw new Exception("urlnormalizer-host must not be enabled");
        }

        FileSystem fs = FileSystem.get(conf);
        Path old = new Path(hostDb, "old");
        Path current = new Path(hostDb, CURRENT_NAME);
        Path tempHostDb = new Path(hostDb, "hostdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

        // lock an existing hostdb to prevent multiple simultaneous updates
        Path lock = new Path(hostDb, LOCK_NAME);
        if (!fs.exists(current)) {
            fs.mkdirs(current);
        }
        LockUtil.createLockFile(fs, lock, false);

        Job job = new Job(conf, "HostDb " + hostDb);
        job.setJarByClass(HostDb.class);
        job.setSpeculativeExecution(false);

        MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class);

        if (topHosts != null) {
            MultipleInputs.addInputPath(job, topHosts, KeyValueTextInputFormat.class);
        }
        if (crawlDb != null) {
            // Tell the job we read from CrawlDB
            conf.setBoolean("hostdb.reading.crawldb", true);
            MultipleInputs.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME),
                    SequenceFileInputFormat.class);
        }

        FileOutputFormat.setOutputPath(job, tempHostDb);

        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NutchWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(HostDatum.class);
        job.setMapperClass(HostDbMapper.class);
        job.setReducerClass(HostDbReducer.class);

        try {
            job.waitForCompletion(true);

            FSUtils.replace(fs, old, current, true);
            FSUtils.replace(fs, current, tempHostDb, true);

            boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
            if (!preserveBackup && fs.exists(old))
                fs.delete(old, true);
        } catch (Exception e) {
            if (fs.exists(tempHostDb)) {
                fs.delete(tempHostDb, true);
            }
            LockUtil.removeLockFile(fs, lock);
            throw e;
        }

        LockUtil.removeLockFile(fs, lock);
        long end = System.currentTimeMillis();
        LOG.info("HostDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
    }

    public static void main(String args[]) throws Exception {
        int res = ToolRunner.run(NutchConfiguration.create(), new HostDb(), args);
        System.exit(res);
    }

    public static void usage() {
        System.err.println(
                "Usage: HostDb <hostdb> " + "[-crawldb <crawldb>] [-tophosts <tophosts>] [-checkAll] [-checkFailed]"
                        + " [-checkNew] [-checkKnown] [-force] [-noFilter] [-noNormalize]");
        System.err.println("\t<hostdb>\tdirectory name where hostdb is located");
        System.err.println("\t-crawldb <crawldb>\tpath to a crawldb directory");
        System.err.println(
                "\t-tophosts <tophosts>\tkey-value text file from the Webgraph's NodeDumper tool having score");
        System.err.println("\t-checkAll\tApply DNS check to resolve all hosts");
        System.err.println(
                "\t-checkFailed\tApply DNS check to resolve only on hosts which had failed DNS check earlier");
        System.err.println("\t-checkNew\tApply DNS check to resolve only new hosts");
        System.err.println("\t-checkKnown\tApply DNS check to resolve only known hosts");
        System.err.println("\t-force\t\tforce hosts to be rechecked. With earlier args, check "
                + "is done on host only if 'recheckInterval' has elapsed.");
        System.err.println("\t-noFilter\tturn off URLFilters on urls");
        System.err.println("\t-noNormalize\tturn off URLNormalizer on urls");
    }

    public int run(String[] args) throws Exception {
        if (args.length < 2) {
            usage();
            return -1;
        }

        Path hostDb = new Path(args[0]);
        Path crawlDb = null;
        Path topHosts = null;

        boolean checkFailed = false;
        boolean checkNew = false;
        boolean checkKnown = false;
        boolean force = false;

        boolean filter = true;
        boolean normalize = true;

        for (int i = 1; i < args.length; i++) {
            if (args[i].equals("-crawldb")) {
                crawlDb = new Path(args[++i]);
                LOG.info("HostDb: crawldb: " + crawlDb);
            } else if (args[i].equals("-tophosts")) {
                topHosts = new Path(args[++i]);
                LOG.info("HostDb: tophosts: " + topHosts);
            } else if (args[i].equals("-checkFailed")) {
                LOG.info("HostDb: checking failed hosts");
                checkFailed = true;
            } else if (args[i].equals("-checkNew")) {
                LOG.info("HostDb: checking new hosts");
                checkNew = true;
            } else if (args[i].equals("-checkKnown")) {
                LOG.info("HostDb: checking known hosts");
                checkKnown = true;
            } else if (args[i].equals("-checkAll")) {
                LOG.info("HostDb: checking all hosts");
                checkFailed = true;
                checkNew = true;
                checkKnown = true;
            } else if (args[i].equals("-force")) {
                LOG.info("HostDb: forced check");
                force = true;
            } else if (args[i].equals("-noFilter")) {
                LOG.info("HostDb: filtering disabled");
                filter = false;
            } else if (args[i].equals("-noNormalize")) {
                LOG.info("HostDb: normalizing disabled");
                normalize = false;
            } else {
                LOG.info("HostDb: Found invalid argument \"" + args[i] + "\"\n");
                usage();
                return -1;
            }
        }

        try {
            hostDb(hostDb, crawlDb, topHosts, checkFailed, checkNew, checkKnown, force, filter, normalize);
            return 0;
        } catch (Exception e) {
            LOG.error("HostDb: " + StringUtils.stringifyException(e));
            return -1;
        }
    }
}