org.archive.access.nutch.Nutchwax.java Source code

Introduction

Here is the source code for org.archive.access.nutch.Nutchwax.java
Source

/* Nutchwax
 * 
 * $Id: Nutchwax.java 1896 2007-08-01 21:44:31Z jlee-archive $
 * 
 * Created on Feb 14, 2006
 *
 * Copyright (C) 2006 Internet Archive.
 * 
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 * 
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 * 
 * Heritrix is distributed in the hope that it will be useful, 
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package org.archive.access.nutch;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.indexer.DeleteDuplicates;
import org.apache.nutch.indexer.IndexMerger;
import org.archive.access.nutch.jobs.ImportArcs;
import org.archive.access.nutch.jobs.NutchwaxCrawlDb;
import org.archive.access.nutch.jobs.NutchwaxIndexer;
import org.archive.access.nutch.jobs.NutchwaxLinkDb;
import org.archive.access.nutch.jobs.NutchwaxPagerank;
import org.archive.util.ArchiveUtils;
import org.apache.nutch.global.Global;

/**
 * Script to run all indexing jobs from index through merge of final index.
 */
public class Nutchwax {
    public static final Log LOG = LogFactory.getLog(Nutchwax.class.getName());

    private static final String KEY_COLLECTION_PREFIX = "c=";
    private static final String KEY_COLLECTION_SUFFIX = ",u=";
    private static final Pattern COLLECTION = Pattern.compile("^\\s*c=([^,]+),u=(.*)\\s*", Pattern.DOTALL);

    private final static List JOBS = Arrays.asList(new String[] { "import", "update", "invert", "pagerank", "index",
            "dedup", "merge", "all", "class", "search", "multiple", "version" });

    // Lazy initialize these two variables to delay complaint about hadoop not
    // being present -- if its not.  Meantime I get command-line processing
    // done.
    private FileSystem fs = null;
    private JobConf conf = null;

    /**
     * Default constructor.
     * @throws IOException 
     */
    public Nutchwax() throws IOException {
        super();
    }

    public synchronized JobConf getJobConf() {
        if (this.conf == null) {
            this.conf = new JobConf(NutchwaxConfiguration.getConfiguration());
        }

        return this.conf;
    }

    public synchronized FileSystem getFS() throws IOException {
        if (this.fs == null) {
            this.fs = FileSystem.get(getJobConf());
        }

        return this.fs;
    }

    public class OutputDirectories {
        private final Path output;
        private final Path crawlDb;
        private final Path linkDb;
        private final Path pagerank;
        private final Path segments;
        private final Path indexes;
        private final Path index;
        private final Path tmpDir;

        public OutputDirectories(final Path output) throws IOException {
            this.output = output;
            this.crawlDb = new Path(output + "/crawldb");
            this.linkDb = new Path(output + "/linkdb");
            this.pagerank = new Path(output + "/pagerank");
            this.segments = new Path(output + "/segments");
            this.indexes = new Path(output + "/indexes");
            this.index = new Path(output + "/index");
            this.tmpDir = getJobConf().getLocalPath("mapred.temp.dir", Generator.generateSegmentName());
        }

        public Path getCrawlDb() {
            return crawlDb;
        }

        public Path getIndexes() {
            return indexes;
        }

        public Path getLinkDb() {
            return linkDb;
        }

        public Path getPagerank() {
            return pagerank;
        }

        public Path getSegments() {
            return segments;
        }

        public Path getTmpDir() {
            return tmpDir;
        }

        public Path getIndex() {
            return index;
        }

        public Path getOutput() {
            return output;
        }
    }

    /**
     * Run passed list of mapreduce indexing jobs. Jobs are always run in
     * order: import, update, etc.
     * 
     * @throws Exception
     */
    protected void doAll(final Path input, final String collectionName, final OutputDirectories od)
            throws Exception {
        doImport(input, collectionName, od);
        doUpdate(od);
        doInvert(od);
        doPagerank(od);
        doIndexing(od);
        doDedup(od);
        doMerge(od);

        LOG.info("Nutchwax finished.");
    }

    protected void doImport(final Path input, String collectionName, final OutputDirectories od)
            throws IOException {
        Path segment = new Path(od.getSegments(), Generator.generateSegmentName()
                + ((collectionName == null || collectionName.length() <= 0) ? "" : "-" + collectionName));

        new ImportArcs(getJobConf()).importArcs(input, segment, collectionName);
    }

    protected void doUpdate(final OutputDirectories od) throws IOException {
        doUpdate(od, null);
    }

    protected void doUpdate(final OutputDirectories od, final String[] segments) throws IOException {
        LOG.info("updating crawldb " + od.getCrawlDb());

        // Need to make sure the db dir exists before progressing.
        Path dbPath = new Path(od.getCrawlDb(), CrawlDb.CURRENT_NAME);

        if (!getFS().exists(dbPath)) {
            getFS().mkdirs(dbPath);
        }

        CrawlDb cdb = new NutchwaxCrawlDb(getJobConf());

        if (segments != null) {
            List<Path> paths = new ArrayList<Path>(segments.length);

            for (int i = 0; i < segments.length; i++) {
                Path p = new Path(segments[i]);

                if (!getFS().exists(p)) {
                    throw new FileNotFoundException(p.toString());
                }

                paths.add(p);
            }

            cdb.update(od.getCrawlDb(), paths.toArray(new Path[paths.size()]), true, true);
        } else {
            Path[] allSegments = getSegments(od);

            // This just does the last segment created.
            cdb.update(od.getCrawlDb(), new Path[] { allSegments[allSegments.length - 1] }, true, true);
        }
    }

    protected Path[] getSegments(final OutputDirectories od) throws IOException {
        Path[] allSegments = getFS().listPaths(od.getSegments());

        if (allSegments == null || allSegments.length <= 0) {
            throw new FileNotFoundException(od.getSegments().toString());
        }

        return allSegments;
    }

    protected void doInvert(final OutputDirectories od, final Path[] segments) throws IOException {
        createLinkdb(od);

        new NutchwaxLinkDb(getJobConf()).invert(od.getLinkDb(), segments, true, true, false);
    }

    protected void doInvert(final OutputDirectories od) throws IOException {
        LOG.info("inverting links in " + od.getSegments());

        new NutchwaxLinkDb(getJobConf()).invert(od.getLinkDb(), getSegments(od), true, true, false);
    }

    protected boolean createLinkdb(final OutputDirectories od) throws IOException {
        boolean result = false;

        // Make sure the linkdb exists.  Otherwise the install where
        // the temporary location gets moved to the permanent fails.
        if (getFS().mkdirs(new Path(od.getLinkDb(), NutchwaxLinkDb.CURRENT_NAME))) {
            LOG.info("Created " + od.getLinkDb());

            result = true;
        }

        return result;
    }

    protected void doPagerank(final OutputDirectories od) throws IOException {
        LOG.info("computing pagerank scores in " + od.getPagerank());

        new NutchwaxPagerank(getJobConf()).process(getSegments(od), od.getPagerank());
    }

    protected void doIndexing(final OutputDirectories od) throws IOException {
        doIndexing(od, getFS().listPaths(od.getSegments()));
    }

    protected void doIndexing(final OutputDirectories od, final Path[] segments) throws IOException {
        LOG.info(" indexing " + segments);

        new NutchwaxIndexer(getJobConf()).index(od.getIndexes(), od.getPagerank(), od.getCrawlDb(), od.getLinkDb(),
                segments);
    }

    protected void doDedup(final OutputDirectories od) throws IOException {
        LOG.info("dedup " + od.getIndex());

        new DeleteDuplicates(getJobConf()).dedup(new Path[] { od.getIndexes() });
    }

    protected void doMerge(final OutputDirectories od) throws IOException {
        LOG.info("index merge " + od.getOutput() + " using tmpDir=" + od.getTmpDir());

        new IndexMerger(getJobConf()).merge(getFS().listPaths(od.getIndexes()), od.getIndex(), od.getTmpDir());
    }

    static String[] rewriteArgs(final String[] args, final int offset) {
        final String[] newArgs = new String[args.length - offset];

        for (int i = 0; i < args.length; i++) {
            if (i < offset) {
                continue;
            }

            newArgs[i - offset] = args[i];
        }

        return newArgs;
    }

    static Object doClassMain(final String[] args) {
        // Redo args so absent our nutchwax 'class' command.
        final String className = args[1];
        String[] newArgs = rewriteArgs(args, 2);

        // From http://www.javaworld.com/javaworld/javaqa/1999-06/01-outside.html
        Class[] argTypes = new Class[1];
        argTypes[0] = String[].class;
        Object result = null;

        try {
            Method mainMethod = Class.forName(className).getDeclaredMethod("main", argTypes);
            result = mainMethod.invoke(newArgs, new Object[] { newArgs });
        } catch (Throwable t) {
            t.printStackTrace();
        }

        return result;
    }

    protected Object doSearch(final String[] args) {
        String[] newArgs = new String[args.length + 1];
        newArgs[0] = args[0];
        newArgs[1] = NutchwaxBean.class.getName();

        for (int i = 1; i < args.length; i++) {
            newArgs[i + 1] = args[i];
        }

        return doClassMain(newArgs);
    }

    protected void doMultiple(final String[] args) throws Exception {
        (new Multiple()).run(rewriteArgs(args, 1));
    }

    protected void doVersion(final String[] args) throws Exception {
        JobConf job = getJobConf();
        String collectionType = job.get(Global.COLLECTION_TYPE);
        System.out.println("Collection type:" + collectionType);
    }

    protected void doJob(final String jobName, final String[] args) throws Exception {
        if (jobName.equals("import")) {
            // Usage: hadoop jar nutchwax.jar import input output name
            if (args.length != 4) {
                ImportArcs.doImportUsage("ERROR: Wrong number of arguments passed.", 2);
            }

            final Path input = new Path(args[1]);
            final Path output = new Path(args[2]);
            final String collectionName = args[3];

            checkArcsDir(input);
            OutputDirectories od = new OutputDirectories(output);
            doImport(input, collectionName, od);
        } else if (jobName.equals("update")) {
            // Usage: hadoop jar nutchwax.jar update output
            if (args.length < 2) {
                doUpdateUsage("ERROR: Wrong number of arguments passed.", 2);
            }

            OutputDirectories od = new OutputDirectories(new Path(args[1]));

            if (args.length == 2) {
                doUpdate(od);
            } else {
                for (int i = 2; i < args.length; i++) {
                    doUpdate(od, new String[] { args[i] });
                }
            }
        } else if (jobName.equals("invert")) {
            // Usage: hadoop jar nutchwax.jar invert output
            if (args.length < 2) {
                doInvertUsage("ERROR: Wrong number of arguments passed.", 2);
            }

            OutputDirectories od = new OutputDirectories(new Path(args[1]));

            if (args.length == 2) {
                doInvert(od);
            } else {
                final int offset = 2;
                Path[] segments = new Path[args.length - offset];

                for (int i = offset; i < args.length; i++) {
                    Path f = new Path(args[i]);

                    if (!getFS().exists(f)) {
                        throw new FileNotFoundException(f.toString());
                    }

                    segments[i - offset] = f;
                }

                doInvert(od, segments);
            }
        }
        /* TODO MC */
        else if (jobName.equals("pagerank")) {
            // Usage: hadoop jar nutchwax.jar pagerank output
            if (args.length != 2) {
                doPagerankUsage("ERROR: Wrong number of arguments passed.", 2);
            }

            OutputDirectories od = new OutputDirectories(new Path(args[1]));
            doPagerank(od);
        }
        /* TODO MC */
        else if (jobName.equals("index")) {
            // Usage: hadoop jar nutchwax.jar index output
            if (args.length < 2) {
                doIndexUsage("ERROR: Wrong number of arguments passed.", 2);
            }

            OutputDirectories od = new OutputDirectories(new Path(args[1]));

            if (args.length == 2) {
                doIndexing(od);
            } else {
                final int offset = 2;
                Path[] segments = new Path[args.length - offset];

                for (int i = offset; i < args.length; i++) {
                    Path f = new Path(args[i]);

                    if (!getFS().exists(f)) {
                        throw new FileNotFoundException(f.toString());
                    }

                    segments[i - offset] = f;
                }

                doIndexing(od, segments);
            }
        } else if (jobName.equals("dedup")) {
            // Usage: hadoop jar nutchwax.jar dedup output
            if (args.length != 2) {
                doDedupUsage("Wrong number of arguments passed.", 2);
            }

            doDedup(new OutputDirectories(new Path(args[1])));
        } else if (jobName.equals("merge")) {
            // Usage: hadoop jar nutchwax.jar merge output");
            if (args.length != 2) {
                doMergeUsage("ERROR: Wrong number of arguments passed.", 2);
            }

            doMerge(new OutputDirectories(new Path(args[1])));
        } else if (jobName.equals("all")) {
            // Usage: hadoop jar nutchwax.jar import input output name
            if (args.length != 4) {
                doAllUsage("ERROR: Wrong number of arguments passed.", 2);
            }

            final Path input = new Path(args[1]);
            final Path output = new Path(args[2]);
            final String collectionName = args[3];

            checkArcsDir(input);

            OutputDirectories od = new OutputDirectories(output);

            doAll(input, collectionName, od);
        } else if (jobName.equals("class")) {
            if (args.length < 2) {
                doClassUsage("ERROR: Wrong number of arguments passed.", 2);
            }

            doClassMain(args);
        } else if (jobName.equals("search")) {
            if (args.length < 1) {
                doClassUsage("ERROR: Wrong number of arguments passed.", 2);
            }

            doSearch(args);
        } else if (jobName.equals("multiple")) {
            doMultiple(args);
        } else if (jobName.equals("version")) {
            doVersion(args);
        } else {
            usage("ERROR: No handler for job name " + jobName, 4);
            System.exit(0);
        }
    }

    /**
     * Check the arcs dir exists and looks like it has files that list ARCs
     * (rather than ARCs themselves).
     * 
     * @param arcsDir Directory to examine.
     * @throws IOException
     */
    protected void checkArcsDir(final Path arcsDir) throws IOException {
        if (!getFS().exists(arcsDir)) {
            throw new IOException(arcsDir + " does not exist.");
        }

        if (!fs.isDirectory(arcsDir)) {
            throw new IOException(arcsDir + " is not a directory.");
        }

        final Path[] files = getFS().listPaths(arcsDir);

        for (int i = 0; i < files.length; i++) {
            if (!getFS().isFile(files[i])) {
                throw new IOException(files[i] + " is not a file.");
            }

            if (files[i].getName().toLowerCase().endsWith(".arc.gz")) {
                throw new IOException(files[i] + " is an ARC file (ARCSDIR "
                        + "should contain text file listing ARCs rather than " + "actual ARCs).");
            }
        }
    }

    public static Text generateWaxKey(WritableComparable key, final String collection) {
        return generateWaxKey(key.toString(), collection);
    }

    public static Text generateWaxKey(final String keyStr, final String collection) {
        if (collection == null) {
            throw new NullPointerException("Collection is null for " + keyStr);
        }

        if (keyStr == null) {
            throw new NullPointerException("keyStr is null");
        }

        if (keyStr.startsWith(KEY_COLLECTION_PREFIX)) {
            LOG.warn("Key already has collection prefix: " + keyStr + ". Skipping.");

            return new Text(keyStr);
        }

        return new Text(KEY_COLLECTION_PREFIX + collection.trim() + KEY_COLLECTION_SUFFIX + keyStr.trim());
    }

    public static String getCollectionFromWaxKey(final WritableComparable key) throws IOException {
        Matcher m = COLLECTION.matcher(key.toString());

        if (m == null || !m.matches()) {
            throw new IOException("Key doesn't have collection " + "prefix <" + key.toString() + ">");
        }

        return m.group(1);
    }

    public static String getUrlFromWaxKey(final WritableComparable key) throws IOException {
        Matcher m = COLLECTION.matcher(key.toString());

        if (m == null || !m.matches()) {
            throw new IOException("Key doesn't have collection " + " prefix: " + key);
        }

        return m.group(2);
    }

    public static long getDate(String d) throws IOException {
        long date = 0;

        try {
            date = ArchiveUtils.getDate(d).getTime();
        } catch (final java.text.ParseException e) {
            throw new IOException("Failed parse of date: " + d + ": " + e.getMessage());
        }

        // Date can be < 0 if pre-1970 (Seen in some old ARCs).
        return date >= 0 ? date : 0;
    }

    public static void usage(final String message, final int exitCode) {
        if (message != null && message.length() > 0) {
            System.out.println(message);
        }

        System.out.println("Usage: hadoop jar nutchwax.jar <job> [args]");
        System.out.println("Launch NutchWAX job(s) on a hadoop platform.");
        System.out.println("Type 'hadoop jar nutchwax.jar help <job>' for" + " help on a specific job.");
        System.out.println("Jobs (usually) must be run in the order " + "listed below.");
        System.out.println("Available jobs:");
        System.out.println(" import   Import ARCs.");
        System.out.println(" update   Update dbs with recent imports.");
        System.out.println(" invert   Invert links.");
        System.out.println(" pagerank Compute pagerank."); // TODO MC
        System.out.println(" index    Index segments.");
        System.out.println(" dedup    Deduplicate by URL or content MD5.");
        System.out.println(" merge    Merge segment indices into one.");
        System.out.println(" all      Runs all above jobs in order.");
        System.out.println(" class    Run the passed class's main.");
        System.out.println(" search   Run a query against index under " + "property 'searcher.dir'");
        System.out.println(" multiple Run multiple concurrent tasks.");
        System.out.println(" version Indicates the software version.");

        System.exit(exitCode);
    }

    public static void doUpdateUsage(final String message, final int exitCode) {
        if (message != null && message.length() > 0) {
            System.out.println(message);
        }

        System.out.println("Usage: hadoop jar nutchwax.jar update <output> " + "[<segments>...]");
        System.out.println("Arguments:");
        System.out.println(" output    Directory to write crawldb under.");
        System.out.println("Options:");
        System.out.println(" segments  List of segments to update crawldb " + "with. If none supplied, updates");
        System.out.println("            using latest segment found.");

        System.exit(exitCode);
    }

    public static void doInvertUsage(final String message, final int exitCode) {
        if (message != null && message.length() > 0) {
            System.out.println(message);
        }

        System.out.println("Usage: hadoop jar nutchwax.jar invert <output> " + "[<segments>...]");
        System.out.println("Arguments:");
        System.out.println(" output    Directory to write linkdb under.");
        System.out.println("Options:");
        System.out.println(" segments  List of segments to update linkdb " + "with. If none supplied, all under");
        System.out.println("           '<output>/segments/' " + "are passed.");

        System.exit(exitCode);
    }

    /* TODO MC */
    public static void doPagerankUsage(final String message, final int exitCode) {
        if (message != null && message.length() > 0) {
            System.out.println(message);
        }

        System.out.println("Usage: hadoop jar nutchwax.jar pagerank <output> ");
        System.out.println("Arguments:");
        System.out.println(" output    Directory to write pagerank under.");
        System.exit(exitCode);
    }
    /* TODO MC */

    public static void doIndexUsage(final String message, final int exitCode) {
        if (message != null && message.length() > 0) {
            System.out.println(message);
        }

        System.out.println("Usage: hadoop jar nutchwax.jar index <output> " + "[<segments>...]");
        System.out.println("Arguments:");
        System.out.println(" output    Directory to write indexes under.");
        System.out.println("Options:");
        System.out.println(" segments  List of segments to index. " + "If none supplied, all under");
        System.out.println("           '<output>/segments/' " + "are indexed.");

        System.exit(exitCode);
    }

    public static void doDedupUsage(final String message, final int exitCode) {
        if (message != null && message.length() > 0) {
            System.out.println(message);
        }

        System.out.println("Usage: hadoop jar nutchwax.jar dedup <output>");
        System.out.println("Arguments:");
        System.out.println(" output  Directory in which indices" + " to dedup reside.");

        System.exit(exitCode);
    }

    public static void doMergeUsage(final String message, final int exitCode) {
        if (message != null && message.length() > 0) {
            System.out.println(message);
        }

        System.out.println("Usage: hadoop jar nutchwax.jar merge <output>");
        System.out.println("Arguments:");
        System.out.println(" output  Directory in which indices" + " to merge reside.");

        System.exit(exitCode);
    }

    public static void doMultipleUsage(final String message, final int exitCode) {
        if (message != null && message.length() > 0) {
            System.out.println(message);
        }

        Multiple.usage();

        System.exit(exitCode);
    }

    public static void doSearchUsage(final String message, final int exitCode) {
        if (message != null && message.length() > 0) {
            System.out.println(message);
        }

        System.out.println("Usage: hadoop jar nutchwax.jar search <query>");
        System.out.println("Arguments:");
        System.out.println(" query  Query string to run against index under " + "property 'searcher.dir'");

        System.exit(exitCode);
    }

    public static void doAllUsage(final String message, final int exitCode) {
        if (message != null && message.length() > 0) {
            System.out.println(message);
        }

        System.out.println("Usage: hadoop jar nutchwax.jar import <input>" + " <output> <collection>");
        System.out.println("Arguments:");
        System.out.println(" input       Directory of files" + " listing ARC URLs to import");
        System.out.println(" output      Directory to import to. Inport is " + "written to a subdir named");
        System.out.println("             for current date plus collection " + "under '<output>/segments/'");
        System.out.println(" collection  Collection name. Added to" + " each resource.");

        System.exit(exitCode);
    }

    public static void doClassUsage(final String message, final int exitCode) {
        if (message != null && message.length() > 0) {
            System.out.println(message);
        }

        System.out.println("Usage: hadoop jar nutchwax.jar class CLASS ...");
        System.out.println("Arguments:");
        System.out.println(" CLASS    Name of class to run. Invokes main " + "passing command-line arguments.");
        System.out.println("          For example, use to run nutch " + "commands. Below is list of command");
        System.out.println("          name and implementing class. " + "Pass name of class only and emits usage.");
        System.out.println();
        System.out.println("          readdb      " + "org.apache.nutch.crawl.CrawlDbReader");
        System.out.println("          mergedb     " + "org.apache.nutch.crawl.CrawlDbMerger");
        System.out.println("          readlinkdb  " + "org.apache.nutch.crawl.LinkDbReader");
        System.out.println("          segread     " + "org.apache.nutch.segment.SegmentReader");
        System.out.println("          mergesegs   " + "org.apache.nutch.segment.SegmentMerger");
        System.out.println("          mergelinkdb " + "org.apache.nutch.crawl.LinkDbMerger");
        System.exit(exitCode);
    }

    static void doJobHelp(final String jobName) {
        if (!JOBS.contains(jobName)) {
            usage("ERROR: Unknown job " + jobName, 1);
        }

        if (jobName.equals("import")) {
            ImportArcs.doImportUsage(null, 1);
        } else if (jobName.equals("update")) {
            doUpdateUsage(null, 1);
        } else if (jobName.equals("invert")) {
            doInvertUsage(null, 1);
        }
        /* TODO MC */
        else if (jobName.equals("pagerank")) {
            doPagerankUsage(null, 1);
        }
        /* TODO MC */
        else if (jobName.equals("index")) {
            doIndexUsage(null, 1);
        } else if (jobName.equals("dedup")) {
            doDedupUsage(null, 1);
        } else if (jobName.equals("merge")) {
            doMergeUsage(null, 1);
        } else if (jobName.equals("all")) {
            doAllUsage(null, 1);
        } else if (jobName.equals("search")) {
            doSearchUsage(null, 1);
        } else if (jobName.equals("multiple")) {
            doMultipleUsage(null, 1);
        } else if (jobName.equals("class")) {
            doClassUsage(null, 1);
        } else {
            usage("ERROR: No help for job name " + jobName, 4);
        }
    }

    public static void main(String args[]) throws Exception {
        if (args.length < 1) {
            usage(null, 0);
            return;
        }

        if (args[0].toLowerCase().equals("help")) {
            if (args.length == 1) {
                usage("ERROR: Add command you need help on.", 0);
                return;
            }

            doJobHelp(args[1].toLowerCase());
        }

        final String jobName = args[0].toLowerCase();

        if (!JOBS.contains(jobName)) {
            usage("ERROR: Unknown <job> " + jobName, 1);
        }

        Nutchwax ia = new Nutchwax();
        ia.doJob(jobName, args);
    }
}