org.archive.access.nutch.jobs.ImportArcs.java Source code

Introduction

Here is the source code for org.archive.access.nutch.jobs.ImportArcs.java
Source

/*
 * $Id: ImportArcs.java 1521 2007-02-27 18:01:29Z stack-sf $
 * 
 * Copyright (C) 2003 Internet Archive.
 * 
 * This file is part of the archive-access tools project
 * (http://sourceforge.net/projects/archive-access).
 * 
 * The archive-access tools are free software; you can redistribute them and/or
 * modify them under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or any
 * later version.
 * 
 * The archive-access tools are distributed in the hope that they will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
 * Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser Public License along with
 * the archive-access tools; if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */

package org.archive.access.nutch.jobs;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.NumberFormat;
import java.util.StringTokenizer;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.Header;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.MapWritable;
import org.apache.nutch.fetcher.Fetcher;
import org.apache.nutch.fetcher.FetcherOutput;
import org.apache.nutch.fetcher.FetcherOutputFormat;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseOutputFormat;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.mime.MimeType;
import org.apache.nutch.util.mime.MimeTypeException;
import org.apache.nutch.util.mime.MimeTypes;
import org.archive.access.nutch.Nutchwax;
import org.archive.access.nutch.NutchwaxConfiguration;
import org.archive.access.nutch.jobs.sql.SqlSearcher;
import org.archive.io.arc.ARCRecord;
import org.archive.io.arc.ARCRecordMetaData;
import org.archive.mapred.ARCMapRunner;
import org.archive.mapred.ARCRecordMapper;
import org.archive.mapred.ARCReporter;
import org.archive.util.Base32;
import org.archive.util.MimetypeUtils;
import org.archive.util.TextUtils;
import org.apache.nutch.global.Global;

/**
 * Ingests ARCs writing ARC Record parse as Nutch FetcherOutputFormat.
 * FOF has five outputs:
 * <ul><li>crawl_fetch holds a fat CrawlDatum of all vitals including metadata.
 * Its written below by our {@link WaxFetcherOutputFormat} (innutch by
 * {@link FetcherOutputFormat}).  Here is an example CD: <pre>  Version: 4
 *  Status: 5 (fetch_success)
 *  Fetch time: Wed Mar 15 12:38:49 PST 2006
 *  Modified time: Wed Dec 31 16:00:00 PST 1969
 *  Retries since fetch: 0
 *  Retry interval: 0.0 days
 *  Score: 1.0
 *  Signature: null
 *  Metadata: collection:test arcname:IAH-20060315203614-00000-debord arcoffset:5127 
 * </pre></li>
 * <li>crawl_parse has CrawlDatum of MD5s.  Used making CrawlDB.
 * Its obtained from above fat crawl_fetch CrawlDatum and written
 * out as part of the parse output done by {@link WaxParseOutputFormat}.
 * This latter class writes three files.  This crawl_parse and both
 * of the following parse_text and parse_data.</li>
 * <li>parse_text has text from parse.</li>
 * <li>parse_data has other metadata found by parse (Depends on
 * parser).  This is only input to linkdb.  The html parser
 * adds found out links here and content-type and discovered
 * encoding as well as advertised encoding, etc.</li>
 * <li>cdx has a summary line for every record processed.</li>
 * </ul>
 */
public class ImportArcs extends ToolBase implements ARCRecordMapper {
    public final Log LOG = LogFactory.getLog(ImportArcs.class);
    private final NumberFormat numberFormatter = NumberFormat.getInstance();

    private static final String WHITESPACE = "\\s+";

    public static final String ARCFILENAME_KEY = "arcname";
    public static final String ARCFILEOFFSET_KEY = "arcoffset";
    private static final String CONTENT_TYPE_KEY = "content-type";
    private static final String TEXT_TYPE = "text/";
    private static final String APPLICATION_TYPE = "application/";
    public static final String ARCCOLLECTION_KEY = "collection";
    public static final String WAX_SUFFIX = "wax.";
    public static final String WAX_COLLECTION_KEY = WAX_SUFFIX + ARCCOLLECTION_KEY;

    private static final String PDF_TYPE = "application/pdf";

    private boolean indexAll;
    private int contentLimit;
    private int pdfContentLimit;
    private MimeTypes mimeTypes;
    private String segmentName;
    private String collectionName;
    private int parseThreshold = -1;
    private boolean indexRedirects;
    private boolean sha1 = false;
    private boolean arcNameFromFirstRecord = true;
    private String arcName;
    private String collectionType;
    private int timeoutIndexingDocument;

    /**
     * Usually the URL in first record looks like this:
     * filedesc://IAH-20060315203614-00000-debord.arc.  But in old
     * ARCs, it can look like this: filedesc://19961022/IA-000001.arc.
     */
    private static final Pattern FILEDESC_PATTERN = Pattern
            .compile("^(?:filedesc://)(?:[0-9]+\\/)?(.+)(?:\\.arc)$");

    private static final Pattern TAIL_PATTERN = Pattern.compile("(?:.*(?:/|\\\\))?(.+)(?:\\.arc|\\.arc\\.gz)$");

    /**
     * Buffer to reuse on each ARCRecord indexing.
     */
    private final byte[] buffer = new byte[1024 * 16];

    private final ByteArrayOutputStream contentBuffer = new ByteArrayOutputStream(1024 * 16);

    private URLNormalizers urlNormalizers;
    private URLFilters filters;

    private ParseUtil parseUtil;

    private static final Text CDXKEY = new Text("cdx");

    private TimeoutParsingThreadPool threadPool = new TimeoutParsingThreadPool(); // this is one pool of only one thread; it is not necessary to be static

    public ImportArcs() {
        super();
    }

    public ImportArcs(Configuration conf) {
        setConf(conf);
    }

    public void importArcs(final Path arcUrlsDir, final Path segment, final String collection) throws IOException {
        LOG.info("ImportArcs segment: " + segment + ", src: " + arcUrlsDir);

        final JobConf job = new JobConf(getConf(), this.getClass());

        job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

        job.setInputPath(arcUrlsDir);

        //job.setMapRunnerClass(job.getClass("wax.import.maprunner", ARCMapRunner.class));
        //job.setMapperClass(job.getClass("wax.import.mapper", this.getClass()));
        job.setMapRunnerClass(ARCMapRunner.class); // compatible with hadoop 0.14 TODO MC
        job.setMapperClass(this.getClass());

        job.setInputFormat(TextInputFormat.class);

        job.setOutputPath(segment);
        job.setOutputFormat(WaxFetcherOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FetcherOutput.class);

        // Pass the collection name out to the tasks IF non-null.
        if ((collection != null) && (collection.length() > 0)) {
            job.set(ImportArcs.WAX_SUFFIX + ImportArcs.ARCCOLLECTION_KEY, collection);
        }
        job.setJobName("import " + arcUrlsDir + " " + segment);

        JobClient.runJob(job);
        LOG.info("ImportArcs: done");
    }

    public void configure(final JobConf job) {
        setConf(job);
        this.indexAll = job.getBoolean("wax.index.all", false);

        this.contentLimit = job.getInt("http.content.limit", 1024 * 100);
        final int pdfMultiplicand = job.getInt("wax.pdf.size.multiplicand", 10);
        this.pdfContentLimit = (this.contentLimit == -1) ? this.contentLimit : pdfMultiplicand * this.contentLimit;
        this.mimeTypes = MimeTypes.get(job.get("mime.types.file"));
        this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY);

        // Get the rsync protocol handler into the mix.
        System.setProperty("java.protocol.handler.pkgs", "org.archive.net");

        // Format numbers output by parse rate logging.
        this.numberFormatter.setMaximumFractionDigits(2);
        this.numberFormatter.setMinimumFractionDigits(2);
        this.parseThreshold = job.getInt("wax.parse.rate.threshold", -1);

        this.indexRedirects = job.getBoolean("wax.index.redirects", false);

        this.sha1 = job.getBoolean("wax.digest.sha1", false);

        this.urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_FETCHER);
        this.filters = new URLFilters(job);

        this.parseUtil = new ParseUtil(job);

        this.collectionName = job.get(ImportArcs.WAX_SUFFIX + ImportArcs.ARCCOLLECTION_KEY);

        // Get ARCName by reading first record in ARC?  Otherwise, we parse
        // the name of the file we've been passed to find an ARC name.
        this.arcNameFromFirstRecord = job.getBoolean("wax.arcname.from.first.record", true);

        this.collectionType = job.get(Global.COLLECTION_TYPE);
        this.timeoutIndexingDocument = job.getInt(Global.TIMEOUT_INDEXING_DOCUMENT, -1);

        LOG.info("ImportArcs collectionType: " + collectionType);
    }

    public Configuration getConf() {
        return this.conf;
    }

    public void setConf(Configuration c) {
        this.conf = c;
    }

    public void onARCOpen() {
        // Nothing to do.
    }

    public void onARCClose() {
        threadPool.closeAll(); // close the only thread created for this map
    }

    public void map(final WritableComparable key, final Writable value, final OutputCollector output,
            final Reporter r) throws IOException {
        // Assumption is that this map is being run by ARCMapRunner.
        // Otherwise, the below casts fail.
        String url = key.toString();

        ARCRecord rec = (ARCRecord) ((ObjectWritable) value).get();
        ARCReporter reporter = (ARCReporter) r;

        // Its null first time map is called on an ARC.
        checkArcName(rec);
        if (!isIndex(rec)) {
            return;
        }
        checkCollectionName();

        final ARCRecordMetaData arcData = rec.getMetaData();
        String oldUrl = url;

        try {
            url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_FETCHER);
            url = filters.filter(url); // filter the url
        } catch (Exception e) {
            LOG.warn("Skipping record. Didn't pass normalization/filter " + oldUrl + ": " + e.toString());

            return;
        }

        final long b = arcData.getContentBegin();
        final long l = arcData.getLength();
        final long recordLength = (l > b) ? (l - b) : l;

        // Look at ARCRecord meta data line mimetype. It can be empty.  If so,
        // two more chances at figuring it either by looking at HTTP headers or
        // by looking at first couple of bytes of the file.  See below.
        String mimetype = getMimetype(arcData.getMimetype(), this.mimeTypes, url);

        if (skip(mimetype)) {
            return;
        }

        // Copy http headers to nutch metadata.
        final Metadata metaData = new Metadata();
        final Header[] headers = rec.getHttpHeaders();
        for (int j = 0; j < headers.length; j++) {
            final Header header = headers[j];

            if (mimetype == null) {
                // Special handling. If mimetype is still null, try getting it
                // from the http header. I've seen arc record lines with empty
                // content-type and a MIME unparseable file ending; i.e. .MID.
                if ((header.getName() != null)
                        && header.getName().toLowerCase().equals(ImportArcs.CONTENT_TYPE_KEY)) {
                    mimetype = getMimetype(header.getValue(), null, null);

                    if (skip(mimetype)) {
                        return;
                    }
                }
            }

            metaData.set(header.getName(), header.getValue());
        }

        // This call to reporter setStatus pings the tasktracker telling it our
        // status and telling the task tracker we're still alive (so it doesn't
        // time us out).
        final String noSpacesMimetype = TextUtils.replaceAll(ImportArcs.WHITESPACE,
                ((mimetype == null || mimetype.length() <= 0) ? "TODO" : mimetype), "-");
        final String recordLengthAsStr = Long.toString(recordLength);

        reporter.setStatus(getStatus(url, oldUrl, recordLengthAsStr, noSpacesMimetype));

        // This is a nutch 'more' field.
        metaData.set("contentLength", recordLengthAsStr);

        rec.skipHttpHeader();
        reporter.setStatusIfElapse("read headers on " + url);

        // TODO: Skip if unindexable type.
        int total = 0;

        // Read in first block. If mimetype still null, look for MAGIC.
        int len = rec.read(this.buffer, 0, this.buffer.length);

        if (mimetype == null) {
            MimeType mt = this.mimeTypes.getMimeType(this.buffer);

            if (mt == null || mt.getName() == null) {
                LOG.warn("Failed to get mimetype for: " + url);

                return;
            }

            mimetype = mt.getName();
        }

        metaData.set(ImportArcs.CONTENT_TYPE_KEY, mimetype);

        // How much do we read total? If pdf, we will read more. If equal to -1,
        // read all.
        int readLimit = (ImportArcs.PDF_TYPE.equals(mimetype)) ? this.pdfContentLimit : this.contentLimit;

        // Reset our contentBuffer so can reuse.  Over the life of an ARC
        // processing will grow to maximum record size.
        this.contentBuffer.reset();

        while ((len != -1) && ((readLimit == -1) || (total < readLimit))) {
            total += len;
            this.contentBuffer.write(this.buffer, 0, len);
            len = rec.read(this.buffer, 0, this.buffer.length);
            reporter.setStatusIfElapse("reading " + url);
        }

        // Close the Record.  We're done with it.  Side-effect is calculation
        // of digest -- if we're digesting.
        rec.close();
        reporter.setStatusIfElapse("closed " + url);

        final byte[] contentBytes = this.contentBuffer.toByteArray();
        final CrawlDatum datum = new CrawlDatum();
        datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);

        // Calculate digest or use precalculated sha1.
        String digest = (this.sha1) ? rec.getDigestStr() : MD5Hash.digest(contentBytes).toString();
        metaData.set(Nutch.SIGNATURE_KEY, digest);

        // Set digest back into the arcData so available later when we write
        // CDX line.
        arcData.setDigest(digest);

        metaData.set(Nutch.SEGMENT_NAME_KEY, this.segmentName);

        // Score at this stage is 1.0f.
        metaData.set(Nutch.SCORE_KEY, Float.toString(datum.getScore()));

        final long startTime = System.currentTimeMillis();
        final Content content = new Content(url, url, contentBytes, mimetype, metaData, getConf());
        datum.setFetchTime(Nutchwax.getDate(arcData.getDate()));

        MapWritable mw = datum.getMetaData();

        if (mw == null) {
            mw = new MapWritable();
        }

        if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
            mw.put(new Text(ImportArcs.ARCCOLLECTION_KEY),
                    new Text(SqlSearcher.getCollectionNameWithTimestamp(collectionName, arcData.getDate())));
        } else {
            mw.put(new Text(ImportArcs.ARCCOLLECTION_KEY), new Text(collectionName));
        }
        mw.put(new Text(ImportArcs.ARCFILENAME_KEY), new Text(arcName));
        mw.put(new Text(ImportArcs.ARCFILEOFFSET_KEY), new Text(Long.toString(arcData.getOffset())));
        datum.setMetaData(mw);

        TimeoutParsingThread tout = threadPool.getThread(Thread.currentThread().getId(), timeoutIndexingDocument);
        tout.setUrl(url);
        tout.setContent(content);
        tout.setParseUtil(parseUtil);
        tout.wakeupAndWait();

        ParseStatus parseStatus = tout.getParseStatus();
        Parse parse = tout.getParse();
        reporter.setStatusIfElapse("parsed " + url);

        if (!parseStatus.isSuccess()) {
            final String status = formatToOneLine(parseStatus.toString());
            LOG.warn("Error parsing: " + mimetype + " " + url + ": " + status);
            parse = null;
        } else {
            // Was it a slow parse?
            final double kbPerSecond = getParseRate(startTime, (contentBytes != null) ? contentBytes.length : 0);

            if (LOG.isDebugEnabled()) {
                LOG.debug(getParseRateLogMessage(url, noSpacesMimetype, kbPerSecond));
            } else if (kbPerSecond < this.parseThreshold) {
                LOG.warn(getParseRateLogMessage(url, noSpacesMimetype, kbPerSecond));
            }
        }

        Writable v = new FetcherOutput(datum, null, parse != null ? new ParseImpl(parse) : null);
        if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
            LOG.info("multiple: "
                    + SqlSearcher.getCollectionNameWithTimestamp(this.collectionName, arcData.getDate()) + " "
                    + url);
            output.collect(Nutchwax.generateWaxKey(url,
                    SqlSearcher.getCollectionNameWithTimestamp(this.collectionName, arcData.getDate())), v);
        } else {
            output.collect(Nutchwax.generateWaxKey(url, this.collectionName), v);
        }
    }

    public void setCollectionName(String collectionName) {
        this.collectionName = collectionName;
        checkCollectionName();
    }

    public String getArcName() {
        return this.arcName;
    }

    public void checkArcName(ARCRecord rec) {
        this.arcName = rec.getMetaData().getArcFile().getName();
        this.arcName = this.arcName.replace(".arc.gz", "");
    }

    protected boolean checkCollectionName() {
        if ((this.collectionName != null) && this.collectionName.length() > 0) {
            return true;
        }

        throw new NullPointerException("Collection name can't be empty");
    }

    /**
     * @param rec ARC Record to test.
     * @return True if we are to index this record.
     */
    protected boolean isIndex(final ARCRecord rec) {
        return ((rec.getStatusCode() >= 200) && (rec.getStatusCode() < 300))
                || (this.indexRedirects && ((rec.getStatusCode() >= 300) && (rec.getStatusCode() < 400)));
    }

    protected String getStatus(final String url, String oldUrl, final String recordLengthAsStr,
            final String noSpacesMimetype) {
        // If oldUrl is same as url, don't log.  Otherwise, log original so we
        // can keep url originally imported.
        if (oldUrl.equals(url)) {
            oldUrl = "-";
        }

        StringBuilder sb = new StringBuilder(128);
        sb.append("adding ");
        sb.append(url);
        sb.append(" ");
        sb.append(oldUrl);
        sb.append(" ");
        sb.append(recordLengthAsStr);
        sb.append(" ");
        sb.append(noSpacesMimetype);

        return sb.toString();
    }

    protected String formatToOneLine(final String s) {
        final StringBuffer sb = new StringBuffer(s.length());

        for (final StringTokenizer st = new StringTokenizer(s, "\t\n\r"); st.hasMoreTokens(); sb
                .append(st.nextToken())) {
            ;
        }

        return sb.toString();
    }

    protected String getParseRateLogMessage(final String url, final String mimetype, final double kbPerSecond) {
        return url + " " + mimetype + " parse KB/Sec " + this.numberFormatter.format(kbPerSecond);
    }

    protected double getParseRate(final long startTime, final long len) {
        // Get indexing rate:
        long elapsedTime = System.currentTimeMillis() - startTime;
        elapsedTime = (elapsedTime == 0) ? 1 : elapsedTime;

        return (len != 0) ? ((double) len / 1024) / ((double) elapsedTime / 1000) : 0;
    }

    protected boolean skip(final String mimetype) {
        boolean decision = false;

        // Are we to index all content?
        if (!this.indexAll) {
            if ((mimetype == null) || (!mimetype.startsWith(ImportArcs.TEXT_TYPE)
                    && !mimetype.startsWith(ImportArcs.APPLICATION_TYPE))) {
                // Skip any but basic types.
                decision = true;
            }
        }

        return decision;
    }

    protected String getMimetype(final String mimetype, final MimeTypes mts, final String url) {
        if (mimetype != null && mimetype.length() > 0) {
            return checkMimetype(mimetype.toLowerCase());
        }

        if (mts != null && url != null) {
            final MimeType mt = mts.getMimeType(url);

            if (mt != null) {
                return checkMimetype(mt.getName().toLowerCase());
            }
        }

        return null;
    }

    protected static String checkMimetype(String mimetype) {
        if ((mimetype == null) || (mimetype.length() <= 0) || mimetype.startsWith(MimetypeUtils.NO_TYPE_MIMETYPE)) {
            return null;
        }

        // Test the mimetype makes sense. If not, clear it.
        try {
            new MimeType(mimetype);
        } catch (final MimeTypeException e) {
            mimetype = null;
        }

        return mimetype;
    }

    /**
     * Override of nutch FetcherOutputFormat so I can substitute my own
     * ParseOutputFormat, {@link WaxParseOutputFormat}.  While I'm here,
     * removed content references.  NutchWAX doesn't save content.
     * @author stack
     */
    public static class WaxFetcherOutputFormat extends FetcherOutputFormat {
        public RecordWriter getRecordWriter(final FileSystem fs, final JobConf job, final String name,
                Progressable progress) throws IOException {
            Path f = new Path(job.getOutputPath(), CrawlDatum.FETCH_DIR_NAME);
            final Path fetch = new Path(f, name);
            final MapFile.Writer fetchOut = new MapFile.Writer(job, fs, fetch.toString(), Text.class,
                    CrawlDatum.class);

            // Write a cdx file.  Write w/o compression.
            Path cdx = new Path(new Path(job.getOutputPath(), "cdx"), name);
            final SequenceFile.Writer cdxOut = SequenceFile.createWriter(fs, job, cdx, Text.class, Text.class,
                    SequenceFile.CompressionType.NONE);

            return new RecordWriter() {
                private RecordWriter parseOut;

                // Initialization
                {
                    if (Fetcher.isParsing(job)) {
                        // Here is nutchwax change, using WaxParseOutput
                        // instead of ParseOutputFormat.           
                        this.parseOut = new WaxParseOutputFormat().getRecordWriter(fs, job, name, null);
                    }
                }

                public void write(WritableComparable key, Writable value) throws IOException {
                    FetcherOutput fo = (FetcherOutput) value;
                    MapWritable mw = fo.getCrawlDatum().getMetaData();
                    Text cdxLine = (Text) mw.get(ImportArcs.CDXKEY);

                    if (cdxLine != null) {
                        cdxOut.append(key, cdxLine);
                    }

                    mw.remove(ImportArcs.CDXKEY);
                    fetchOut.append(key, fo.getCrawlDatum());

                    if (fo.getParse() != null) {
                        parseOut.write(key, fo.getParse());
                    }
                }

                public void close(Reporter reporter) throws IOException {
                    fetchOut.close();
                    cdxOut.close();

                    if (parseOut != null) {
                        parseOut.close(reporter);
                    }
                }
            };
        }
    }

    /**
     * Copy so I can add collection prefix to produced signature and link
     * CrawlDatums.
     * @author stack
     */
    public static class WaxParseOutputFormat extends ParseOutputFormat {
        public final Log LOG = LogFactory.getLog(WaxParseOutputFormat.class);

        private URLNormalizers urlNormalizers;
        private URLFilters filters;
        private ScoringFilters scfilters;

        public RecordWriter getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress)
                throws IOException {
            // Extract collection prefix from key to use later when adding
            // signature and link crawldatums.

            this.urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
            this.filters = new URLFilters(job);
            this.scfilters = new ScoringFilters(job);

            final float interval = job.getFloat("db.default.fetch.interval", 30f);
            final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false);
            final boolean sha1 = job.getBoolean("wax.digest.sha1", false);

            Path text = new Path(new Path(job.getOutputPath(), ParseText.DIR_NAME), name);
            Path data = new Path(new Path(job.getOutputPath(), ParseData.DIR_NAME), name);
            Path crawl = new Path(new Path(job.getOutputPath(), CrawlDatum.PARSE_DIR_NAME), name);

            final MapFile.Writer textOut = new MapFile.Writer(job, fs, text.toString(), Text.class, ParseText.class,
                    CompressionType.RECORD);

            final MapFile.Writer dataOut = new MapFile.Writer(job, fs, data.toString(), Text.class,
                    ParseData.class);

            final SequenceFile.Writer crawlOut = SequenceFile.createWriter(fs, job, crawl, Text.class,
                    CrawlDatum.class);

            return new RecordWriter() {
                public void write(WritableComparable key, Writable value) throws IOException {
                    // Test that I can parse the key before I do anything
                    // else. If not, write nothing for this record.
                    String collection = null;
                    String fromUrl = null;
                    String fromHost = null;
                    String toHost = null;

                    try {
                        collection = Nutchwax.getCollectionFromWaxKey(key);
                        fromUrl = Nutchwax.getUrlFromWaxKey(key);
                    } catch (IOException ioe) {
                        LOG.warn("Skipping record. Can't parse " + key, ioe);

                        return;
                    }

                    if (fromUrl == null || collection == null) {
                        LOG.warn("Skipping record. Null from or collection " + key);

                        return;
                    }

                    Parse parse = (Parse) value;

                    textOut.append(key, new ParseText(parse.getText()));
                    ParseData parseData = parse.getData();

                    // recover the signature prepared by Fetcher or ParseSegment
                    String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);

                    if (sig != null) {
                        byte[] signature = (sha1) ? Base32.decode(sig) : StringUtil.fromHexString(sig);

                        if (signature != null) {
                            // append a CrawlDatum with a signature
                            CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0.0f);
                            d.setSignature(signature);
                            crawlOut.append(key, d);
                        }
                    }

                    // collect outlinks for subsequent db update
                    Outlink[] links = parseData.getOutlinks();
                    if (ignoreExternalLinks) {
                        try {
                            fromHost = new URL(fromUrl).getHost().toLowerCase();
                        } catch (MalformedURLException e) {
                            fromHost = null;
                        }
                    } else {
                        fromHost = null;
                    }

                    String[] toUrls = new String[links.length];
                    int validCount = 0;

                    for (int i = 0; i < links.length; i++) {
                        String toUrl = links[i].getToUrl();

                        try {
                            toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK);
                            toUrl = filters.filter(toUrl); // filter the url
                            if (toUrl == null) {
                                LOG.warn("Skipping url (target) because is null."); // TODO MC remove 
                            }
                        } catch (Exception e) {
                            toUrl = null;
                        }

                        // ignore links to self (or anchors within the page)
                        if (fromUrl.equals(toUrl)) {
                            toUrl = null;
                        }

                        if (toUrl != null) {
                            validCount++;
                        }

                        toUrls[i] = toUrl;
                    }

                    CrawlDatum adjust = null;

                    // compute score contributions and adjustment to the
                    // original score          
                    for (int i = 0; i < toUrls.length; i++) {
                        if (toUrls[i] == null) {
                            continue;
                        }

                        if (ignoreExternalLinks) {
                            try {
                                toHost = new URL(toUrls[i]).getHost().toLowerCase();
                            } catch (MalformedURLException e) {
                                toHost = null;
                            }

                            if (toHost == null || !toHost.equals(fromHost)) {
                                // external links
                                continue; // skip it
                            }
                        }

                        CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
                        Text fromURLUTF8 = new Text(fromUrl);
                        Text targetUrl = new Text(toUrls[i]);
                        adjust = null;

                        try {
                            // Scoring now expects first two arguments to be
                            // URLs (More reason to do our own scoring).
                            // St.Ack
                            adjust = scfilters.distributeScoreToOutlink(fromURLUTF8, targetUrl, parseData, target,
                                    null, links.length, validCount);
                        } catch (ScoringFilterException e) {
                            if (LOG.isWarnEnabled()) {
                                LOG.warn("Cannot distribute score from " + key + " to " + target + " - skipped ("
                                        + e.getMessage());
                            }

                            continue;
                        }

                        Text targetKey = Nutchwax.generateWaxKey(targetUrl, collection);
                        crawlOut.append(targetKey, target);
                        if (adjust != null) {
                            crawlOut.append(key, adjust);
                        }
                    }

                    dataOut.append(key, parseData);
                }

                public void close(Reporter reporter) throws IOException {
                    textOut.close();
                    dataOut.close();
                    crawlOut.close();
                }
            };
        }
    }

    public void close() {
        // Nothing to close.
    }

    public static void doImportUsage(final String message, final int exitCode) {
        if (message != null && message.length() > 0) {
            System.out.println(message);
        }

        System.out.println("Usage: hadoop jar nutchwax.jar import <input>" + " <output> <collection>");
        System.out.println("Arguments:");
        System.out.println(" input       Directory of files" + " listing ARC URLs to import");
        System.out.println(" output      Directory to import to. Inport is " + "written to a subdir named");
        System.out.println("             for current date plus collection " + "under '<output>/segments/'");
        System.out.println(" collection  Collection name. Added to" + " each resource.");
        System.exit(exitCode);
    }

    public static void main(String[] args) throws Exception {
        int res = new ImportArcs().doMain(NutchwaxConfiguration.getConfiguration(), args);

        System.exit(res);
    }

    public int run(final String[] args) throws Exception {
        if (args.length != 3) {
            doImportUsage("ERROR: Wrong number of arguments passed.", 2);
        }

        // Assume list of ARC urls is first arg and output dir the second.
        try {
            importArcs(new Path(args[0]), new Path(args[1]), args[2]);
            return 0;
        } catch (Exception e) {
            LOG.fatal("ImportARCs: " + StringUtils.stringifyException(e));

            return -1;
        }
    }
}