org.archive.mapred.ARCMapRunner.java Source code

Introduction

Here is the source code for org.archive.mapred.ARCMapRunner.java
Source

/*
 * $Id: ImportArcs.java 1494 2007-02-15 17:47:58Z stack-sf $
 * 
 * Copyright (C) 2007 Internet Archive.
 * 
 * This file is part of the archive-access tools project
 * (http://sourceforge.net/projects/archive-access).
 * 
 * The archive-access tools are free software; you can redistribute them and/or
 * modify them under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or any
 * later version.
 * 
 * The archive-access tools are distributed in the hope that they will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
 * Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser Public License along with
 * the archive-access tools; if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
package org.archive.mapred;

import java.io.IOException;
import java.util.Iterator;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapRunnable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.ReflectionUtils;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.arc.ARCConstants;
import org.archive.io.arc.ARCRecord;

/**
 * MapRunner that passes an ARCRecord to configured mapper.
 * Configured mapper must be implementation of {@link ARCMapRunner}.
 * @author stack
 */
public class ARCMapRunner implements MapRunnable {
    public final Log LOG = LogFactory.getLog(this.getClass().getName());
    protected ARCRecordMapper mapper;

    private enum Counter {
        ARCS_COUNT, ARCRECORDS_COUNT, BAD_ARC_PARSE_COUNT, ARC_FAILED_DOWNLOAD, LONG_ARCRECORDS_COUNT
    }

    /**
     * How long to spend indexing.
     */
    protected long maxtime;

    public void configure(JobConf job) {
        this.mapper = (ARCRecordMapper) ReflectionUtils.newInstance(job.getMapperClass(), job);
        // Value is in minutes.
        this.maxtime = job.getLong("wax.index.timeout", 60) * 60 * 1000;
    }

    public void run(RecordReader input, OutputCollector output, Reporter reporter) throws IOException {
        try {
            WritableComparable key = input.createKey();
            Writable value = input.createValue();
            while (input.next(key, value)) {
                doArchive(value.toString(), output, new ARCReporter(reporter));
            }
        } finally {
            this.mapper.close();
        }
    }

    protected void doArchive(final String arcurl, final OutputCollector output, final ARCReporter reporter)
            throws IOException {
        if ((arcurl == null) || arcurl.endsWith("work")) {
            reporter.setStatus("skipping " + arcurl, true);
            return;
        }

        // Run indexing in a thread so I can cover it with a timer.
        final Thread thread = new IndexingThread(arcurl, output, reporter);
        startIndexingThread(thread, reporter);
    }

    protected void startIndexingThread(Thread thread, ARCReporter reporter) throws IOException {
        thread.setDaemon(true);
        thread.start();
        final long start = System.currentTimeMillis();
        try {
            for (long period = this.maxtime; thread.isAlive()
                    && (period > 0); period = this.maxtime - (System.currentTimeMillis() - start)) {
                try {
                    thread.join(period);
                } catch (final InterruptedException e) {
                    e.printStackTrace();
                }
            }
        } finally {
            cleanup(thread, reporter);
        }
    }

    protected void cleanup(final Thread thread, final ARCReporter reporter) throws IOException {
        if (!thread.isAlive()) {
            return;
        }
        reporter.setStatus("Killing indexing thread " + thread.getName(), true);
        thread.interrupt();
        try {
            // Give it some time to die.
            thread.join(1000);
        } catch (final InterruptedException e) {
            e.printStackTrace();
        }
        if (thread.isAlive()) {
            LOG.info(thread.getName() + " will not die");
        }
    }

    protected class IndexingThread extends Thread {
        protected final String location;
        protected final OutputCollector output;
        protected final ARCReporter reporter;

        public IndexingThread(final String loc, final OutputCollector o, final ARCReporter r) {
            // Name this thread same as ARC location.
            super(loc);
            this.location = loc;
            this.output = o;
            this.reporter = r;
        }

        /**
         * @return Null if fails download.
         */
        protected ArchiveReader getArchiveReader() {
            ArchiveReader arc = null;
            // Need a thread that will keep updating TaskTracker during long
            // downloads else tasktracker will kill us.
            Thread reportingDuringDownload = null;
            try {
                this.reporter.setStatus("opening " + this.location, true);
                reportingDuringDownload = new Thread("reportDuringDownload") {
                    public void run() {
                        while (!this.isInterrupted()) {
                            try {
                                synchronized (this) {
                                    sleep(1000 * 60); // Sleep a minute.
                                }
                                reporter.setStatus("downloading " + location);
                                /* TODO MC - to be compitable with hadoop 0.14    
                                } catch (final IOException e) {
                                    e.printStackTrace();
                                    // No point hanging around if we're failing
                                    // status.
                                    break;
                                */
                            } catch (final InterruptedException e) {
                                // Interrupt flag is cleared. Just fall out.
                                break;
                            }
                        }
                    }
                };
                reportingDuringDownload.setDaemon(true);
                reportingDuringDownload.start();
                arc = ArchiveReaderFactory.get(this.location);
            } catch (final Throwable e) {
                //try {
                final String msg = "Error opening " + this.location + ": " + e.toString();
                this.reporter.setStatus(msg, true);
                this.reporter.incrCounter(Counter.ARC_FAILED_DOWNLOAD, 1);
                LOG.info(msg);
                /* TODO MC - to be compitable with hadoop 0.14
                } catch (final IOException ioe) {
                LOG.warn(this.location, ioe);
                }
                */
            } finally {
                if ((reportingDuringDownload != null) && reportingDuringDownload.isAlive()) {
                    reportingDuringDownload.interrupt();
                }
            }
            return arc;
        }

        public void run() {
            if (this.location == null || this.location.length() <= 0) {
                return;
            }

            ArchiveReader arc = getArchiveReader();
            if (arc == null) {
                return;
            }

            try {
                ARCMapRunner.this.mapper.onARCOpen();
                this.reporter.incrCounter(Counter.ARCS_COUNT, 1);

                // Iterate over each ARCRecord.
                for (final Iterator i = arc.iterator(); i.hasNext() && !currentThread().isInterrupted();) {
                    final ARCRecord rec = (ARCRecord) i.next();
                    this.reporter.incrCounter(Counter.ARCRECORDS_COUNT, 1);

                    try {
                        ARCMapRunner.this.mapper.map(new Text(rec.getMetaData().getUrl()), new ObjectWritable(rec),
                                this.output, this.reporter);

                        final long b = rec.getMetaData().getContentBegin();
                        final long l = rec.getMetaData().getLength();
                        final long recordLength = (l > b) ? (l - b) : l;
                        if (recordLength > ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE) {
                            // Now, if the content length is larger than a
                            // standard ARC, then it is most likely the last
                            // record in the ARC because ARC is closed after we
                            // exceed 100MB (DEFAULT_MAX_ARC...). Calling
                            // hasNext above will make us read through the
                            // whole record, even if its a 1.7G video. On a
                            // loaded machine, this might cause us timeout with
                            // tasktracker -- so, just skip out here.
                            this.reporter.setStatus(
                                    "skipping " + this.location + " -- very long record " + rec.getMetaData());
                            this.reporter.incrCounter(Counter.LONG_ARCRECORDS_COUNT, 1);
                            break;
                        }
                    } catch (final Throwable e) {
                        // Failed parse of record. Keep going.
                        LOG.warn("Error processing " + rec.getMetaData(), e);
                    }
                }
                if (currentThread().isInterrupted()) {
                    LOG.info(currentThread().getName() + " interrupted");
                }
                this.reporter.setStatus("closing " + this.location, true);

            } catch (final Throwable e) {
                // Problem parsing arc file.
                this.reporter.incrCounter(Counter.BAD_ARC_PARSE_COUNT, 1);
                final String msg = "Error parsing " + this.location;
                //try {
                this.reporter.setStatus(msg, true);
                /* TODO MC - to be compitable with hadoop 0.14
                } catch (final IOException ioe) {
                ioe.printStackTrace();
                }
                */
                LOG.warn("ARCMapRunner - Throwable:" + msg, e);
            } finally {
                try {
                    arc.close();
                    ARCMapRunner.this.mapper.onARCClose();
                } catch (final IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
}