org.commoncrawl.util.MapReduceJobStatsWriter.java Source code

Introduction

Here is the source code for org.commoncrawl.util.MapReduceJobStatsWriter.java
Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.util;

import java.io.File;
import java.io.IOException;
import java.util.concurrent.Semaphore;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.commoncrawl.async.Callback;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.async.Timer;
import org.commoncrawl.common.Environment;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;

import com.hadoop.compression.lzo.LzoCodec;

/** used to push stats to hdfs in a consistent manner 
 *  stats are written to a local temp file in form of a 
 *  hadoop sequence file, and are flushed to hdfs when 
 *  the writer is closed. Each stat entry consists of a 
 *  key and a value pair. The key has to implement WritableComparable,
 *  and value has to implement Writable
 * @author rana
 *
 */
public class MapReduceJobStatsWriter<KeyType extends WritableComparable, ValueType extends Writable> {

    private static final Log LOG = LogFactory.getLog(MapReduceJobStatsWriter.class);

    /** log family type **/
    private String _logFamily;
    /** grouping key **/
    private String _groupingKey;
    /** log file key **/
    private long _uniqueKey;

    /** the temp file stats writer object **/
    private SequenceFile.Writer _writer = null;
    /** remote file system instance **/
    FileSystem _remoteFileSystem;
    /** temp file name **/
    private File _tempFileName;
    /** output stream sequence file is writing to **/
    private FSDataOutputStream _outputStream;
    /** hadoop config **/
    Configuration _config;
    /** last log write exception **/
    private IOException _lastLogWriteException;
    /** log file entry count **/
    private int _entryCount = 0;

    /** internal class used to queue up log file write requests **/
    private static class LogFileItem<KeyType extends WritableComparable, ValueType extends Writable> {

        LogFileItem(KeyType key, ValueType value) {
            _key = key;
            _value = value;
        }

        LogFileItem() {
            _key = null;
            _value = null;
        }

        public KeyType _key;
        public ValueType _value;
    }

    /** the log writer thread event loop **/
    EventLoop _eventLoop = new EventLoop();

    /** Constructor
     * 
     * @param keyClass      key type
     * @param valueClass    value type
     * @param familyKey         
     * @param groupingKey
     * @param uniqueKey
     */
    public MapReduceJobStatsWriter(FileSystem remoteFileSystem, Configuration config, Class<KeyType> keyClass,
            Class<ValueType> valueClass, String familyKey, String groupingKey, long uniqueKey) throws IOException {

        _logFamily = familyKey;
        _groupingKey = groupingKey;
        _uniqueKey = uniqueKey;
        _remoteFileSystem = remoteFileSystem;
        _config = config;
        // temp file 
        _tempFileName = File.createTempFile("statsWriter", "seq");
        // create output stream that sequence file writer will output to
        _outputStream = FileSystem.getLocal(_config).create(new Path(_tempFileName.getAbsolutePath()));

        LzoCodec codec = new LzoCodec();
        // create sequencefile writer 
        _writer = SequenceFile.createWriter(config, _outputStream, keyClass, valueClass, CompressionType.BLOCK,
                codec);
        // start event loop
        _eventLoop.start();
    }

    /** append an item to the log file **/
    public void appendLogEntry(final KeyType key, final ValueType value) throws IOException {
        if (_lastLogWriteException == null) {
            // send async message to the writer thread ... 
            _eventLoop.setTimer(new Timer(0, false, new Timer.Callback() {

                @Override
                public void timerFired(Timer timer) {
                    // this executes in the writer thread's context ... 
                    try {
                        _writer.append(key, value);
                        ++_entryCount;
                    } catch (IOException e) {
                        LOG.error("Failed to Write to Log File Entry for:" + _logFamily + "/" + _groupingKey + "/"
                                + Long.toString(_uniqueKey) + "Exception:" + CCStringUtils.stringifyException(e));

                        _lastLogWriteException = e;
                    }
                }
            }));
        } else {
            IOException e = _lastLogWriteException;
            _lastLogWriteException = null;
            throw e;
        }
    }

    /** close and flush the log file **/
    public void close(final Callback optionalAsyncCallback) {

        if (_eventLoop != null) {
            // allocate a blocking semaphore in case async callback was not specified 
            final Semaphore blockingCallSemaphore = new Semaphore(0);

            // perform shutdown in worker thread ... 
            _eventLoop.setTimer(new Timer(0, false, new Timer.Callback() {

                @Override
                public void timerFired(Timer timer) {

                    try {
                        try {
                            if (_writer != null) {
                                _writer.close();
                            }
                        } catch (IOException e) {
                            LOG.error(CCStringUtils.stringifyException(e));
                            _lastLogWriteException = e;
                        } finally {
                            _writer = null;

                            try {
                                if (_outputStream != null) {
                                    _outputStream.flush();
                                    _outputStream.close();
                                }
                            } catch (IOException e) {
                                LOG.error(CCStringUtils.stringifyException(e));
                                _lastLogWriteException = e;
                            } finally {
                                _outputStream = null;
                            }
                        }

                        // now figure out if everything went smoothly or not 
                        if (_entryCount != 0 && _lastLogWriteException == null) {
                            // ok so far so good... time to copy the local log file to hdfs ... 
                            Path hdfsPath = new Path(Environment.HDFS_LOGCOLLECTOR_BASEDIR,
                                    _logFamily + "/" + _groupingKey + "/" + Long.toString(_uniqueKey));

                            try {

                                // delete the remote file if it exists
                                _remoteFileSystem.delete(hdfsPath, false);
                                // ensure parent path 
                                _remoteFileSystem.mkdirs(hdfsPath.getParent());
                                // now if the local file exists and has data 
                                if (_tempFileName.exists() && _tempFileName.length() != 0) {
                                    // copy the file to hdfs 
                                    _remoteFileSystem.copyFromLocalFile(new Path(_tempFileName.getAbsolutePath()),
                                            hdfsPath);
                                }
                            } catch (IOException e) {
                                LOG.error(CCStringUtils.stringifyException(e));
                                _lastLogWriteException = e;
                            }
                        }
                    } finally {
                        // always delete the temp file ... 
                        _tempFileName.delete();

                        // release semaphore 
                        blockingCallSemaphore.release();

                        // if callback was specified , call it now 
                        if (optionalAsyncCallback != null) {
                            optionalAsyncCallback.execute();
                        }

                        // stop the event loop ... 
                        _eventLoop.stop();
                        _eventLoop = null;
                    }
                }
            }));

            // now if callback was not specified... wait for blocking semaphore to signal ... 
            if (optionalAsyncCallback == null) {
                blockingCallSemaphore.acquireUninterruptibly();
            }
        }
    }

    public static void main(String[] args) {
        LOG.info("Initializing Hadoop Config");

        Configuration conf = new Configuration();

        conf.addResource("nutch-default.xml");
        conf.addResource("nutch-site.xml");
        conf.addResource("hadoop-default.xml");
        conf.addResource("hadoop-site.xml");
        conf.addResource("commoncrawl-default.xml");
        conf.addResource("commoncrawl-site.xml");

        CrawlEnvironment.setHadoopConfig(conf);
        CrawlEnvironment.setDefaultHadoopFSURI("hdfs://ccn01:9000/");

        // test the stats Writer ... 
        try {

            LOG.info("Opening Stats Writer");
            MapReduceJobStatsWriter<IntWritable, Text> statsWriter = new MapReduceJobStatsWriter<IntWritable, Text>(
                    CrawlEnvironment.getDefaultFileSystem(), conf, IntWritable.class, Text.class, "test", "group1",
                    12345L);

            LOG.info("Writing Entries");
            for (int i = 0; i < 1000; ++i) {
                statsWriter.appendLogEntry(new IntWritable(i), new Text("Log Entry #" + i));
            }
            LOG.info("Flushing / Closing");
            final Semaphore blockingSempahore = new Semaphore(0);
            statsWriter.close(new Callback() {

                @Override
                public void execute() {
                    LOG.info("Completion Callback Triggered");
                    blockingSempahore.release();
                }

            });
            LOG.info("Waiting on Semaphore");
            blockingSempahore.acquireUninterruptibly();
            LOG.info("Acquired Semaphore");

            LOG.info("Closed");

            Path hdfsPath = new Path(Environment.HDFS_LOGCOLLECTOR_BASEDIR,
                    "test" + "/" + "group1" + "/" + Long.toString(12345L));

            LOG.info("Opening Reader");
            SequenceFile.Reader reader = new SequenceFile.Reader(CrawlEnvironment.getDefaultFileSystem(), hdfsPath,
                    conf);
            IntWritable key = new IntWritable();
            Text value = new Text();
            while (reader.next(key, value)) {
                LOG.info("Key:" + key.get() + " Value:" + value.toString());
            }
            reader.close();

        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
        }

    }
}