org.commoncrawl.hadoop.mergeutils.SequenceFileReader.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.hadoop.mergeutils.SequenceFileReader.java

Source

package org.commoncrawl.hadoop.mergeutils;

/*
 *    Copyright 2010 - CommonCrawl Foundation
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 */

import java.io.IOException;
import java.lang.reflect.Constructor;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.commoncrawl.util.shared.CCStringUtils;

/**
 * An input source that reads from a SequenceFile
 * 
 * @author rana
 * 
 * @param <KeyType>
 * @param <ValueType>
 */
public class SequenceFileReader<KeyType extends WritableComparable, ValueType extends Writable> {

    public static final Log LOG = LogFactory.getLog(SequenceFileReader.class);

    FileSystem _sourceFileSystem;
    Configuration _config;
    SpillWriter<KeyType, ValueType> _writer = null;
    Vector<Path> _inputSegments = null;
    Constructor<KeyType> _keyConstructor = null;
    Constructor<ValueType> _valConstructor = null;
    long _recordCount = 0;
    private static final Class[] emptyArray = new Class[] {};

    public SequenceFileReader(FileSystem fileSystem, Configuration conf, Vector<Path> inputSegments,
            SpillWriter<KeyType, ValueType> spillWriter, Class<KeyType> keyClass, Class<ValueType> valueClass)
            throws IOException {

        _sourceFileSystem = fileSystem;
        _config = conf;
        _inputSegments = inputSegments;
        _writer = spillWriter;

        try {
            this._keyConstructor = keyClass.getDeclaredConstructor(emptyArray);
            this._keyConstructor.setAccessible(true);
            this._valConstructor = valueClass.getDeclaredConstructor(emptyArray);
            this._valConstructor.setAccessible(true);
        } catch (SecurityException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            throw new RuntimeException(e);
        } catch (NoSuchMethodException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            throw new RuntimeException(e);
        }

    }

    public void close() throws IOException {
    }

    @SuppressWarnings("unchecked")
    public void readAndSpill() throws IOException {
        long cumilativeReadTimeStart = System.currentTimeMillis();

        for (Path sequenceFilePath : _inputSegments) {

            long individualReadTimeStart = System.currentTimeMillis();

            // LOG.info("Reading Contents for File:" + sequenceFilePath);
            // allocate a reader for the current path
            SequenceFile.Reader reader = new SequenceFile.Reader(_sourceFileSystem, sequenceFilePath, _config);

            try {
                boolean eos = false;

                while (!eos) {

                    KeyType key = null;
                    ValueType value = null;

                    try {
                        key = _keyConstructor.newInstance();
                        value = _valConstructor.newInstance();
                    } catch (Exception e) {
                        LOG.error("Failed to create key or value type with Exception:"
                                + CCStringUtils.stringifyException(e));
                        throw new RuntimeException(e);
                    }

                    eos = !reader.next(key, value);

                    if (!eos) {
                        _recordCount++;
                        _writer.spillRecord(key, value);
                    }
                }
                while (!eos)
                    ;

                // LOG.info("Read and Spill of File:" + sequenceFilePath +" took:" +
                // (System.currentTimeMillis() - individualReadTimeStart));
            } finally {

                if (reader != null) {
                    reader.close();
                }
            }
        }
        // LOG.info("Cumilative Read and Spill took:" + (System.currentTimeMillis()
        // - cumilativeReadTimeStart) + " Spilled RecordCount:" + _recordCount);
    }
}