org.archive.bacon.io.SequenceFileStorage.java Source code

Introduction

Here is the source code for org.archive.bacon.io.SequenceFileStorage.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.archive.bacon.io;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;

import org.apache.pig.StoreFunc;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;

/**
 * Pig StoreFunc which stores Tuples in a Hadoop SequenceFile.  Hadoop
 * SequenceFiles are made up of unordered (key,value) pairs.  So, you
 * can add (key,value) pairs in any order, the inputs do not need to
 * be sorted.
 *
 * In a Hadoop SequenceFile, the key and the value are each typed
 * according to the Hadoop type system.  The type of the key and the
 * value are specified when the SequenceFile is created.  Thus, when
 * this StoreFunc is initialized, the key and value types must be
 * given.  By default, if the types are not specified, they are
 * assumed to be Text (i.e. Strings).
 *
 * For example, in a Pig script:
 *
 *   STORE foo INTO 'foo' USING SequenceFileStorage();  -- Default (Text,Text)
 *
 * or
 *
 *   STORE foo INTO 'foo' USING SequenceFileStorage( 'org.apache.hadoop.io.LongWritable',
 *                                                   'org.apache.hadoop.io.BytesWritable' );
 * 
 */
public class SequenceFileStorage extends StoreFunc {
    /*
     * It's possible that we can actually get the Class objects in the
     * constructor; however, I'm not sure that the whole environment is
     * setup when this class is instantiated, so I think it's safer to
     * defer getting the Class objects until they are actually needed in
     * the RecordWriter.
     */
    String keyType = "org.apache.hadoop.io.Text";
    String valueType = "org.apache.hadoop.io.Text";

    /*
     * We create a null object for the key and value types.  If we need
     * to write a null to the sequence file, then we just use these
     * instances.
     */
    Writable nullKey;
    Writable nullValue;

    RecordWriter writer;

    public SequenceFileStorage() {
    }

    public SequenceFileStorage(String keyType, String valueType) {
        this.keyType = keyType;
        this.valueType = valueType;
    }

    /**
     * Most of this method is cut/pasted from the Hadoop
     * SequenceFileOutputFormat.  The big difference is that we use the
     * key and value types given to this Pig storage class rather than
     * using the ones set by the job configuration.
     */
    public OutputFormat getOutputFormat() throws IOException {
        return new SequenceFileOutputFormat() {
            public RecordWriter getRecordWriter(TaskAttemptContext context)
                    throws IOException, InterruptedException {
                Configuration conf = context.getConfiguration();

                Class keyClass, valueClass;
                try {
                    keyClass = conf.getClassByName(keyType);
                    valueClass = conf.getClassByName(valueType);
                } catch (ClassNotFoundException cnfe) {
                    throw new IOException(cnfe);
                }

                // Instantiate null objects for the key and value types.
                // See getWritable() for their use.
                try {
                    nullKey = (Writable) keyClass.newInstance();
                    nullValue = (Writable) valueClass.newInstance();
                } catch (ReflectiveOperationException roe) {
                    throw new IOException(roe);
                }

                CompressionCodec codec = null;
                CompressionType compressionType = CompressionType.NONE;
                if (getCompressOutput(context)) {
                    // find the kind of compression to do
                    compressionType = getOutputCompressionType(context);

                    // find the right codec
                    Class<?> codecClass = getOutputCompressorClass(context, DefaultCodec.class);
                    codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
                }
                // get the path of the temporary output file 
                Path file = getDefaultWorkFile(context, "");
                FileSystem fs = file.getFileSystem(conf);
                final SequenceFile.Writer out = SequenceFile.createWriter(fs, conf, file, keyClass, valueClass,
                        compressionType, codec, context);

                return new RecordWriter() {

                    public void write(Object key, Object value) throws IOException {

                        out.append(key, value);
                    }

                    public void close(TaskAttemptContext context) throws IOException {
                        out.close();
                    }
                };
            }
        };
    }

    public void setStoreLocation(String location, Job job) throws IOException {
        FileOutputFormat.setOutputPath(job, new Path(location));
    }

    public void prepareToWrite(RecordWriter writer) throws IOException {
        this.writer = writer;
    }

    /**
     * SequenceFiles require a key/value pair.
     *
     * For convenience, we allow an incoming tuple ot have either one or
     * two entries.
     *
     * If it has two entries, then we take them as the key and value.
     *
     * If there is only one entry, then it is taken as the value and a
     * 'null' key is used.
     */
    public void putNext(Tuple tuple) throws IOException {
        try {
            Writable key, value;

            int size = tuple.size();

            if (size == 1) {
                key = this.nullKey;
                value = getWritable(tuple.get(0), this.nullValue);
            } else if (size == 2) {
                key = getWritable(tuple.get(0), this.nullKey);
                value = getWritable(tuple.get(1), this.nullValue);
            } else {
                throw new IOException("Invalid tuple size, must be 1 or 2: " + size);
            }

            this.writer.write(key, value);
        } catch (InterruptedException ie) {
            throw new IOException(ie);
        }
    }

    /**
     * Convert the Pig tupleValue to the corresponding Hadoop object.
     */
    public Writable getWritable(Object tupleValue, Writable nullWritable) throws IOException {
        switch (DataType.findType(tupleValue)) {
        case DataType.BOOLEAN:
            return new BooleanWritable((boolean) tupleValue);

        case DataType.BYTE:
            return new ByteWritable((byte) tupleValue);

        case DataType.CHARARRAY:
            return new Text((String) tupleValue);

        case DataType.INTEGER:
            return new IntWritable((int) tupleValue);

        case DataType.LONG:
            return new LongWritable((long) tupleValue);

        case DataType.DOUBLE:
            return new DoubleWritable((double) tupleValue);

        case DataType.FLOAT:
            return new FloatWritable((float) tupleValue);

        case DataType.BYTEARRAY:
            return new BytesWritable((byte[]) tupleValue);

        // If we get a 'null' from Pig, just pass through the
        // already-instantiated Hadoop nullWritable.
        case DataType.NULL:
            return nullWritable;

        // Don't know what to do with these complex data types.
        case DataType.BAG:
        case DataType.ERROR:
        case DataType.MAP:
        case DataType.TUPLE:
        case DataType.UNKNOWN:
        default:
            throw new IOException("Cannot write values of type: " + DataType.findTypeName(tupleValue));
        }
    }

}