Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.bacon.io; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.ByteWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.DefaultCodec; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.ReflectionUtils; import org.apache.pig.StoreFunc; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; /** * Pig StoreFunc which stores Tuples in a Hadoop SequenceFile. Hadoop * SequenceFiles are made up of unordered (key,value) pairs. So, you * can add (key,value) pairs in any order, the inputs do not need to * be sorted. * * In a Hadoop SequenceFile, the key and the value are each typed * according to the Hadoop type system. The type of the key and the * value are specified when the SequenceFile is created. Thus, when * this StoreFunc is initialized, the key and value types must be * given. By default, if the types are not specified, they are * assumed to be Text (i.e. Strings). * * For example, in a Pig script: * * STORE foo INTO 'foo' USING SequenceFileStorage(); -- Default (Text,Text) * * or * * STORE foo INTO 'foo' USING SequenceFileStorage( 'org.apache.hadoop.io.LongWritable', * 'org.apache.hadoop.io.BytesWritable' ); * */ public class SequenceFileStorage extends StoreFunc { /* * It's possible that we can actually get the Class objects in the * constructor; however, I'm not sure that the whole environment is * setup when this class is instantiated, so I think it's safer to * defer getting the Class objects until they are actually needed in * the RecordWriter. */ String keyType = "org.apache.hadoop.io.Text"; String valueType = "org.apache.hadoop.io.Text"; /* * We create a null object for the key and value types. If we need * to write a null to the sequence file, then we just use these * instances. */ Writable nullKey; Writable nullValue; RecordWriter writer; public SequenceFileStorage() { } public SequenceFileStorage(String keyType, String valueType) { this.keyType = keyType; this.valueType = valueType; } /** * Most of this method is cut/pasted from the Hadoop * SequenceFileOutputFormat. The big difference is that we use the * key and value types given to this Pig storage class rather than * using the ones set by the job configuration. */ public OutputFormat getOutputFormat() throws IOException { return new SequenceFileOutputFormat() { public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); Class keyClass, valueClass; try { keyClass = conf.getClassByName(keyType); valueClass = conf.getClassByName(valueType); } catch (ClassNotFoundException cnfe) { throw new IOException(cnfe); } // Instantiate null objects for the key and value types. // See getWritable() for their use. try { nullKey = (Writable) keyClass.newInstance(); nullValue = (Writable) valueClass.newInstance(); } catch (ReflectiveOperationException roe) { throw new IOException(roe); } CompressionCodec codec = null; CompressionType compressionType = CompressionType.NONE; if (getCompressOutput(context)) { // find the kind of compression to do compressionType = getOutputCompressionType(context); // find the right codec Class<?> codecClass = getOutputCompressorClass(context, DefaultCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); } // get the path of the temporary output file Path file = getDefaultWorkFile(context, ""); FileSystem fs = file.getFileSystem(conf); final SequenceFile.Writer out = SequenceFile.createWriter(fs, conf, file, keyClass, valueClass, compressionType, codec, context); return new RecordWriter() { public void write(Object key, Object value) throws IOException { out.append(key, value); } public void close(TaskAttemptContext context) throws IOException { out.close(); } }; } }; } public void setStoreLocation(String location, Job job) throws IOException { FileOutputFormat.setOutputPath(job, new Path(location)); } public void prepareToWrite(RecordWriter writer) throws IOException { this.writer = writer; } /** * SequenceFiles require a key/value pair. * * For convenience, we allow an incoming tuple ot have either one or * two entries. * * If it has two entries, then we take them as the key and value. * * If there is only one entry, then it is taken as the value and a * 'null' key is used. */ public void putNext(Tuple tuple) throws IOException { try { Writable key, value; int size = tuple.size(); if (size == 1) { key = this.nullKey; value = getWritable(tuple.get(0), this.nullValue); } else if (size == 2) { key = getWritable(tuple.get(0), this.nullKey); value = getWritable(tuple.get(1), this.nullValue); } else { throw new IOException("Invalid tuple size, must be 1 or 2: " + size); } this.writer.write(key, value); } catch (InterruptedException ie) { throw new IOException(ie); } } /** * Convert the Pig tupleValue to the corresponding Hadoop object. */ public Writable getWritable(Object tupleValue, Writable nullWritable) throws IOException { switch (DataType.findType(tupleValue)) { case DataType.BOOLEAN: return new BooleanWritable((boolean) tupleValue); case DataType.BYTE: return new ByteWritable((byte) tupleValue); case DataType.CHARARRAY: return new Text((String) tupleValue); case DataType.INTEGER: return new IntWritable((int) tupleValue); case DataType.LONG: return new LongWritable((long) tupleValue); case DataType.DOUBLE: return new DoubleWritable((double) tupleValue); case DataType.FLOAT: return new FloatWritable((float) tupleValue); case DataType.BYTEARRAY: return new BytesWritable((byte[]) tupleValue); // If we get a 'null' from Pig, just pass through the // already-instantiated Hadoop nullWritable. case DataType.NULL: return nullWritable; // Don't know what to do with these complex data types. case DataType.BAG: case DataType.ERROR: case DataType.MAP: case DataType.TUPLE: case DataType.UNKNOWN: default: throw new IOException("Cannot write values of type: " + DataType.findTypeName(tupleValue)); } } }