Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. */ package org.apache.pig.piggybank.storage; import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; import java.io.IOException; import java.nio.charset.Charset; import java.text.NumberFormat; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import org.apache.commons.lang.ArrayUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskID; import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.ReflectionUtils; import org.apache.pig.StoreFunc; import org.apache.pig.data.DataBag; import org.apache.pig.data.Tuple; import org.apache.pig.impl.util.StorageUtil; /** * The UDF is useful for splitting the output data into a bunch of directories * and files dynamically based on user specified key field in the output tuple. */ public class GAMultiStorage extends StoreFunc { private Path outputPath; // User specified output Path private boolean statsOnly = false; private Compression comp; // Compression type of output data. // Default size for the byte buffer, should fit most tuples. private static final int BUF_SIZE = 1024; // Compression types supported by this store enum Compression { none, bz2, bz, gz; }; public GAMultiStorage(String parentPathStr, String statsOnly) { this(parentPathStr, statsOnly, "none"); } /** * Constructor * * @param parentPathStr * Parent output directory path * @param statsOnly * generate only tuple statistics (game_id;length;memory_size) 'true' or 'false' * @param compression * 'bz2', 'bz', 'gz' or 'none' */ public GAMultiStorage(String parentPathStr, String statsOnly, String compression) { this.outputPath = new Path(parentPathStr); this.statsOnly = Boolean.parseBoolean(statsOnly); try { this.comp = (compression == null) ? Compression.none : Compression.valueOf(compression.toLowerCase()); } catch (IllegalArgumentException e) { System.err.println("Exception when converting compression string: " + compression + " to enum. No compression will be used"); this.comp = Compression.none; } } // -------------------------------------------------------------------------- // Implementation of StoreFunc private RecordWriter<String, Text> writer; @Override public void putNext(Tuple t) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(BUF_SIZE); // this magical limit is related to memory size of reducer to prevent OOM while trying // to write data for big games int limit = 67108864 / 8; boolean partialSave = false; try { DataBag bag = (DataBag) t.get(1); String gameId = String.valueOf(((Tuple) t.get(0)).get(0)); if (this.statsOnly) { // produces debug memory/size stats in CSV format writer.write(gameId, new Text( gameId + ";" + String.valueOf(bag.size()) + ";" + String.valueOf(t.getMemorySize()))); } else { Iterator<Tuple> iterator = bag.iterator(); while (iterator.hasNext()) { Tuple t1 = (Tuple) iterator.next(); String line = t1.get(2).toString(); // we are using multiple files together and collectors // do not append end line if (!line.endsWith("\n")) { line += "\n"; } baos.write(line.getBytes(Charset.forName("UTF-8"))); // save partial result to prevent VM array limit exceptions if (baos.size() >= limit) { writer.write(String.valueOf(gameId), RemoveLastByte(baos.toByteArray())); baos.reset(); partialSave = true; } else { partialSave = false; } } if (!partialSave) { writer.write(String.valueOf(gameId), RemoveLastByte(baos.toByteArray())); } } } catch (InterruptedException ie) { throw new IOException(ie); } catch (Exception e) { throw new IOException(e); } } private Text RemoveLastByte(byte[] input) { return new Text(ArrayUtils.remove(input, input.length - 1)); } @SuppressWarnings("unchecked") @Override public OutputFormat getOutputFormat() throws IOException { return new MultiStorageOutputFormat(); } @SuppressWarnings("unchecked") @Override public void prepareToWrite(RecordWriter writer) throws IOException { this.writer = writer; } @Override public void setStoreLocation(String location, Job job) throws IOException { job.getConfiguration().set("mapred.textoutputformat.separator", ""); FileOutputFormat.setOutputPath(job, new Path(location)); if (comp == Compression.bz2 || comp == Compression.bz) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); } else if (comp == Compression.gz) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } } // -------------------------------------------------------------------------- // Implementation of OutputFormat public static class MultiStorageOutputFormat extends TextOutputFormat<String, Text> { private String keyValueSeparator = ""; @Override public RecordWriter<String, Text> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException { final TaskAttemptContext ctx = context; return new RecordWriter<String, Text>() { private Map<String, MyLineRecordWriter> storeMap = new HashMap<String, MyLineRecordWriter>(); private static final int BUFFER_SIZE = 1024; private ByteArrayOutputStream mOut = new ByteArrayOutputStream(BUFFER_SIZE); @Override public void write(String key, Text val) throws IOException { StorageUtil.putField(mOut, val.toString()); getStore(key).write(null, new Text(mOut.toByteArray())); mOut.reset(); } @Override public void close(TaskAttemptContext context) throws IOException { for (MyLineRecordWriter out : storeMap.values()) { out.close(context); } } private MyLineRecordWriter getStore(String fieldValue) throws IOException { MyLineRecordWriter store = storeMap.get(fieldValue); if (store == null) { DataOutputStream os = createOutputStream(fieldValue); store = new MyLineRecordWriter(os, keyValueSeparator); storeMap.put(fieldValue, store); } return store; } private DataOutputStream createOutputStream(String fieldValue) throws IOException { Configuration conf = ctx.getConfiguration(); TaskID taskId = ctx.getTaskAttemptID().getTaskID(); // Check whether compression is enabled, if so get the // extension and add them to the path boolean isCompressed = getCompressOutput(ctx); CompressionCodec codec = null; String extension = ""; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(ctx, GzipCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, ctx.getConfiguration()); extension = codec.getDefaultExtension(); } NumberFormat nf = NumberFormat.getInstance(); nf.setMinimumIntegerDigits(4); Path path = new Path(fieldValue, nf.format(taskId.getId()) + ".json" + extension); Path workOutputPath = ((FileOutputCommitter) getOutputCommitter(ctx)).getWorkPath(); Path file = new Path(workOutputPath, path); FileSystem fs = file.getFileSystem(conf); // Overwrite existing file (second argument is 'true'). // Seems like this is necessary if one reducer failed, // and was restarted. FSDataOutputStream fileOut = fs.create(file, true); if (isCompressed) return new DataOutputStream(codec.createOutputStream(fileOut)); else return fileOut; } }; } // ------------------------------------------------------------------------ // protected static class MyLineRecordWriter extends TextOutputFormat.LineRecordWriter<WritableComparable, Text> { public MyLineRecordWriter(DataOutputStream out, String keyValueSeparator) { super(out, keyValueSeparator); } } } }