org.apache.pig.piggybank.storage.GAMultiStorage.java Source code

Introduction

Here is the source code for org.apache.pig.piggybank.storage.GAMultiStorage.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
 * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed under the License is
 * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and limitations under the License.
 */
package org.apache.pig.piggybank.storage;

import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.text.NumberFormat;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.pig.StoreFunc;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.util.StorageUtil;

/**
 * The UDF is useful for splitting the output data into a bunch of directories
 * and files dynamically based on user specified key field in the output tuple.
 */
public class GAMultiStorage extends StoreFunc {

    private Path outputPath; // User specified output Path
    private boolean statsOnly = false;
    private Compression comp; // Compression type of output data.

    // Default size for the byte buffer, should fit most tuples.
    private static final int BUF_SIZE = 1024;

    // Compression types supported by this store
    enum Compression {
        none, bz2, bz, gz;
    };

    public GAMultiStorage(String parentPathStr, String statsOnly) {
        this(parentPathStr, statsOnly, "none");
    }

    /**
     * Constructor
     * 
     * @param parentPathStr
     *            Parent output directory path
     * @param statsOnly
     *            generate only tuple statistics (game_id;length;memory_size) 'true' or 'false'
     * @param compression
     *            'bz2', 'bz', 'gz' or 'none'
     */
    public GAMultiStorage(String parentPathStr, String statsOnly, String compression) {
        this.outputPath = new Path(parentPathStr);
        this.statsOnly = Boolean.parseBoolean(statsOnly);

        try {
            this.comp = (compression == null) ? Compression.none : Compression.valueOf(compression.toLowerCase());
        } catch (IllegalArgumentException e) {
            System.err.println("Exception when converting compression string: " + compression
                    + " to enum. No compression will be used");
            this.comp = Compression.none;
        }
    }

    // --------------------------------------------------------------------------
    // Implementation of StoreFunc

    private RecordWriter<String, Text> writer;

    @Override
    public void putNext(Tuple t) throws IOException {
        ByteArrayOutputStream baos = new ByteArrayOutputStream(BUF_SIZE);

        // this magical limit is related to memory size of reducer to prevent OOM while trying
        // to write data for big games
        int limit = 67108864 / 8;
        boolean partialSave = false;

        try {
            DataBag bag = (DataBag) t.get(1);
            String gameId = String.valueOf(((Tuple) t.get(0)).get(0));

            if (this.statsOnly) {
                // produces debug memory/size stats in CSV format
                writer.write(gameId, new Text(
                        gameId + ";" + String.valueOf(bag.size()) + ";" + String.valueOf(t.getMemorySize())));
            } else {
                Iterator<Tuple> iterator = bag.iterator();
                while (iterator.hasNext()) {
                    Tuple t1 = (Tuple) iterator.next();
                    String line = t1.get(2).toString();

                    // we are using multiple files together and collectors
                    // do not append end line
                    if (!line.endsWith("\n")) {
                        line += "\n";
                    }
                    baos.write(line.getBytes(Charset.forName("UTF-8")));

                    // save partial result to prevent VM array limit exceptions
                    if (baos.size() >= limit) {
                        writer.write(String.valueOf(gameId), RemoveLastByte(baos.toByteArray()));
                        baos.reset();
                        partialSave = true;
                    } else {
                        partialSave = false;
                    }
                }

                if (!partialSave) {
                    writer.write(String.valueOf(gameId), RemoveLastByte(baos.toByteArray()));
                }
            }
        } catch (InterruptedException ie) {
            throw new IOException(ie);
        } catch (Exception e) {
            throw new IOException(e);
        }
    }

    private Text RemoveLastByte(byte[] input) {
        return new Text(ArrayUtils.remove(input, input.length - 1));
    }

    @SuppressWarnings("unchecked")
    @Override
    public OutputFormat getOutputFormat() throws IOException {
        return new MultiStorageOutputFormat();
    }

    @SuppressWarnings("unchecked")
    @Override
    public void prepareToWrite(RecordWriter writer) throws IOException {
        this.writer = writer;
    }

    @Override
    public void setStoreLocation(String location, Job job) throws IOException {
        job.getConfiguration().set("mapred.textoutputformat.separator", "");
        FileOutputFormat.setOutputPath(job, new Path(location));
        if (comp == Compression.bz2 || comp == Compression.bz) {
            FileOutputFormat.setCompressOutput(job, true);
            FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
        } else if (comp == Compression.gz) {
            FileOutputFormat.setCompressOutput(job, true);
            FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
        }
    }

    // --------------------------------------------------------------------------
    // Implementation of OutputFormat

    public static class MultiStorageOutputFormat extends TextOutputFormat<String, Text> {

        private String keyValueSeparator = "";

        @Override
        public RecordWriter<String, Text> getRecordWriter(TaskAttemptContext context)
                throws IOException, InterruptedException {

            final TaskAttemptContext ctx = context;

            return new RecordWriter<String, Text>() {

                private Map<String, MyLineRecordWriter> storeMap = new HashMap<String, MyLineRecordWriter>();

                private static final int BUFFER_SIZE = 1024;

                private ByteArrayOutputStream mOut = new ByteArrayOutputStream(BUFFER_SIZE);

                @Override
                public void write(String key, Text val) throws IOException {
                    StorageUtil.putField(mOut, val.toString());
                    getStore(key).write(null, new Text(mOut.toByteArray()));
                    mOut.reset();
                }

                @Override
                public void close(TaskAttemptContext context) throws IOException {
                    for (MyLineRecordWriter out : storeMap.values()) {
                        out.close(context);
                    }
                }

                private MyLineRecordWriter getStore(String fieldValue) throws IOException {
                    MyLineRecordWriter store = storeMap.get(fieldValue);
                    if (store == null) {
                        DataOutputStream os = createOutputStream(fieldValue);
                        store = new MyLineRecordWriter(os, keyValueSeparator);
                        storeMap.put(fieldValue, store);
                    }
                    return store;
                }

                private DataOutputStream createOutputStream(String fieldValue) throws IOException {
                    Configuration conf = ctx.getConfiguration();
                    TaskID taskId = ctx.getTaskAttemptID().getTaskID();

                    // Check whether compression is enabled, if so get the
                    // extension and add them to the path
                    boolean isCompressed = getCompressOutput(ctx);
                    CompressionCodec codec = null;
                    String extension = "";
                    if (isCompressed) {
                        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(ctx,
                                GzipCodec.class);
                        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, ctx.getConfiguration());
                        extension = codec.getDefaultExtension();
                    }

                    NumberFormat nf = NumberFormat.getInstance();
                    nf.setMinimumIntegerDigits(4);

                    Path path = new Path(fieldValue, nf.format(taskId.getId()) + ".json" + extension);
                    Path workOutputPath = ((FileOutputCommitter) getOutputCommitter(ctx)).getWorkPath();
                    Path file = new Path(workOutputPath, path);
                    FileSystem fs = file.getFileSystem(conf);
                    // Overwrite existing file (second argument is 'true').
                    // Seems like this is necessary if one reducer failed,
                    // and was restarted.
                    FSDataOutputStream fileOut = fs.create(file, true);

                    if (isCompressed)
                        return new DataOutputStream(codec.createOutputStream(fileOut));
                    else
                        return fileOut;
                }

            };
        }

        // ------------------------------------------------------------------------
        //

        protected static class MyLineRecordWriter
                extends TextOutputFormat.LineRecordWriter<WritableComparable, Text> {

            public MyLineRecordWriter(DataOutputStream out, String keyValueSeparator) {
                super(out, keyValueSeparator);
            }
        }
    }
}