com.endgame.binarypig.loaders.AbstractFileDroppingLoader.java Source code

Introduction

Here is the source code for com.endgame.binarypig.loaders.AbstractFileDroppingLoader.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 *  Copyright 2013 Endgame Inc.
 *
 */

package com.endgame.binarypig.loaders;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.UUID;
import java.util.concurrent.TimeUnit;

import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
import org.apache.pig.FileInputLoadFunc;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;

import com.endgame.binarypig.util.StreamUtils;
import com.google.common.base.Stopwatch;

/**
 * Hello world!
 *
 */
public abstract class AbstractFileDroppingLoader extends FileInputLoadFunc {
    long timeoutMS = Long.MAX_VALUE;
    File dataDir;
    File workingDir;

    //might not need this.
    SequenceFileRecordReader<Text, BytesWritable> reader;
    Text key;
    BytesWritable value;

    ArrayList<Object> protoTuple = new ArrayList<>();
    TupleFactory tupleFactory = TupleFactory.getInstance();

    long numRecords = 0;
    Stopwatch copyOverhead = new Stopwatch();
    Stopwatch execOverhead = new Stopwatch();
    Stopwatch deleteOverhead = new Stopwatch();
    Stopwatch tupleCreationOverhead = new Stopwatch();
    Stopwatch totalOverhead = new Stopwatch();

    boolean useDevShm;

    public AbstractFileDroppingLoader() {
        this(Long.toString(Long.MAX_VALUE));
    }

    public AbstractFileDroppingLoader(String timeoutMS) {
        this(timeoutMS, "false");
    }

    public AbstractFileDroppingLoader(String timeoutMS, String useDevShm) {
        this.timeoutMS = Long.parseLong(timeoutMS);
        this.useDevShm = Boolean.parseBoolean(useDevShm);
    }

    @Override
    public void prepareToRead(RecordReader reader, PigSplit split) throws IOException {
        // cast any reader object into a SequenceFileRecordReader. We do not care about the split. We keep with sequencefile, as our base
        // filetype is a sequence file. All the fanciness is post-sequence.

        try {
            FileSplit filesplit = (FileSplit) split.getWrappedSplit();
            System.out.println("filesplit: " + filesplit.getPath());
        } catch (NullPointerException e) {
        } catch (Exception e) {
            e.printStackTrace();
        }

        this.reader = (SequenceFileRecordReader) reader;
        workingDir = new File(".").getAbsoluteFile();
        makeDataDir();
        init();
    }

    public void makeDataDir() {
        if (useDevShm) {
            dataDir = new File("/dev/shm/" + UUID.randomUUID().toString());
        } else {
            // data dir is here so the input file doesn't clobber any ofn the files in the CWD
            dataDir = new File(workingDir, "_data");
        }

        if (!dataDir.mkdir() && !dataDir.exists()) {
            throw new RuntimeException("Could not create data dir: " + dataDir);
        }
        dataDir.deleteOnExit();
    }

    @Override
    public InputFormat getInputFormat() throws IOException {
        // Define the kind of file in HDFS that we are dealing with. Here, we define that we wish to operate on a sequence file.
        return new SequenceFileInputFormat();
    }

    boolean shouldContinue() throws IOException {
        //Determine if there is another key/value
        try {
            return reader.nextKeyValue();
        } catch (InterruptedException e) {
            throw new IOException(e);
        }
    }

    public static void dumpStat(String name, Stopwatch stat, long totalUsec, long records) {
        long uSec = stat.elapsedTime(TimeUnit.MICROSECONDS);
        double percentTotal = (100.0 * uSec) / totalUsec;
        double usecPerRecord = (double) uSec / (double) records;
        System.err.printf(String.format("STAT:%s: %.4f uSec/rec, %s, %.2f percent total overhead\n", name,
                usecPerRecord, stat.toString(4), percentTotal));
    }

    public void dumpStats() {
        System.err.println("workDir     = " + workingDir);
        System.err.println("dataDir     = " + dataDir);
        System.err.println("useDevShm     = " + useDevShm);
        System.err.println("numRecords     = " + numRecords);

        long usec = totalOverhead.elapsedTime(TimeUnit.MICROSECONDS);
        dumpStat("copyOverhead", copyOverhead, usec, numRecords);
        dumpStat("deleteOverhead", deleteOverhead, usec, numRecords);
        dumpStat("execOverhead", execOverhead, usec, numRecords);
        dumpStat("tupleCreationOverhead", tupleCreationOverhead, usec, numRecords);
        dumpStat("totalOverhead", totalOverhead, usec, numRecords);
    }

    @Override
    public Tuple getNext() throws IOException {
        if (!shouldContinue()) {
            dumpStats();
            cleanUp();
            return null;
        }
        totalOverhead.start();

        key = reader.getCurrentKey();
        value = reader.getCurrentValue();

        ++numRecords;

        copyOverhead.start();
        File binaryFile = new File(dataDir, key.toString());
        binaryFile.deleteOnExit();
        try {
            StreamUtils.writeToFile((BytesWritable) value, binaryFile);
            copyOverhead.stop();

            execOverhead.start();
            Tuple tuple = processFile(key, value, binaryFile);
            execOverhead.stop();

            return tuple;
        } catch (Exception e) {
            throw new IOException(e);
        } finally {
            // do our best to prevent orphan files (important when using /dev/shm)
            deleteOverhead.start();
            binaryFile.delete();
            deleteOverhead.stop();
            totalOverhead.stop();
        }
    }

    public void cleanUp() {
        try {
            FileUtils.deleteDirectory(dataDir);
        } catch (NullPointerException e) {
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void init() throws IOException {
    }

    public abstract Tuple processFile(Text key, BytesWritable value, File binaryFile) throws IOException;

    @Override
    public void setLocation(String location, Job job) throws IOException {
        // implement distributed cache loading here.
        FileInputFormat.setInputPaths(job, location);
    }

    public File getDataDir() {
        return dataDir;
    }

    public File getWorkingDir() {
        return workingDir;
    }

    public ArrayList<Object> getProtoTuple() {
        return protoTuple;
    }

    public TupleFactory getTupleFactory() {
        return tupleFactory;
    }

    public long getTimeoutMS() {
        return timeoutMS;
    }

    public void setTimeoutMS(long timeoutMS) {
        this.timeoutMS = timeoutMS;
    }
}