org.apache.hadoop.tools.rumen.Folder.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.tools.rumen.Folder.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.tools.rumen;

import java.io.Closeable;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Random;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class Folder extends Configured implements Tool {
    private long outputDuration = -1;
    private long inputCycle = -1;
    private double concentration = 1.0;
    private long randomSeed = 0; // irrelevant if seeded == false
    private boolean seeded = false;
    private boolean debug = false;
    private boolean allowMissorting = false;
    private int skewBufferLength = 0;
    private long startsAfter = -1;

    static final private Log LOG = LogFactory.getLog(Folder.class);

    private DeskewedJobTraceReader reader = null;
    private Outputter<LoggedJob> outGen = null;

    private List<Path> tempPaths = new LinkedList<Path>();

    private Path tempDir = null;

    private long firstJobSubmitTime;

    private double timeDilation;

    private double transcriptionRateFraction;

    private int transcriptionRateInteger;

    private Random random;

    static private final long TICKS_PER_SECOND = 1000L;

    // error return codes
    static private final int NON_EXISTENT_FILES = 1;
    static private final int NO_INPUT_CYCLE_LENGTH = 2;
    static private final int EMPTY_JOB_TRACE = 3;
    static private final int OUT_OF_ORDER_JOBS = 4;
    static private final int ALL_JOBS_SIMULTANEOUS = 5;
    static private final int IO_ERROR = 6;
    static private final int OTHER_ERROR = 7;

    private Set<Closeable> closees = new HashSet<Closeable>();
    private Set<Path> deletees = new HashSet<Path>();

    static long parseDuration(String durationString) {
        String numeral = durationString.substring(0, durationString.length() - 1);
        char durationCode = durationString.charAt(durationString.length() - 1);

        long result = Integer.parseInt(numeral);

        if (result <= 0) {
            throw new IllegalArgumentException("Negative durations are not allowed");
        }

        switch (durationCode) {
        case 'D':
        case 'd':
            return 24L * 60L * 60L * TICKS_PER_SECOND * result;
        case 'H':
        case 'h':
            return 60L * 60L * TICKS_PER_SECOND * result;
        case 'M':
        case 'm':
            return 60L * TICKS_PER_SECOND * result;
        case 'S':
        case 's':
            return TICKS_PER_SECOND * result;
        default:
            throw new IllegalArgumentException("Missing or invalid duration code");
        }
    }

    private int initialize(String[] args) throws IllegalArgumentException {
        String tempDirName = null;
        String inputPathName = null;
        String outputPathName = null;

        for (int i = 0; i < args.length; ++i) {
            String thisArg = args[i];
            if (thisArg.equalsIgnoreCase("-starts-after")) {
                startsAfter = parseDuration(args[++i]);
            } else if (thisArg.equalsIgnoreCase("-output-duration")) {
                outputDuration = parseDuration(args[++i]);
            } else if (thisArg.equalsIgnoreCase("-input-cycle")) {
                inputCycle = parseDuration(args[++i]);
            } else if (thisArg.equalsIgnoreCase("-concentration")) {
                concentration = Double.parseDouble(args[++i]);
            } else if (thisArg.equalsIgnoreCase("-debug")) {
                debug = true;
            } else if (thisArg.equalsIgnoreCase("-allow-missorting")) {
                allowMissorting = true;
            } else if (thisArg.equalsIgnoreCase("-seed")) {
                seeded = true;
                randomSeed = Long.parseLong(args[++i]);
            } else if (thisArg.equalsIgnoreCase("-skew-buffer-length")) {
                skewBufferLength = Integer.parseInt(args[++i]);
            } else if (thisArg.equalsIgnoreCase("-temp-directory")) {
                tempDirName = args[++i];
            } else if (thisArg.equals("") || thisArg.startsWith("-")) {
                throw new IllegalArgumentException("Illegal switch argument, " + thisArg + " at position " + i);
            } else {
                inputPathName = thisArg;
                outputPathName = args[++i];

                if (i != args.length - 1) {
                    throw new IllegalArgumentException("Too many non-switch arguments");
                }
            }
        }

        try {
            Configuration conf = getConf();
            Path inPath = new Path(inputPathName);
            reader = new DeskewedJobTraceReader(new JobTraceReader(inPath, conf), skewBufferLength,
                    !allowMissorting);
            Path outPath = new Path(outputPathName);

            outGen = new DefaultOutputter<LoggedJob>();
            outGen.init(outPath, conf);

            tempDir = tempDirName == null ? outPath.getParent() : new Path(tempDirName);

            FileSystem fs = tempDir.getFileSystem(getConf());
            if (!fs.getFileStatus(tempDir).isDirectory()) {
                throw new IOException("Your temp directory is not a directory");
            }

            if (inputCycle <= 0) {
                LOG.error("You must have an input cycle length.");
                return NO_INPUT_CYCLE_LENGTH;
            }

            if (outputDuration <= 0) {
                outputDuration = 60L * 60L * TICKS_PER_SECOND;
            }

            if (inputCycle <= 0) {
                inputCycle = outputDuration;
            }

            timeDilation = (double) outputDuration / (double) inputCycle;

            random = seeded ? new Random(randomSeed) : new Random();

            if (debug) {
                randomSeed = random.nextLong();

                LOG.warn("This run effectively has a -seed of " + randomSeed);

                random = new Random(randomSeed);

                seeded = true;
            }
        } catch (IOException e) {
            e.printStackTrace(System.err);

            return NON_EXISTENT_FILES;
        }

        return 0;
    }

    @Override
    public int run(String[] args) throws IOException {
        int result = initialize(args);

        if (result != 0) {
            return result;
        }

        return run();
    }

    public int run() throws IOException {
        class JobEntryComparator implements Comparator<Pair<LoggedJob, JobTraceReader>> {
            public int compare(Pair<LoggedJob, JobTraceReader> p1, Pair<LoggedJob, JobTraceReader> p2) {
                LoggedJob j1 = p1.first();
                LoggedJob j2 = p2.first();

                return (j1.getSubmitTime() < j2.getSubmitTime()) ? -1
                        : (j1.getSubmitTime() == j2.getSubmitTime()) ? 0 : 1;
            }
        }

        // we initialize an empty heap so if we take an error before establishing
        // a real one the finally code goes through
        Queue<Pair<LoggedJob, JobTraceReader>> heap = new PriorityQueue<Pair<LoggedJob, JobTraceReader>>();

        try {
            LoggedJob job = reader.nextJob();

            if (job == null) {
                LOG.error("The job trace is empty");

                return EMPTY_JOB_TRACE;
            }

            // If starts-after time is specified, skip the number of jobs till we reach
            // the starting time limit.
            if (startsAfter > 0) {
                LOG.info("starts-after time is specified. Initial job submit time : " + job.getSubmitTime());

                long approximateTime = job.getSubmitTime() + startsAfter;
                job = reader.nextJob();
                long skippedCount = 0;
                while (job != null && job.getSubmitTime() < approximateTime) {
                    job = reader.nextJob();
                    skippedCount++;
                }

                LOG.debug("Considering jobs with submit time greater than " + startsAfter + " ms. Skipped "
                        + skippedCount + " jobs.");

                if (job == null) {
                    LOG.error("No more jobs to process in the trace with 'starts-after'" + " set to " + startsAfter
                            + "ms.");
                    return EMPTY_JOB_TRACE;
                }
                LOG.info("The first job has a submit time of " + job.getSubmitTime());
            }

            firstJobSubmitTime = job.getSubmitTime();
            long lastJobSubmitTime = firstJobSubmitTime;

            int numberJobs = 0;

            long currentIntervalEnd = Long.MIN_VALUE;

            Path nextSegment = null;
            Outputter<LoggedJob> tempGen = null;

            if (debug) {
                LOG.debug("The first job has a submit time of " + firstJobSubmitTime);
            }

            final Configuration conf = getConf();

            try {
                // At the top of this loop, skewBuffer has at most
                // skewBufferLength entries.
                while (job != null) {
                    final Random tempNameGenerator = new Random();

                    lastJobSubmitTime = job.getSubmitTime();

                    ++numberJobs;

                    if (job.getSubmitTime() >= currentIntervalEnd) {
                        if (tempGen != null) {
                            tempGen.close();
                        }

                        nextSegment = null;
                        for (int i = 0; i < 3 && nextSegment == null; ++i) {
                            try {
                                nextSegment = new Path(tempDir,
                                        "segment-" + tempNameGenerator.nextLong() + ".json.gz");

                                if (debug) {
                                    LOG.debug("The next segment name is " + nextSegment);
                                }

                                FileSystem fs = nextSegment.getFileSystem(conf);

                                try {
                                    if (!fs.exists(nextSegment)) {
                                        break;
                                    }

                                    continue;
                                } catch (IOException e) {
                                    // no code -- file did not already exist
                                }
                            } catch (IOException e) {
                                // no code -- file exists now, or directory bad. We try three
                                // times.
                            }
                        }

                        if (nextSegment == null) {
                            throw new RuntimeException("Failed to create a new file!");
                        }

                        if (debug) {
                            LOG.debug("Creating " + nextSegment + " for a job with a submit time of "
                                    + job.getSubmitTime());
                        }

                        deletees.add(nextSegment);

                        tempPaths.add(nextSegment);

                        tempGen = new DefaultOutputter<LoggedJob>();
                        tempGen.init(nextSegment, conf);

                        long currentIntervalNumber = (job.getSubmitTime() - firstJobSubmitTime) / inputCycle;

                        currentIntervalEnd = firstJobSubmitTime + ((currentIntervalNumber + 1) * inputCycle);
                    }

                    // the temp files contain UDadjusted times, but each temp file's
                    // content is in the same input cycle interval.
                    if (tempGen != null) {
                        tempGen.output(job);
                    }

                    job = reader.nextJob();
                }
            } catch (DeskewedJobTraceReader.OutOfOrderException e) {
                return OUT_OF_ORDER_JOBS;
            } finally {
                if (tempGen != null) {
                    tempGen.close();
                }
            }

            if (lastJobSubmitTime <= firstJobSubmitTime) {
                LOG.error("All of your job[s] have the same submit time." + "  Please just use your input file.");

                return ALL_JOBS_SIMULTANEOUS;
            }

            double submitTimeSpan = lastJobSubmitTime - firstJobSubmitTime;

            LOG.warn("Your input trace spans " + (lastJobSubmitTime - firstJobSubmitTime) + " ticks.");

            double foldingRatio = submitTimeSpan * (numberJobs + 1) / numberJobs / inputCycle;

            if (debug) {
                LOG.warn("run: submitTimeSpan = " + submitTimeSpan + ", numberJobs = " + numberJobs
                        + ", inputCycle = " + inputCycle);
            }

            if (reader.neededSkewBufferSize() > 0) {
                LOG.warn("You needed a -skew-buffer-length of " + reader.neededSkewBufferSize()
                        + " but no more, for this input.");
            }

            double tProbability = timeDilation * concentration / foldingRatio;

            if (debug) {
                LOG.warn("run: timeDilation = " + timeDilation + ", concentration = " + concentration
                        + ", foldingRatio = " + foldingRatio);
                LOG.warn("The transcription probability is " + tProbability);
            }

            transcriptionRateInteger = (int) Math.floor(tProbability);
            transcriptionRateFraction = tProbability - Math.floor(tProbability);

            // Now read all the inputs in parallel
            heap = new PriorityQueue<Pair<LoggedJob, JobTraceReader>>(tempPaths.size(), new JobEntryComparator());

            for (Path tempPath : tempPaths) {
                JobTraceReader thisReader = new JobTraceReader(tempPath, conf);

                closees.add(thisReader);

                LoggedJob streamFirstJob = thisReader.getNext();

                long thisIndex = (streamFirstJob.getSubmitTime() - firstJobSubmitTime) / inputCycle;

                if (debug) {
                    LOG.debug("A job with submit time of " + streamFirstJob.getSubmitTime() + " is in interval # "
                            + thisIndex);
                }

                adjustJobTimes(streamFirstJob);

                if (debug) {
                    LOG.debug("That job's submit time is adjusted to " + streamFirstJob.getSubmitTime());
                }

                heap.add(new Pair<LoggedJob, JobTraceReader>(streamFirstJob, thisReader));
            }

            Pair<LoggedJob, JobTraceReader> next = heap.poll();

            while (next != null) {
                maybeOutput(next.first());

                if (debug) {
                    LOG.debug("The most recent job has an adjusted submit time of " + next.first().getSubmitTime());
                    LOG.debug(" Its replacement in the heap will come from input engine " + next.second());
                }

                LoggedJob replacement = next.second().getNext();

                if (replacement == null) {
                    next.second().close();

                    if (debug) {
                        LOG.debug("That input engine is depleted.");
                    }
                } else {
                    adjustJobTimes(replacement);

                    if (debug) {
                        LOG.debug("The replacement has an adjusted submit time of " + replacement.getSubmitTime());
                    }

                    heap.add(new Pair<LoggedJob, JobTraceReader>(replacement, next.second()));
                }

                next = heap.poll();
            }
        } finally {
            IOUtils.cleanup(null, reader);
            if (outGen != null) {
                outGen.close();
            }
            for (Pair<LoggedJob, JobTraceReader> heapEntry : heap) {
                heapEntry.second().close();
            }
            for (Closeable closee : closees) {
                closee.close();
            }
            if (!debug) {
                Configuration conf = getConf();

                for (Path deletee : deletees) {
                    FileSystem fs = deletee.getFileSystem(conf);

                    try {
                        fs.delete(deletee, false);
                    } catch (IOException e) {
                        // no code
                    }
                }
            }
        }

        return 0;
    }

    private void maybeOutput(LoggedJob job) throws IOException {
        for (int i = 0; i < transcriptionRateInteger; ++i) {
            outGen.output(job);
        }

        if (random.nextDouble() < transcriptionRateFraction) {
            outGen.output(job);
        }
    }

    private void adjustJobTimes(LoggedJob adjustee) {
        long offsetInCycle = (adjustee.getSubmitTime() - firstJobSubmitTime) % inputCycle;

        long outputOffset = (long) ((double) offsetInCycle * timeDilation);

        long adjustment = firstJobSubmitTime + outputOffset - adjustee.getSubmitTime();

        adjustee.adjustTimes(adjustment);
    }

    /**
     * @param args
     */
    public static void main(String[] args) {
        Folder instance = new Folder();

        int result = 0;

        try {
            result = ToolRunner.run(instance, args);
        } catch (IOException e) {
            e.printStackTrace(System.err);
            System.exit(IO_ERROR);
        } catch (Exception e) {
            e.printStackTrace(System.err);
            System.exit(OTHER_ERROR);
        }

        if (result != 0) {
            System.exit(result);
        }

        return;
    }
}