datafu.hourglass.jobs.StagedOutputJob.java Source code

Introduction

Here is the source code for datafu.hourglass.jobs.StagedOutputJob.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package datafu.hourglass.jobs;

import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.math.stat.descriptive.DescriptiveStatistics;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.TaskCompletionEvent;
import org.apache.hadoop.mapred.TaskReport;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Logger;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.permission.FsPermission;

/**
 * A derivation of {@link Job} that stages its output in another location and only
 * moves it to the final destination if the job completes successfully.
 * It also outputs a counters file to the file system that contains counters fetched from Hadoop
 * and other task statistics.
 */
public class StagedOutputJob extends Job implements Callable<Boolean> {
    private final String _stagingPrefix;
    private final Logger _log;
    private Path _countersPath;
    private Path _countersParentPath;
    private boolean _writeCounters = false;

    /**
     * Creates a job which using a temporary staging location for the output data.
     * The data is only copied to the final output directory on successful completion
     * of the job.  This prevents existing output data from being overwritten unless
     * the job completes successfully.
     * 
     * @param conf configuration
     * @param jobName job name
     * @param inputPaths input paths
     * @param stagingLocation where to stage output temporarily
     * @param outputPath output path
     * @param log logger 
     * @return job
     */
    public static StagedOutputJob createStagedJob(Configuration conf, String jobName, List<String> inputPaths,
            String stagingLocation, String outputPath, final Logger log) {
        final StagedOutputJob retVal;
        try {
            retVal = new StagedOutputJob(conf, stagingLocation, log);
            retVal.setJobName(jobName);
            retVal.setJarByClass(getCallersClass());
            FileInputFormat.setInputPathFilter(retVal, HiddenFilePathFilter.class);
        } catch (IOException e) {
            log.error("IOException when making a job", e);
            throw new RuntimeException(e);
        }

        if (inputPaths != null) {
            try {
                FileInputFormat.setInputPaths(retVal, StringUtils.join(inputPaths.iterator(), ","));
            } catch (IOException e) {
                log.error("Unable to set up input paths.", e);
                throw new RuntimeException(e);
            }
        }

        FileOutputFormat.setOutputPath(retVal, new Path(outputPath));

        return retVal;
    }

    /**
     * Initializes the job.
     * 
     * @param conf configuration
     * @param stagingPrefix where to stage output temporarily
     * @param log logger
     * @throws IOException
     */
    public StagedOutputJob(Configuration conf, String stagingPrefix, Logger log) throws IOException {
        super(conf);
        this._stagingPrefix = stagingPrefix;
        this._log = log;
    }

    /**
     * Gets path to store the counters.  If this is not set then by default the counters will be
     * stored in the output directory.
     * 
     * @return path parent path for counters
     */
    public Path getCountersParentPath() {
        return _countersParentPath;
    }

    /**
     * Sets path to store the counters.  If this is not set then by default the counters will be
     * stored in the output directory.
     * 
     * @param path parent path for counters
     */
    public void setCountersParentPath(Path path) {
        _countersParentPath = path;
    }

    /**
     * Path to written counters.
     * 
     * @return counters path
     */
    public Path getCountersPath() {
        return _countersPath;
    }

    /**
     * Get whether counters should be written.
     * 
     * @return true if counters should be written
     */
    public boolean getWriteCounters() {
        return _writeCounters;
    }

    /**
     * Sets whether counters should be written.
     * 
     * @param writeCounters true if counters should be written
     */
    public void setWriteCounters(boolean writeCounters) {
        this._writeCounters = writeCounters;
    }

    /**
     * Run the job.
     */
    @Override
    public Boolean call() throws Exception {
        try {
            boolean success = false;
            success = waitForCompletion(false);
            String jobId = "?";

            if (getJobID() != null) {
                jobId = String.format("job_%s_%d", getJobID().getJtIdentifier(), getJobID().getId());
            }

            if (success) {
                _log.info(String.format("Job %s with ID %s succeeded! Tracking URL: %s", getJobName(), jobId,
                        this.getTrackingURL()));
            } else {
                _log.error(String.format("Job %s with ID %s failed! Tracking URL: %s", getJobName(), jobId,
                        this.getTrackingURL()));
            }

            return success;
        } catch (Exception e) {
            _log.error("Exception: " + e.toString());
            throw new Exception(e);
        }
    }

    /**
     * Run the job and wait for it to complete.  Output will be temporarily stored under the staging path.
     * If the job is successful it will be moved to the final location.
     */
    @Override
    public boolean waitForCompletion(boolean verbose)
            throws IOException, InterruptedException, ClassNotFoundException {
        final Path actualOutputPath = FileOutputFormat.getOutputPath(this);
        final Path stagedPath = new Path(String.format("%s/%s/staged", _stagingPrefix, System.currentTimeMillis()));

        FileOutputFormat.setOutputPath(this, stagedPath);

        final Thread hook = new Thread(new Runnable() {
            @Override
            public void run() {
                try {
                    killJob();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        });

        Runtime.getRuntime().addShutdownHook(hook);

        final boolean retVal = super.waitForCompletion(verbose);
        Runtime.getRuntime().removeShutdownHook(hook);

        if (retVal) {
            FileSystem fs = actualOutputPath.getFileSystem(getConfiguration());

            fs.mkdirs(actualOutputPath);

            _log.info(String.format("Deleting data at old path[%s]", actualOutputPath));
            fs.delete(actualOutputPath, true);

            _log.info(String.format("Moving from staged path[%s] to final resting place[%s]", stagedPath,
                    actualOutputPath));
            boolean renamed = fs.rename(stagedPath, actualOutputPath);

            if (renamed && _writeCounters) {
                writeCounters(fs);
            }

            return renamed;
        } else {
            FileSystem fs = actualOutputPath.getFileSystem(getConfiguration());
            _log.info(String.format("Job failed, deleting staged path[%s]", stagedPath));
            try {
                fs.delete(stagedPath, true);
            } catch (IOException e) {
            }
        }

        _log.warn("retVal was false for some reason...");
        return retVal;
    }

    /**
     * Gets the class for the caller.
     * 
     * @return caller class
     */
    private static Class<?> getCallersClass() {
        StackTraceElement[] stack = Thread.currentThread().getStackTrace();
        boolean foundSelf = false;
        for (StackTraceElement element : stack) {
            if (foundSelf && !StagedOutputJob.class.getName().equals(element.getClassName())) {
                try {
                    return Class.forName(element.getClassName());
                } catch (ClassNotFoundException e) {
                    throw new RuntimeException(e);
                }
            } else if (StagedOutputJob.class.getName().equals(element.getClassName())
                    && "getCallersClass".equals(element.getMethodName())) {
                foundSelf = true;
            }
        }
        return StagedOutputJob.class;
    }

    /**
     * Writes Hadoop counters and other task statistics to a file in the file system.
     * 
     * @param fs
     * @throws IOException
     */
    private void writeCounters(final FileSystem fs) throws IOException {
        final Path actualOutputPath = FileOutputFormat.getOutputPath(this);

        SimpleDateFormat timestampFormat = new SimpleDateFormat("yyyyMMddHHmmss");

        String suffix = timestampFormat.format(new Date());

        if (_countersParentPath != null) {
            if (!fs.exists(_countersParentPath)) {
                _log.info("Creating counter parent path " + _countersParentPath);
                fs.mkdirs(_countersParentPath, FsPermission.valueOf("-rwxrwxr-x"));
            }
            // make the name as unique as possible in this case because this may be a directory
            // where other counter files will be dropped
            _countersPath = new Path(_countersParentPath, ".counters." + suffix);
        } else {
            _countersPath = new Path(actualOutputPath, ".counters." + suffix);
        }

        _log.info(String.format("Writing counters to %s", _countersPath));
        FSDataOutputStream counterStream = fs.create(_countersPath);
        BufferedOutputStream buffer = new BufferedOutputStream(counterStream, 256 * 1024);
        OutputStreamWriter writer = new OutputStreamWriter(buffer);
        for (String groupName : getCounters().getGroupNames()) {
            for (Counter counter : getCounters().getGroup(groupName)) {
                writeAndLog(writer, String.format("%s=%d", counter.getName(), counter.getValue()));
            }
        }

        JobID jobID = this.getJobID();

        org.apache.hadoop.mapred.JobID oldJobId = new org.apache.hadoop.mapred.JobID(jobID.getJtIdentifier(),
                jobID.getId());

        long minStart = Long.MAX_VALUE;
        long maxFinish = 0;
        long setupStart = Long.MAX_VALUE;
        long cleanupFinish = 0;
        DescriptiveStatistics mapStats = new DescriptiveStatistics();
        DescriptiveStatistics reduceStats = new DescriptiveStatistics();
        boolean success = true;

        JobClient jobClient = new JobClient(this.conf);

        Map<String, String> taskIdToType = new HashMap<String, String>();

        TaskReport[] setupReports = jobClient.getSetupTaskReports(oldJobId);
        if (setupReports.length > 0) {
            _log.info("Processing setup reports");
            for (TaskReport report : jobClient.getSetupTaskReports(oldJobId)) {
                taskIdToType.put(report.getTaskID().toString(), "SETUP");
                if (report.getStartTime() == 0) {
                    _log.warn("Skipping report with zero start time");
                    continue;
                }
                setupStart = Math.min(setupStart, report.getStartTime());
            }
        } else {
            _log.error("No setup reports");
        }

        TaskReport[] mapReports = jobClient.getMapTaskReports(oldJobId);
        if (mapReports.length > 0) {
            _log.info("Processing map reports");
            for (TaskReport report : mapReports) {
                taskIdToType.put(report.getTaskID().toString(), "MAP");
                if (report.getFinishTime() == 0 || report.getStartTime() == 0) {
                    _log.warn("Skipping report with zero start or finish time");
                    continue;
                }
                minStart = Math.min(minStart, report.getStartTime());
                mapStats.addValue(report.getFinishTime() - report.getStartTime());
            }
        } else {
            _log.error("No map reports");
        }

        TaskReport[] reduceReports = jobClient.getReduceTaskReports(oldJobId);
        if (reduceReports.length > 0) {
            _log.info("Processing reduce reports");
            for (TaskReport report : reduceReports) {
                taskIdToType.put(report.getTaskID().toString(), "REDUCE");
                if (report.getFinishTime() == 0 || report.getStartTime() == 0) {
                    _log.warn("Skipping report with zero start or finish time");
                    continue;
                }
                maxFinish = Math.max(maxFinish, report.getFinishTime());
                reduceStats.addValue(report.getFinishTime() - report.getStartTime());
            }
        } else {
            _log.error("No reduce reports");
        }

        TaskReport[] cleanupReports = jobClient.getCleanupTaskReports(oldJobId);
        if (cleanupReports.length > 0) {
            _log.info("Processing cleanup reports");
            for (TaskReport report : cleanupReports) {
                taskIdToType.put(report.getTaskID().toString(), "CLEANUP");
                if (report.getFinishTime() == 0) {
                    _log.warn("Skipping report with finish time of zero");
                    continue;
                }
                cleanupFinish = Math.max(cleanupFinish, report.getFinishTime());
            }
        } else {
            _log.error("No cleanup reports");
        }

        if (minStart == Long.MAX_VALUE) {
            _log.error("Could not determine map-reduce start time");
            success = false;
        }
        if (maxFinish == 0) {
            _log.error("Could not determine map-reduce finish time");
            success = false;
        }

        if (setupStart == Long.MAX_VALUE) {
            _log.error("Could not determine setup start time");
            success = false;
        }
        if (cleanupFinish == 0) {
            _log.error("Could not determine cleanup finish time");
            success = false;
        }

        // Collect statistics on successful/failed/killed task attempts, categorized by setup/map/reduce/cleanup.
        // Unfortunately the job client doesn't have an easier way to get these statistics.
        Map<String, Integer> attemptStats = new HashMap<String, Integer>();
        _log.info("Processing task attempts");
        for (TaskCompletionEvent event : getTaskCompletionEvents(jobClient, oldJobId)) {
            String type = taskIdToType.get(event.getTaskAttemptId().getTaskID().toString());
            String status = event.getTaskStatus().toString();

            String key = String.format("%s_%s_ATTEMPTS", status, type);
            if (!attemptStats.containsKey(key)) {
                attemptStats.put(key, 0);
            }
            attemptStats.put(key, attemptStats.get(key) + 1);
        }

        if (success) {
            writeAndLog(writer, String.format("SETUP_START_TIME_MS=%d", setupStart));
            writeAndLog(writer, String.format("CLEANUP_FINISH_TIME_MS=%d", cleanupFinish));
            writeAndLog(writer, String.format("COMPLETE_WALL_CLOCK_TIME_MS=%d", cleanupFinish - setupStart));

            writeAndLog(writer, String.format("MAP_REDUCE_START_TIME_MS=%d", minStart));
            writeAndLog(writer, String.format("MAP_REDUCE_FINISH_TIME_MS=%d", maxFinish));
            writeAndLog(writer, String.format("MAP_REDUCE_WALL_CLOCK_TIME_MS=%d", maxFinish - minStart));

            writeAndLog(writer, String.format("MAP_TOTAL_TASKS=%d", (long) mapStats.getN()));
            writeAndLog(writer, String.format("MAP_MAX_TIME_MS=%d", (long) mapStats.getMax()));
            writeAndLog(writer, String.format("MAP_MIN_TIME_MS=%d", (long) mapStats.getMin()));
            writeAndLog(writer, String.format("MAP_AVG_TIME_MS=%d", (long) mapStats.getMean()));
            writeAndLog(writer, String.format("MAP_STD_TIME_MS=%d", (long) mapStats.getStandardDeviation()));
            writeAndLog(writer, String.format("MAP_SUM_TIME_MS=%d", (long) mapStats.getSum()));

            writeAndLog(writer, String.format("REDUCE_TOTAL_TASKS=%d", (long) reduceStats.getN()));
            writeAndLog(writer, String.format("REDUCE_MAX_TIME_MS=%d", (long) reduceStats.getMax()));
            writeAndLog(writer, String.format("REDUCE_MIN_TIME_MS=%d", (long) reduceStats.getMin()));
            writeAndLog(writer, String.format("REDUCE_AVG_TIME_MS=%d", (long) reduceStats.getMean()));
            writeAndLog(writer, String.format("REDUCE_STD_TIME_MS=%d", (long) reduceStats.getStandardDeviation()));
            writeAndLog(writer, String.format("REDUCE_SUM_TIME_MS=%d", (long) reduceStats.getSum()));

            writeAndLog(writer, String.format("MAP_REDUCE_SUM_TIME_MS=%d",
                    (long) mapStats.getSum() + (long) reduceStats.getSum()));

            for (Map.Entry<String, Integer> attemptStat : attemptStats.entrySet()) {
                writeAndLog(writer, String.format("%s=%d", attemptStat.getKey(), attemptStat.getValue()));
            }
        }

        writer.close();
        buffer.close();
        counterStream.close();
    }

    /**
     * Get all task completion events for a particular job.
     * 
     * @param jobClient job client
     * @param jobId job ID
     * @return task completion events
     * @throws IOException
     */
    private List<TaskCompletionEvent> getTaskCompletionEvents(JobClient jobClient,
            org.apache.hadoop.mapred.JobID jobId) throws IOException {
        List<TaskCompletionEvent> events = new ArrayList<TaskCompletionEvent>();

        // Tries to use reflection to get access to the getTaskCompletionEvents method from the private jobSubmitClient field.
        // This method has a parameter for the size, which defaults to 10 for the top level methods and can therefore be extremely slow
        // if the goal is to get all events.

        Method getTaskCompletionEventsMethod = null;
        Object jobSubmitClient = null;

        try {
            Field f = JobClient.class.getDeclaredField("jobSubmitClient");
            f.setAccessible(true);
            jobSubmitClient = f.get(jobClient);

            if (jobSubmitClient != null) {
                getTaskCompletionEventsMethod = jobSubmitClient.getClass().getDeclaredMethod(
                        "getTaskCompletionEvents", org.apache.hadoop.mapred.JobID.class, int.class, int.class);
                getTaskCompletionEventsMethod.setAccessible(true);
            }
        } catch (NoSuchMethodException e) {
        } catch (SecurityException e) {
        } catch (NoSuchFieldException e) {
        } catch (IllegalArgumentException e) {
        } catch (IllegalAccessException e) {
        }

        if (getTaskCompletionEventsMethod != null) {
            _log.info("Will call getTaskCompletionEvents via reflection since it's faster");
        } else {
            _log.info("Will call getTaskCompletionEvents via the slow method");
        }

        int index = 0;
        while (true) {
            TaskCompletionEvent[] currentEvents;
            if (getTaskCompletionEventsMethod != null) {
                try {
                    // grab events, 250 at a time, which is faster than the other method which defaults to 10 at a time (with no override ability)
                    currentEvents = (TaskCompletionEvent[]) getTaskCompletionEventsMethod.invoke(jobSubmitClient,
                            jobId, index, 250);
                } catch (IllegalArgumentException e) {
                    _log.error(
                            "Failed to call getTaskCompletionEventsMethod via reflection, switching to slow direct method",
                            e);
                    getTaskCompletionEventsMethod = null;
                    continue;
                } catch (IllegalAccessException e) {
                    _log.error(
                            "Failed to call getTaskCompletionEventsMethod via reflection, switching to slow direct method",
                            e);
                    getTaskCompletionEventsMethod = null;
                    continue;
                } catch (InvocationTargetException e) {
                    _log.error(
                            "Failed to call getTaskCompletionEventsMethod via reflection, switching to slow direct method",
                            e);
                    getTaskCompletionEventsMethod = null;
                    continue;
                }
            } else {
                currentEvents = this.getTaskCompletionEvents(index);
            }
            if (currentEvents.length == 0)
                break;
            for (TaskCompletionEvent event : currentEvents) {
                events.add(event);
            }
            index += currentEvents.length;
        }

        return events;
    }

    private void writeAndLog(OutputStreamWriter writer, String line) throws IOException {
        writer.append(line);
        writer.append("\n");
        _log.info(line);
    }

    static class HiddenFilePathFilter implements PathFilter {
        @Override
        public boolean accept(Path path) {
            String name = path.getName();
            return !name.startsWith("_") && !name.startsWith(".");
        }
    }
}