jadoop.HadoopGridJob.java Source code

Java tutorial

Introduction

Here is the source code for jadoop.HadoopGridJob.java

Source

// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// This file is part of Jadoop
// Copyright (c) 2016 Grant Braught. All rights reserved.
// 
// Jadoop is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published
// by the Free Software Foundation, either version 3 of the License,
// or (at your option) any later version.
// 
// Jadoop is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty
// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU General Public License for more details.
// 
// You should have received a copy of the GNU General Public
// License along with Jadoop.
// If not, see <http://www.gnu.org/licenses/>.
// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

package jadoop;

import jadoop.util.SingleRecordSplitSequenceFileInputFormat;
import jadoop.util.TextArrayWritable;

import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;

import javax.management.openmbean.KeyAlreadyExistsException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.SequenceFile.Writer;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobStatus;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

/**
 * A a collection of HadoopGridTasks to be run on a Hadoop Cluster.
 * 
 * A HadoopGridJob has several flags indicating its current state:
 * <UL>
 * <LI>started: true if the task in the job have been started on the cluster.
 * <LI>running: true if the tasks in the job are currently running on the
 * cluster.
 * <LI>finished: true if all of the tasks in the job were started but are no
 * longer running.
 * <LI>terminated: true if one or more tasks in the job not finish on its own
 * (i.e. it timed out or was killed because the job it was a part of was
 * terminated or timed out).
 * <LI>timedout: true if the job was killed because it exceeded the time limit.
 * <LI>successful: true if all of the tasks in the job finished with an exit
 * code of 0.
 * </UL>
 * 
 * @see HadoopGridTask
 * 
 * @author Grant Braught
 * @author Xiang Wei
 * @author Dickinson College
 * @version Jun 9, 2015
 */
public class HadoopGridJob {

    private String jobName;
    private long timeout;

    private Job job;

    private HashMap<String, HadoopGridTask> taskMap;
    private ArrayList<File> files;
    private ArrayList<File> archives;

    private boolean started;
    private boolean finished;
    private volatile boolean terminated;
    private boolean timedout;
    private boolean successful;

    /*
     * Some flags used for testing. These control whether particular parts of
     * the job processing occur or not.
     */
    private boolean testing;
    private boolean makeTempDir;
    private boolean copyFiles;
    private boolean copyArchives;
    private boolean makeInputDir;
    private boolean writeTasksFile;
    private boolean configureJob;
    private boolean submitJob;

    /*
     * Wait 3 seconds between each poll of the cluster to determine if the job
     * has completed.
     */
    private static final int JOB_MONITOR_DELAY = 3000;

    private Thread jobMonitorThread;

    /**
     * Construct a new HadoopGridJob with the specified name. A default
     * org.apache.hadoop.conf.Configuration object will be created and used for
     * the job by passing it to the 2-arg constructor. The default timeout for a
     * new HadoopGridJob is 10 minutes.
     * 
     * @param name
     *            the name for the HadoopGridJob. This name will also be the
     *            name of the job submitted to Hadoop and thus may be useful if
     *            observing the Hadoop cluster with other tools. This is also
     *            the name of the temporary working directory on the HDFS, and
     *            thus must be a valid directory name.
     * @throws IOException
     *             propagated from the org.apache.hadoop.conf.Configuration
     *             constructor.
     */
    public HadoopGridJob(String name) throws IOException {
        this(name, new Configuration());
    }

    /**
     * Construct a new HadoopGridJob with the specified name and the provided
     * org.apache.hadoop.conf.Configuration. The default timeout for a
     * HadoopGridJob is 10 minutes.
     * 
     * The following properties in the Configuration object will be overwritten,
     * thus any values set in the provided object will be ignored:
     * <UL>
     * <LI>number of reduce tasks
     * <LI>Mapper class
     * <LI>InputFormat class
     * <LI>OutputKey class
     * <LI>OutputValue class
     * <LI>OutputFormat class
     * <LI>InputPaths
     * <LI>OutputPath
     * </UL>
     * 
     * @param name
     *            the name for the HadoopGridJob. This name will also be the
     *            name of the job submitted to Hadoop and thus may be useful if
     *            observing the Hadoop cluster with other tools. This is also
     *            the name of the temporary working directory on the HDFS, and
     *            thus must be a valid directory name.
     * @param config
     *            a Hadoop Configuration object to be used for submission of the
     *            job. The provided Configuration object is cloned for use in
     *            the job. Thus, subsequent changes to the provided
     *            Configuration will not affect the Configuration being used for
     *            the job.
     * @throws IOException
     *             propagated from the org.apache.hadoop.conf.Configuration
     *             constructor used to clone the one provided.
     */
    public HadoopGridJob(String name, Configuration config) throws IOException {
        jobName = name;

        /*
         * NOTE: Job makes its own copy of config, so we use the one from job
         * anytime we need it rather than keeping a reference to config.
         */
        job = Job.getInstance(config, jobName);
        job.setNumReduceTasks(0); // only map tasks

        taskMap = new HashMap<String, HadoopGridTask>();
        files = new ArrayList<File>();
        archives = new ArrayList<File>();

        timeout = Long.MAX_VALUE; // effectively no timeout.

        started = false;
        finished = false;
        successful = false;
        terminated = false;
        timedout = false;

        /*
         * Testing is false so that things run as normal unless this is changed
         * by a call to one of the private helpers at the bottom of the file.
         * Those are used by the tests to set the appropriate values to true for
         * what is being tested by the test.
         */
        testing = false;
        makeTempDir = false;
        copyFiles = false;
        copyArchives = false;
        makeInputDir = false;
        writeTasksFile = false;
        configureJob = false;
        submitJob = false;

        jobMonitorThread = null;
    }

    /*
     * NOTE: All of the following accessors could compute their results but
     * instead rely on fields set during the processing of the results by the
     * JobMonitor and/or processResults method. That way they are only computed
     * once rather than on each call.
     */

    /**
     * Check to see if this job has been started. A job is started when it is
     * submitted to the cluster.
     * 
     * @return true if this job has been started.
     */
    public boolean wasStarted() {
        return started;
    }

    /**
     * Check to see if this job has finished. A job has finished if all of the
     * tasks contained in the job have finished (completed, failed, terminated
     * or timed out).
     * 
     * @return true if this job has finished.
     */
    public boolean hasFinished() {
        return finished;
    }

    /**
     * Check to see if this job is currently running. A job is considered
     * running if it has been started but has not yet finished.
     * 
     * @return true if this job is running, false if not
     */
    public boolean isRunning() {
        return started && !finished;
    }

    /**
     * Check to see if this job was terminated (via a call to the terminate
     * method).
     * 
     * @return true if this job was terminated.
     */
    public boolean wasTerminated() {
        return terminated;
    }

    /**
     * Check to see if this job timed out. A job that has timed out will also be
     * marked as terminated.
     * 
     * @return true if this job timed out.
     */
    public boolean hasTimedout() {
        return timedout;
    }

    /**
     * Check if all of the tasks in this job have completed successfully (i.e.
     * they are finished and the task command gave an exit value of 0.)
     * 
     * @return true if all tasks have completed successfully, false if not.
     */
    public boolean wasSuccessful() {
        return successful;
    }

    /**
     * Get a clone of the configuration being used for this HadoopGridJob.
     * 
     * @return a clone of this job's org.apache.hadoop.conf.Configuration
     *         object.
     */
    public Configuration getConfiguration() {
        Configuration cloneConf = new Configuration(job.getConfiguration());
        return cloneConf;
    }

    /**
     * Get the org.apache.hadoop.mapreduce.Job that this HadoopGridJob is using
     * to interact with hadoop.
     * 
     * @return the Job.
     */

    public Job getJob() {
        return job;
    }

    /**
     * Add a HadoopGridTask to this HadoopGridJob. The task will be retrievable
     * by the key specified in the task. This method throws an exception if
     * another task has already been added with the same key. To replace a task
     * with the same key the existing task must be removed first and then the
     * new task can be added.
     * 
     * @param task
     *            the HadoopGridTask to be added to this HadoopGridJob.
     * 
     * @throws KeyAlreadyExistsException
     *             if this HadoopGridJob already contains a task with the same
     *             key as specified in the given task.
     * @throws IllegalStateException
     *             if this HadoopGridJob has already been started.
     * 
     * @see #removeTask(String)
     */
    public void addTask(HadoopGridTask task) {
        if (started) {
            throw new IllegalStateException("Cannot add a task after a job is started.");
        }

        if (taskMap.get(task.getKey()) != null) {
            throw new KeyAlreadyExistsException(
                    "There is already a task with the same key " + "as the one you are trying to add to the job.");
        } else {
            taskMap.put(task.getKey(), task);
        }
    }

    /**
     * Get the number of tasks in the job.
     * 
     * @return the number of tasks in the job.
     */
    public int size() {
        return taskMap.size();
    }

    /**
     * Get the HadoopGridTask associated with the given key.
     * 
     * @param key
     *            the key of the task to be retrieved.
     * @return the HadoopGridTask with the given key or null if no such task
     *         exists.
     */
    public HadoopGridTask getTask(String key) {
        return taskMap.get(key);
    }

    /**
     * Get a list of all of the HadoopGridTasks contained in this HadoopGridJob.
     * If this HadoopGridJob is complete then each task will contain the results
     * (e.g. exit value, standard output, standard error) generated by the task.
     * 
     * @return a list of the HaddopGridTasks in this HadoopGridJob.
     */
    public List<HadoopGridTask> getAllTasks() {
        List<HadoopGridTask> lHGT = new ArrayList<HadoopGridTask>();
        for (HadoopGridTask task : taskMap.values()) {
            lHGT.add(task);
        }
        return lHGT;
    }

    /**
     * Add all of the HadoopGridTasks contained in the List to this
     * HadoopGridJob. Each task will be retrievable by the key specified in the
     * task object. This method operates by invoking addTask on each of the
     * individual tasks.
     * 
     * @param tasks
     *            a list of HadoopGridTasks to be added to this HadoopGridJob.
     * 
     * @throws KeyAlreadyExistsException
     *             if this HadoopGridJob already contains a task with the same
     *             key as that specified in any of the given tasks.
     * @throws IllegalStateException
     *             if this HadoopGridJob is currently running or has already
     *             been completed.
     * 
     * @see #addTask(HadoopGridTask)
     */
    public void addAllTasks(List<HadoopGridTask> tasks) {
        for (HadoopGridTask task : tasks) {
            addTask(task);
        }
    }

    /**
     * Remove the HadoopGridTask with the specified key from this HadoopGridJob.
     * 
     * @param key
     *            the key of the HadoopGridTask to be removed.
     * 
     * @throws IllegalStateException
     *             if this HadoopGridJob is currently running or has already
     *             been completed.
     */
    public void removeTask(String key) {
        if (started) {
            throw new IllegalStateException("Cannot remove a task after a job is started.");
        }

        taskMap.remove(key);
    }

    /**
     * Set the timeout period for the job. If the job takes longer than this
     * amount of wall clock time it will be terminated. Terminating the job will
     * terminate all tasks and no results from any task will be available.
     * Generally it is preferable to rely on timeouts for the individuals
     * HadoopGridTasks instead so that results from completed tasks are
     * available. If no timeout is desired, set this to Long.MAX_VALUE, which is
     * the default value.
     * 
     * @param ms
     *            the task timeout in milliseconds.
     * 
     * @throws IllegalStateException
     *             if this job has already been started.
     */
    public void setJobTimeout(long ms) {
        if (started) {
            throw new IllegalStateException("Cannot set timeout after the job has been started");
        }

        timeout = ms;
    }

    /**
     * Get the current timeout period for the job.
     * 
     * @return the timeout period for the job in milliseconds.
     */
    public long getJobTimeout() {
        return timeout;
    }

    /**
     * Add a file to the list of files that will be available (read-only) to the
     * HadoopGridTasks in their working directory.
     * 
     * When the job is run (i.e. runJob is invoked), the specified file will be
     * copied to a temporary working directory on the hadoop HDFS and then made
     * available in the task's working directory via Hadoop's distributed cache.
     * When the job is complete, the temporary working directory, along with
     * this file will be deleted from the HDFS.
     * 
     * @param dataFile
     *            a File object referring to the file on the local filesystem.
     */
    public void addFile(File dataFile) {
        if (started) {
            throw new IllegalStateException("Cannot add a file after the job has been started");
        }

        /*
         * Just hold the file for now. Later it will be placed into the HDFS so
         * that it is available to the map task.
         */
        files.add(dataFile);
    }

    /**
     * Add an archive (jar file) to the list of archives that will be available
     * (read-only) to the HadoopGridTasks in their working directory.
     * 
     * When the job is run (i.e. when runJob is invoked), a sub-directory with
     * the same name as the archive will be created within the temporary working
     * directory on the Hadoop HDFS. The archive will be expanded within that
     * sub-directory. The sub-directory will be made available in the tasks'
     * working directory via Hadoop's distributed cache.
     * 
     * When the job is complete, the temporary working directory, the
     * sub-directory for the archive and any files and directories created by
     * expanding the archive will be deleted.
     * 
     * @param archiveFile
     *            a File object referring to the archive on the local
     *            filesystem.
     * 
     * @throws IllegalStateException
     *             if this job is currently running or has already been
     *             completed.
     */
    public void addArchive(File archiveFile) {
        if (started) {
            throw new IllegalStateException("Cannot add an archive after the job has been started");
        }

        /*
         * Just hold the archive file for now. Later it will be placed into the
         * HDFS so that it is available to the map task.
         */
        archives.add(archiveFile);
    }

    /**
     * Run all of the HadoopGridTasks in this job on the Hadoop cluster. When
     * all of the tasks have finished execution (completed, failed, terminated
     * or timed out) the results of the tasks are parsed and placed into the
     * HadoopGridTask objects. Once all of the results have been processed the
     * job will be marked as finished (and timedout and/or terminated as
     * appropriate)
     * 
     * @param wait
     *            true to cause this method to wait until all of the tasks have
     *            completed, failed, terminated or timed out, and all of the
     *            results have been processed, before this method returns. False
     *            to return immediately.
     * 
     * @throws IllegalStateException
     *             if this job has already been started
     * @throws IOException
     *             if there is a problem running the HadoopGridTasks. This is
     *             propagated from the org.apache.hadoop.mapreduce.Job.submit
     *             method.
     * @throws InterruptedException
     *             if interrupted while waiting for the HadoopGridTasks to
     *             complete. This is propagated from the
     *             org.apache.hadoop.mapreduce.Job.submit method.
     * @throws ClassNotFoundException
     *             if a class needed to execute the HadoopGridTasks cannot be
     *             found. This is propagated from the
     *             org.apache.hadoop.mapreduce.Job.submit method.
     * @throws URISyntaxException
     *             if there is a problem generating the URIs used to add the
     *             files and archives to the hadoop distributed cache.
     */
    public void runJob(boolean wait) throws IllegalStateException, IOException, InterruptedException,
            ClassNotFoundException, URISyntaxException {

        if (started) {
            throw new IllegalStateException("Cannot run a job that is already started.");
        }

        /*
         * When a HadoopGridJob is started, a temporary working directory is
         * created on the Hadoop HDFS. This directory will be prefaced with the
         * job's name and will include additional digits to make it unique so
         * that it does not conflict with any other directory on the HDFS. The
         * file listing all of the tasks to be run will be written to this
         * directory. Any files or archives that have been added to the job used
         * by are also moved to this directory. From there files and archives
         * being used are added to the job's hadoop distribued cache. Upon
         * completion of the HadoopGridJob, this working directory and all of
         * its contents are removed from the HDFS.
         */

        FileSystem fs = FileSystem.get(job.getConfiguration());

        /*
         * Mark the job and all of the tasks as started. Their completion status
         * is taken care of in the processResults method.
         */
        started = true;
        for (HadoopGridTask hgt : taskMap.values()) {
            hgt.markAsStarted();
        }

        /*
         * NOTE: Flag checks below are to facilitate incremental focused testing
         * of each part of the run process. Flags are set in the test class as
         * appropriate to each test.
         */

        Path tempHDFSWorkingDir = null;
        if (!testing || (testing && makeTempDir)) {
            tempHDFSWorkingDir = createTemporaryDirectory(fs);
        }

        if (!testing || (testing && copyFiles)) {
            if (!files.isEmpty()) {
                copyLocalFileToHDFS(fs, tempHDFSWorkingDir);
            }
        }

        if (!testing || (testing && copyArchives)) {
            if (!archives.isEmpty()) {
                copyLocalArchiveToHDFS(fs, tempHDFSWorkingDir);
            }
        }

        Path inputDir = null;
        if (!testing || (testing && makeInputDir)) {
            inputDir = createInputDirectory(fs, tempHDFSWorkingDir);
            FileInputFormat.addInputPath(job, inputDir);
        }

        if (!testing || (testing && writeTasksFile)) {
            writeTasksSequenceFiles(inputDir);
        }

        Path outputDir = null;
        if (!testing || (testing && configureJob)) {
            job.setJarByClass(jadoop.HadoopGridTaskRunner.class);
            job.setMapperClass(jadoop.HadoopGridTaskRunner.class);
            job.setInputFormatClass(SingleRecordSplitSequenceFileInputFormat.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(MapWritable.class);
            job.setOutputFormatClass(SequenceFileOutputFormat.class);

            outputDir = new Path(tempHDFSWorkingDir.toString() + "/output");
            FileOutputFormat.setOutputPath(job, outputDir);
        }

        if (!testing || (testing && submitJob)) {
            job.submit();

            jobMonitorThread = (new Thread(new JobMonitor(fs, outputDir, tempHDFSWorkingDir)));
            jobMonitorThread.start();

            if (wait) {
                jobMonitorThread.join();
            }
        }
    }

    /**
     * Terminates any currently running tasks in this job. This method will
     * block until all of the tasks in the job have been terminated. If the job
     * is not currently running this method has no effect.
     */
    public void terminate() {
        if (isRunning()) {
            /*
             * NOTE: We don't do the termination here. Just set a flag and let
             * the JobMonitor thread detect it and then do the termination
             * there.
             */
            terminated = true;
            try {
                jobMonitorThread.join();
            } catch (InterruptedException ie) {
                ie.printStackTrace();
            }
        }
    }

    /**
     * Get the status information from the underlying Hadoop Job that is running
     * the HadoopGridTasks.
     * 
     * @return the JobStatus for the Haddop Job.
     * 
     * @throws IOException
     *             if there is a problem getting the status of the Hadoop Job.
     *             This is propagated from the
     *             org.apache.hadoop.mapreduce.Job.getStatus method.
     * @throws InterruptedException
     *             if interrupted while waiting for the HadoopGridTasks to
     *             complete. This is propagated from the
     *             org.apache.hadoop.mapreduce.Job.getStatus method.
     */
    public JobStatus getStatus() throws IOException, InterruptedException {
        return job.getStatus();
    }

    /**
     * Print periodic messages regarding the status of the running Hadoop job.
     * This method will monitor the jobs and print information about completed,
     * failed and killed tasks. It returns when all tasks have been completed,
     * failed or killed.
     * 
     * @throws IOException
     *             if there is a problem getting the status of the Hadoop Job.
     *             This is propagated from the
     *             org.apache.hadoop.mapreduce.Job.monitorAndPrintJob method.
     * @throws InterruptedException
     *             if interrupted while waiting for the HadoopGridTasks to
     *             complete. This is propagated from the
     *             org.apache.hadoop.mapreduce.Job.monitorAndPrintJob method.
     */
    public void monitorAndPrintJob() throws IOException, InterruptedException {
        job.monitorAndPrintJob();
    }

    /**
     * Creates a temporary working directory on the hadoop HDFS for the job that
     * will be running. The name of this temporary directory will be the name
     * given to the job. If there is an existing directory with the same name as
     * the job's name, this method generate a new name that will be used so that
     * the temporary directory does not share a name with another directory on
     * the HDFS.
     * 
     * @return the path of the new temporary working directory on the HDFS
     * 
     * @throws IOException
     *             if there is a problem creating the temporary working
     *             directory.
     */
    private Path createTemporaryDirectory(FileSystem fs) throws IOException {
        // path to the HDFS system
        Path hdfsHome = fs.getHomeDirectory();

        // base name of the temporary working directory.
        Path newHDFSDir = new Path("/" + jobName);

        // full path to the temporary working directory on the HDFS.
        Path tempHDFSWorkingDir = Path.mergePaths(hdfsHome, newHDFSDir);

        // append numbers to the job name until there is no conflict...
        int number = 1;
        while (fs.exists(tempHDFSWorkingDir)) {
            Path jobNum = new Path("/" + jobName + number);
            tempHDFSWorkingDir = Path.mergePaths(hdfsHome, jobNum);
            number++;
        }

        // make the directory on the HDFS and return the path to it.
        fs.mkdirs(tempHDFSWorkingDir);
        return tempHDFSWorkingDir;
    }

    /**
     * Copies the file(s) on the local machine onto the temporary HDFS working
     * directory and make them available in the hadoop distributed cache so that
     * they appear in the working directory of the HadoopGridTask(s) when they
     * are running
     * 
     * @param fs
     *            the hadoop HDFS filesystem
     * @param hdfsDirectory
     *            the path to the temporary working directory on the HDFS to
     *            which the files are to be copied.
     * @throws IOException
     *             if there is a problem copying the files to the HDFS or adding
     *             them to the hadoop distributed cache.
     * @throws URISyntaxException
     *             if there is a problem generating the URI used to add the file
     *             to the hadoop distributed cache.
     */
    private void copyLocalFileToHDFS(FileSystem fs, Path hdfsDirectory) throws IOException, URISyntaxException {

        for (File localFile : files) {
            // get the path to the file on the local file system.
            Path fileRelativePath = new Path(localFile.getPath());

            /*
             * copy the file from the local file system to the temporary working
             * directory on the HDFS.
             */
            fs.copyFromLocalFile(fileRelativePath, hdfsDirectory);

            /*
             * Build a URI to the file on the HDFS so we can add it to the
             * working cache.
             * 
             * The value before the # gives the name of the file on the HDFS,
             * the value after the # gives the name that the file will have in
             * the cache (i.e. the working directory of the tasks).
             */
            URI uri = new URI(hdfsDirectory + "/" + localFile.getName() + "#" + localFile.getName());

            job.addCacheFile(uri);
        }
    }

    /**
     * Copies the archive file(s) on the local machine into the temporary
     * working directory on the hadoop HDFS. And also be makes them available in
     * the distributed working cache so the HadoopGridTask(s) can access them in
     * their working directory. Once the archive file(s) have been copied onto
     * the HDFS, a directory with the archive file(s)'s name will be created and
     * the contents of archive file(s) will unpacked into that directory
     * 
     * @param fs
     *            the hadoop HDFS file system
     * @param hdfsDirectory
     *            path to the temporary working directory on the HDFS to which
     *            the archives are to be copied.
     * @throws IOException
     *             if there is a problem copying the archives to the HDFS or
     *             adding them to the hadoop distributed cache.
     * @throws URISyntaxException
     *             if there is a problem generating the URI used to add the
     *             archive to the hadoop distributed cache.
     */
    private void copyLocalArchiveToHDFS(FileSystem fs, Path hdfsDirectory) throws IOException, URISyntaxException {
        for (File localArchive : archives) {
            Path archiveRelativePath = new Path(localArchive.getPath());

            fs.copyFromLocalFile(archiveRelativePath, hdfsDirectory);

            URI uri = new URI(hdfsDirectory + "/" + localArchive.getName() + "#" + localArchive.getName());
            job.addCacheArchive(uri);
        }
    }

    /**
     * Creates a directory named "input" in the temporary working directory on
     * the hadoop HDFS.
     * 
     * @param fs
     *            the hadoop HDFS file system
     * @param hdfsDirectory
     *            path to the temporary working directory on the HDFS in which
     *            the input directory is to be created.
     * 
     * @return a path to the input directory that was created.
     * 
     * @throws IOException
     *             if there is a problem creating the input directory.
     */
    private Path createInputDirectory(FileSystem fs, Path hdfsDirectory) throws IOException {
        String IN_DIR = hdfsDirectory.toString() + "/input";
        Path inDir = new Path(IN_DIR);
        fs.mkdirs(inDir);
        return inDir;
    }

    /**
     * Create a tasks.seq sequence file in the input directory for each task.
     * This file contains the key and command that defines the map task. The key
     * is the key that was associated with the task when it was added to the
     * job. The value is an TextArrayWritable object with the following
     * contents:
     * <UL>
     * <LI>true/false - indicating if standard output is to be captured.
     * <LI>true/false - indicating if standard error is to be captured.
     * <LI>cmd - the command to be run in the mapper task.
     * <LI>... - any successive elements contains an argument to the cmd.
     * </UL>
     * 
     * @see HadoopGridTaskRunner
     * 
     * @param hdfsInputDir
     *            the input directory on the HDFS where the tasks.seq file is to
     *            be created.
     * 
     * @throws IOException
     *             if there is a problem creating the tasks.seq file.
     */
    private void writeTasksSequenceFiles(Path hdfsInputDir) throws IOException {
        /*
         * Seems as if we should be able to just write one task file with
         * multiple key/value pairs in it. However, hadoop did not seem to want
         * to send each entry to a different node. Rather one node processed
         * many of the tasks. It seems as if this could be fixed by defining how
         * hadoop is to split up the sequence file, but we were unable to get
         * that to work. Writing a different task file for each task is a bit of
         * a hack solution, but it works. Each task is then run on a different
         * node, as desired.
         */

        Text mapperKey = new Text();
        TextArrayWritable mapperVal = new TextArrayWritable();

        // for each task in the job...
        int index = 0;
        for (HadoopGridTask hgt : taskMap.values()) {

            Path seqFileInDirPath = new Path(hdfsInputDir.toString() + "/tasks" + index + ".seq");

            SequenceFile.Writer writer = SequenceFile.createWriter(job.getConfiguration(),
                    Writer.file(seqFileInDirPath), Writer.keyClass(Text.class),
                    Writer.valueClass(TextArrayWritable.class));

            String taskKey = hgt.getKey();
            String[] taskVal = hgt.getCommand();

            // set the key for sequence file entry for this task
            mapperKey.set(taskKey);

            /*
             * Build an array of Writeable holding the flags that indicate if
             * standard output/error are to be captured, the timeout and the
             * command and its arguments.
             */
            Writable[] vals = new Writable[taskVal.length + 3];

            // put the flags in the array.
            vals[0] = new Text(String.valueOf(hgt.captureStandardOutput()));
            vals[1] = new Text(String.valueOf(hgt.captureStandardError()));
            vals[2] = new Text(String.valueOf(hgt.getTimeout()));

            // put the command and its arguments into the array.
            for (int i = 3; i < taskVal.length + 3; i++) {
                vals[i] = new Text(taskVal[i - 3]);
            }

            /*
             * Set the value for the sequence file entry for this task to be the
             * array.
             */
            mapperVal.set(vals);

            writer.append(mapperKey, mapperVal);

            writer.close();

            index++;
        }
    }

    /**
     * Process the results that were returned by the Hadoop job. Each result
     * will be a key value pair with the format specified in the
     * HadoopGridTaskRunner class. The results for each key should be parsed and
     * placed into the HadoopGridTask object with the same key.
     * 
     * @see HadoopGridTaskRunner
     * 
     * @throws IOException
     *             if there is a problem reading the results.
     */
    private void processResults(FileSystem fs, Path outDir) throws IOException {

        FileStatus[] fileStatus = fs.listStatus(outDir);

        /*
         * Process the results for all of the tasks that have completed. Any
         * task that did not complete will be included in any file.
         */
        for (FileStatus file : fileStatus) {
            String fileName = file.getPath().getName();

            if (fileName.contains("part-m-")) {

                Path filePath = new Path(outDir + "/" + fileName);
                SequenceFile.Reader reader = new SequenceFile.Reader(job.getConfiguration(),
                        SequenceFile.Reader.file(filePath));

                Text mapperOutputKey = new Text();
                MapWritable mapperOutputVal = new MapWritable();

                /*
                 * If multiple tasks are sent to the same node then the response
                 * file will contain multiple entries. Be sure to process each
                 * one of them.
                 */
                while (reader.next(mapperOutputKey, mapperOutputVal)) {
                    // Get the value returned from the HadoopGridTaskRunner.
                    byte exitValue = ((ByteWritable) mapperOutputVal.get(new Text("EV"))).get();

                    boolean taskTO = ((BooleanWritable) mapperOutputVal.get(new Text("TO"))).get();

                    String stdOut = ((Text) mapperOutputVal.get(new Text("SO"))).toString();
                    String stdErr = ((Text) mapperOutputVal.get(new Text("SE"))).toString();

                    HadoopGridTask task = getTask(mapperOutputKey.toString());

                    if (taskTO) {
                        task.markAsTimedout();
                    } else {
                        // change the task's exit value.
                        task.markAsFinished(exitValue);
                    }

                    if (task.captureStandardOutput()) {
                        task.setStandardOutput(stdOut);
                    }

                    if (task.captureStandardError()) {
                        task.setStandardError(stdErr);
                    }
                }

                reader.close();
            }
        }
    }

    /**
     * Runnable that will be launched in a thread to monitor the progress of the
     * tasks and process the results.
     */
    private class JobMonitor implements Runnable {

        private FileSystem fs;
        private Path outputDir;
        private Path tempWorkingDir;

        public JobMonitor(FileSystem fileSystem, Path outputDirectory, Path tempWorkingDirectory) {
            this.fs = fileSystem;
            outputDir = outputDirectory;
            tempWorkingDir = tempWorkingDirectory;
        }

        /**
         * This method periodically checks to see if all of the tasks in the job
         * have completed or have been killed. When all of the tasks are
         * complete, failed or killed it calls a method that processes the
         * returned key,value pairs and fills in the fields in the associated
         * HadoopGridTask objects. When all results have been processed the
         * finished and correct flags are set.
         */
        @Override
        public void run() {
            long startTime = System.currentTimeMillis();
            long curTime = System.currentTimeMillis();
            long runTime = curTime - startTime;

            try {
                while (!job.isComplete() && !terminated) {
                    Thread.sleep(JOB_MONITOR_DELAY);
                    curTime = System.currentTimeMillis();
                    runTime = curTime - startTime;

                    if (runTime >= timeout) {
                        timedout = true;
                        terminated = true;
                    }

                }

                if (terminated) {
                    job.killJob(); // blocks until tasks are killed.
                }

                /*
                 * If a job is terminated then we cannot get any results from
                 * hadoop because they are not available on the HDFS, only a
                 * _temprorary file exits in the working directory.
                 */
                if (!terminated) {
                    processResults(fs, outputDir);
                }

                // remove the temporary working directory.
                fs.delete(tempWorkingDir, true);
            } catch (Exception e) {
                /*
                 * Don't really want to kill everything... and this cannot be
                 * easily caught... so print it out mark the unfinished tasks as
                 * appropriate (in finally) and get on with it.
                 */
                e.printStackTrace();
            } finally {
                /*
                 * Any tasks not already marked as finished by the
                 * processResults method should be marked as terminated and
                 * timedout (if appropriate)
                 */
                for (HadoopGridTask task : taskMap.values()) {
                    if (!task.hasFinished()) {
                        if (timedout) {
                            task.markAsTimedout();
                        } else {
                            task.markAsTerminated();
                        }
                    }
                }

                /*
                 * Mark the job as successful if all of the tasks have an exit
                 * code of 0. Could be combined with above loop, but seemed more
                 * clear this way.
                 */
                successful = true;
                for (HadoopGridTask hgt : taskMap.values()) {
                    successful = successful && (hgt.wasSuccessful());
                }

                /*
                 * Do this only at the end so that we can be sure that all of
                 * the results have been processed before a call to
                 * hasFinished() will return true - i.e. if runJob was not asked
                 * to wait and the user code is polling.
                 */
                finished = true;
            }
        }
    }

    /*
     * There are a number of private methods below here that are used by the
     * tests. The tests invoke them via reflection so that they can remain
     * private.
     */

    /**
     * Gets all of the files that are made available to the HadoopGridTasks. For
     * testing purposes.
     * 
     * @return a list of files that were made available to the HadoopGridTasks.
     */
    @SuppressWarnings("unused")
    private List<File> getFiles() {
        return files;
    }

    /**
     * Retrieves all of the archive files that are made available for the
     * HadoopGridTasks. For testing purposes.
     * 
     * @return a list of archive files that were made available for the
     *         HadoopGridTasks.
     */
    @SuppressWarnings("unused")
    private List<File> getArchives() {
        return archives;
    }

    /**
     * Marks the job as started. For testing purposes.
     */
    @SuppressWarnings("unused")
    private void markAsStarted() {
        testing = true;
        started = true;
    }

    /**
     * Marks the job the job as finished. For testing purposes.
     */
    @SuppressWarnings("unused")
    private void markAsFinished() {
        testing = true;
        finished = true;
    }

    /**
     * Marks the job the job as terminated. For testing purposes.
     */
    @SuppressWarnings("unused")
    private void markAsTerminated() {
        testing = true;
        terminated = true;
    }

    /**
     * Marks the job as timedout. For testing purposes.
     */
    @SuppressWarnings("unused")
    private void markAsTimedout() {
        testing = true;
        timedout = true;
    }

    /**
     * Marks the job as successfully completed. For testing purposes.
     */
    @SuppressWarnings("unused")
    private void markAsSuccessful() {
        testing = true;
        successful = true;
    }

    /**
     * Mark the makeTempDir variable so that the temporary directory is created.
     * For testing purposes.
     */
    @SuppressWarnings("unused")
    private void makeTempDir() {
        testing = true;
        makeTempDir = true;
    }

    /**
     * Mark the copyFiles variable so that the files are copied to the HDFS. For
     * testing purposes.
     */
    @SuppressWarnings("unused")
    private void copyFiles() {
        testing = true;
        makeTempDir = true;
        copyFiles = true;
    }

    /**
     * Mark the copyFiles variable so that the archives are copied to the HDFS.
     * For testing purposes.
     */
    @SuppressWarnings("unused")
    private void copyArchives() {
        testing = true;
        makeTempDir = true;
        copyArchives = true;
    }

    /**
     * Mark the makeInputDir variable so that the input directory is created on
     * the HDFS. For testing purposes.
     */
    @SuppressWarnings("unused")
    private void makeInputDir() {
        testing = true;
        makeTempDir = true;
        makeInputDir = true;
    }

    /**
     * Mark the writeTasksFile variable so that the tasks sequence file is
     * created in the input directory. For testing purposes.
     */
    @SuppressWarnings("unused")
    private void writeTasksFile() {
        testing = true;
        makeTempDir = true;
        makeInputDir = true;
        writeTasksFile = true;
    }

    /**
     * Mark the configureJob variable so that the job is configured. For testing
     * purposes.
     */
    @SuppressWarnings("unused")
    private void configureJob() {
        testing = true;
        makeTempDir = true;
        configureJob = true;
    }

    /**
     * Mark the submitJob variable so that the job is submitted to the cluster.
     * For testing purposes.
     */
    @SuppressWarnings("unused")
    private void submitJob() {
        testing = true;
        makeTempDir = true;
        makeInputDir = true;
        writeTasksFile = true;
        copyFiles = true;
        copyArchives = true;
        configureJob = true;
        submitJob = true;
    }
}