distributed.hadoop.MapReduceJobConfig.java Source code

Java tutorial

Introduction

Here is the source code for distributed.hadoop.MapReduceJobConfig.java

Source

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    MapReduceJobConfig.java
 *    Copyright (C) 2013 University of Waikato, Hamilton, New Zealand
 *
 */

package distributed.hadoop;

import distributed.core.DistributedJobConfig;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import weka.core.Environment;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Map;
import java.util.Vector;

/**
 * The main job configuration used by Weka Hadoop jobs
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision$
 */
public class MapReduceJobConfig extends AbstractHadoopJobConfig implements OptionHandler {

    /** Internal key for the number of mappers to use */
    public static final String NUM_MAPPERS = "numMappers";

    /** Internal key for the number of reducers to use */
    public static final String NUM_REDUCERS = "numReducers";

    /**
     * Internal key for the maximum number of mappers that will run on a node
     * concurrently
     */
    public static final String TASK_TRACKER_MAP_MAXIMUM = "taskTrackerMaxMappers";

    /** Internal key for the name of the mapper class */
    public static final String MAPPER_CLASS = "mapperClass";

    /** Internal key for the name of the reducer class */
    public static final String REDUCER_CLASS = "reducerClass";

    /** Internal key for the name of the combiner class */
    public static final String COMBINER_CLASS = "combinerClass";

    /** Internal key for the name of the input format class to use */
    public static final String INPUT_FORMAT_CLASS = "inputFormatClass";

    /** Internal key for the name of the output format class to use */
    public static final String OUTPUT_FORMAT_CLASS = "outputFormatClass";

    /** Internal key for the name of the map output key class to use */
    public static final String MAP_OUTPUT_KEY_CLASS = "mapOutputKeyClass";

    /** Internal key for the name of the map output value class to use */
    public static final String MAP_OUTPUT_VALUE_CLASS = "mapOutputValueClass";

    /** Internal key for the name of the (job/reducer) output key to use */
    public static final String OUTPUT_KEY_CLASS = "outputKeyClass";

    /** Internal key for the name of the (job/reducer) output value to use */
    public static final String OUTPUT_VALUE_CLASS = "outputValueClass";

    /** Internal key for the input path(s) to use for the job */
    public static final String INPUT_PATHS = "inputPaths";

    /** Internal key for the output path to use for the job */
    public static final String OUTPUT_PATH = "outputPath";

    /** Internal key for the maximum block size (for splitting data) to use */
    public static final String MAPRED_MAX_SPLIT_SIZE = "mapredMaxSplitSize";

    /** Internal key for the Hadoop property for the job tracker host */
    public static final String HADOOP_JOB_TRACKER_HOST = "mapred.job.tracker";

    /** Internal key for the Hadoop property for the yarn resource manager address */
    public static final String YARN_RESOURCE_MANAGER_ADDRESS = "yarn.resourcemanager.address";

    /**
     * Internal key for the Hadoop property for the yarn resource manager
     * scheduler address. Weka will use yarn.resource.manager.address:8030 to set
     * this. If this is not appropriate for a particular cluster it can be
     * overridden using -user-prop arguments to jobs, or by placing the cluster
     * configuration directory in the classpath when running jobs.
     */
    public static final String YARN_RESOURCE_MANAGER_SCHEDULER_ADDRESS = "yarn.resourcemanager.scheduler.address";

    /** Internal key for the Hadoop 1 property for the maximum block size */
    public static final String HADOOP_MAPRED_MAX_SPLIT_SIZE = "mapred.max.split.size";

    /** Internal key for the Haddop 2 property for the maximum block size */
    public static final String HADOOP2_MAPRED_MAX_SPLIT_SIZE = "mapreduce.input.fileinputformat.split.maxsize";

    /**
     * Internal key for the Hadoop 1 property for the maximum number of number of
     * reducers to run per node
     */
    public static final String HADOOP_TASKTRACKER_REDUCE_TASKS_MAXIMUM = "mapred.tasktracker.reduce.tasks.maximum";

    /**
     * Internal key for the Hadoop 2 property for the maximum number of number of
     * reducers to run per node
     */
    public static final String HADOOP2_TASKTRACKER_REDUCE_TASKS_MAXIMUM = "mapreduce.tasktracker.reduce.tasks.maximum";

    /** For serialization */
    private static final long serialVersionUID = -1721850598954532369L;

    /** HDFSConfig for HDFS properties */
    protected HDFSConfig m_hdfsConfig = new HDFSConfig();

    /**
     * Constructor - sets defaults
     */
    public MapReduceJobConfig() {

        // defaults
        if (AbstractHadoopJobConfig.isHadoop2()) {
            // default port for resource manager
            setJobTrackerPort(AbstractHadoopJobConfig.DEFAULT_PORT_YARN);
        } else {
            setJobTrackerPort(AbstractHadoopJobConfig.DEFAULT_PORT);
        }
        setJobTrackerHost("localhost");
        getHDFSConfig().setHDFSHost("localhost");
        getHDFSConfig().setHDFSPort("8020");

        // some defaults for Weka style jobs. Mappers tend to create
        // some sort of object (classifier, instances etc.) to be
        // aggregated in the Reducer. Reducers will proably write
        // the aggregated object via serialization to a custom
        // location in HDFS. Output from the job is probably just
        // some status information
        setProperty(INPUT_FORMAT_CLASS, TextInputFormat.class.getName());
        setProperty(OUTPUT_FORMAT_CLASS, TextOutputFormat.class.getName());

        setProperty(MAP_OUTPUT_KEY_CLASS, Text.class.getName());
        setProperty(MAP_OUTPUT_VALUE_CLASS, BytesWritable.class.getName());

        setProperty(OUTPUT_KEY_CLASS, Text.class.getName());
        setProperty(OUTPUT_VALUE_CLASS, Text.class.getName());
    }

    @Override
    public Enumeration<Option> listOptions() {
        Vector<Option> result = new Vector<Option>();

        // HDFS options
        Enumeration<Option> hdfsOpts = m_hdfsConfig.listOptions();
        while (hdfsOpts.hasMoreElements()) {
            Option o = hdfsOpts.nextElement();
            result.add(o);
        }

        result.addElement(new Option("\tJob tracker/resource manager hostname. (default: localhost)",
                "jobtracker-host", 1, "-jobtracker-host <hostname>"));
        result.addElement(new Option("\tJob tracker/resource manager port. (default 8021/8032)", "jobtracker-port",
                1, "-jobtracker-port <port number>"));
        result.addElement(new Option("\tThe number of maps (hint to MR).", "num-maps", 1, "-num-maps <integer>"));
        result.addElement(new Option("\tMaximum number of map tasks to run " + "concurrently per node.",
                "map-maximum", 1, "-map-maximum"));
        result.addElement(new Option("\tNumber of reducers to use.", "num-reducers", 1, "-num-reducers"));
        result.addElement(
                new Option("\tInput path(s) in HDFS (comma-separated)", "input-paths", 1, "-input-paths"));
        result.addElement(new Option("\tOutput path for the job in HDFS.", "output-path", 1, "-output-path"));
        result.addElement(new Option("\tMaximum split size (in bytes) for each mapper to " + "process.",
                "max-split-size", 1, "-max-split-size"));

        Enumeration<Option> superOpts = super.listOptions();
        while (superOpts.hasMoreElements()) {
            result.add(superOpts.nextElement());
        }

        return result.elements();
    }

    @Override
    public void setOptions(String[] options) throws Exception {
        super.setOptions(options);

        setJobTrackerHost(Utils.getOption("jobtracker-host", options));
        setJobTrackerPort(Utils.getOption("jobtracker-port", options));
        setNumberOfMaps(Utils.getOption("num-maps", options));
        setTaskTrackerMapTasksMaximum(Utils.getOption("map-maximum", options));
        setNumberOfReducers(Utils.getOption("num-reducers", options));
        setInputPaths(Utils.getOption("input-paths", options));
        setOutputPath(Utils.getOption("output-path", options));
        setMapredMaxSplitSize(Utils.getOption("max-split-size", options));

        m_hdfsConfig.setOptions(options);
    }

    @Override
    public String[] getOptions() {
        List<String> opts = new ArrayList<String>();

        String[] hdfsOpts = m_hdfsConfig.getOptions();
        for (String o : hdfsOpts) {
            opts.add(o);
        }

        opts.add("-jobtracker-host");
        opts.add(getJobTrackerHost());

        if (!DistributedJobConfig.isEmpty(getJobTrackerPort())) {
            opts.add("-jobtracker-port");
            opts.add(getJobTrackerPort());
        }

        if (!DistributedJobConfig.isEmpty(getNumberOfMaps())) {
            opts.add("-num-maps");
            opts.add(getNumberOfMaps());
        }

        if (!DistributedJobConfig.isEmpty(getTaskTrackerMapTasksMaximum())) {
            opts.add("-map-maximum");
            opts.add(getTaskTrackerMapTasksMaximum());
        }

        if (!DistributedJobConfig.isEmpty(getNumberOfReducers())) {
            opts.add("-num-reducers");
            opts.add(getNumberOfReducers());
        }

        if (!DistributedJobConfig.isEmpty(getInputPaths())) {
            opts.add("-input-paths");
            opts.add(getInputPaths());
        }

        if (!DistributedJobConfig.isEmpty(getOutputPath())) {
            opts.add("-output-path");
            opts.add(getOutputPath());
        }

        if (!DistributedJobConfig.isEmpty(getMapredMaxSplitSize())) {
            opts.add("-max-split-size");
            opts.add(getMapredMaxSplitSize());
        }

        String[] superOpts = super.getOptions();
        for (String s : superOpts) {
            opts.add(s);
        }

        return opts.toArray(new String[opts.size()]);
    }

    /**
     * Set the HDFSConfig to use
     * 
     * @param config the HDFSConfig to use
     */
    public void setHDFSConfig(HDFSConfig config) {
        m_hdfsConfig = config;
    }

    /**
     * Get the HDFSConfig to use
     * 
     * @return the HDFSConfig to use
     */
    public HDFSConfig getHDFSConfig() {
        return m_hdfsConfig;
    }

    /**
     * Get the tool tip text for this property
     * 
     * @return the tool tip text for this property
     */
    public String HDFSHostTipText() {
        return "The HDFS (name node) host to use";
    }

    /**
     * Set the HDFSHost (name node)
     * 
     * @param host the HDFS host
     */
    public void setHDFSHost(String host) {
        m_hdfsConfig.setHDFSHost(host);
    }

    /**
     * Get the HDFS host (name node)
     * 
     * @return the HDFS host
     */
    public String getHDFSHost() {
        return m_hdfsConfig.getHDFSHost();
    }

    /**
     * Get the tool tip text for this property
     * 
     * @return the tool tip text for this property
     */
    public String HDFSPortTipText() {
        return "The HDFS (name node) port";
    }

    /**
     * Set the HDFS port
     * 
     * @param port the HDFS port
     */
    public void setHDFSPort(String port) {
        m_hdfsConfig.setHDFSPort(port);
    }

    /**
     * Get the HDFS port
     * 
     * @return the HDFS port
     */
    public String getHDFSPort() {
        return m_hdfsConfig.getHDFSPort();
    }

    /**
     * Get the tool tip text for this property
     * 
     * @return the tool tip text for this property
     */
    public String numberOfMapsTipText() {
        return "The number of maps to use. This is just a hint to the underlying Hadoop"
                + " framework for how many maps to use. Using setMapredMaxSplitSize(), which"
                + " sets the Hadoop property mapred.max.split.size, gives greater control over"
                + " how many maps will be run (and thus how much data each map processes).";
    }

    /**
     * Set the number of maps to use. This is just a hint to the underlying Hadoop
     * framework for how many maps to use. Using setMapredMaxSplitSize(), which
     * sets the Hadoop property mapred.max.split.size, gives greater control over
     * how many maps will be run (and thus how much data each map processes).
     * 
     * @param nM the number of maps to use
     */
    public void setNumberOfMaps(String nM) {
        setProperty(NUM_MAPPERS, nM);
    }

    /**
     * Get the number of maps to use. This is just a hint to the underlying Hadoop
     * framework for how many maps to use. Using setMapredMaxSplitSize(), which
     * sets the Hadoop property mapred.max.split.size, gives greater control over
     * how many maps will be run (and thus how much data each map processes).
     * 
     * @return the number of maps to use
     */
    public String getNumberOfMaps() {
        return getProperty(NUM_MAPPERS);
    }

    /**
     * Get the tool tip text for this property
     * 
     * @return the tool tip text for this property
     */
    public String taskTrackerMapTasksMaximumTipText() {
        return "The maximum number of map tasks to run concurrently by a task tracker (node)";
    }

    /**
     * Set the maximum number of map tasks to run concurrently by a task tracker
     * (node). The cluster setting for this will be used if not specified here
     * 
     * @param mmt the maximum number of map tasks to run concurrently by a task
     *          tracker
     */
    public void setTaskTrackerMapTasksMaximum(String mmt) {
        setProperty(TASK_TRACKER_MAP_MAXIMUM, mmt);
    }

    /**
     * Get the maximum number of map tasks to run concurrently by a task tracker
     * (node). The cluster setting for this will be used if not specified here
     * 
     * @return the maximum number of map tasks to run concurrently by a task
     *         tracker
     */
    public String getTaskTrackerMapTasksMaximum() {
        return getProperty(TASK_TRACKER_MAP_MAXIMUM);
    }

    /**
     * Get the tool tip text for this property
     * 
     * @return the tool tip text for this property
     */
    public String numberOfReducersTipText() {
        return "The number of reducers to use. Weka jobs set this property automatically.";
    }

    /**
     * Set the number of reducers to use. Weka jobs set this property
     * automatically
     * 
     * @param nR the number of reducers to use.
     */
    public void setNumberOfReducers(String nR) {
        setProperty(NUM_REDUCERS, nR);
    }

    /**
     * Get the number of reducers to use. Weka jobs set this property
     * automatically
     * 
     * @return the number of reducers to use.
     */
    public String getNumberOfReducers() {
        return getProperty(NUM_REDUCERS);
    }

    /**
     * Set the mapper class name to use. Weka jobs configure this automatically.
     * 
     * @param mapperClass the mapper class name
     */
    public void setMapperClass(String mapperClass) {
        setProperty(MAPPER_CLASS, mapperClass);
    }

    /**
     * Get the mapper class name to use. Weka jobs configure this automatically.
     * 
     * @return the mapper class name
     */
    public String getMapperClass() {
        return getProperty(MAPPER_CLASS);
    }

    /**
     * Set the name of the reducer class to use. Weka jobs set this automatically.
     * 
     * @param reducerClass the name of the reducer class
     */
    public void setReducerClass(String reducerClass) {
        setProperty(REDUCER_CLASS, reducerClass);
    }

    /**
     * Get the name of the reducer class to use. Weka jobs set this automatically.
     * 
     * @return the name of the reducer class
     */
    public String getReducerClass() {
        return getProperty(REDUCER_CLASS);
    }

    /**
     * Set the name of the reducer class (if any) to use. Weka jobs may set this
     * automatically
     * 
     * @param combinerClass the name of the combiner class to use
     */
    public void setCombinerClass(String combinerClass) {
        setProperty(COMBINER_CLASS, combinerClass);
    }

    /**
     * Get the name of the reducer class (if any) to use. Weka jobs may set this
     * automatically
     * 
     * @return the name of the combiner class to use
     */
    public String getCombinerClass() {
        return getProperty(COMBINER_CLASS);
    }

    /**
     * Set the name of the input format class to use. Weka jobs set this
     * automatically.
     * 
     * @param inputFormatClass the name of the input format class to use
     */
    public void setInputFormatClass(String inputFormatClass) {
        setProperty(INPUT_FORMAT_CLASS, inputFormatClass);
    }

    /**
     * Get the name of the input format class to use. Weka jobs set this
     * automatically.
     * 
     * @return the name of the input format class to use
     */
    public String getInputFormatClass() {
        return getProperty(INPUT_FORMAT_CLASS);
    }

    /**
     * Set the name of the output format class to use. Weka jobs set this
     * automatically.
     * 
     * @param outputFormatClass the name of the output format class to use.
     */
    public void setOutputFormatClass(String outputFormatClass) {
        setProperty(OUTPUT_FORMAT_CLASS, outputFormatClass);
    }

    /**
     * Get the name of the output format class to use. Weka jobs set this
     * automatically.
     * 
     * @return the name of the output format class to use.
     */
    public String getOutputFormatClass() {
        return getProperty(OUTPUT_FORMAT_CLASS);
    }

    /**
     * Set the name of the map output key class to use. Weka jobs set this
     * automatically.
     * 
     * @param mapOutputKeyClass the name of the map output key class
     */
    public void setMapOutputKeyClass(String mapOutputKeyClass) {
        setProperty(MAP_OUTPUT_KEY_CLASS, mapOutputKeyClass);
    }

    /**
     * Get the name of the map output key class to use. Weka jobs set this
     * automatically.
     * 
     * @return the name of the map output key class
     */
    public String getMapOutputKeyClass() {
        return getProperty(MAP_OUTPUT_KEY_CLASS);
    }

    /**
     * Set the name of the map output value class to use. Weka jobs set this
     * automatically.
     * 
     * @param mapOutputValueClass the name of the map output value class
     */
    public void setMapOutputValueClass(String mapOutputValueClass) {
        setProperty(MAP_OUTPUT_VALUE_CLASS, mapOutputValueClass);
    }

    /**
     * Get the name of the map output value class to use. Weka jobs set this
     * automatically.
     * 
     * @return the name of the map output value class
     */
    public String getMapOutputValueClass() {
        return getProperty(MAP_OUTPUT_VALUE_CLASS);
    }

    /**
     * Set the name of the (reducer) output key class to use
     * 
     * @param outputKeyClass the name of the output key class to use
     */
    public void setOutputKeyClass(String outputKeyClass) {
        setProperty(OUTPUT_KEY_CLASS, outputKeyClass);
    }

    /**
     * Get the name of the (reducer) output key class to use
     * 
     * @return the name of the output key class to use
     */
    public String getOutputKeyClass() {
        return getProperty(OUTPUT_KEY_CLASS);
    }

    /**
     * Set the name of the (reducer) output value class to use
     * 
     * @param outputValueClass the name of the output value class to use
     */
    public void setOutputValueClass(String outputValueClass) {
        setProperty(OUTPUT_VALUE_CLASS, outputValueClass);
    }

    /**
     * Get the name of the (reducer) output value class to use
     * 
     * @return the name of the output value class to use
     */
    public String getOutputValueClass() {
        return getProperty(OUTPUT_VALUE_CLASS);
    }

    /**
     * Get the tip text for this property
     * 
     * @return the tip text for this property
     */
    public String inputPathsTipText() {
        return "The path to the directory in HDFS that contains the input files";
    }

    /**
     * Set the input path(s) to use
     * 
     * @param inputPaths the input paths to use
     */
    public void setInputPaths(String inputPaths) {
        setProperty(INPUT_PATHS, inputPaths);
    }

    /**
     * Get the input path(s) to use
     * 
     * @return the input paths to use
     */
    public String getInputPaths() {
        return getProperty(INPUT_PATHS);
    }

    /**
     * Get the tip text for this property
     * 
     * @return the tip text for this property
     */
    public String outputPathTipText() {
        return "The path in HDFS to the output directory";
    }

    /**
     * Set the output path to use
     * 
     * @param outputPath the output path to use
     */
    public void setOutputPath(String outputPath) {
        setProperty(OUTPUT_PATH, outputPath);
    }

    /**
     * Get the output path to use
     * 
     * @return the output path to use
     */
    public String getOutputPath() {
        return getProperty(OUTPUT_PATH);
    }

    /**
     * Set the maximum split size (in bytes). This can be used to control the
     * number of maps that run. The default block size of 64Mb may be too large
     * for some batch learning tasks depending on data characteristics, choice of
     * learning algorithm and available RAM on the node.
     * 
     * @param maxSize the maximum split size (in bytes)
     */
    public void setMapredMaxSplitSize(String maxSize) {
        setProperty(MAPRED_MAX_SPLIT_SIZE, maxSize);
    }

    /**
     * Get the maximum split size (in bytes). This can be used to control the
     * number of maps that run. The default block size of 64Mb may be too large
     * for some batch learning tasks depending on data characteristics, choice of
     * learning algorithm and available RAM on the node.
     * 
     * @return the maximum split size (in bytes)
     */
    public String getMapredMaxSplitSize() {
        return getProperty(MAPRED_MAX_SPLIT_SIZE);
    }

    /**
     * Substitute environment variables in a given string
     * 
     * @param s the string to substitute in
     * @param env environment variables
     * @return the string with variables replaced
     */
    protected static String environmentSubstitute(String s, Environment env) {
        if (env != null) {
            try {
                s = env.substitute(s);
            } catch (Exception ex) {
            }
        }

        return s;
    }

    /**
     * Apply the settings encapsulated in this config and return a Job object
     * ready for execution.
     * 
     * @param jobName the name of the job
     * @param conf the Configuration object that will be wrapped in the Job
     * @param env environment variables
     * @return a configured Job object
     * @throws IOException if a problem occurs
     * @throws ClassNotFoundException if various classes are not found
     */
    public Job configureForHadoop(String jobName, Configuration conf, Environment env)
            throws IOException, ClassNotFoundException {

        String jobTrackerPort = getJobTrackerPort();
        if (DistributedJobConfig.isEmpty(jobTrackerPort)) {
            jobTrackerPort = AbstractHadoopJobConfig.isHadoop2() ? AbstractHadoopJobConfig.DEFAULT_PORT_YARN
                    : AbstractHadoopJobConfig.DEFAULT_PORT;
        }
        String jobTracker = getJobTrackerHost() + ":" + jobTrackerPort;
        if (DistributedJobConfig.isEmpty(jobTracker)) {
            System.err.println("No " + (AbstractHadoopJobConfig.isHadoop2() ? "resource manager " : "JobTracker ")
                    + "set - running locally...");
        } else {
            jobTracker = environmentSubstitute(jobTracker, env);
            if (AbstractHadoopJobConfig.isHadoop2()) {
                conf.set(YARN_RESOURCE_MANAGER_ADDRESS, jobTracker);
                conf.set(YARN_RESOURCE_MANAGER_SCHEDULER_ADDRESS,
                        environmentSubstitute(getJobTrackerHost(), env) + ":8030");
            } else {
                conf.set(HADOOP_JOB_TRACKER_HOST, jobTracker);
            }
        }
        System.err.println("Using " + (AbstractHadoopJobConfig.isHadoop2() ? "resource manager: " : "jobtracker: ")
                + jobTracker);

        if (AbstractHadoopJobConfig.isHadoop2()) {
            // a few other properties needed to run against Yarn
            conf.set("yarn.nodemanager.aux-services", "mapreduce_shuffle");
            conf.set("mapreduce.framework.name", "yarn");
        }

        if (!DistributedJobConfig.isEmpty(getMapredMaxSplitSize())) {
            conf.set(AbstractHadoopJobConfig.isHadoop2() ? HADOOP2_MAPRED_MAX_SPLIT_SIZE
                    : HADOOP_MAPRED_MAX_SPLIT_SIZE, getMapredMaxSplitSize());
        }

        // Do any user supplied properties here before creating the Job
        for (Map.Entry<String, String> e : m_additionalUserSuppliedProperties.entrySet()) {
            conf.set(e.getKey(), e.getValue());
        }

        m_hdfsConfig.configureForHadoop(conf, env);
        Job job = new Job(conf, jobName);

        String numMappers = getNumberOfMaps();
        if (!DistributedJobConfig.isEmpty(numMappers)) {
            numMappers = environmentSubstitute(numMappers, env);
            ((JobConf) job.getConfiguration()).setNumMapTasks(Integer.parseInt(numMappers));
        }

        // The number of map tasks that will be run simultaneously by a task tracker
        String maxConcurrentMapTasks = getTaskTrackerMapTasksMaximum();
        if (!DistributedJobConfig.isEmpty(maxConcurrentMapTasks)) {
            ((JobConf) job.getConfiguration()).set("mapred.tasktracker.map.tasks.maximum", maxConcurrentMapTasks);
        }

        String numReducers = getNumberOfReducers();
        if (!DistributedJobConfig.isEmpty(numReducers)) {
            numReducers = environmentSubstitute(numReducers, env);
            job.setNumReduceTasks(Integer.parseInt(numReducers));

            if (Integer.parseInt(numReducers) == 0) {
                System.err.println("Warning - no reducer class set. Configuring for a map only job");
            }
        } else {
            job.setNumReduceTasks(1);
        }
        String mapperClass = getMapperClass();
        if (DistributedJobConfig.isEmpty(mapperClass)) {
            throw new IOException("No mapper class specified!");
        }
        mapperClass = environmentSubstitute(mapperClass, env);

        @SuppressWarnings("unchecked")
        Class<? extends Mapper> mc = (Class<? extends Mapper>) Class.forName(mapperClass);

        job.setMapperClass(mc);

        String reducerClass = getReducerClass();
        if (DistributedJobConfig.isEmpty(reducerClass) && Integer.parseInt(numReducers) > 0) {
            throw new IOException("No reducer class specified!");
        } else if (job.getNumReduceTasks() > 0) {
            reducerClass = environmentSubstitute(reducerClass, env);

            @SuppressWarnings("unchecked")
            Class<? extends Reducer> rc = (Class<? extends Reducer>) Class.forName(reducerClass);

            job.setReducerClass(rc);
        }

        String combinerClass = getCombinerClass();
        if (!DistributedJobConfig.isEmpty(combinerClass)) {
            combinerClass = environmentSubstitute(combinerClass, env);

            @SuppressWarnings("unchecked")
            Class<? extends Reducer> cc = (Class<? extends Reducer>) Class.forName(combinerClass);

            job.setCombinerClass(cc);
        }

        String inputFormatClass = getInputFormatClass();
        if (DistributedJobConfig.isEmpty(inputFormatClass)) {
            throw new IOException("No input format class specified");
        }
        inputFormatClass = environmentSubstitute(inputFormatClass, env);

        @SuppressWarnings("unchecked")
        Class<? extends InputFormat> ifc = (Class<? extends InputFormat>) Class.forName(inputFormatClass);

        job.setInputFormatClass(ifc);

        String outputFormatClass = getOutputFormatClass();
        if (DistributedJobConfig.isEmpty(outputFormatClass)) {
            throw new IOException("No output format class specified");
        }
        outputFormatClass = environmentSubstitute(outputFormatClass, env);

        @SuppressWarnings("unchecked")
        Class<? extends OutputFormat> ofc = (Class<? extends OutputFormat>) Class.forName(outputFormatClass);
        job.setOutputFormatClass(ofc);

        String mapOutputKeyClass = getMapOutputKeyClass();
        if (DistributedJobConfig.isEmpty(mapOutputKeyClass)) {
            throw new IOException("No map output key class defined");
        }
        mapOutputKeyClass = environmentSubstitute(mapOutputKeyClass, env);
        Class mokc = Class.forName(mapOutputKeyClass);
        job.setMapOutputKeyClass(mokc);

        String mapOutputValueClass = getMapOutputValueClass();
        if (DistributedJobConfig.isEmpty(mapOutputValueClass)) {
            throw new IOException("No map output value class defined");
        }
        mapOutputValueClass = environmentSubstitute(mapOutputValueClass, env);
        Class movc = Class.forName(mapOutputValueClass);
        job.setMapOutputValueClass(movc);

        String outputKeyClass = getOutputKeyClass();
        if (DistributedJobConfig.isEmpty(outputKeyClass)) {
            throw new IOException("No output key class defined");
        }
        outputKeyClass = environmentSubstitute(outputKeyClass, env);
        Class okc = Class.forName(outputKeyClass);
        job.setOutputKeyClass(okc);

        String outputValueClass = getOutputValueClass();
        if (DistributedJobConfig.isEmpty(outputValueClass)) {
            throw new IOException("No output value class defined");
        }
        outputValueClass = environmentSubstitute(outputValueClass, env);
        Class ovc = Class.forName(outputValueClass);
        job.setOutputValueClass(ovc);

        String inputPaths = getInputPaths();
        // don't complain if there aren't any as inputs such as HBASE
        // require other properties to be set
        if (!DistributedJobConfig.isEmpty(inputPaths)) {
            inputPaths = environmentSubstitute(inputPaths, env);
            FileInputFormat.setInputPaths(job, inputPaths);
        }

        String outputPath = getOutputPath();
        if (DistributedJobConfig.isEmpty(outputPath)) {
            throw new IOException("No output path specified");
        }
        outputPath = environmentSubstitute(outputPath, env);
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        return job;
    }

    /**
     * Clean the output directory specified for the supplied job
     * 
     * @param job the job to clean the output directory for
     * @param env environment variables
     * @throws IOException if a problem occurs
     */
    public void deleteOutputDirectory(Job job, Environment env) throws IOException {
        Configuration conf = job.getConfiguration();
        String outputDir = getOutputPath();

        if (DistributedJobConfig.isEmpty(outputDir)) {
            throw new IOException("Can't delete output path - no path defined!");
        }

        HDFSUtils.deleteDirectory(getHDFSConfig(), conf, outputDir, env);
    }
}