it.crs4.pydoop.mapreduce.pipes.CommandLineParser.java Source code

Introduction

Here is the source code for it.crs4.pydoop.mapreduce.pipes.CommandLineParser.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package it.crs4.pydoop.mapreduce.pipes;

import java.util.Properties;
import java.io.InputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLClassLoader;
import java.security.AccessController;
import java.security.PrivilegedAction;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.Parser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.ExitUtil;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;

/**
 * A command line parser for the CLI-based Pipes job submitter.
 */
class CommandLineParser {
    private Options options = new Options();

    CommandLineParser() {
        addOption("input", false, "input path to the maps", "path");
        addOption("output", false, "output path from the reduces", "path");
        addOption("jar", false, "job jar file", "path");
        addOption("inputformat", false, "java classname of InputFormat", "class");
        addOption("map", false, "java classname of Mapper", "class");
        addOption("partitioner", false, "java classname of Partitioner", "class");
        addOption("reduce", false, "java classname of Reducer", "class");
        addOption("writer", false, "java classname of OutputFormat", "class");
        addOption("program", false, "URI to application executable", "class");
        addOption("reduces", false, "number of reduces", "num");
        addOption("lazyOutput", false, "Optional. Create output lazily", "boolean");
        addOption("avroInput", false, "avro input mode", "boolean");
        addOption("avroOutput", false, "avro output mode", "boolean");
    }

    void addOption(String longName, boolean required, String description, String paramName) {
        Option option = OptionBuilder.withArgName(paramName).hasArgs(1).withDescription(description)
                .isRequired(required).create(longName);
        options.addOption(option);
    }

    void addArgument(String name, boolean required, String description) {
        Option option = OptionBuilder.withArgName(name).hasArgs(1).withDescription(description).isRequired(required)
                .create();
        options.addOption(option);
    }

    CommandLine parse(Configuration conf, String[] args) throws IOException, ParseException {
        Parser parser = new BasicParser();
        conf.setBoolean("mapreduce.client.genericoptionsparser.used", true);
        GenericOptionsParser genericParser = new GenericOptionsParser(conf, args);
        return parser.parse(options, genericParser.getRemainingArgs());
    }

    void printUsage() {
        // The CLI package should do this for us, but I can't figure out how
        // to make it print something reasonable.
        System.out.println("bin/hadoop pipes");
        System.out.println("  [-input <path>] // Input directory");
        System.out.println("  [-output <path>] // Output directory");
        System.out.println("  [-jar <jar file> // jar filename");
        System.out.println("  [-inputformat <class>] // InputFormat class");
        System.out.println("  [-map <class>] // Java Map class");
        System.out.println("  [-partitioner <class>] // Java Partitioner");
        System.out.println("  [-reduce <class>] // Java Reduce class");
        System.out.println("  [-writer <class>] // Java RecordWriter");
        System.out.println("  [-program <executable>] // executable URI");
        System.out.println("  [-reduces <num>] // number of reduces");
        System.out.println("  [-lazyOutput <true/false>] // createOutputLazily");
        System.out.println("  [-avroInput <k/v/kv>] // avro input");
        System.out.println("  [-avroOutput <k/v/kv>] // avro output");
        System.out.println();
        GenericOptionsParser.printGenericCommandUsage(System.out);
    }
}

public class Submitter extends Configured implements Tool {

    public static enum AvroIO {
        K, // {Input,Output}Format key type is avro record
        V, // {Input,Output}Format value type is avro record
        KV, // {Input,Output}Format {key,value} type is avro record
    }

    protected static final Log LOG = LogFactory.getLog(Submitter.class);
    protected static final String PROP_FILE = "pydoop.properties";
    protected static AvroIO avroInput;
    protected static AvroIO avroOutput;
    protected static boolean explicitInputFormat = false;
    protected static boolean explicitOutputFormat = false;
    // --- pydoop properties ---
    protected static Properties props;

    public static final String PRESERVE_COMMANDFILE = "mapreduce.pipes.commandfile.preserve";
    public static final String EXECUTABLE = "mapreduce.pipes.executable";
    public static final String INTERPRETOR = "mapreduce.pipes.executable.interpretor";
    public static final String IS_JAVA_MAP = "mapreduce.pipes.isjavamapper";
    public static final String IS_JAVA_RR = "mapreduce.pipes.isjavarecordreader";
    public static final String IS_JAVA_RW = "mapreduce.pipes.isjavarecordwriter";
    public static final String IS_JAVA_REDUCE = "mapreduce.pipes.isjavareducer";
    public static final String PARTITIONER = "mapreduce.pipes.partitioner";
    public static final String INPUT_FORMAT = "mapreduce.pipes.inputformat";
    public static final String OUTPUT_FORMAT = "mapreduce.pipes.outputformat";
    public static final String PORT = "mapreduce.pipes.command.port";

    public static Properties getPydoopProperties() {
        Properties properties = new Properties();
        InputStream stream = Submitter.class.getResourceAsStream(PROP_FILE);
        try {
            properties.load(stream);
            stream.close();
        } catch (NullPointerException e) {
            throw new RuntimeException("Could not find " + PROP_FILE);
        } catch (IOException e) {
            throw new RuntimeException("Could not read " + PROP_FILE);
        }
        return properties;
    }

    public Submitter() {
        super();
        props = getPydoopProperties();
    }

    /**
     * Get the URI of the application's executable.
     * @param conf
     * @return the URI where the application's executable is located
     */
    public static String getExecutable(Configuration conf) {
        return conf.get(Submitter.EXECUTABLE);
    }

    /**
     * Set the URI for the application's executable. Normally this is a hdfs:
     * location.
     * @param conf
     * @param executable The URI of the application's executable.
     */
    public static void setExecutable(Configuration conf, String executable) {
        conf.set(Submitter.EXECUTABLE, executable);
    }

    /**
     * Set whether the job is using a Java RecordReader.
     * @param conf the configuration to modify
     * @param value the new value
     */
    public static void setIsJavaRecordReader(Configuration conf, boolean value) {
        conf.setBoolean(Submitter.IS_JAVA_RR, value);
    }

    /**
     * Check whether the job is using a Java RecordReader
     * @param conf the configuration to check
     * @return is it a Java RecordReader?
     */
    public static boolean getIsJavaRecordReader(Configuration conf) {
        return conf.getBoolean(Submitter.IS_JAVA_RR, false);
    }

    /**
     * Set whether the Mapper is written in Java.
     * @param conf the configuration to modify
     * @param value the new value
     */
    public static void setIsJavaMapper(Configuration conf, boolean value) {
        conf.setBoolean(Submitter.IS_JAVA_MAP, value);
    }

    /**
     * Check whether the job is using a Java Mapper.
     * @param conf the configuration to check
     * @return is it a Java Mapper?
     */
    public static boolean getIsJavaMapper(Configuration conf) {
        return conf.getBoolean(Submitter.IS_JAVA_MAP, false);
    }

    /**
     * Set whether the Reducer is written in Java.
     * @param conf the configuration to modify
     * @param value the new value
     */
    public static void setIsJavaReducer(Configuration conf, boolean value) {
        conf.setBoolean(Submitter.IS_JAVA_REDUCE, value);
    }

    /**
     * Check whether the job is using a Java Reducer.
     * @param conf the configuration to check
     * @return is it a Java Reducer?
     */
    public static boolean getIsJavaReducer(Configuration conf) {
        return conf.getBoolean(Submitter.IS_JAVA_REDUCE, false);
    }

    /**
     * Set whether the job will use a Java RecordWriter.
     * @param conf the configuration to modify
     * @param value the new value to set
     */
    public static void setIsJavaRecordWriter(Configuration conf, boolean value) {
        conf.setBoolean(Submitter.IS_JAVA_RW, value);
    }

    /**
     * Will the reduce use a Java RecordWriter?
     * @param conf the configuration to check
     * @return true, if the output of the job will be written by Java
     */
    public static boolean getIsJavaRecordWriter(Configuration conf) {
        return conf.getBoolean(Submitter.IS_JAVA_RW, false);
    }

    /**
     * Set the configuration, if it doesn't already have a value for the given
     * key.
     * @param conf the configuration to modify
     * @param key the key to set
     * @param value the new "default" value to set
     */
    private static void setIfUnset(Configuration conf, String key, String value) {
        if (conf.get(key) == null) {
            conf.set(key, value);
        }
    }

    /**
     * Save away the user's original partitioner before we override it.
     * @param conf the configuration to modify
     * @param cls the user's partitioner class
     */
    static void setJavaPartitioner(Configuration conf, Class cls) {
        conf.set(Submitter.PARTITIONER, cls.getName());
    }

    /**
     * Get the user's original partitioner.
     * @param conf the configuration to look in
     * @return the class that the user submitted
     */
    static Class<? extends Partitioner> getJavaPartitioner(Configuration conf) {
        return conf.getClass(Submitter.PARTITIONER, HashPartitioner.class, Partitioner.class);
    }

    private static <InterfaceType> Class<? extends InterfaceType> getClass(CommandLine cl, String key,
            Configuration conf, Class<InterfaceType> cls) throws ClassNotFoundException {
        return conf.getClassByName(cl.getOptionValue(key)).asSubclass(cls);
    }

    /**
     * Does the user want to keep the command file for debugging? If
     * this is true, pipes will write a copy of the command data to a
     * file in the task directory named "downlink.data", which may be
     * used to run the C++ program under the debugger. You probably also
     * want to set Configuration.setKeepFailedTaskFiles(true) to keep
     * the entire directory from being deleted.  To run using the data
     * file, set the environment variable "mapreduce.pipes.commandfile"
     * to point to the file.
     * @param conf the configuration to check
     * @return will the framework save the command file?
     */
    public static boolean getKeepCommandFile(Configuration conf) {
        return conf.getBoolean(Submitter.PRESERVE_COMMANDFILE, false);
    }

    /**
     * Set whether to keep the command file for debugging
     * @param conf the configuration to modify
     * @param keep the new value
     */
    public static void setKeepCommandFile(Configuration conf, boolean keep) {
        conf.setBoolean(Submitter.PRESERVE_COMMANDFILE, keep);
    }

    private static void setupPipesJob(Job job) throws IOException, ClassNotFoundException {
        Configuration conf = job.getConfiguration();
        // default map output types to Text
        if (!getIsJavaMapper(conf)) {
            job.setMapperClass(PipesMapper.class);
            // Save the user's partitioner and hook in our's.
            setJavaPartitioner(conf, job.getPartitionerClass());
            job.setPartitionerClass(PipesPartitioner.class);
        }
        if (!getIsJavaReducer(conf)) {
            job.setReducerClass(PipesReducer.class);
            if (!getIsJavaRecordWriter(conf)) {
                job.setOutputFormatClass(NullOutputFormat.class);
            }
        }
        String textClassname = Text.class.getName();
        setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname);
        setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname);
        setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname);
        setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname);

        // Use PipesNonJavaInputFormat if necessary to handle progress reporting
        // from C++ RecordReaders ...
        if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
            conf.setClass(Submitter.INPUT_FORMAT, job.getInputFormatClass(), InputFormat.class);
            job.setInputFormatClass(PipesNonJavaInputFormat.class);
        }

        if (avroInput != null) {
            if (explicitInputFormat) {
                conf.setClass(Submitter.INPUT_FORMAT, job.getInputFormatClass(), InputFormat.class);
            } // else let the bridge fall back to the appropriate Avro IF
            switch (avroInput) {
            case K:
                job.setInputFormatClass(PydoopAvroInputKeyBridge.class);
                break;
            case V:
                job.setInputFormatClass(PydoopAvroInputValueBridge.class);
                break;
            case KV:
                job.setInputFormatClass(PydoopAvroInputKeyValueBridge.class);
                break;
            default:
                throw new IllegalArgumentException("Bad Avro input type");
            }
        }
        if (avroOutput != null) {
            if (explicitOutputFormat) {
                conf.setClass(Submitter.OUTPUT_FORMAT, job.getOutputFormatClass(), OutputFormat.class);
            } // else let the bridge fall back to the appropriate Avro OF
            conf.set(props.getProperty("AVRO_OUTPUT"), avroOutput.name());
            switch (avroOutput) {
            case K:
                job.setOutputFormatClass(PydoopAvroOutputKeyBridge.class);
                break;
            case V:
                job.setOutputFormatClass(PydoopAvroOutputValueBridge.class);
                break;
            case KV:
                job.setOutputFormatClass(PydoopAvroOutputKeyValueBridge.class);
                break;
            default:
                throw new IllegalArgumentException("Bad Avro output type");
            }
        }

        String exec = getExecutable(conf);
        if (exec == null) {
            String msg = "No application program defined.";
            throw new IllegalArgumentException(msg);
        }
        // add default debug script only when executable is expressed as
        // <path>#<executable>
        //FIXME: this is kind of useless if the pipes program is not in c++
        if (exec.contains("#")) {
            // set default gdb commands for map and reduce task
            String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script";
            setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT, defScript);
            setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT, defScript);
        }
        URI[] fileCache = DistributedCache.getCacheFiles(conf);
        if (fileCache == null) {
            fileCache = new URI[1];
        } else {
            URI[] tmp = new URI[fileCache.length + 1];
            System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
            fileCache = tmp;
        }
        try {
            fileCache[0] = new URI(exec);
        } catch (URISyntaxException e) {
            String msg = "Problem parsing executable URI " + exec;
            IOException ie = new IOException(msg);
            ie.initCause(e);
            throw ie;
        }
        DistributedCache.setCacheFiles(fileCache, conf);
    }

    public int run(String[] args) throws Exception {
        CommandLineParser cli = new CommandLineParser();
        if (args.length == 0) {
            cli.printUsage();
            return 1;
        }
        try {
            Job job = new Job(new Configuration());
            job.setJobName(getClass().getName());
            Configuration conf = job.getConfiguration();
            CommandLine results = cli.parse(conf, args);
            if (results.hasOption("input")) {
                Path path = new Path(results.getOptionValue("input"));
                FileInputFormat.setInputPaths(job, path);
            }
            if (results.hasOption("output")) {
                Path path = new Path(results.getOptionValue("output"));
                FileOutputFormat.setOutputPath(job, path);
            }
            if (results.hasOption("jar")) {
                job.setJar(results.getOptionValue("jar"));
            }
            if (results.hasOption("inputformat")) {
                explicitInputFormat = true;
                setIsJavaRecordReader(conf, true);
                job.setInputFormatClass(getClass(results, "inputformat", conf, InputFormat.class));
            }
            if (results.hasOption("javareader")) {
                setIsJavaRecordReader(conf, true);
            }
            if (results.hasOption("map")) {
                setIsJavaMapper(conf, true);
                job.setMapperClass(getClass(results, "map", conf, Mapper.class));
            }
            if (results.hasOption("partitioner")) {
                job.setPartitionerClass(getClass(results, "partitioner", conf, Partitioner.class));
            }
            if (results.hasOption("reduce")) {
                setIsJavaReducer(conf, true);
                job.setReducerClass(getClass(results, "reduce", conf, Reducer.class));
            }
            if (results.hasOption("reduces")) {
                job.setNumReduceTasks(Integer.parseInt(results.getOptionValue("reduces")));
            }
            if (results.hasOption("writer")) {
                explicitOutputFormat = true;
                setIsJavaRecordWriter(conf, true);
                job.setOutputFormatClass(getClass(results, "writer", conf, OutputFormat.class));
            }
            if (results.hasOption("lazyOutput")) {
                if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) {
                    LazyOutputFormat.setOutputFormatClass(job, job.getOutputFormatClass());
                }
            }
            if (results.hasOption("avroInput")) {
                avroInput = AvroIO.valueOf(results.getOptionValue("avroInput").toUpperCase());
            }
            if (results.hasOption("avroOutput")) {
                avroOutput = AvroIO.valueOf(results.getOptionValue("avroOutput").toUpperCase());
            }

            if (results.hasOption("program")) {
                setExecutable(conf, results.getOptionValue("program"));
            }
            // if they gave us a jar file, include it into the class path
            String jarFile = job.getJar();
            if (jarFile != null) {
                final URL[] urls = new URL[] { FileSystem.getLocal(conf).pathToFile(new Path(jarFile)).toURL() };
                // FindBugs complains that creating a URLClassLoader should be
                // in a doPrivileged() block.
                ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() {
                    public ClassLoader run() {
                        return new URLClassLoader(urls);
                    }
                });
                conf.setClassLoader(loader);
            }
            setupPipesJob(job);
            return job.waitForCompletion(true) ? 0 : 1;
        } catch (ParseException pe) {
            LOG.info("Error : " + pe);
            cli.printUsage();
            return 1;
        }
    }

    public static void main(String[] args) throws Exception {
        int exitCode = new Submitter().run(args);
        ExitUtil.terminate(exitCode);
    }
}