org.apache.hama.pipes.Submitter.java Source code

Introduction

Here is the source code for org.apache.hama.pipes.Submitter.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hama.pipes;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLClassLoader;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.StringTokenizer;

import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.Parser;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hama.Constants;
import org.apache.hama.HamaConfiguration;
import org.apache.hama.bsp.BSPJob;
import org.apache.hama.bsp.BSPJobClient;
import org.apache.hama.bsp.FileInputFormat;
import org.apache.hama.bsp.FileOutputFormat;
import org.apache.hama.bsp.HashPartitioner;
import org.apache.hama.bsp.InputFormat;
import org.apache.hama.bsp.OutputFormat;
import org.apache.hama.bsp.Partitioner;
import org.apache.hama.pipes.util.DistributedCacheUtil;

import com.google.common.base.Joiner;

/**
 * The main entry point and job submitter. It may either be used as a command
 * line-based or API-based method to launch Pipes jobs.
 * 
 * Adapted from Hadoop Pipes.
 * 
 */
public class Submitter implements Tool {

    protected static final Log LOG = LogFactory.getLog(Submitter.class);
    private HamaConfiguration conf;

    public Submitter() {
        this.conf = new HamaConfiguration();
    }

    public Submitter(HamaConfiguration conf) {
        setConf(conf);
    }

    @Override
    public HamaConfiguration getConf() {
        return this.conf;
    }

    @Override
    public void setConf(Configuration conf) {
        this.conf = (HamaConfiguration) conf;
    }

    /**
     * Get the URI of the CPU application's executable.
     * 
     * @param conf
     * @return the URI where the application's executable is located
     */
    public static String getExecutable(Configuration conf) {
        return conf.get("hama.pipes.executable");
    }

    /**
     * Set the URI for the CPU application's executable. Normally this is a hdfs:
     * location.
     * 
     * @param conf
     * @param executable The URI of the application's executable.
     */
    public static void setExecutable(Configuration conf, String executable) {
        conf.set("hama.pipes.executable", executable);
    }

    /**
     * Set the configuration, if it doesn't already have a value for the given
     * key.
     * 
     * @param conf the configuration to modify
     * @param key the key to set
     * @param value the new "default" value to set
     */
    private static void setIfUnset(Configuration conf, String key, String value) {
        if (conf.get(key) == null) {
            conf.set(key, value);
        }
    }

    /**
     * Save away the user's original partitioner before we override it.
     * 
     * @param conf the configuration to modify
     * @param cls the user's partitioner class
     */
    static void setJavaPartitioner(Configuration conf, Class<?> cls) {
        conf.set("hama.pipes.partitioner", cls.getName());
    }

    /**
     * Get the user's original partitioner.
     * 
     * @param conf the configuration to look in
     * @return the class that the user submitted
     */
    @SuppressWarnings("rawtypes")
    static Class<? extends Partitioner> getJavaPartitioner(Configuration conf) {
        return conf.getClass("hama.pipes.partitioner", HashPartitioner.class, Partitioner.class);
    }

    /**
     * Does the user want to keep the command file for debugging? If this is true,
     * pipes will write a copy of the command data to a file in the task directory
     * named "downlink.data", which may be used to run the C++ program under the
     * debugger. You probably also want to set
     * JobConf.setKeepFailedTaskFiles(true) to keep the entire directory from
     * being deleted. To run using the data file, set the environment variable
     * "hadoop.pipes.command.file" to point to the file.
     * 
     * @param conf the configuration to check
     * @return will the framework save the command file?
     */
    public static boolean getKeepCommandFile(Configuration conf) {
        return conf.getBoolean("hama.pipes.command-file.keep", false);
    }

    /**
     * Set whether to keep the command file for debugging
     * 
     * @param conf the configuration to modify
     * @param keep the new value
     */
    public static void setKeepCommandFile(Configuration conf, boolean keep) {
        conf.setBoolean("hama.pipes.command-file.keep", keep);
    }

    /**
     * Submit a job to the cluster. All of the necessary modifications to the job
     * to run under pipes are made to the configuration.
     * 
     * @param job to submit to the cluster
     * @throws IOException
     */
    public static void runJob(BSPJob job) throws IOException {
        setupPipesJob(job);
        BSPJobClient.runJob(job);
    }

    private static void setupPipesJob(BSPJob job) throws IOException {
        job.setBspClass(PipesBSP.class);
        job.setJarByClass(PipesBSP.class);

        String textClassname = Text.class.getName();
        setIfUnset(job.getConfiguration(), "bsp.input.key.class", textClassname);
        setIfUnset(job.getConfiguration(), "bsp.input.value.class", textClassname);
        setIfUnset(job.getConfiguration(), "bsp.output.key.class", textClassname);
        setIfUnset(job.getConfiguration(), "bsp.output.value.class", textClassname);
        setIfUnset(job.getConfiguration(), Constants.MESSAGE_CLASS, BytesWritable.class.getName());

        setIfUnset(job.getConfiguration(), "bsp.job.name", "Hama Pipes Job");

        // DEBUG Output
        LOG.debug("BspClass: " + job.getBspClass().getName());
        // conf.setInputFormat(NLineInputFormat.class);
        LOG.debug("InputFormat: " + job.getInputFormat());
        LOG.debug("InputKeyClass: " + job.getInputKeyClass().getName());
        LOG.debug("InputValueClass: " + job.getInputValueClass().getName());
        LOG.debug("InputFormat: " + job.getOutputFormat());
        LOG.debug("OutputKeyClass: " + job.getOutputKeyClass().getName());
        LOG.debug("OutputValueClass: " + job.getOutputValueClass().getName());
        LOG.debug("MessageClass: " + job.get(Constants.MESSAGE_CLASS));

        LOG.debug("bsp.master.address: " + job.getConfiguration().get("bsp.master.address"));
        LOG.debug("bsp.local.tasks.maximum: " + job.getConfiguration().get("bsp.local.tasks.maximum"));
        LOG.debug("NumBspTask: " + job.getNumBspTask());
        LOG.debug("fs.default.name: " + job.getConfiguration().get("fs.default.name"));

        String exec = getExecutable(job.getConfiguration());
        if (exec == null) {
            throw new IllegalArgumentException("No application defined. (Set property hama.pipes.executable)");
        }

        URI[] fileCache = DistributedCache.getCacheFiles(job.getConfiguration());
        if (fileCache == null) {
            fileCache = new URI[1];
        } else {
            URI[] tmp = new URI[fileCache.length + 1];
            System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
            fileCache = tmp;
        }

        try {
            fileCache[0] = new URI(exec);
        } catch (URISyntaxException e) {
            IOException ie = new IOException("Problem parsing executable URI " + exec);
            ie.initCause(e);
            throw ie;
        }
        DistributedCache.setCacheFiles(fileCache, job.getConfiguration());

        // Add libjars to HDFS
        String tmpjars = job.getConfiguration().get("tmpjars");
        LOG.debug("conf.get(tmpjars): " + tmpjars);

        if (tmpjars != null) {
            String hdfsFileUrls = DistributedCacheUtil.addFilesToHDFS(job.getConfiguration(),
                    job.getConfiguration().get("tmpjars"));
            job.getConfiguration().set("tmpjars", hdfsFileUrls);

            LOG.info("conf.get(tmpjars): " + job.getConfiguration().get("tmpjars"));
        }

    }

    /**
     * A command line parser for the CLI-based Pipes job submitter.
     */
    static class CommandLineParser {
        private Options options = new Options();

        void addOption(String longName, boolean required, String description, String paramName) {
            OptionBuilder.withArgName(paramName);
            OptionBuilder.hasArgs(1);
            OptionBuilder.withDescription(description);
            OptionBuilder.isRequired(required);
            Option option = OptionBuilder.create(longName);
            options.addOption(option);
        }

        void addArgument(String name, boolean required, String description) {
            OptionBuilder.withArgName(name);
            OptionBuilder.hasArgs(1);
            OptionBuilder.withDescription(description);
            OptionBuilder.isRequired(required);
            Option option = OptionBuilder.create();
            options.addOption(option);

        }

        Parser createParser() {
            Parser result = new BasicParser();
            return result;
        }

        void printUsage() {
            // The CLI package should do this for us, but I can't figure out how
            // to make it print something reasonable.
            System.out.println("bin/hama pipes [genericOptions] [pipesOptions]");
            System.out.println();

            // Generic options
            // GenericOptionsParser.printGenericCommandUsage(System.out);
            System.out.println("Generic options supported are");
            System.out.println(" -conf <configuration file>\tSpecify an application configuration file");
            System.out.println(" -D <property=value>\tUse value for given property");
            System.out.println();

            // Pipes options
            System.out.println("Pipes options supported are");
            System.out.println(" -input <path>\tInput directory");
            System.out.println(" -output <path>\tOutput directory");
            System.out.println(" -jar <jar file>\tJar filename");
            System.out.println(" -inputformat <class>\tInputFormat class");
            System.out.println(" -bspTasks <number>\tNumber of bsp tasks to launch");
            System.out.println(" -partitioner <class>\tJava Partitioner");
            System.out.println(" -combiner <class>\tJava Combiner class");
            System.out.println(" -outputformat <class>\tJava RecordWriter");
            System.out.println(
                    " -cachefiles <space separated paths>\tAdditional cache files like libs, can be globbed with wildcards");
            System.out.println(" -program <executable>\tExecutable URI");
            System.out.println(" -programArgs <argument>\tArguments for the program");
            System.out.println(" -interpreter <executable>\tInterpreter, like python or bash");
            System.out.println(" -streaming <true|false>\tIf supplied, streaming is used instead of pipes");
            System.out.println(" -jobname <name>\tSets the name of this job");
        }
    }

    private static <InterfaceType> Class<? extends InterfaceType> getClass(CommandLine cl, String key,
            HamaConfiguration conf, Class<InterfaceType> cls) throws ClassNotFoundException {

        return conf.getClassByName(cl.getOptionValue(key)).asSubclass(cls);
    }

    @Override
    public int run(String[] args) throws Exception {
        CommandLineParser cli = new CommandLineParser();
        if (args.length == 0) {
            cli.printUsage();
            return 1;
        }

        LOG.debug("Hama pipes Submitter started!");

        cli.addOption("input", false, "input path for bsp", "path");
        cli.addOption("output", false, "output path from bsp", "path");

        cli.addOption("jar", false, "job jar file", "path");
        cli.addOption("inputformat", false, "java classname of InputFormat", "class");
        // cli.addArgument("javareader", false, "is the RecordReader in Java");

        cli.addOption("partitioner", false, "java classname of Partitioner", "class");
        cli.addOption("outputformat", false, "java classname of OutputFormat", "class");

        cli.addOption("cachefiles", false, "additional cache files to add", "space delimited paths");

        cli.addOption("interpreter", false, "interpreter, like python or bash", "executable");

        cli.addOption("jobname", false, "the jobname", "name");

        cli.addOption("programArgs", false, "program arguments", "arguments");
        cli.addOption("bspTasks", false, "how many bsp tasks to launch", "number");
        cli.addOption("streaming", false, "if supplied, streaming is used instead of pipes", "");

        cli.addOption("jobconf", false,
                "\"n1=v1,n2=v2,..\" (Deprecated) Optional. Add or override a JobConf property.", "key=val");

        cli.addOption("program", false, "URI to application executable", "class");
        Parser parser = cli.createParser();
        try {

            // check generic arguments -conf
            GenericOptionsParser genericParser = new GenericOptionsParser(getConf(), args);
            // get other arguments
            CommandLine results = parser.parse(cli.options, genericParser.getRemainingArgs());

            BSPJob job = new BSPJob(getConf());

            if (results.hasOption("input")) {
                FileInputFormat.setInputPaths(job, results.getOptionValue("input"));
            }
            if (results.hasOption("output")) {
                FileOutputFormat.setOutputPath(job, new Path(results.getOptionValue("output")));
            }
            if (results.hasOption("jar")) {
                job.setJar(results.getOptionValue("jar"));
            }

            if (results.hasOption("jobname")) {
                job.setJobName(results.getOptionValue("jobname"));
            }

            if (results.hasOption("inputformat")) {
                job.setInputFormat(getClass(results, "inputformat", conf, InputFormat.class));
            }

            if (results.hasOption("partitioner")) {
                job.setPartitioner(getClass(results, "partitioner", conf, Partitioner.class));
            }

            if (results.hasOption("outputformat")) {
                job.setOutputFormat(getClass(results, "outputformat", conf, OutputFormat.class));
            }

            if (results.hasOption("streaming")) {
                LOG.info("Streaming enabled!");
                job.set("hama.streaming.enabled", "true");
            }

            if (results.hasOption("jobconf")) {
                LOG.warn("-jobconf option is deprecated, please use -D instead.");
                String options = results.getOptionValue("jobconf");
                StringTokenizer tokenizer = new StringTokenizer(options, ",");
                while (tokenizer.hasMoreTokens()) {
                    String keyVal = tokenizer.nextToken().trim();
                    String[] keyValSplit = keyVal.split("=", 2);
                    job.set(keyValSplit[0], keyValSplit[1]);
                }
            }

            if (results.hasOption("bspTasks")) {
                int optionValue = Integer.parseInt(results.getOptionValue("bspTasks"));
                conf.setInt("bsp.local.tasks.maximum", optionValue);
                conf.setInt("bsp.peers.num", optionValue);
            }

            if (results.hasOption("program")) {
                String executablePath = results.getOptionValue("program");
                setExecutable(job.getConfiguration(), executablePath);
                DistributedCache.addCacheFile(new Path(executablePath).toUri(), conf);
            }

            if (results.hasOption("interpreter")) {
                job.getConfiguration().set("hama.pipes.executable.interpretor",
                        results.getOptionValue("interpreter"));
            }

            if (results.hasOption("programArgs")) {
                job.getConfiguration().set("hama.pipes.executable.args",
                        Joiner.on(" ").join(results.getOptionValues("programArgs")));
                // job.getConfiguration().set("hama.pipes.resolve.executable.args",
                // "true");
            }

            if (results.hasOption("cachefiles")) {
                FileSystem fs = FileSystem.get(getConf());
                String[] optionValues = results.getOptionValues("cachefiles");
                for (String s : optionValues) {
                    Path path = new Path(s);
                    FileStatus[] globStatus = fs.globStatus(path);
                    for (FileStatus f : globStatus) {
                        if (!f.isDir()) {
                            DistributedCache.addCacheFile(f.getPath().toUri(), job.getConfiguration());
                        } else {
                            LOG.info("Ignoring directory " + f.getPath() + " while globbing.");
                        }
                    }
                }
            }

            // if they gave us a jar file, include it into the class path
            String jarFile = job.getJar();
            if (jarFile != null) {
                @SuppressWarnings("deprecation")
                final URL[] urls = new URL[] { FileSystem.getLocal(conf).pathToFile(new Path(jarFile)).toURL() };
                // FindBugs complains that creating a URLClassLoader should be
                // in a doPrivileged() block.
                ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() {
                    @Override
                    public ClassLoader run() {
                        return new URLClassLoader(urls);
                    }
                });
                conf.setClassLoader(loader);
            }

            runJob(job);
            return 0;
        } catch (ParseException pe) {
            LOG.info("Error : " + pe);
            cli.printUsage();
            return 1;
        }

    }

    /**
     * Submit a pipes job based on the command line arguments.
     */
    public static void main(String[] args) throws Exception {
        int exitCode = new Submitter().run(args);
        System.exit(exitCode);
    }

}