Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.streaming; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.net.URLEncoder; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.TreeSet; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.mapreduce.MRConfig; import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.filecache.DistributedCache; import org.apache.hadoop.mapreduce.server.jobtracker.JTConfig; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.InvalidJobConfException; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobID; import org.apache.hadoop.mapred.KeyValueTextInputFormat; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.SequenceFileAsTextInputFormat; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapred.lib.LazyOutputFormat; import org.apache.hadoop.mapred.lib.aggregate.ValueAggregatorCombiner; import org.apache.hadoop.mapred.lib.aggregate.ValueAggregatorReducer; import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.streaming.io.IdentifierResolver; import org.apache.hadoop.streaming.io.InputWriter; import org.apache.hadoop.streaming.io.OutputReader; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.RunJar; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import static org.apache.hadoop.util.RunJar.MATCH_ANY; /** All the client-side work happens here. * (Jar packaging, MapRed job submission and monitoring) */ public class StreamJob implements Tool { protected static final Logger LOG = LoggerFactory.getLogger(StreamJob.class.getName()); final static String REDUCE_NONE = "NONE"; /** -----------Streaming CLI Implementation **/ private CommandLineParser parser = new BasicParser(); private Options allOptions; /**@deprecated use StreamJob() with ToolRunner or set the * Configuration using {@link #setConf(Configuration)} and * run with {@link #run(String[])}. */ @Deprecated public StreamJob(String[] argv, boolean mayExit) { this(); argv_ = Arrays.copyOf(argv, argv.length); this.config_ = new Configuration(); } public StreamJob() { setupOptions(); this.config_ = new Configuration(); } @Override public Configuration getConf() { return config_; } @Override public void setConf(Configuration conf) { this.config_ = conf; } @Override public int run(String[] args) throws Exception { try { this.argv_ = Arrays.copyOf(args, args.length); init(); preProcessArgs(); parseArgv(); if (printUsage) { printUsage(detailedUsage_); return 0; } postProcessArgs(); setJobConf(); } catch (IllegalArgumentException ex) { //ignore, since log will already be printed // print the log in debug mode. LOG.debug("Error in streaming job", ex); return 1; } return submitAndMonitorJob(); } /** * This method creates a streaming job from the given argument list. * The created object can be used and/or submitted to a jobtracker for * execution by a job agent such as JobControl * @param argv the list args for creating a streaming job * @return the created JobConf object * @throws IOException */ static public JobConf createJob(String[] argv) throws IOException { StreamJob job = new StreamJob(); job.argv_ = argv; job.init(); job.preProcessArgs(); job.parseArgv(); job.postProcessArgs(); job.setJobConf(); return job.jobConf_; } /** * This is the method that actually * initializes the job conf and submits the job * to the jobtracker * @throws IOException * @deprecated use {@link #run(String[])} instead. */ @Deprecated public int go() throws IOException { try { return run(argv_); } catch (Exception ex) { throw new IOException(ex.getMessage()); } } protected void init() { try { env_ = new Environment(); } catch (IOException io) { throw new RuntimeException(io); } } void preProcessArgs() { verbose_ = false; // Unset HADOOP_ROOT_LOGGER in case streaming job // invokes additional hadoop commands. addTaskEnvironment_ = "HADOOP_ROOT_LOGGER="; } void postProcessArgs() throws IOException { if (inputSpecs_.size() == 0) { fail("Required argument: -input <name>"); } if (output_ == null) { fail("Required argument: -output "); } msg("addTaskEnvironment=" + addTaskEnvironment_); for (final String packageFile : packageFiles_) { File f = new File(packageFile); if (f.isFile()) { shippedCanonFiles_.add(f.getCanonicalPath()); } } msg("shippedCanonFiles_=" + shippedCanonFiles_); // careful with class names.. mapCmd_ = unqualifyIfLocalPath(mapCmd_); comCmd_ = unqualifyIfLocalPath(comCmd_); redCmd_ = unqualifyIfLocalPath(redCmd_); } String unqualifyIfLocalPath(String cmd) throws IOException { if (cmd == null) { // } else { String prog = cmd; String args = ""; int s = cmd.indexOf(" "); if (s != -1) { prog = cmd.substring(0, s); args = cmd.substring(s + 1); } String progCanon; try { progCanon = new File(prog).getCanonicalPath(); } catch (IOException io) { progCanon = prog; } boolean shipped = shippedCanonFiles_.contains(progCanon); msg("shipped: " + shipped + " " + progCanon); if (shipped) { // Change path to simple filename. // That way when PipeMapRed calls Runtime.exec(), // it will look for the excutable in Task's working dir. // And this is where TaskRunner unjars our job jar. prog = new File(prog).getName(); if (args.length() > 0) { cmd = prog + " " + args; } else { cmd = prog; } } } msg("cmd=" + cmd); return cmd; } void parseArgv() { CommandLine cmdLine = null; try { cmdLine = parser.parse(allOptions, argv_); } catch (Exception oe) { LOG.error(oe.getMessage()); exitUsage(argv_.length > 0 && "-info".equals(argv_[0])); } if (cmdLine != null) { @SuppressWarnings("unchecked") List<String> args = cmdLine.getArgList(); if (args != null && args.size() > 0) { fail("Found " + args.size() + " unexpected arguments on the " + "command line " + args); } detailedUsage_ = cmdLine.hasOption("info"); if (cmdLine.hasOption("help") || detailedUsage_) { printUsage = true; return; } verbose_ = cmdLine.hasOption("verbose"); background_ = cmdLine.hasOption("background"); debug_ = cmdLine.hasOption("debug") ? debug_ + 1 : debug_; String[] values = cmdLine.getOptionValues("input"); if (values != null && values.length > 0) { for (String input : values) { inputSpecs_.add(input); } } output_ = cmdLine.getOptionValue("output"); mapCmd_ = cmdLine.getOptionValue("mapper"); comCmd_ = cmdLine.getOptionValue("combiner"); redCmd_ = cmdLine.getOptionValue("reducer"); lazyOutput_ = cmdLine.hasOption("lazyOutput"); values = cmdLine.getOptionValues("file"); if (values != null && values.length > 0) { LOG.warn("-file option is deprecated, please use generic option" + " -files instead."); StringBuffer fileList = new StringBuffer(); for (String file : values) { packageFiles_.add(file); try { Path path = new Path(file); FileSystem localFs = FileSystem.getLocal(config_); Path qualifiedPath = path.makeQualified(localFs.getUri(), localFs.getWorkingDirectory()); validate(qualifiedPath); String finalPath = qualifiedPath.toString(); if (fileList.length() > 0) { fileList.append(','); } fileList.append(finalPath); } catch (Exception e) { throw new IllegalArgumentException(e); } } String tmpFiles = config_.get("tmpfiles", ""); if (tmpFiles.isEmpty()) { tmpFiles = fileList.toString(); } else { tmpFiles = tmpFiles + "," + fileList; } config_.set("tmpfiles", tmpFiles); } String fsName = cmdLine.getOptionValue("dfs"); if (null != fsName) { LOG.warn("-dfs option is deprecated, please use -fs instead."); config_.set("fs.default.name", fsName); } additionalConfSpec_ = cmdLine.getOptionValue("additionalconfspec"); inputFormatSpec_ = cmdLine.getOptionValue("inputformat"); outputFormatSpec_ = cmdLine.getOptionValue("outputformat"); numReduceTasksSpec_ = cmdLine.getOptionValue("numReduceTasks"); partitionerSpec_ = cmdLine.getOptionValue("partitioner"); inReaderSpec_ = cmdLine.getOptionValue("inputreader"); mapDebugSpec_ = cmdLine.getOptionValue("mapdebug"); reduceDebugSpec_ = cmdLine.getOptionValue("reducedebug"); ioSpec_ = cmdLine.getOptionValue("io"); String[] car = cmdLine.getOptionValues("cacheArchive"); if (null != car && car.length > 0) { LOG.warn("-cacheArchive option is deprecated, please use -archives instead."); for (String s : car) { cacheArchives = (cacheArchives == null) ? s : cacheArchives + "," + s; } } String[] caf = cmdLine.getOptionValues("cacheFile"); if (null != caf && caf.length > 0) { LOG.warn("-cacheFile option is deprecated, please use -files instead."); for (String s : caf) { cacheFiles = (cacheFiles == null) ? s : cacheFiles + "," + s; } } String[] jobconf = cmdLine.getOptionValues("jobconf"); if (null != jobconf && jobconf.length > 0) { LOG.warn("-jobconf option is deprecated, please use -D instead."); for (String s : jobconf) { String[] parts = s.split("=", 2); config_.set(parts[0], parts[1]); } } String[] cmd = cmdLine.getOptionValues("cmdenv"); if (null != cmd && cmd.length > 0) { for (String s : cmd) { if (addTaskEnvironment_.length() > 0) { addTaskEnvironment_ += " "; } addTaskEnvironment_ += s; } } } else { exitUsage(argv_.length > 0 && "-info".equals(argv_[0])); } } protected void msg(String msg) { if (verbose_) { System.out.println("STREAM: " + msg); } } private Option createOption(String name, String desc, String argName, int max, boolean required) { return OptionBuilder.withArgName(argName).hasArgs(max).withDescription(desc).isRequired(required) .create(name); } private Option createBoolOption(String name, String desc) { return OptionBuilder.withDescription(desc).create(name); } private void validate(final Path path) throws IOException { try { path.getFileSystem(config_).access(path, FsAction.READ); } catch (FileNotFoundException e) { fail("File: " + path + " does not exist."); } catch (AccessControlException e) { fail("File: " + path + " is not readable."); } } private void setupOptions() { // input and output are not required for -info and -help options, // though they are required for streaming job to be run. Option input = createOption("input", "DFS input file(s) for the Map step", "path", Integer.MAX_VALUE, false); Option output = createOption("output", "DFS output directory for the Reduce step", "path", 1, false); Option mapper = createOption("mapper", "The streaming command to run", "cmd", 1, false); Option combiner = createOption("combiner", "The streaming command to run", "cmd", 1, false); // reducer could be NONE Option reducer = createOption("reducer", "The streaming command to run", "cmd", 1, false); Option file = createOption("file", "File to be shipped in the Job jar file", "file", Integer.MAX_VALUE, false); Option dfs = createOption("dfs", "Optional. Override DFS configuration", "<h:p>|local", 1, false); Option additionalconfspec = createOption("additionalconfspec", "Optional.", "spec", 1, false); Option inputformat = createOption("inputformat", "Optional.", "spec", 1, false); Option outputformat = createOption("outputformat", "Optional.", "spec", 1, false); Option partitioner = createOption("partitioner", "Optional.", "spec", 1, false); Option numReduceTasks = createOption("numReduceTasks", "Optional.", "spec", 1, false); Option inputreader = createOption("inputreader", "Optional.", "spec", 1, false); Option mapDebug = createOption("mapdebug", "Optional.", "spec", 1, false); Option reduceDebug = createOption("reducedebug", "Optional", "spec", 1, false); Option jobconf = createOption("jobconf", "(n=v) Optional. Add or override a JobConf property.", "spec", 1, false); Option cmdenv = createOption("cmdenv", "(n=v) Pass env.var to streaming commands.", "spec", 1, false); Option cacheFile = createOption("cacheFile", "File name URI", "fileNameURI", Integer.MAX_VALUE, false); Option cacheArchive = createOption("cacheArchive", "File name URI", "fileNameURI", Integer.MAX_VALUE, false); Option io = createOption("io", "Optional.", "spec", 1, false); // boolean properties Option background = createBoolOption("background", "Submit the job and don't wait till it completes."); Option verbose = createBoolOption("verbose", "print verbose output"); Option info = createBoolOption("info", "print verbose output"); Option help = createBoolOption("help", "print this help message"); Option debug = createBoolOption("debug", "print debug output"); Option lazyOutput = createBoolOption("lazyOutput", "create outputs lazily"); allOptions = new Options().addOption(input).addOption(output).addOption(mapper).addOption(combiner) .addOption(reducer).addOption(file).addOption(dfs).addOption(additionalconfspec) .addOption(inputformat).addOption(outputformat).addOption(partitioner).addOption(numReduceTasks) .addOption(inputreader).addOption(mapDebug).addOption(reduceDebug).addOption(jobconf) .addOption(cmdenv).addOption(cacheFile).addOption(cacheArchive).addOption(io).addOption(background) .addOption(verbose).addOption(info).addOption(debug).addOption(help).addOption(lazyOutput); } public void exitUsage(boolean detailed) { printUsage(detailed); fail(""); } private void printUsage(boolean detailed) { System.out.println("Usage: $HADOOP_HOME/bin/hadoop jar hadoop-streaming.jar" + " [options]"); System.out.println("Options:"); System.out.println(" -input <path> DFS input file(s) for the Map" + " step."); System.out.println(" -output <path> DFS output directory for the" + " Reduce step."); System.out.println(" -mapper <cmd|JavaClassName> Optional. Command" + " to be run as mapper."); System.out.println(" -combiner <cmd|JavaClassName> Optional. Command" + " to be run as combiner."); System.out.println(" -reducer <cmd|JavaClassName> Optional. Command" + " to be run as reducer."); System.out.println(" -file <file> Optional. File/dir to be " + "shipped in the Job jar file.\n" + " Deprecated. Use generic option \"-files\" instead."); System.out.println( " -inputformat <TextInputFormat(default)" + "|SequenceFileAsTextInputFormat|JavaClassName>\n" + " Optional. The input format class."); System.out.println(" -outputformat <TextOutputFormat(default)" + "|JavaClassName>\n" + " Optional. The output format class."); System.out.println(" -partitioner <JavaClassName> Optional. The" + " partitioner class."); System.out.println(" -numReduceTasks <num> Optional. Number of reduce " + "tasks."); System.out.println(" -inputreader <spec> Optional. Input recordreader" + " spec."); System.out.println(" -cmdenv <n>=<v> Optional. Pass env.var to" + " streaming commands."); System.out.println(" -mapdebug <cmd> Optional. " + "To run this script when a map task fails."); System.out.println(" -reducedebug <cmd> Optional." + " To run this script when a reduce task fails."); System.out.println(" -io <identifier> Optional. Format to use" + " for input to and output"); System.out.println(" from mapper/reducer commands"); System.out.println(" -lazyOutput Optional. Lazily create Output."); System.out.println(" -background Optional. Submit the job and don't wait till it completes."); System.out.println(" -verbose Optional. Print verbose output."); System.out.println(" -info Optional. Print detailed usage."); System.out.println(" -help Optional. Print help message."); System.out.println(); GenericOptionsParser.printGenericCommandUsage(System.out); if (!detailed) { System.out.println(); System.out.println("For more details about these options:"); System.out.println("Use " + "$HADOOP_HOME/bin/hadoop jar hadoop-streaming.jar -info"); return; } System.out.println(); System.out.println("Usage tips:"); System.out.println("In -input: globbing on <path> is supported and can " + "have multiple -input"); System.out.println(); System.out.println("Default Map input format: a line is a record in UTF-8 " + "the key part ends at first"); System.out.println(" TAB, the rest of the line is the value"); System.out.println(); System.out.println("To pass a Custom input format:"); System.out.println(" -inputformat package.MyInputFormat"); System.out.println(); System.out.println("Similarly, to pass a custom output format:"); System.out.println(" -outputformat package.MyOutputFormat"); System.out.println(); System.out.println("The files with extensions .class and .jar/.zip," + " specified for the -file"); System.out.println(" argument[s], end up in \"classes\" and \"lib\" " + "directories respectively inside"); System.out.println(" the working directory when the mapper and reducer are" + " run. All other files"); System.out.println(" specified for the -file argument[s]" + " end up in the working directory when the"); System.out.println(" mapper and reducer are run. The location of this " + "working directory is"); System.out.println(" unspecified."); System.out.println(); System.out.println("To set the number of reduce tasks (num. of output " + "files) as, say 10:"); System.out.println(" Use -numReduceTasks 10"); System.out.println("To skip the sort/combine/shuffle/sort/reduce step:"); System.out.println(" Use -numReduceTasks 0"); System.out.println(" Map output then becomes a 'side-effect " + "output' rather than a reduce input."); System.out.println(" This speeds up processing. This also feels " + "more like \"in-place\" processing"); System.out.println(" because the input filename and the map " + "input order are preserved."); System.out.println(" This is equivalent to -reducer NONE"); System.out.println(); System.out.println("To speed up the last maps:"); System.out.println(" -D " + MRJobConfig.MAP_SPECULATIVE + "=true"); System.out.println("To speed up the last reduces:"); System.out.println(" -D " + MRJobConfig.REDUCE_SPECULATIVE + "=true"); System.out.println("To name the job (appears in the JobTracker Web UI):"); System.out.println(" -D " + MRJobConfig.JOB_NAME + "='My Job'"); System.out.println("To change the local temp directory:"); System.out.println(" -D dfs.data.dir=/tmp/dfs"); System.out.println(" -D stream.tmpdir=/tmp/streaming"); System.out.println("Additional local temp directories with -jt local:"); System.out.println(" -D " + MRConfig.LOCAL_DIR + "=/tmp/local"); System.out.println(" -D " + JTConfig.JT_SYSTEM_DIR + "=/tmp/system"); System.out.println(" -D " + MRConfig.TEMP_DIR + "=/tmp/temp"); System.out.println("To treat tasks with non-zero exit status as SUCCEDED:"); System.out.println(" -D stream.non.zero.exit.is.failure=false"); System.out.println("Use a custom hadoop streaming build along with standard" + " hadoop install:"); System.out.println(" $HADOOP_HOME/bin/hadoop jar " + "/path/my-hadoop-streaming.jar [...]\\"); System.out.println(" [...] -D stream.shipped.hadoopstreaming=" + "/path/my-hadoop-streaming.jar"); System.out.println("For more details about jobconf parameters see:"); System.out.println(" http://wiki.apache.org/hadoop/JobConfFile"); System.out.println( "Truncate the values of the job configuration copied" + "to the environment at the given length:"); System.out.println(" -D stream.jobconf.truncate.limit=-1"); System.out.println("To set an environment variable in a streaming " + "command:"); System.out.println(" -cmdenv EXAMPLE_DIR=/home/example/dictionaries/"); System.out.println(); System.out.println("Shortcut:"); System.out.println(" setenv HSTREAMING \"$HADOOP_HOME/bin/hadoop jar " + "hadoop-streaming.jar\""); System.out.println(); System.out.println("Example: $HSTREAMING -mapper " + "\"/usr/local/bin/perl5 filter.pl\""); System.out.println(" -file /local/filter.pl -input " + "\"/logs/0604*/*\" [...]"); System.out.println(" Ships a script, invokes the non-shipped perl " + "interpreter. Shipped files go to"); System.out.println(" the working directory so filter.pl is found by perl. " + "Input files are all the"); System.out.println(" daily logs for days in month 2006-04"); } public void fail(String message) { System.err.println(message); System.err.println("Try -help for more information"); throw new IllegalArgumentException(message); } // -------------------------------------------- protected String getHadoopClientHome() { String h = env_.getProperty("HADOOP_HOME"); // standard Hadoop if (h == null) { //fail("Missing required environment variable: HADOOP_HOME"); h = "UNDEF"; } return h; } protected boolean isLocalHadoop() { return StreamUtil.isLocalJobTracker(jobConf_); } @Deprecated protected String getClusterNick() { return "default"; } /** @return path to the created Jar file or null if no files are necessary. */ protected String packageJobJar() throws IOException { ArrayList<String> unjarFiles = new ArrayList<String>(); // Runtime code: ship same version of code as self (job submitter code) // usually found in: build/contrib or build/hadoop-<version>-dev-streaming.jar // First try an explicit spec: it's too hard to find our own location in this case: // $HADOOP_HOME/bin/hadoop jar /not/first/on/classpath/custom-hadoop-streaming.jar // where findInClasspath() would find the version of hadoop-streaming.jar in $HADOOP_HOME String runtimeClasses = config_.get("stream.shipped.hadoopstreaming"); // jar or class dir if (runtimeClasses == null) { runtimeClasses = StreamUtil.findInClasspath(StreamJob.class.getName()); } if (runtimeClasses == null) { throw new IOException("runtime classes not found: " + getClass().getPackage()); } else { msg("Found runtime classes in: " + runtimeClasses); } if (isLocalHadoop()) { // don't package class files (they might get unpackaged in "." and then // hide the intended CLASSPATH entry) // we still package everything else (so that scripts and executable are found in // Task workdir like distributed Hadoop) } else { if (new File(runtimeClasses).isDirectory()) { packageFiles_.add(runtimeClasses); } else { unjarFiles.add(runtimeClasses); } } if (packageFiles_.size() + unjarFiles.size() == 0) { return null; } String tmp = jobConf_.get("stream.tmpdir"); //, "/tmp/${mapreduce.job.user.name}/" File tmpDir = (tmp == null) ? null : new File(tmp); // tmpDir=null means OS default tmp dir File jobJar = File.createTempFile("streamjob", ".jar", tmpDir); System.out .println("packageJobJar: " + packageFiles_ + " " + unjarFiles + " " + jobJar + " tmpDir=" + tmpDir); if (debug_ == 0) { jobJar.deleteOnExit(); } JarBuilder builder = new JarBuilder(); if (verbose_) { builder.setVerbose(true); } String jobJarName = jobJar.getAbsolutePath(); builder.merge(packageFiles_, unjarFiles, jobJarName); return jobJarName; } /** * get the uris of all the files/caches */ protected void getURIs(String lcacheArchives, String lcacheFiles) { String archives[] = StringUtils.getStrings(lcacheArchives); String files[] = StringUtils.getStrings(lcacheFiles); fileURIs = StringUtils.stringToURI(files); archiveURIs = StringUtils.stringToURI(archives); } protected void setJobConf() throws IOException { if (additionalConfSpec_ != null) { LOG.warn("-additionalconfspec option is deprecated, please use -conf instead."); config_.addResource(new Path(additionalConfSpec_)); } // general MapRed job properties jobConf_ = new JobConf(config_, StreamJob.class); // All streaming jobs get the task timeout value // from the configuration settings. // The correct FS must be set before this is called! // (to resolve local vs. dfs drive letter differences) // (mapreduce.job.working.dir will be lazily initialized ONCE and depends on FS) for (int i = 0; i < inputSpecs_.size(); i++) { FileInputFormat.addInputPaths(jobConf_, (String) inputSpecs_.get(i)); } String defaultPackage = this.getClass().getPackage().getName(); Class c; Class fmt = null; if (inReaderSpec_ == null && inputFormatSpec_ == null) { fmt = TextInputFormat.class; } else if (inputFormatSpec_ != null) { if (inputFormatSpec_.equals(TextInputFormat.class.getName()) || inputFormatSpec_.equals(TextInputFormat.class.getCanonicalName()) || inputFormatSpec_.equals(TextInputFormat.class.getSimpleName())) { fmt = TextInputFormat.class; } else if (inputFormatSpec_.equals(KeyValueTextInputFormat.class.getName()) || inputFormatSpec_.equals(KeyValueTextInputFormat.class.getCanonicalName()) || inputFormatSpec_.equals(KeyValueTextInputFormat.class.getSimpleName())) { if (inReaderSpec_ == null) { fmt = KeyValueTextInputFormat.class; } } else if (inputFormatSpec_.equals(SequenceFileInputFormat.class.getName()) || inputFormatSpec_ .equals(org.apache.hadoop.mapred.SequenceFileInputFormat.class.getCanonicalName()) || inputFormatSpec_ .equals(org.apache.hadoop.mapred.SequenceFileInputFormat.class.getSimpleName())) { if (inReaderSpec_ == null) { fmt = SequenceFileInputFormat.class; } } else if (inputFormatSpec_.equals(SequenceFileAsTextInputFormat.class.getName()) || inputFormatSpec_.equals(SequenceFileAsTextInputFormat.class.getCanonicalName()) || inputFormatSpec_.equals(SequenceFileAsTextInputFormat.class.getSimpleName())) { fmt = SequenceFileAsTextInputFormat.class; } else { c = StreamUtil.goodClassOrNull(jobConf_, inputFormatSpec_, defaultPackage); if (c != null) { fmt = c; } else { fail("-inputformat : class not found : " + inputFormatSpec_); } } } if (fmt == null) { fmt = StreamInputFormat.class; } jobConf_.setInputFormat(fmt); if (ioSpec_ != null) { jobConf_.set("stream.map.input", ioSpec_); jobConf_.set("stream.map.output", ioSpec_); jobConf_.set("stream.reduce.input", ioSpec_); jobConf_.set("stream.reduce.output", ioSpec_); } Class<? extends IdentifierResolver> idResolverClass = jobConf_.getClass( "stream.io.identifier.resolver.class", IdentifierResolver.class, IdentifierResolver.class); IdentifierResolver idResolver = ReflectionUtils.newInstance(idResolverClass, jobConf_); idResolver.resolve(jobConf_.get("stream.map.input", IdentifierResolver.TEXT_ID)); jobConf_.setClass("stream.map.input.writer.class", idResolver.getInputWriterClass(), InputWriter.class); idResolver.resolve(jobConf_.get("stream.reduce.input", IdentifierResolver.TEXT_ID)); jobConf_.setClass("stream.reduce.input.writer.class", idResolver.getInputWriterClass(), InputWriter.class); jobConf_.set("stream.addenvironment", addTaskEnvironment_); boolean isMapperACommand = false; if (mapCmd_ != null) { c = StreamUtil.goodClassOrNull(jobConf_, mapCmd_, defaultPackage); if (c != null) { jobConf_.setMapperClass(c); } else { isMapperACommand = true; jobConf_.setMapperClass(PipeMapper.class); jobConf_.setMapRunnerClass(PipeMapRunner.class); jobConf_.set("stream.map.streamprocessor", URLEncoder.encode(mapCmd_, "UTF-8")); } } if (comCmd_ != null) { c = StreamUtil.goodClassOrNull(jobConf_, comCmd_, defaultPackage); if (c != null) { jobConf_.setCombinerClass(c); } else { jobConf_.setCombinerClass(PipeCombiner.class); jobConf_.set("stream.combine.streamprocessor", URLEncoder.encode(comCmd_, "UTF-8")); } } if (numReduceTasksSpec_ != null) { int numReduceTasks = Integer.parseInt(numReduceTasksSpec_); jobConf_.setNumReduceTasks(numReduceTasks); } boolean isReducerACommand = false; if (redCmd_ != null) { if (redCmd_.equals(REDUCE_NONE)) { jobConf_.setNumReduceTasks(0); } if (jobConf_.getNumReduceTasks() != 0) { if (redCmd_.compareToIgnoreCase("aggregate") == 0) { jobConf_.setReducerClass(ValueAggregatorReducer.class); jobConf_.setCombinerClass(ValueAggregatorCombiner.class); } else { c = StreamUtil.goodClassOrNull(jobConf_, redCmd_, defaultPackage); if (c != null) { jobConf_.setReducerClass(c); } else { isReducerACommand = true; jobConf_.setReducerClass(PipeReducer.class); jobConf_.set("stream.reduce.streamprocessor", URLEncoder.encode(redCmd_, "UTF-8")); } } } } idResolver.resolve(jobConf_.get("stream.map.output", IdentifierResolver.TEXT_ID)); jobConf_.setClass("stream.map.output.reader.class", idResolver.getOutputReaderClass(), OutputReader.class); if (isMapperACommand || jobConf_.get("stream.map.output") != null) { // if mapper is a command, then map output key/value classes come from the // idResolver jobConf_.setMapOutputKeyClass(idResolver.getOutputKeyClass()); jobConf_.setMapOutputValueClass(idResolver.getOutputValueClass()); if (jobConf_.getNumReduceTasks() == 0) { jobConf_.setOutputKeyClass(idResolver.getOutputKeyClass()); jobConf_.setOutputValueClass(idResolver.getOutputValueClass()); } } idResolver.resolve(jobConf_.get("stream.reduce.output", IdentifierResolver.TEXT_ID)); jobConf_.setClass("stream.reduce.output.reader.class", idResolver.getOutputReaderClass(), OutputReader.class); if (isReducerACommand || jobConf_.get("stream.reduce.output") != null) { // if reducer is a command, then output key/value classes come from the // idResolver jobConf_.setOutputKeyClass(idResolver.getOutputKeyClass()); jobConf_.setOutputValueClass(idResolver.getOutputValueClass()); } if (inReaderSpec_ != null) { String[] args = inReaderSpec_.split(","); String readerClass = args[0]; // this argument can only be a Java class c = StreamUtil.goodClassOrNull(jobConf_, readerClass, defaultPackage); if (c != null) { jobConf_.set("stream.recordreader.class", c.getName()); } else { fail("-inputreader: class not found: " + readerClass); } for (int i = 1; i < args.length; i++) { String[] nv = args[i].split("=", 2); String k = "stream.recordreader." + nv[0]; String v = (nv.length > 1) ? nv[1] : ""; jobConf_.set(k, v); } } FileOutputFormat.setOutputPath(jobConf_, new Path(output_)); fmt = null; if (outputFormatSpec_ != null) { c = StreamUtil.goodClassOrNull(jobConf_, outputFormatSpec_, defaultPackage); if (c != null) { fmt = c; } else { fail("-outputformat : class not found : " + outputFormatSpec_); } } if (fmt == null) { fmt = TextOutputFormat.class; } if (lazyOutput_) { LazyOutputFormat.setOutputFormatClass(jobConf_, fmt); } else { jobConf_.setOutputFormat(fmt); } if (partitionerSpec_ != null) { c = StreamUtil.goodClassOrNull(jobConf_, partitionerSpec_, defaultPackage); if (c != null) { jobConf_.setPartitionerClass(c); } else { fail("-partitioner : class not found : " + partitionerSpec_); } } if (mapDebugSpec_ != null) { jobConf_.setMapDebugScript(mapDebugSpec_); } if (reduceDebugSpec_ != null) { jobConf_.setReduceDebugScript(reduceDebugSpec_); } // last, allow user to override anything // (although typically used with properties we didn't touch) jar_ = packageJobJar(); if (jar_ != null) { jobConf_.setJar(jar_); } if ((cacheArchives != null) || (cacheFiles != null)) { getURIs(cacheArchives, cacheFiles); boolean b = DistributedCache.checkURIs(fileURIs, archiveURIs); if (!b) fail(LINK_URI); } // set the jobconf for the caching parameters if (cacheArchives != null) DistributedCache.setCacheArchives(archiveURIs, jobConf_); if (cacheFiles != null) DistributedCache.setCacheFiles(fileURIs, jobConf_); if (verbose_) { listJobConfProperties(); } msg("submitting to jobconf: " + getJobTrackerHostPort()); } /** * Prints out the jobconf properties on stdout * when verbose is specified. */ protected void listJobConfProperties() { msg("==== JobConf properties:"); TreeMap<String, String> sorted = new TreeMap<String, String>(); for (final Map.Entry<String, String> en : jobConf_) { sorted.put(en.getKey(), en.getValue()); } for (final Map.Entry<String, String> en : sorted.entrySet()) { msg(en.getKey() + "=" + en.getValue()); } msg("===="); } protected String getJobTrackerHostPort() { return jobConf_.get(JTConfig.JT_IPC_ADDRESS); } // Based on JobClient public int submitAndMonitorJob() throws IOException { if (jar_ != null && isLocalHadoop()) { // getAbs became required when shell and subvm have different working dirs... File wd = new File(".").getAbsoluteFile(); RunJar.unJar(new File(jar_), wd, MATCH_ANY); } // if jobConf_ changes must recreate a JobClient jc_ = new JobClient(jobConf_); running_ = null; try { running_ = jc_.submitJob(jobConf_); jobId_ = running_.getID(); if (background_) { LOG.info("Job is running in background."); } else if (!jc_.monitorAndPrintJob(jobConf_, running_)) { LOG.error("Job not successful!"); return 1; } LOG.info("Output directory: " + output_); } catch (FileNotFoundException fe) { LOG.error("Error launching job , bad input path : " + fe.getMessage()); return 2; } catch (InvalidJobConfException je) { LOG.error("Error launching job , Invalid job conf : " + je.getMessage()); return 3; } catch (FileAlreadyExistsException fae) { LOG.error("Error launching job , Output path already exists : " + fae.getMessage()); return 4; } catch (IOException ioe) { LOG.error("Error Launching job : " + ioe.getMessage()); return 5; } catch (InterruptedException ie) { LOG.error("Error monitoring job : " + ie.getMessage()); return 6; } finally { jc_.close(); } return 0; } protected String[] argv_; protected boolean background_; protected boolean verbose_; protected boolean detailedUsage_; protected boolean printUsage = false; protected int debug_; protected Environment env_; protected String jar_; protected boolean localHadoop_; protected Configuration config_; protected JobConf jobConf_; protected JobClient jc_; // command-line arguments protected ArrayList<String> inputSpecs_ = new ArrayList<String>(); protected TreeSet<String> seenPrimary_ = new TreeSet<String>(); protected boolean hasSimpleInputSpecs_; protected ArrayList<String> packageFiles_ = new ArrayList<String>(); protected ArrayList<String> shippedCanonFiles_ = new ArrayList<String>(); //protected TreeMap<String, String> userJobConfProps_ = new TreeMap<String, String>(); protected String output_; protected String mapCmd_; protected String comCmd_; protected String redCmd_; protected String cacheFiles; protected String cacheArchives; protected URI[] fileURIs; protected URI[] archiveURIs; protected String inReaderSpec_; protected String inputFormatSpec_; protected String outputFormatSpec_; protected String partitionerSpec_; protected String numReduceTasksSpec_; protected String additionalConfSpec_; protected String mapDebugSpec_; protected String reduceDebugSpec_; protected String ioSpec_; protected boolean lazyOutput_; // Use to communicate config to the external processes (ex env.var.HADOOP_USER) // encoding "a=b c=d" protected String addTaskEnvironment_; protected boolean outputSingleNode_; protected long minRecWrittenToEnableSkip_; protected RunningJob running_; protected JobID jobId_; protected static final String LINK_URI = "You need to specify the uris as scheme://path#linkname," + "Please specify a different link name for all of your caching URIs"; }