Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience.Private; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.mapred.lib.HashPartitioner; import org.apache.hadoop.mapred.lib.IdentityMapper; import org.apache.hadoop.mapred.lib.IdentityReducer; import org.apache.hadoop.mapred.lib.KeyFieldBasedComparator; import org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner; import org.apache.hadoop.mapreduce.MRConfig; import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.TaskType; import org.apache.hadoop.mapreduce.filecache.DistributedCache; import org.apache.hadoop.mapreduce.util.ConfigUtil; import org.apache.hadoop.security.Credentials; import org.apache.hadoop.util.ClassUtil; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.Tool; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A map/reduce job configuration. * * <p><code>JobConf</code> is the primary interface for a user to describe a * map-reduce job to the Hadoop framework for execution. The framework tries to * faithfully execute the job as-is described by <code>JobConf</code>, however: * <ol> * <li> * Some configuration parameters might have been marked as * <a href="{@docRoot}/org/apache/hadoop/conf/Configuration.html#FinalParams"> * final</a> by administrators and hence cannot be altered. * </li> * <li> * While some job parameters are straight-forward to set * (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly * with the rest of the framework and/or job-configuration and is relatively * more complex for the user to control finely * (e.g. {@link #setNumMapTasks(int)}). * </li> * </ol> * * <p><code>JobConf</code> typically specifies the {@link Mapper}, combiner * (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and * {@link OutputFormat} implementations to be used etc. * * <p>Optionally <code>JobConf</code> is used to specify other advanced facets * of the job such as <code>Comparator</code>s to be used, files to be put in * the {@link DistributedCache}, whether or not intermediate and/or job outputs * are to be compressed (and how), debugability via user-provided scripts * ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}), * for doing post-processing on task logs, task's stdout, stderr, syslog. * and etc.</p> * * <p>Here is an example on how to configure a job via <code>JobConf</code>:</p> * <p><blockquote><pre> * // Create a new JobConf * JobConf job = new JobConf(new Configuration(), MyJob.class); * * // Specify various job-specific parameters * job.setJobName("myjob"); * * FileInputFormat.setInputPaths(job, new Path("in")); * FileOutputFormat.setOutputPath(job, new Path("out")); * * job.setMapperClass(MyJob.MyMapper.class); * job.setCombinerClass(MyJob.MyReducer.class); * job.setReducerClass(MyJob.MyReducer.class); * * job.setInputFormat(SequenceFileInputFormat.class); * job.setOutputFormat(SequenceFileOutputFormat.class); * </pre></blockquote> * * @see JobClient * @see ClusterStatus * @see Tool * @see DistributedCache */ @InterfaceAudience.Public @InterfaceStability.Stable public class JobConf extends Configuration { private static final Logger LOG = LoggerFactory.getLogger(JobConf.class); private static final Pattern JAVA_OPTS_XMX_PATTERN = Pattern .compile(".*(?:^|\\s)-Xmx(\\d+)([gGmMkK]?)(?:$|\\s).*"); static { ConfigUtil.loadResources(); } /** * @deprecated Use {@link #MAPREDUCE_JOB_MAP_MEMORY_MB_PROPERTY} and * {@link #MAPREDUCE_JOB_REDUCE_MEMORY_MB_PROPERTY} */ @Deprecated public static final String MAPRED_TASK_MAXVMEM_PROPERTY = "mapred.task.maxvmem"; /** * @deprecated */ @Deprecated public static final String UPPER_LIMIT_ON_TASK_VMEM_PROPERTY = "mapred.task.limit.maxvmem"; /** * @deprecated */ @Deprecated public static final String MAPRED_TASK_DEFAULT_MAXVMEM_PROPERTY = "mapred.task.default.maxvmem"; /** * @deprecated */ @Deprecated public static final String MAPRED_TASK_MAXPMEM_PROPERTY = "mapred.task.maxpmem"; /** * A value which if set for memory related configuration options, * indicates that the options are turned off. * Deprecated because it makes no sense in the context of MR2. */ @Deprecated public static final long DISABLED_MEMORY_LIMIT = -1L; /** * Property name for the configuration property mapreduce.cluster.local.dir */ public static final String MAPRED_LOCAL_DIR_PROPERTY = MRConfig.LOCAL_DIR; /** * Name of the queue to which jobs will be submitted, if no queue * name is mentioned. */ public static final String DEFAULT_QUEUE_NAME = "default"; static final String MAPREDUCE_JOB_MAP_MEMORY_MB_PROPERTY = JobContext.MAP_MEMORY_MB; static final String MAPREDUCE_JOB_REDUCE_MEMORY_MB_PROPERTY = JobContext.REDUCE_MEMORY_MB; /** * The variable is kept for M/R 1.x applications, while M/R 2.x applications * should use {@link #MAPREDUCE_JOB_MAP_MEMORY_MB_PROPERTY} */ @Deprecated public static final String MAPRED_JOB_MAP_MEMORY_MB_PROPERTY = "mapred.job.map.memory.mb"; /** * The variable is kept for M/R 1.x applications, while M/R 2.x applications * should use {@link #MAPREDUCE_JOB_REDUCE_MEMORY_MB_PROPERTY} */ @Deprecated public static final String MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY = "mapred.job.reduce.memory.mb"; /** Pattern for the default unpacking behavior for job jars */ public static final Pattern UNPACK_JAR_PATTERN_DEFAULT = Pattern.compile("(?:classes/|lib/).*"); /** * Configuration key to set the java command line options for the child * map and reduce tasks. * * Java opts for the task tracker child processes. * The following symbol, if present, will be interpolated: @taskid@. * It is replaced by current TaskID. Any other occurrences of '@' will go * unchanged. * For example, to enable verbose gc logging to a file named for the taskid in * /tmp and to set the heap maximum to be a gigabyte, pass a 'value' of: * -Xmx1024m -verbose:gc -Xloggc:/tmp/@taskid@.gc * * The configuration variable {@link #MAPRED_TASK_ENV} can be used to pass * other environment variables to the child processes. * * @deprecated Use {@link #MAPRED_MAP_TASK_JAVA_OPTS} or * {@link #MAPRED_REDUCE_TASK_JAVA_OPTS} */ @Deprecated public static final String MAPRED_TASK_JAVA_OPTS = "mapred.child.java.opts"; /** * Configuration key to set the java command line options for the map tasks. * * Java opts for the task tracker child map processes. * The following symbol, if present, will be interpolated: @taskid@. * It is replaced by current TaskID. Any other occurrences of '@' will go * unchanged. * For example, to enable verbose gc logging to a file named for the taskid in * /tmp and to set the heap maximum to be a gigabyte, pass a 'value' of: * -Xmx1024m -verbose:gc -Xloggc:/tmp/@taskid@.gc * * The configuration variable {@link #MAPRED_MAP_TASK_ENV} can be used to pass * other environment variables to the map processes. */ public static final String MAPRED_MAP_TASK_JAVA_OPTS = JobContext.MAP_JAVA_OPTS; /** * Configuration key to set the java command line options for the reduce tasks. * * Java opts for the task tracker child reduce processes. * The following symbol, if present, will be interpolated: @taskid@. * It is replaced by current TaskID. Any other occurrences of '@' will go * unchanged. * For example, to enable verbose gc logging to a file named for the taskid in * /tmp and to set the heap maximum to be a gigabyte, pass a 'value' of: * -Xmx1024m -verbose:gc -Xloggc:/tmp/@taskid@.gc * * The configuration variable {@link #MAPRED_REDUCE_TASK_ENV} can be used to * pass process environment variables to the reduce processes. */ public static final String MAPRED_REDUCE_TASK_JAVA_OPTS = JobContext.REDUCE_JAVA_OPTS; public static final String DEFAULT_MAPRED_TASK_JAVA_OPTS = ""; /** * @deprecated * Configuration key to set the maximum virtual memory available to the child * map and reduce tasks (in kilo-bytes). This has been deprecated and will no * longer have any effect. */ @Deprecated public static final String MAPRED_TASK_ULIMIT = "mapred.child.ulimit"; /** * @deprecated * Configuration key to set the maximum virtual memory available to the * map tasks (in kilo-bytes). This has been deprecated and will no * longer have any effect. */ @Deprecated public static final String MAPRED_MAP_TASK_ULIMIT = "mapreduce.map.ulimit"; /** * @deprecated * Configuration key to set the maximum virtual memory available to the * reduce tasks (in kilo-bytes). This has been deprecated and will no * longer have any effect. */ @Deprecated public static final String MAPRED_REDUCE_TASK_ULIMIT = "mapreduce.reduce.ulimit"; /** * Configuration key to set the environment of the child map/reduce tasks. * * The format of the value is <code>k1=v1,k2=v2</code>. Further it can * reference existing environment variables via <code>$key</code> on * Linux or <code>%key%</code> on Windows. * * Example: * <ul> * <li> A=foo - This will set the env variable A to foo. </li> * </ul> * * @deprecated Use {@link #MAPRED_MAP_TASK_ENV} or * {@link #MAPRED_REDUCE_TASK_ENV} */ @Deprecated public static final String MAPRED_TASK_ENV = "mapred.child.env"; /** * Configuration key to set the environment of the child map tasks. * * The format of the value is <code>k1=v1,k2=v2</code>. Further it can * reference existing environment variables via <code>$key</code> on * Linux or <code>%key%</code> on Windows. * * Example: * <ul> * <li> A=foo - This will set the env variable A to foo. </li> * </ul> * * You can also add environment variables individually by appending * <code>.VARNAME</code> to this configuration key, where VARNAME is * the name of the environment variable. * * Example: * <ul> * <li>mapreduce.map.env.VARNAME=value</li> * </ul> */ public static final String MAPRED_MAP_TASK_ENV = JobContext.MAP_ENV; /** * Configuration key to set the environment of the child reduce tasks. * * The format of the value is <code>k1=v1,k2=v2</code>. Further it can * reference existing environment variables via <code>$key</code> on * Linux or <code>%key%</code> on Windows. * * Example: * <ul> * <li> A=foo - This will set the env variable A to foo. </li> * </ul> * * You can also add environment variables individually by appending * <code>.VARNAME</code> to this configuration key, where VARNAME is * the name of the environment variable. * * Example: * <ul> * <li>mapreduce.reduce.env.VARNAME=value</li> * </ul> */ public static final String MAPRED_REDUCE_TASK_ENV = JobContext.REDUCE_ENV; private Credentials credentials = new Credentials(); /** * Configuration key to set the logging level for the map task. * * The allowed logging levels are: * OFF, FATAL, ERROR, WARN, INFO, DEBUG, TRACE and ALL. */ public static final String MAPRED_MAP_TASK_LOG_LEVEL = JobContext.MAP_LOG_LEVEL; /** * Configuration key to set the logging level for the reduce task. * * The allowed logging levels are: * OFF, FATAL, ERROR, WARN, INFO, DEBUG, TRACE and ALL. */ public static final String MAPRED_REDUCE_TASK_LOG_LEVEL = JobContext.REDUCE_LOG_LEVEL; /** * Default logging level for map/reduce tasks. */ public static final String DEFAULT_LOG_LEVEL = JobContext.DEFAULT_LOG_LEVEL; /** * The variable is kept for M/R 1.x applications, M/R 2.x applications should * use {@link MRJobConfig#WORKFLOW_ID} instead */ @Deprecated public static final String WORKFLOW_ID = MRJobConfig.WORKFLOW_ID; /** * The variable is kept for M/R 1.x applications, M/R 2.x applications should * use {@link MRJobConfig#WORKFLOW_NAME} instead */ @Deprecated public static final String WORKFLOW_NAME = MRJobConfig.WORKFLOW_NAME; /** * The variable is kept for M/R 1.x applications, M/R 2.x applications should * use {@link MRJobConfig#WORKFLOW_NODE_NAME} instead */ @Deprecated public static final String WORKFLOW_NODE_NAME = MRJobConfig.WORKFLOW_NODE_NAME; /** * The variable is kept for M/R 1.x applications, M/R 2.x applications should * use {@link MRJobConfig#WORKFLOW_ADJACENCY_PREFIX_STRING} instead */ @Deprecated public static final String WORKFLOW_ADJACENCY_PREFIX_STRING = MRJobConfig.WORKFLOW_ADJACENCY_PREFIX_STRING; /** * The variable is kept for M/R 1.x applications, M/R 2.x applications should * use {@link MRJobConfig#WORKFLOW_ADJACENCY_PREFIX_PATTERN} instead */ @Deprecated public static final String WORKFLOW_ADJACENCY_PREFIX_PATTERN = MRJobConfig.WORKFLOW_ADJACENCY_PREFIX_PATTERN; /** * The variable is kept for M/R 1.x applications, M/R 2.x applications should * use {@link MRJobConfig#WORKFLOW_TAGS} instead */ @Deprecated public static final String WORKFLOW_TAGS = MRJobConfig.WORKFLOW_TAGS; /** * The variable is kept for M/R 1.x applications, M/R 2.x applications should * not use it */ @Deprecated public static final String MAPREDUCE_RECOVER_JOB = "mapreduce.job.restart.recover"; /** * The variable is kept for M/R 1.x applications, M/R 2.x applications should * not use it */ @Deprecated public static final boolean DEFAULT_MAPREDUCE_RECOVER_JOB = true; /** * Construct a map/reduce job configuration. */ public JobConf() { checkAndWarnDeprecation(); } /** * Construct a map/reduce job configuration. * * @param exampleClass a class whose containing jar is used as the job's jar. */ public JobConf(Class exampleClass) { setJarByClass(exampleClass); checkAndWarnDeprecation(); } /** * Construct a map/reduce job configuration. * * @param conf a Configuration whose settings will be inherited. */ public JobConf(Configuration conf) { super(conf); if (conf instanceof JobConf) { JobConf that = (JobConf) conf; credentials = that.credentials; } checkAndWarnDeprecation(); } /** Construct a map/reduce job configuration. * * @param conf a Configuration whose settings will be inherited. * @param exampleClass a class whose containing jar is used as the job's jar. */ public JobConf(Configuration conf, Class exampleClass) { this(conf); setJarByClass(exampleClass); } /** Construct a map/reduce configuration. * * @param config a Configuration-format XML job description file. */ public JobConf(String config) { this(new Path(config)); } /** Construct a map/reduce configuration. * * @param config a Configuration-format XML job description file. */ public JobConf(Path config) { super(); addResource(config); checkAndWarnDeprecation(); } /** A new map/reduce configuration where the behavior of reading from the * default resources can be turned off. * <p> * If the parameter {@code loadDefaults} is false, the new instance * will not load resources from the default files. * * @param loadDefaults specifies whether to load from the default files */ public JobConf(boolean loadDefaults) { super(loadDefaults); checkAndWarnDeprecation(); } /** * Get credentials for the job. * @return credentials for the job */ public Credentials getCredentials() { return credentials; } @Private public void setCredentials(Credentials credentials) { this.credentials = credentials; } /** * Get the user jar for the map-reduce job. * * @return the user jar for the map-reduce job. */ public String getJar() { return get(JobContext.JAR); } /** * Set the user jar for the map-reduce job. * * @param jar the user jar for the map-reduce job. */ public void setJar(String jar) { set(JobContext.JAR, jar); } /** * Get the pattern for jar contents to unpack on the tasktracker */ public Pattern getJarUnpackPattern() { return getPattern(JobContext.JAR_UNPACK_PATTERN, UNPACK_JAR_PATTERN_DEFAULT); } /** * Set the job's jar file by finding an example class location. * * @param cls the example class. */ public void setJarByClass(Class cls) { String jar = ClassUtil.findContainingJar(cls); if (jar != null) { setJar(jar); } } public String[] getLocalDirs() throws IOException { return getTrimmedStrings(MRConfig.LOCAL_DIR); } /** * Use MRAsyncDiskService.moveAndDeleteAllVolumes instead. */ @Deprecated public void deleteLocalFiles() throws IOException { String[] localDirs = getLocalDirs(); for (int i = 0; i < localDirs.length; i++) { FileSystem.getLocal(this).delete(new Path(localDirs[i]), true); } } public void deleteLocalFiles(String subdir) throws IOException { String[] localDirs = getLocalDirs(); for (int i = 0; i < localDirs.length; i++) { FileSystem.getLocal(this).delete(new Path(localDirs[i], subdir), true); } } /** * Constructs a local file name. Files are distributed among configured * local directories. */ public Path getLocalPath(String pathString) throws IOException { return getLocalPath(MRConfig.LOCAL_DIR, pathString); } /** * Get the reported username for this job. * * @return the username */ public String getUser() { return get(JobContext.USER_NAME); } /** * Set the reported username for this job. * * @param user the username for this job. */ public void setUser(String user) { set(JobContext.USER_NAME, user); } /** * Set whether the framework should keep the intermediate files for * failed tasks. * * @param keep <code>true</code> if framework should keep the intermediate files * for failed tasks, <code>false</code> otherwise. * */ public void setKeepFailedTaskFiles(boolean keep) { setBoolean(JobContext.PRESERVE_FAILED_TASK_FILES, keep); } /** * Should the temporary files for failed tasks be kept? * * @return should the files be kept? */ public boolean getKeepFailedTaskFiles() { return getBoolean(JobContext.PRESERVE_FAILED_TASK_FILES, false); } /** * Set a regular expression for task names that should be kept. * The regular expression ".*_m_000123_0" would keep the files * for the first instance of map 123 that ran. * * @param pattern the java.util.regex.Pattern to match against the * task names. */ public void setKeepTaskFilesPattern(String pattern) { set(JobContext.PRESERVE_FILES_PATTERN, pattern); } /** * Get the regular expression that is matched against the task names * to see if we need to keep the files. * * @return the pattern as a string, if it was set, othewise null. */ public String getKeepTaskFilesPattern() { return get(JobContext.PRESERVE_FILES_PATTERN); } /** * Set the current working directory for the default file system. * * @param dir the new current working directory. */ public void setWorkingDirectory(Path dir) { dir = new Path(getWorkingDirectory(), dir); set(JobContext.WORKING_DIR, dir.toString()); } /** * Get the current working directory for the default file system. * * @return the directory name. */ public Path getWorkingDirectory() { String name = get(JobContext.WORKING_DIR); if (name != null) { return new Path(name); } else { try { Path dir = FileSystem.get(this).getWorkingDirectory(); set(JobContext.WORKING_DIR, dir.toString()); return dir; } catch (IOException e) { throw new RuntimeException(e); } } } /** * Sets the number of tasks that a spawned task JVM should run * before it exits * @param numTasks the number of tasks to execute; defaults to 1; * -1 signifies no limit */ public void setNumTasksToExecutePerJvm(int numTasks) { setInt(JobContext.JVM_NUMTASKS_TORUN, numTasks); } /** * Get the number of tasks that a spawned JVM should execute */ public int getNumTasksToExecutePerJvm() { return getInt(JobContext.JVM_NUMTASKS_TORUN, 1); } /** * Get the {@link InputFormat} implementation for the map-reduce job, * defaults to {@link TextInputFormat} if not specified explicity. * * @return the {@link InputFormat} implementation for the map-reduce job. */ public InputFormat getInputFormat() { return ReflectionUtils .newInstance(getClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class), this); } /** * Set the {@link InputFormat} implementation for the map-reduce job. * * @param theClass the {@link InputFormat} implementation for the map-reduce * job. */ public void setInputFormat(Class<? extends InputFormat> theClass) { setClass("mapred.input.format.class", theClass, InputFormat.class); } /** * Get the {@link OutputFormat} implementation for the map-reduce job, * defaults to {@link TextOutputFormat} if not specified explicity. * * @return the {@link OutputFormat} implementation for the map-reduce job. */ public OutputFormat getOutputFormat() { return ReflectionUtils.newInstance( getClass("mapred.output.format.class", TextOutputFormat.class, OutputFormat.class), this); } /** * Get the {@link OutputCommitter} implementation for the map-reduce job, * defaults to {@link FileOutputCommitter} if not specified explicitly. * * @return the {@link OutputCommitter} implementation for the map-reduce job. */ public OutputCommitter getOutputCommitter() { return (OutputCommitter) ReflectionUtils.newInstance( getClass("mapred.output.committer.class", FileOutputCommitter.class, OutputCommitter.class), this); } /** * Set the {@link OutputCommitter} implementation for the map-reduce job. * * @param theClass the {@link OutputCommitter} implementation for the map-reduce * job. */ public void setOutputCommitter(Class<? extends OutputCommitter> theClass) { setClass("mapred.output.committer.class", theClass, OutputCommitter.class); } /** * Set the {@link OutputFormat} implementation for the map-reduce job. * * @param theClass the {@link OutputFormat} implementation for the map-reduce * job. */ public void setOutputFormat(Class<? extends OutputFormat> theClass) { setClass("mapred.output.format.class", theClass, OutputFormat.class); } /** * Should the map outputs be compressed before transfer? * * @param compress should the map outputs be compressed? */ public void setCompressMapOutput(boolean compress) { setBoolean(JobContext.MAP_OUTPUT_COMPRESS, compress); } /** * Are the outputs of the maps be compressed? * * @return <code>true</code> if the outputs of the maps are to be compressed, * <code>false</code> otherwise. */ public boolean getCompressMapOutput() { return getBoolean(JobContext.MAP_OUTPUT_COMPRESS, false); } /** * Set the given class as the {@link CompressionCodec} for the map outputs. * * @param codecClass the {@link CompressionCodec} class that will compress * the map outputs. */ public void setMapOutputCompressorClass(Class<? extends CompressionCodec> codecClass) { setCompressMapOutput(true); setClass(JobContext.MAP_OUTPUT_COMPRESS_CODEC, codecClass, CompressionCodec.class); } /** * Get the {@link CompressionCodec} for compressing the map outputs. * * @param defaultValue the {@link CompressionCodec} to return if not set * @return the {@link CompressionCodec} class that should be used to compress the * map outputs. * @throws IllegalArgumentException if the class was specified, but not found */ public Class<? extends CompressionCodec> getMapOutputCompressorClass( Class<? extends CompressionCodec> defaultValue) { Class<? extends CompressionCodec> codecClass = defaultValue; String name = get(JobContext.MAP_OUTPUT_COMPRESS_CODEC); if (name != null) { try { codecClass = getClassByName(name).asSubclass(CompressionCodec.class); } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Compression codec " + name + " was not found.", e); } } return codecClass; } /** * Get the key class for the map output data. If it is not set, use the * (final) output key class. This allows the map output key class to be * different than the final output key class. * * @return the map output key class. */ public Class<?> getMapOutputKeyClass() { Class<?> retv = getClass(JobContext.MAP_OUTPUT_KEY_CLASS, null, Object.class); if (retv == null) { retv = getOutputKeyClass(); } return retv; } /** * Set the key class for the map output data. This allows the user to * specify the map output key class to be different than the final output * value class. * * @param theClass the map output key class. */ public void setMapOutputKeyClass(Class<?> theClass) { setClass(JobContext.MAP_OUTPUT_KEY_CLASS, theClass, Object.class); } /** * Get the value class for the map output data. If it is not set, use the * (final) output value class This allows the map output value class to be * different than the final output value class. * * @return the map output value class. */ public Class<?> getMapOutputValueClass() { Class<?> retv = getClass(JobContext.MAP_OUTPUT_VALUE_CLASS, null, Object.class); if (retv == null) { retv = getOutputValueClass(); } return retv; } /** * Set the value class for the map output data. This allows the user to * specify the map output value class to be different than the final output * value class. * * @param theClass the map output value class. */ public void setMapOutputValueClass(Class<?> theClass) { setClass(JobContext.MAP_OUTPUT_VALUE_CLASS, theClass, Object.class); } /** * Get the key class for the job output data. * * @return the key class for the job output data. */ public Class<?> getOutputKeyClass() { return getClass(JobContext.OUTPUT_KEY_CLASS, LongWritable.class, Object.class); } /** * Set the key class for the job output data. * * @param theClass the key class for the job output data. */ public void setOutputKeyClass(Class<?> theClass) { setClass(JobContext.OUTPUT_KEY_CLASS, theClass, Object.class); } /** * Get the {@link RawComparator} comparator used to compare keys. * * @return the {@link RawComparator} comparator used to compare keys. */ public RawComparator getOutputKeyComparator() { Class<? extends RawComparator> theClass = getClass(JobContext.KEY_COMPARATOR, null, RawComparator.class); if (theClass != null) return ReflectionUtils.newInstance(theClass, this); return WritableComparator.get(getMapOutputKeyClass().asSubclass(WritableComparable.class), this); } /** * Set the {@link RawComparator} comparator used to compare keys. * * @param theClass the {@link RawComparator} comparator used to * compare keys. * @see #setOutputValueGroupingComparator(Class) */ public void setOutputKeyComparatorClass(Class<? extends RawComparator> theClass) { setClass(JobContext.KEY_COMPARATOR, theClass, RawComparator.class); } /** * Set the {@link KeyFieldBasedComparator} options used to compare keys. * * @param keySpec the key specification of the form -k pos1[,pos2], where, * pos is of the form f[.c][opts], where f is the number * of the key field to use, and c is the number of the first character from * the beginning of the field. Fields and character posns are numbered * starting with 1; a character position of zero in pos2 indicates the * field's last character. If '.c' is omitted from pos1, it defaults to 1 * (the beginning of the field); if omitted from pos2, it defaults to 0 * (the end of the field). opts are ordering options. The supported options * are: * -n, (Sort numerically) * -r, (Reverse the result of comparison) */ public void setKeyFieldComparatorOptions(String keySpec) { setOutputKeyComparatorClass(KeyFieldBasedComparator.class); set(KeyFieldBasedComparator.COMPARATOR_OPTIONS, keySpec); } /** * Get the {@link KeyFieldBasedComparator} options */ public String getKeyFieldComparatorOption() { return get(KeyFieldBasedComparator.COMPARATOR_OPTIONS); } /** * Set the {@link KeyFieldBasedPartitioner} options used for * {@link Partitioner} * * @param keySpec the key specification of the form -k pos1[,pos2], where, * pos is of the form f[.c][opts], where f is the number * of the key field to use, and c is the number of the first character from * the beginning of the field. Fields and character posns are numbered * starting with 1; a character position of zero in pos2 indicates the * field's last character. If '.c' is omitted from pos1, it defaults to 1 * (the beginning of the field); if omitted from pos2, it defaults to 0 * (the end of the field). */ public void setKeyFieldPartitionerOptions(String keySpec) { setPartitionerClass(KeyFieldBasedPartitioner.class); set(KeyFieldBasedPartitioner.PARTITIONER_OPTIONS, keySpec); } /** * Get the {@link KeyFieldBasedPartitioner} options */ public String getKeyFieldPartitionerOption() { return get(KeyFieldBasedPartitioner.PARTITIONER_OPTIONS); } /** * Get the user defined {@link WritableComparable} comparator for * grouping keys of inputs to the combiner. * * @return comparator set by the user for grouping values. * @see #setCombinerKeyGroupingComparator(Class) for details. */ public RawComparator getCombinerKeyGroupingComparator() { Class<? extends RawComparator> theClass = getClass(JobContext.COMBINER_GROUP_COMPARATOR_CLASS, null, RawComparator.class); if (theClass == null) { return getOutputKeyComparator(); } return ReflectionUtils.newInstance(theClass, this); } /** * Get the user defined {@link WritableComparable} comparator for * grouping keys of inputs to the reduce. * * @return comparator set by the user for grouping values. * @see #setOutputValueGroupingComparator(Class) for details. */ public RawComparator getOutputValueGroupingComparator() { Class<? extends RawComparator> theClass = getClass(JobContext.GROUP_COMPARATOR_CLASS, null, RawComparator.class); if (theClass == null) { return getOutputKeyComparator(); } return ReflectionUtils.newInstance(theClass, this); } /** * Set the user defined {@link RawComparator} comparator for * grouping keys in the input to the combiner. * * <p>This comparator should be provided if the equivalence rules for keys * for sorting the intermediates are different from those for grouping keys * before each call to * {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.</p> * * <p>For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed * in a single call to the reduce function if K1 and K2 compare as equal.</p> * * <p>Since {@link #setOutputKeyComparatorClass(Class)} can be used to control * how keys are sorted, this can be used in conjunction to simulate * <i>secondary sort on values</i>.</p> * * <p><i>Note</i>: This is not a guarantee of the combiner sort being * <i>stable</i> in any sense. (In any case, with the order of available * map-outputs to the combiner being non-deterministic, it wouldn't make * that much sense.)</p> * * @param theClass the comparator class to be used for grouping keys for the * combiner. It should implement <code>RawComparator</code>. * @see #setOutputKeyComparatorClass(Class) */ public void setCombinerKeyGroupingComparator(Class<? extends RawComparator> theClass) { setClass(JobContext.COMBINER_GROUP_COMPARATOR_CLASS, theClass, RawComparator.class); } /** * Set the user defined {@link RawComparator} comparator for * grouping keys in the input to the reduce. * * <p>This comparator should be provided if the equivalence rules for keys * for sorting the intermediates are different from those for grouping keys * before each call to * {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.</p> * * <p>For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed * in a single call to the reduce function if K1 and K2 compare as equal.</p> * * <p>Since {@link #setOutputKeyComparatorClass(Class)} can be used to control * how keys are sorted, this can be used in conjunction to simulate * <i>secondary sort on values</i>.</p> * * <p><i>Note</i>: This is not a guarantee of the reduce sort being * <i>stable</i> in any sense. (In any case, with the order of available * map-outputs to the reduce being non-deterministic, it wouldn't make * that much sense.)</p> * * @param theClass the comparator class to be used for grouping keys. * It should implement <code>RawComparator</code>. * @see #setOutputKeyComparatorClass(Class) * @see #setCombinerKeyGroupingComparator(Class) */ public void setOutputValueGroupingComparator(Class<? extends RawComparator> theClass) { setClass(JobContext.GROUP_COMPARATOR_CLASS, theClass, RawComparator.class); } /** * Should the framework use the new context-object code for running * the mapper? * @return true, if the new api should be used */ public boolean getUseNewMapper() { return getBoolean("mapred.mapper.new-api", false); } /** * Set whether the framework should use the new api for the mapper. * This is the default for jobs submitted with the new Job api. * @param flag true, if the new api should be used */ public void setUseNewMapper(boolean flag) { setBoolean("mapred.mapper.new-api", flag); } /** * Should the framework use the new context-object code for running * the reducer? * @return true, if the new api should be used */ public boolean getUseNewReducer() { return getBoolean("mapred.reducer.new-api", false); } /** * Set whether the framework should use the new api for the reducer. * This is the default for jobs submitted with the new Job api. * @param flag true, if the new api should be used */ public void setUseNewReducer(boolean flag) { setBoolean("mapred.reducer.new-api", flag); } /** * Get the value class for job outputs. * * @return the value class for job outputs. */ public Class<?> getOutputValueClass() { return getClass(JobContext.OUTPUT_VALUE_CLASS, Text.class, Object.class); } /** * Set the value class for job outputs. * * @param theClass the value class for job outputs. */ public void setOutputValueClass(Class<?> theClass) { setClass(JobContext.OUTPUT_VALUE_CLASS, theClass, Object.class); } /** * Get the {@link Mapper} class for the job. * * @return the {@link Mapper} class for the job. */ public Class<? extends Mapper> getMapperClass() { return getClass("mapred.mapper.class", IdentityMapper.class, Mapper.class); } /** * Set the {@link Mapper} class for the job. * * @param theClass the {@link Mapper} class for the job. */ public void setMapperClass(Class<? extends Mapper> theClass) { setClass("mapred.mapper.class", theClass, Mapper.class); } /** * Get the {@link MapRunnable} class for the job. * * @return the {@link MapRunnable} class for the job. */ public Class<? extends MapRunnable> getMapRunnerClass() { return getClass("mapred.map.runner.class", MapRunner.class, MapRunnable.class); } /** * Expert: Set the {@link MapRunnable} class for the job. * * Typically used to exert greater control on {@link Mapper}s. * * @param theClass the {@link MapRunnable} class for the job. */ public void setMapRunnerClass(Class<? extends MapRunnable> theClass) { setClass("mapred.map.runner.class", theClass, MapRunnable.class); } /** * Get the {@link Partitioner} used to partition {@link Mapper}-outputs * to be sent to the {@link Reducer}s. * * @return the {@link Partitioner} used to partition map-outputs. */ public Class<? extends Partitioner> getPartitionerClass() { return getClass("mapred.partitioner.class", HashPartitioner.class, Partitioner.class); } /** * Set the {@link Partitioner} class used to partition * {@link Mapper}-outputs to be sent to the {@link Reducer}s. * * @param theClass the {@link Partitioner} used to partition map-outputs. */ public void setPartitionerClass(Class<? extends Partitioner> theClass) { setClass("mapred.partitioner.class", theClass, Partitioner.class); } /** * Get the {@link Reducer} class for the job. * * @return the {@link Reducer} class for the job. */ public Class<? extends Reducer> getReducerClass() { return getClass("mapred.reducer.class", IdentityReducer.class, Reducer.class); } /** * Set the {@link Reducer} class for the job. * * @param theClass the {@link Reducer} class for the job. */ public void setReducerClass(Class<? extends Reducer> theClass) { setClass("mapred.reducer.class", theClass, Reducer.class); } /** * Get the user-defined <i>combiner</i> class used to combine map-outputs * before being sent to the reducers. Typically the combiner is same as the * the {@link Reducer} for the job i.e. {@link #getReducerClass()}. * * @return the user-defined combiner class used to combine map-outputs. */ public Class<? extends Reducer> getCombinerClass() { return getClass("mapred.combiner.class", null, Reducer.class); } /** * Set the user-defined <i>combiner</i> class used to combine map-outputs * before being sent to the reducers. * * <p>The combiner is an application-specified aggregation operation, which * can help cut down the amount of data transferred between the * {@link Mapper} and the {@link Reducer}, leading to better performance.</p> * * <p>The framework may invoke the combiner 0, 1, or multiple times, in both * the mapper and reducer tasks. In general, the combiner is called as the * sort/merge result is written to disk. The combiner must: * <ul> * <li> be side-effect free</li> * <li> have the same input and output key types and the same input and * output value types</li> * </ul> * * <p>Typically the combiner is same as the <code>Reducer</code> for the * job i.e. {@link #setReducerClass(Class)}.</p> * * @param theClass the user-defined combiner class used to combine * map-outputs. */ public void setCombinerClass(Class<? extends Reducer> theClass) { setClass("mapred.combiner.class", theClass, Reducer.class); } /** * Should speculative execution be used for this job? * Defaults to <code>true</code>. * * @return <code>true</code> if speculative execution be used for this job, * <code>false</code> otherwise. */ public boolean getSpeculativeExecution() { return (getMapSpeculativeExecution() || getReduceSpeculativeExecution()); } /** * Turn speculative execution on or off for this job. * * @param speculativeExecution <code>true</code> if speculative execution * should be turned on, else <code>false</code>. */ public void setSpeculativeExecution(boolean speculativeExecution) { setMapSpeculativeExecution(speculativeExecution); setReduceSpeculativeExecution(speculativeExecution); } /** * Should speculative execution be used for this job for map tasks? * Defaults to <code>true</code>. * * @return <code>true</code> if speculative execution be * used for this job for map tasks, * <code>false</code> otherwise. */ public boolean getMapSpeculativeExecution() { return getBoolean(JobContext.MAP_SPECULATIVE, true); } /** * Turn speculative execution on or off for this job for map tasks. * * @param speculativeExecution <code>true</code> if speculative execution * should be turned on for map tasks, * else <code>false</code>. */ public void setMapSpeculativeExecution(boolean speculativeExecution) { setBoolean(JobContext.MAP_SPECULATIVE, speculativeExecution); } /** * Should speculative execution be used for this job for reduce tasks? * Defaults to <code>true</code>. * * @return <code>true</code> if speculative execution be used * for reduce tasks for this job, * <code>false</code> otherwise. */ public boolean getReduceSpeculativeExecution() { return getBoolean(JobContext.REDUCE_SPECULATIVE, true); } /** * Turn speculative execution on or off for this job for reduce tasks. * * @param speculativeExecution <code>true</code> if speculative execution * should be turned on for reduce tasks, * else <code>false</code>. */ public void setReduceSpeculativeExecution(boolean speculativeExecution) { setBoolean(JobContext.REDUCE_SPECULATIVE, speculativeExecution); } /** * Get the configured number of map tasks for this job. * Defaults to <code>1</code>. * * @return the number of map tasks for this job. */ public int getNumMapTasks() { return getInt(JobContext.NUM_MAPS, 1); } /** * Set the number of map tasks for this job. * * <p><i>Note</i>: This is only a <i>hint</i> to the framework. The actual * number of spawned map tasks depends on the number of {@link InputSplit}s * generated by the job's {@link InputFormat#getSplits(JobConf, int)}. * * A custom {@link InputFormat} is typically used to accurately control * the number of map tasks for the job.</p> * * <b id="NoOfMaps">How many maps?</b> * * <p>The number of maps is usually driven by the total size of the inputs * i.e. total number of blocks of the input files.</p> * * <p>The right level of parallelism for maps seems to be around 10-100 maps * per-node, although it has been set up to 300 or so for very cpu-light map * tasks. Task setup takes awhile, so it is best if the maps take at least a * minute to execute.</p> * * <p>The default behavior of file-based {@link InputFormat}s is to split the * input into <i>logical</i> {@link InputSplit}s based on the total size, in * bytes, of input files. However, the {@link FileSystem} blocksize of the * input files is treated as an upper bound for input splits. A lower bound * on the split size can be set via * <a href="{@docRoot}/../hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml#mapreduce.input.fileinputformat.split.minsize"> * mapreduce.input.fileinputformat.split.minsize</a>.</p> * * <p>Thus, if you expect 10TB of input data and have a blocksize of 128MB, * you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is * used to set it even higher.</p> * * @param n the number of map tasks for this job. * @see InputFormat#getSplits(JobConf, int) * @see FileInputFormat * @see FileSystem#getDefaultBlockSize() * @see FileStatus#getBlockSize() */ public void setNumMapTasks(int n) { setInt(JobContext.NUM_MAPS, n); } /** * Get the configured number of reduce tasks for this job. Defaults to * <code>1</code>. * * @return the number of reduce tasks for this job. */ public int getNumReduceTasks() { return getInt(JobContext.NUM_REDUCES, 1); } /** * Set the requisite number of reduce tasks for this job. * * <b id="NoOfReduces">How many reduces?</b> * * <p>The right number of reduces seems to be <code>0.95</code> or * <code>1.75</code> multiplied by ( * <i>available memory for reduce tasks</i> * (The value of this should be smaller than * numNodes * yarn.nodemanager.resource.memory-mb * since the resource of memory is shared by map tasks and other * applications) / * <a href="{@docRoot}/../hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml#mapreduce.reduce.memory.mb"> * mapreduce.reduce.memory.mb</a>). * </p> * * <p>With <code>0.95</code> all of the reduces can launch immediately and * start transfering map outputs as the maps finish. With <code>1.75</code> * the faster nodes will finish their first round of reduces and launch a * second wave of reduces doing a much better job of load balancing.</p> * * <p>Increasing the number of reduces increases the framework overhead, but * increases load balancing and lowers the cost of failures.</p> * * <p>The scaling factors above are slightly less than whole numbers to * reserve a few reduce slots in the framework for speculative-tasks, failures * etc.</p> * * <b id="ReducerNone">Reducer NONE</b> * * <p>It is legal to set the number of reduce-tasks to <code>zero</code>.</p> * * <p>In this case the output of the map-tasks directly go to distributed * file-system, to the path set by * {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the * framework doesn't sort the map-outputs before writing it out to HDFS.</p> * * @param n the number of reduce tasks for this job. */ public void setNumReduceTasks(int n) { setInt(JobContext.NUM_REDUCES, n); } /** * Get the configured number of maximum attempts that will be made to run a * map task, as specified by the <code>mapreduce.map.maxattempts</code> * property. If this property is not already set, the default is 4 attempts. * * @return the max number of attempts per map task. */ public int getMaxMapAttempts() { return getInt(JobContext.MAP_MAX_ATTEMPTS, 4); } /** * Expert: Set the number of maximum attempts that will be made to run a * map task. * * @param n the number of attempts per map task. */ public void setMaxMapAttempts(int n) { setInt(JobContext.MAP_MAX_ATTEMPTS, n); } /** * Get the configured number of maximum attempts that will be made to run a * reduce task, as specified by the <code>mapreduce.reduce.maxattempts</code> * property. If this property is not already set, the default is 4 attempts. * * @return the max number of attempts per reduce task. */ public int getMaxReduceAttempts() { return getInt(JobContext.REDUCE_MAX_ATTEMPTS, 4); } /** * Expert: Set the number of maximum attempts that will be made to run a * reduce task. * * @param n the number of attempts per reduce task. */ public void setMaxReduceAttempts(int n) { setInt(JobContext.REDUCE_MAX_ATTEMPTS, n); } /** * Get the user-specified job name. This is only used to identify the * job to the user. * * @return the job's name, defaulting to "". */ public String getJobName() { return get(JobContext.JOB_NAME, ""); } /** * Set the user-specified job name. * * @param name the job's new name. */ public void setJobName(String name) { set(JobContext.JOB_NAME, name); } /** * Get the user-specified session identifier. The default is the empty string. * * The session identifier is used to tag metric data that is reported to some * performance metrics system via the org.apache.hadoop.metrics API. The * session identifier is intended, in particular, for use by Hadoop-On-Demand * (HOD) which allocates a virtual Hadoop cluster dynamically and transiently. * HOD will set the session identifier by modifying the mapred-site.xml file * before starting the cluster. * * When not running under HOD, this identifer is expected to remain set to * the empty string. * * @return the session identifier, defaulting to "". */ @Deprecated public String getSessionId() { return get("session.id", ""); } /** * Set the user-specified session identifier. * * @param sessionId the new session id. */ @Deprecated public void setSessionId(String sessionId) { set("session.id", sessionId); } /** * Set the maximum no. of failures of a given job per tasktracker. * If the no. of task failures exceeds <code>noFailures</code>, the * tasktracker is <i>blacklisted</i> for this job. * * @param noFailures maximum no. of failures of a given job per tasktracker. */ public void setMaxTaskFailuresPerTracker(int noFailures) { setInt(JobContext.MAX_TASK_FAILURES_PER_TRACKER, noFailures); } /** * Expert: Get the maximum no. of failures of a given job per tasktracker. * If the no. of task failures exceeds this, the tasktracker is * <i>blacklisted</i> for this job. * * @return the maximum no. of failures of a given job per tasktracker. */ public int getMaxTaskFailuresPerTracker() { return getInt(JobContext.MAX_TASK_FAILURES_PER_TRACKER, 3); } /** * Get the maximum percentage of map tasks that can fail without * the job being aborted. * * Each map task is executed a minimum of {@link #getMaxMapAttempts()} * attempts before being declared as <i>failed</i>. * * Defaults to <code>zero</code>, i.e. <i>any</i> failed map-task results in * the job being declared as {@link JobStatus#FAILED}. * * @return the maximum percentage of map tasks that can fail without * the job being aborted. */ public int getMaxMapTaskFailuresPercent() { return getInt(JobContext.MAP_FAILURES_MAX_PERCENT, 0); } /** * Expert: Set the maximum percentage of map tasks that can fail without the * job being aborted. * * Each map task is executed a minimum of {@link #getMaxMapAttempts} attempts * before being declared as <i>failed</i>. * * @param percent the maximum percentage of map tasks that can fail without * the job being aborted. */ public void setMaxMapTaskFailuresPercent(int percent) { setInt(JobContext.MAP_FAILURES_MAX_PERCENT, percent); } /** * Get the maximum percentage of reduce tasks that can fail without * the job being aborted. * * Each reduce task is executed a minimum of {@link #getMaxReduceAttempts()} * attempts before being declared as <i>failed</i>. * * Defaults to <code>zero</code>, i.e. <i>any</i> failed reduce-task results * in the job being declared as {@link JobStatus#FAILED}. * * @return the maximum percentage of reduce tasks that can fail without * the job being aborted. */ public int getMaxReduceTaskFailuresPercent() { return getInt(JobContext.REDUCE_FAILURES_MAXPERCENT, 0); } /** * Set the maximum percentage of reduce tasks that can fail without the job * being aborted. * * Each reduce task is executed a minimum of {@link #getMaxReduceAttempts()} * attempts before being declared as <i>failed</i>. * * @param percent the maximum percentage of reduce tasks that can fail without * the job being aborted. */ public void setMaxReduceTaskFailuresPercent(int percent) { setInt(JobContext.REDUCE_FAILURES_MAXPERCENT, percent); } /** * Set {@link JobPriority} for this job. * * @param prio the {@link JobPriority} for this job. */ public void setJobPriority(JobPriority prio) { set(JobContext.PRIORITY, prio.toString()); } /** * Set {@link JobPriority} for this job. * * @param prio the {@link JobPriority} for this job. */ public void setJobPriorityAsInteger(int prio) { set(JobContext.PRIORITY, Integer.toString(prio)); } /** * Get the {@link JobPriority} for this job. * * @return the {@link JobPriority} for this job. */ public JobPriority getJobPriority() { String prio = get(JobContext.PRIORITY); if (prio == null) { return JobPriority.DEFAULT; } JobPriority priority = JobPriority.DEFAULT; try { priority = JobPriority.valueOf(prio); } catch (IllegalArgumentException e) { return convertToJobPriority(Integer.parseInt(prio)); } return priority; } /** * Get the priority for this job. * * @return the priority for this job. */ public int getJobPriorityAsInteger() { String priority = get(JobContext.PRIORITY); if (priority == null) { return 0; } int jobPriority = 0; try { jobPriority = convertPriorityToInteger(priority); } catch (IllegalArgumentException e) { return Integer.parseInt(priority); } return jobPriority; } private int convertPriorityToInteger(String priority) { JobPriority jobPriority = JobPriority.valueOf(priority); switch (jobPriority) { case VERY_HIGH: return 5; case HIGH: return 4; case NORMAL: return 3; case LOW: return 2; case VERY_LOW: return 1; case DEFAULT: return 0; default: break; } // If a user sets the priority as "UNDEFINED_PRIORITY", we can return // 0 which is also default value. return 0; } private JobPriority convertToJobPriority(int priority) { switch (priority) { case 5: return JobPriority.VERY_HIGH; case 4: return JobPriority.HIGH; case 3: return JobPriority.NORMAL; case 2: return JobPriority.LOW; case 1: return JobPriority.VERY_LOW; case 0: return JobPriority.DEFAULT; default: break; } return JobPriority.UNDEFINED_PRIORITY; } /** * Set JobSubmitHostName for this job. * * @param hostname the JobSubmitHostName for this job. */ void setJobSubmitHostName(String hostname) { set(MRJobConfig.JOB_SUBMITHOST, hostname); } /** * Get the JobSubmitHostName for this job. * * @return the JobSubmitHostName for this job. */ String getJobSubmitHostName() { String hostname = get(MRJobConfig.JOB_SUBMITHOST); return hostname; } /** * Set JobSubmitHostAddress for this job. * * @param hostadd the JobSubmitHostAddress for this job. */ void setJobSubmitHostAddress(String hostadd) { set(MRJobConfig.JOB_SUBMITHOSTADDR, hostadd); } /** * Get JobSubmitHostAddress for this job. * * @return JobSubmitHostAddress for this job. */ String getJobSubmitHostAddress() { String hostadd = get(MRJobConfig.JOB_SUBMITHOSTADDR); return hostadd; } /** * Get whether the task profiling is enabled. * @return true if some tasks will be profiled */ public boolean getProfileEnabled() { return getBoolean(JobContext.TASK_PROFILE, false); } /** * Set whether the system should collect profiler information for some of * the tasks in this job? The information is stored in the user log * directory. * @param newValue true means it should be gathered */ public void setProfileEnabled(boolean newValue) { setBoolean(JobContext.TASK_PROFILE, newValue); } /** * Get the profiler configuration arguments. * * The default value for this property is * "-agentlib:hprof=cpu=samples,heap=sites,force=n,thread=y,verbose=n,file=%s" * * @return the parameters to pass to the task child to configure profiling */ public String getProfileParams() { return get(JobContext.TASK_PROFILE_PARAMS, MRJobConfig.DEFAULT_TASK_PROFILE_PARAMS); } /** * Set the profiler configuration arguments. If the string contains a '%s' it * will be replaced with the name of the profiling output file when the task * runs. * * This value is passed to the task child JVM on the command line. * * @param value the configuration string */ public void setProfileParams(String value) { set(JobContext.TASK_PROFILE_PARAMS, value); } /** * Get the range of maps or reduces to profile. * @param isMap is the task a map? * @return the task ranges */ public IntegerRanges getProfileTaskRange(boolean isMap) { return getRange((isMap ? JobContext.NUM_MAP_PROFILES : JobContext.NUM_REDUCE_PROFILES), "0-2"); } /** * Set the ranges of maps or reduces to profile. setProfileEnabled(true) * must also be called. * @param newValue a set of integer ranges of the map ids */ public void setProfileTaskRange(boolean isMap, String newValue) { // parse the value to make sure it is legal new Configuration.IntegerRanges(newValue); set((isMap ? JobContext.NUM_MAP_PROFILES : JobContext.NUM_REDUCE_PROFILES), newValue); } /** * Set the debug script to run when the map tasks fail. * * <p>The debug script can aid debugging of failed map tasks. The script is * given task's stdout, stderr, syslog, jobconf files as arguments.</p> * * <p>The debug command, run on the node where the map failed, is:</p> * <p><blockquote><pre> * $script $stdout $stderr $syslog $jobconf. * </pre></blockquote> * * <p> The script file is distributed through {@link DistributedCache} * APIs. The script needs to be symlinked. </p> * * <p>Here is an example on how to submit a script * <p><blockquote><pre> * job.setMapDebugScript("./myscript"); * DistributedCache.createSymlink(job); * DistributedCache.addCacheFile("/debug/scripts/myscript#myscript"); * </pre></blockquote> * * @param mDbgScript the script name */ public void setMapDebugScript(String mDbgScript) { set(JobContext.MAP_DEBUG_SCRIPT, mDbgScript); } /** * Get the map task's debug script. * * @return the debug Script for the mapred job for failed map tasks. * @see #setMapDebugScript(String) */ public String getMapDebugScript() { return get(JobContext.MAP_DEBUG_SCRIPT); } /** * Set the debug script to run when the reduce tasks fail. * * <p>The debug script can aid debugging of failed reduce tasks. The script * is given task's stdout, stderr, syslog, jobconf files as arguments.</p> * * <p>The debug command, run on the node where the map failed, is:</p> * <p><blockquote><pre> * $script $stdout $stderr $syslog $jobconf. * </pre></blockquote> * * <p> The script file is distributed through {@link DistributedCache} * APIs. The script file needs to be symlinked </p> * * <p>Here is an example on how to submit a script * <p><blockquote><pre> * job.setReduceDebugScript("./myscript"); * DistributedCache.createSymlink(job); * DistributedCache.addCacheFile("/debug/scripts/myscript#myscript"); * </pre></blockquote> * * @param rDbgScript the script name */ public void setReduceDebugScript(String rDbgScript) { set(JobContext.REDUCE_DEBUG_SCRIPT, rDbgScript); } /** * Get the reduce task's debug Script * * @return the debug script for the mapred job for failed reduce tasks. * @see #setReduceDebugScript(String) */ public String getReduceDebugScript() { return get(JobContext.REDUCE_DEBUG_SCRIPT); } /** * Get the uri to be invoked in-order to send a notification after the job * has completed (success/failure). * * @return the job end notification uri, <code>null</code> if it hasn't * been set. * @see #setJobEndNotificationURI(String) */ public String getJobEndNotificationURI() { return get(JobContext.MR_JOB_END_NOTIFICATION_URL); } /** * Set the uri to be invoked in-order to send a notification after the job * has completed (success/failure). * * <p>The uri can contain 2 special parameters: <tt>$jobId</tt> and * <tt>$jobStatus</tt>. Those, if present, are replaced by the job's * identifier and completion-status respectively.</p> * * <p>This is typically used by application-writers to implement chaining of * Map-Reduce jobs in an <i>asynchronous manner</i>.</p> * * @param uri the job end notification uri * @see JobStatus */ public void setJobEndNotificationURI(String uri) { set(JobContext.MR_JOB_END_NOTIFICATION_URL, uri); } /** * Get job-specific shared directory for use as scratch space * * <p> * When a job starts, a shared directory is created at location * <code> * ${mapreduce.cluster.local.dir}/taskTracker/$user/jobcache/$jobid/work/ </code>. * This directory is exposed to the users through * <code>mapreduce.job.local.dir </code>. * So, the tasks can use this space * as scratch space and share files among them. </p> * This value is available as System property also. * * @return The localized job specific shared directory */ public String getJobLocalDir() { return get(JobContext.JOB_LOCAL_DIR); } /** * Get memory required to run a map task of the job, in MB. * * If a value is specified in the configuration, it is returned. * Else, it returns {@link JobContext#DEFAULT_MAP_MEMORY_MB}. * <p> * For backward compatibility, if the job configuration sets the * key {@link #MAPRED_TASK_MAXVMEM_PROPERTY} to a value different * from {@link #DISABLED_MEMORY_LIMIT}, that value will be used * after converting it from bytes to MB. * @return memory required to run a map task of the job, in MB, */ public long getMemoryForMapTask() { long value = getDeprecatedMemoryValue(); if (value < 0) { return getMemoryRequired(TaskType.MAP); } return value; } public void setMemoryForMapTask(long mem) { setLong(JobConf.MAPREDUCE_JOB_MAP_MEMORY_MB_PROPERTY, mem); // In case that M/R 1.x applications use the old property name setLong(JobConf.MAPRED_JOB_MAP_MEMORY_MB_PROPERTY, mem); } /** * Get memory required to run a reduce task of the job, in MB. * * If a value is specified in the configuration, it is returned. * Else, it returns {@link JobContext#DEFAULT_REDUCE_MEMORY_MB}. * <p> * For backward compatibility, if the job configuration sets the * key {@link #MAPRED_TASK_MAXVMEM_PROPERTY} to a value different * from {@link #DISABLED_MEMORY_LIMIT}, that value will be used * after converting it from bytes to MB. * @return memory required to run a reduce task of the job, in MB. */ public long getMemoryForReduceTask() { long value = getDeprecatedMemoryValue(); if (value < 0) { return getMemoryRequired(TaskType.REDUCE); } return value; } // Return the value set to the key MAPRED_TASK_MAXVMEM_PROPERTY, // converted into MBs. // Returns DISABLED_MEMORY_LIMIT if unset, or set to a negative // value. private long getDeprecatedMemoryValue() { long oldValue = getLong(MAPRED_TASK_MAXVMEM_PROPERTY, DISABLED_MEMORY_LIMIT); if (oldValue > 0) { oldValue /= (1024 * 1024); } return oldValue; } public void setMemoryForReduceTask(long mem) { setLong(JobConf.MAPREDUCE_JOB_REDUCE_MEMORY_MB_PROPERTY, mem); // In case that M/R 1.x applications use the old property name setLong(JobConf.MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY, mem); } /** * Return the name of the queue to which this job is submitted. * Defaults to 'default'. * * @return name of the queue */ public String getQueueName() { return get(JobContext.QUEUE_NAME, DEFAULT_QUEUE_NAME); } /** * Set the name of the queue to which this job should be submitted. * * @param queueName Name of the queue */ public void setQueueName(String queueName) { set(JobContext.QUEUE_NAME, queueName); } /** * Normalize the negative values in configuration * * @param val * @return normalized value */ public static long normalizeMemoryConfigValue(long val) { if (val < 0) { val = DISABLED_MEMORY_LIMIT; } return val; } /** * Find a jar that contains a class of the same name, if any. * It will return a jar file, even if that is not the first thing * on the class path that has a class with the same name. * * @param my_class the class to find. * @return a jar file that contains the class, or null. */ public static String findContainingJar(Class my_class) { return ClassUtil.findContainingJar(my_class); } /** * Get the memory required to run a task of this job, in bytes. See * {@link #MAPRED_TASK_MAXVMEM_PROPERTY} * <p> * This method is deprecated. Now, different memory limits can be * set for map and reduce tasks of a job, in MB. * <p> * For backward compatibility, if the job configuration sets the * key {@link #MAPRED_TASK_MAXVMEM_PROPERTY}, that value is returned. * Otherwise, this method will return the larger of the values returned by * {@link #getMemoryForMapTask()} and {@link #getMemoryForReduceTask()} * after converting them into bytes. * * @return Memory required to run a task of this job, in bytes. * @see #setMaxVirtualMemoryForTask(long) * @deprecated Use {@link #getMemoryForMapTask()} and * {@link #getMemoryForReduceTask()} */ @Deprecated public long getMaxVirtualMemoryForTask() { LOG.warn("getMaxVirtualMemoryForTask() is deprecated. " + "Instead use getMemoryForMapTask() and getMemoryForReduceTask()"); long value = getLong(MAPRED_TASK_MAXVMEM_PROPERTY, Math.max(getMemoryForMapTask(), getMemoryForReduceTask()) * 1024 * 1024); return value; } /** * Set the maximum amount of memory any task of this job can use. See * {@link #MAPRED_TASK_MAXVMEM_PROPERTY} * <p> * mapred.task.maxvmem is split into * mapreduce.map.memory.mb * and mapreduce.map.memory.mb,mapred * each of the new key are set * as mapred.task.maxvmem / 1024 * as new values are in MB * * @param vmem Maximum amount of virtual memory in bytes any task of this job * can use. * @see #getMaxVirtualMemoryForTask() * @deprecated * Use {@link #setMemoryForMapTask(long mem)} and * Use {@link #setMemoryForReduceTask(long mem)} */ @Deprecated public void setMaxVirtualMemoryForTask(long vmem) { LOG.warn("setMaxVirtualMemoryForTask() is deprecated." + "Instead use setMemoryForMapTask() and setMemoryForReduceTask()"); if (vmem < 0) { throw new IllegalArgumentException("Task memory allocation may not be < 0"); } if (get(JobConf.MAPRED_TASK_MAXVMEM_PROPERTY) == null) { setMemoryForMapTask(vmem / (1024 * 1024)); //Changing bytes to mb setMemoryForReduceTask(vmem / (1024 * 1024));//Changing bytes to mb } else { this.setLong(JobConf.MAPRED_TASK_MAXVMEM_PROPERTY, vmem); } } /** * @deprecated this variable is deprecated and nolonger in use. */ @Deprecated public long getMaxPhysicalMemoryForTask() { LOG.warn("The API getMaxPhysicalMemoryForTask() is deprecated." + " Refer to the APIs getMemoryForMapTask() and" + " getMemoryForReduceTask() for details."); return -1; } /* * @deprecated this */ @Deprecated public void setMaxPhysicalMemoryForTask(long mem) { LOG.warn("The API setMaxPhysicalMemoryForTask() is deprecated." + " The value set is ignored. Refer to " + " setMemoryForMapTask() and setMemoryForReduceTask() for details."); } static String deprecatedString(String key) { return "The variable " + key + " is no longer used."; } private void checkAndWarnDeprecation() { if (get(JobConf.MAPRED_TASK_MAXVMEM_PROPERTY) != null) { LOG.warn(JobConf.deprecatedString(JobConf.MAPRED_TASK_MAXVMEM_PROPERTY) + " Instead use " + JobConf.MAPREDUCE_JOB_MAP_MEMORY_MB_PROPERTY + " and " + JobConf.MAPREDUCE_JOB_REDUCE_MEMORY_MB_PROPERTY); } if (get(JobConf.MAPRED_TASK_ULIMIT) != null) { LOG.warn(JobConf.deprecatedString(JobConf.MAPRED_TASK_ULIMIT)); } if (get(JobConf.MAPRED_MAP_TASK_ULIMIT) != null) { LOG.warn(JobConf.deprecatedString(JobConf.MAPRED_MAP_TASK_ULIMIT)); } if (get(JobConf.MAPRED_REDUCE_TASK_ULIMIT) != null) { LOG.warn(JobConf.deprecatedString(JobConf.MAPRED_REDUCE_TASK_ULIMIT)); } } private String getConfiguredTaskJavaOpts(TaskType taskType) { String userClasspath = ""; String adminClasspath = ""; if (taskType == TaskType.MAP) { userClasspath = get(MAPRED_MAP_TASK_JAVA_OPTS, get(MAPRED_TASK_JAVA_OPTS, DEFAULT_MAPRED_TASK_JAVA_OPTS)); adminClasspath = get(MRJobConfig.MAPRED_MAP_ADMIN_JAVA_OPTS, MRJobConfig.DEFAULT_MAPRED_ADMIN_JAVA_OPTS); } else { userClasspath = get(MAPRED_REDUCE_TASK_JAVA_OPTS, get(MAPRED_TASK_JAVA_OPTS, DEFAULT_MAPRED_TASK_JAVA_OPTS)); adminClasspath = get(MRJobConfig.MAPRED_REDUCE_ADMIN_JAVA_OPTS, MRJobConfig.DEFAULT_MAPRED_ADMIN_JAVA_OPTS); } return adminClasspath + " " + userClasspath; } @Private public String getTaskJavaOpts(TaskType taskType) { String javaOpts = getConfiguredTaskJavaOpts(taskType); if (!javaOpts.contains("-Xmx")) { float heapRatio = getFloat(MRJobConfig.HEAP_MEMORY_MB_RATIO, MRJobConfig.DEFAULT_HEAP_MEMORY_MB_RATIO); if (heapRatio > 1.0f || heapRatio < 0) { LOG.warn("Invalid value for " + MRJobConfig.HEAP_MEMORY_MB_RATIO + ", using the default."); heapRatio = MRJobConfig.DEFAULT_HEAP_MEMORY_MB_RATIO; } int taskContainerMb = getMemoryRequired(taskType); int taskHeapSize = (int) Math.ceil(taskContainerMb * heapRatio); String xmxArg = String.format("-Xmx%dm", taskHeapSize); LOG.info("Task java-opts do not specify heap size. Setting task attempt" + " jvm max heap size to " + xmxArg); javaOpts += " " + xmxArg; } return javaOpts; } /** * Parse the Maximum heap size from the java opts as specified by the -Xmx option * Format: -Xmx<size>[g|G|m|M|k|K] * @param javaOpts String to parse to read maximum heap size * @return Maximum heap size in MB or -1 if not specified */ @Private @VisibleForTesting public static int parseMaximumHeapSizeMB(String javaOpts) { // Find the last matching -Xmx following word boundaries Matcher m = JAVA_OPTS_XMX_PATTERN.matcher(javaOpts); if (m.matches()) { long size = Long.parseLong(m.group(1)); if (size <= 0) { return -1; } if (m.group(2).isEmpty()) { // -Xmx specified in bytes return (int) (size / (1024 * 1024)); } char unit = m.group(2).charAt(0); switch (unit) { case 'g': case 'G': // -Xmx specified in GB return (int) (size * 1024); case 'm': case 'M': // -Xmx specified in MB return (int) size; case 'k': case 'K': // -Xmx specified in KB return (int) (size / 1024); } } // -Xmx not specified return -1; } private int getMemoryRequiredHelper(String configName, int defaultValue, int heapSize, float heapRatio) { int memory = getInt(configName, -1); if (memory <= 0) { if (heapSize > 0) { memory = (int) Math.ceil(heapSize / heapRatio); LOG.info("Figured value for " + configName + " from javaOpts"); } else { memory = defaultValue; } } return memory; } @Private public int getMemoryRequired(TaskType taskType) { int memory = 1024; int heapSize = parseMaximumHeapSizeMB(getConfiguredTaskJavaOpts(taskType)); float heapRatio = getFloat(MRJobConfig.HEAP_MEMORY_MB_RATIO, MRJobConfig.DEFAULT_HEAP_MEMORY_MB_RATIO); if (taskType == TaskType.MAP) { return getMemoryRequiredHelper(MRJobConfig.MAP_MEMORY_MB, MRJobConfig.DEFAULT_MAP_MEMORY_MB, heapSize, heapRatio); } else if (taskType == TaskType.REDUCE) { return getMemoryRequiredHelper(MRJobConfig.REDUCE_MEMORY_MB, MRJobConfig.DEFAULT_REDUCE_MEMORY_MB, heapSize, heapRatio); } else { return memory; } } /* For debugging. Dump configurations to system output as XML format. */ public static void main(String[] args) throws Exception { new JobConf(new Configuration()).writeXml(System.out); } }