org.apache.tez.mapreduce.hadoop.MRHelpers.java Source code

Introduction

Here is the source code for org.apache.tez.mapreduce.hadoop.MRHelpers.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tez.mapreduce.hadoop;

import java.io.File;
import java.io.IOException;
import java.util.Map;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.classification.InterfaceStability.Evolving;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.tez.common.TezUtils;
import org.apache.tez.common.TezYARNUtils;
import org.apache.tez.dag.api.TezUncheckedException;
import org.apache.tez.mapreduce.combine.MRCombiner;
import org.apache.tez.mapreduce.partition.MRPartitioner;
import org.apache.tez.runtime.library.api.TezRuntimeConfiguration;

/**
 * This class contains helper methods for frameworks which migrate from MapReduce to Tez, and need
 * to continue to work with existing MapReduce configurations.
 */
@Public
@Evolving
public class MRHelpers {

    private static final Log LOG = LogFactory.getLog(MRHelpers.class);

    /**
     * Translate MapReduce configuration keys to the equivalent Tez keys in the provided
     * configuration. The translation is done in place. </p>
     * This method is meant to be used by frameworks which rely upon existing MapReduce configuration
     * instead of setting up their own.
     *
     * @param conf mr based configuration to be translated to tez
     */
    public static void translateMRConfToTez(Configuration conf) {
        convertVertexConfToTez(conf);
    }

    /**
     * Update the provided configuration to use the new API (mapreduce) or the old API (mapred) based
     * on the configured InputFormat, OutputFormat, Partitioner etc. Also ensures that keys not
     * required by a particular mode are not present. </p>
     *
     * This method should be invoked after completely setting up the configuration. </p>
     *
     * Defaults to using the new API if relevant keys are not present.
     *
     */
    public static void configureMRApiUsage(Configuration conf) {
        String oldMapperClass = "mapred.mapper.class";
        conf.setBooleanIfUnset("mapred.mapper.new-api", conf.get(oldMapperClass) == null);
        try {
            if (conf.getBoolean("mapred.mapper.new-api", false)) {
                String mode = "new map API";
                ensureNotSet(conf, "mapred.input.format.class", mode);
                ensureNotSet(conf, oldMapperClass, mode);
            } else {
                String mode = "map compatability";
                ensureNotSet(conf, MRJobConfig.INPUT_FORMAT_CLASS_ATTR, mode);
                ensureNotSet(conf, MRJobConfig.MAP_CLASS_ATTR, mode);
            }
        } catch (IOException e) {
            throw new TezUncheckedException(e);
        }
    }

    private static void convertVertexConfToTez(Configuration vertexConf) {
        setStageKeysFromBaseConf(vertexConf, vertexConf, "unknown");
        processDirectConversion(vertexConf);
        setupMRComponents(vertexConf);
    }

    private static void setupMRComponents(Configuration conf) {
        if (conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_PARTITIONER_CLASS) == null) {
            conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_PARTITIONER_CLASS, MRPartitioner.class.getName());
        }

        if (conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINER_CLASS) == null) {
            boolean useNewApi = conf.getBoolean("mapred.mapper.new-api", false);
            if (useNewApi) {
                if (conf.get(MRJobConfig.COMBINE_CLASS_ATTR) != null) {
                    conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINER_CLASS, MRCombiner.class.getName());
                }
            } else {
                if (conf.get("mapred.combiner.class") != null) {
                    conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINER_CLASS, MRCombiner.class.getName());
                }
            }
        }
    }

    /**
     * Pulls in specific keys from the base configuration, if they are not set at
     * the stage level. An explicit list of keys is copied over (not all), which
     * require translation to tez keys.
     */
    private static void setStageKeysFromBaseConf(Configuration conf, Configuration baseConf, String stage) {
        // Don't clobber explicit tez config.
        JobConf jobConf = null;
        if (conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS) == null) {
            // If this is set, but the comparator is not set, and their types differ -
            // the job will break.
            if (conf.get(MRJobConfig.MAP_OUTPUT_KEY_CLASS) == null) {
                // Pull this in from the baseConf
                // Create jobConf only if required.
                jobConf = new JobConf(baseConf);
                conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, jobConf.getMapOutputKeyClass().getName());

                if (LOG.isDebugEnabled()) {
                    LOG.debug("Setting " + MRJobConfig.MAP_OUTPUT_KEY_CLASS + " for stage: " + stage
                            + " based on job level configuration. Value: "
                            + conf.get(MRJobConfig.MAP_OUTPUT_KEY_CLASS));
                }
            }
        }

        if (conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS) == null) {
            if (conf.get(MRJobConfig.MAP_OUTPUT_VALUE_CLASS) == null) {
                if (jobConf == null) {
                    // Create jobConf if not already created
                    jobConf = new JobConf(baseConf);
                }
                conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, jobConf.getMapOutputValueClass().getName());
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Setting " + MRJobConfig.MAP_OUTPUT_VALUE_CLASS + " for stage: " + stage
                            + " based on job level configuration. Value: "
                            + conf.get(MRJobConfig.MAP_OUTPUT_VALUE_CLASS));
                }
            }
        }
    }

    private static void processDirectConversion(Configuration conf) {
        for (Map.Entry<String, String> dep : DeprecatedKeys.getMRToTezRuntimeParamMap().entrySet()) {
            if (conf.get(dep.getKey()) != null) {
                // TODO Deprecation reason does not seem to reflect in the config ?
                // The ordering is important in case of keys which are also deprecated.
                // Unset will unset the deprecated keys and all it's variants.
                final String mrValue = conf.get(dep.getKey());
                final String tezValue = conf.get(dep.getValue());
                conf.unset(dep.getKey());
                if (tezValue == null) {
                    conf.set(dep.getValue(), mrValue, "TRANSLATED_TO_TEZ");
                }
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Config: mr(unset):" + dep.getKey() + ", mr initial value=" + mrValue + ", tez:"
                            + dep.getValue() + "=" + conf.get(dep.getValue()));
                }
            }
        }
    }

    private static String getChildLogLevel(Configuration conf, boolean isMap) {
        if (isMap) {
            return conf.get(MRJobConfig.MAP_LOG_LEVEL, JobConf.DEFAULT_LOG_LEVEL.toString());
        } else {
            return conf.get(MRJobConfig.REDUCE_LOG_LEVEL, JobConf.DEFAULT_LOG_LEVEL.toString());
        }
    }

    private static void ensureNotSet(Configuration conf, String attr, String msg) throws IOException {
        if (conf.get(attr) != null) {
            throw new IOException(attr + " is incompatible with " + msg + " mode.");
        }
    }

    private static String getLog4jCmdLineProperties(Configuration conf, boolean isMap) {
        Vector<String> logProps = new Vector<String>(4);
        TezUtils.addLog4jSystemProperties(getChildLogLevel(conf, isMap), logProps);
        StringBuilder sb = new StringBuilder();
        for (String str : logProps) {
            sb.append(str).append(" ");
        }
        return sb.toString();
    }

    /**
     * Generate JVM options based on MapReduce AM java options. </p>
     * <p/>
     * This is only meant to be used if frameworks are not setting up their own java options or
     * relying on the defaults specified by Tez, and instead want to use the options which may already
     * have been configured for an MR AppMaster.
     *
     * @param conf Configuration to be used to extract JVM opts specific info
     * @return JAVA_OPTS string to be used in launching the JVM
     */
    public static String getJavaOptsForMRAM(Configuration conf) {
        // Admin opts
        String mrAppMasterAdminOptions = conf.get(MRJobConfig.MR_AM_ADMIN_COMMAND_OPTS,
                MRJobConfig.DEFAULT_MR_AM_ADMIN_COMMAND_OPTS);
        // Add AM user command opts
        String mrAppMasterUserOptions = conf.get(MRJobConfig.MR_AM_COMMAND_OPTS,
                MRJobConfig.DEFAULT_MR_AM_COMMAND_OPTS);

        return mrAppMasterAdminOptions.trim() + " " + mrAppMasterUserOptions.trim();
    }

    /**
     * Generate JVM options based on MapReduce mapper java options. </p>
     *
     * This is only meant to be used if frameworks are not setting up their own java options,
     * and would like to fallback to using java options which may already be configured for
     * Hadoop MapReduce mappers.
     *
     * Uses mapreduce.admin.map.child.java.opts, mapreduce.map.java.opts and
     * mapreduce.map.log.level from config to generate the opts.
     *
     * @param conf Configuration to be used to extract JVM opts specific info
     * @return JAVA_OPTS string to be used in launching the JVM
     */
    @SuppressWarnings("deprecation")
    public static String getJavaOptsForMRMapper(Configuration conf) {
        String adminOpts = conf.get(MRJobConfig.MAPRED_MAP_ADMIN_JAVA_OPTS,
                MRJobConfig.DEFAULT_MAPRED_ADMIN_JAVA_OPTS);

        String userOpts = conf.get(MRJobConfig.MAP_JAVA_OPTS,
                conf.get(JobConf.MAPRED_TASK_JAVA_OPTS, JobConf.DEFAULT_MAPRED_TASK_JAVA_OPTS));

        return adminOpts.trim() + " " + userOpts.trim() + " " + getLog4jCmdLineProperties(conf, true);
    }

    /**
     * Generate JVM options based on MapReduce reducer java options. </p>
     *
     * This is only meant to be used if frameworks are not setting up their own java options,
     * and would like to fallback to using java options which may already be configured for
     * Hadoop MapReduce reducers.
     *
     * Uses mapreduce.admin.reduce.child.java.opts, mapreduce.reduce.java.opts
     * and mapreduce.reduce.log.level from config to generate the opts.
     *
     * @param conf Configuration to be used to extract JVM opts specific info
     * @return JAVA_OPTS string to be used in launching the JVM
     */
    @SuppressWarnings("deprecation")
    public static String getJavaOptsForMRReducer(Configuration conf) {
        String adminOpts = conf.get(MRJobConfig.MAPRED_REDUCE_ADMIN_JAVA_OPTS,
                MRJobConfig.DEFAULT_MAPRED_ADMIN_JAVA_OPTS);

        String userOpts = conf.get(MRJobConfig.REDUCE_JAVA_OPTS,
                conf.get(JobConf.MAPRED_TASK_JAVA_OPTS, JobConf.DEFAULT_MAPRED_TASK_JAVA_OPTS));

        return adminOpts.trim() + " " + userOpts.trim() + " " + getLog4jCmdLineProperties(conf, false);
    }

    /**
     * Extract the container resource requirements from the provided configuration, which would
     * otherwise have been used when running a Hadoop MapReduce mapper. </p>
     * <p/>
     * This is only meant to be used if frameworks are not setting up their own {@link
     * org.apache.hadoop.yarn.api.records.Resource} and would like to fallback to using resources
     * which may already be configured for Hadoop MapReduce mappers.
     *
     * @param conf Configuration with MR specific settings used to extract
     * information from
     *
     * @return Resource object used to define requirements for containers
     * running Map tasks
     */
    public static Resource getResourceForMRMapper(Configuration conf) {
        return Resource.newInstance(conf.getInt(MRJobConfig.MAP_MEMORY_MB, MRJobConfig.DEFAULT_MAP_MEMORY_MB),
                conf.getInt(MRJobConfig.MAP_CPU_VCORES, MRJobConfig.DEFAULT_MAP_CPU_VCORES));
    }

    /**
     * Extract the container resource requirements from the provided configuration, which would
     * otherwise have been used when running a Hadoop MapReduce reducer. </p>
     * <p/>
     * This is only meant to be used if frameworks are not setting up their own {@link
     * org.apache.hadoop.yarn.api.records.Resource} and would like to fallback to using resources
     * which may already be configured for Hadoop MapReduce reducers.
     * <p/>
     * Uses mapreduce.reduce.memory.mb and mapreduce.reduce.cpu.vcores from the
     * provided configuration.
     *
     * @param conf Configuration with MR specific settings used to extract
     *             information from
     * @return Resource object used to define requirements for containers
     * running Reduce tasks
     */
    public static Resource getResourceForMRReducer(Configuration conf) {
        return Resource.newInstance(conf.getInt(MRJobConfig.REDUCE_MEMORY_MB, MRJobConfig.DEFAULT_REDUCE_MEMORY_MB),
                conf.getInt(MRJobConfig.REDUCE_CPU_VCORES, MRJobConfig.DEFAULT_REDUCE_CPU_VCORES));
    }

    /**
     * Setup classpath and other environment variables based on the configured values for MR Mappers
     * or Reducers
     *
     * @param conf        Configuration to retrieve settings from
     * @param environment Environment to update
     * @param isMap       Whether task is a map or reduce task
     */
    public static void updateEnvBasedOnMRTaskEnv(Configuration conf, Map<String, String> environment,
            boolean isMap) {
        // Shell
        environment.put(Environment.SHELL.name(),
                conf.get(MRJobConfig.MAPRED_ADMIN_USER_SHELL, MRJobConfig.DEFAULT_SHELL));

        // Add pwd to LD_LIBRARY_PATH, add this before adding anything else
        TezYARNUtils.addToEnvironment(environment, Environment.LD_LIBRARY_PATH.name(), Environment.PWD.$(),
                File.pathSeparator);

        // Add the env variables passed by the admin
        TezYARNUtils.appendToEnvFromInputString(environment,
                conf.get(MRJobConfig.MAPRED_ADMIN_USER_ENV, MRJobConfig.DEFAULT_MAPRED_ADMIN_USER_ENV),
                File.pathSeparator);

        // Add the env variables passed by the user
        String mapredChildEnv = (isMap ? conf.get(MRJobConfig.MAP_ENV, "") : conf.get(MRJobConfig.REDUCE_ENV, ""));
        TezYARNUtils.appendToEnvFromInputString(environment, mapredChildEnv, File.pathSeparator);

        // Set logging level in the environment.
        environment.put("HADOOP_ROOT_LOGGER", getChildLogLevel(conf, isMap) + ",CLA");
    }

    /**
     * Setup environment variables based on the configured values for the MR AM
     * @param conf Configuration from which to extract information
     * @param environment Environment map to update
     */
    public static void updateEnvBasedOnMRAMEnv(Configuration conf, Map<String, String> environment) {
        TezYARNUtils.appendToEnvFromInputString(environment, conf.get(MRJobConfig.MR_AM_ADMIN_USER_ENV),
                File.pathSeparator);
        TezYARNUtils.appendToEnvFromInputString(environment, conf.get(MRJobConfig.MR_AM_ENV), File.pathSeparator);
    }

}