org.apache.hadoop.hive.ql.exec.mr.MapredLocalTask.java Source code

Introduction

Here is the source code for org.apache.hadoop.hive.ql.exec.mr.MapredLocalTask.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.exec.mr;

import static org.apache.hadoop.hive.ql.exec.mr.MapRedTask.HADOOP_CLIENT_OPTS;
import static org.apache.hadoop.hive.ql.exec.mr.MapRedTask.HADOOP_MEM_KEY;
import static org.apache.hadoop.hive.ql.exec.mr.MapRedTask.HADOOP_OPTS_KEY;
import static org.apache.hadoop.hive.ql.exec.mr.MapRedTask.HIVE_SYS_PROP;

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.Serializable;
import java.lang.management.ManagementFactory;
import java.lang.management.MemoryMXBean;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.LogUtils;
import org.apache.hadoop.hive.common.io.CachingPrintStream;
import org.apache.hadoop.hive.common.metrics.common.Metrics;
import org.apache.hadoop.hive.common.metrics.common.MetricsConstant;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.DriverContext;
import org.apache.hadoop.hive.ql.QueryPlan;
import org.apache.hadoop.hive.ql.QueryState;
import org.apache.hadoop.hive.ql.exec.BucketMatcher;
import org.apache.hadoop.hive.ql.exec.FetchOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.SecureCmdDoAs;
import org.apache.hadoop.hive.ql.exec.SerializationUtilities;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.mapjoin.MapJoinMemoryExhaustionError;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.HiveInputFormat;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.BucketMapJoinContext;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.MapredLocalWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.api.StageType;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.shims.Utils;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hive.common.util.StreamPrinter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * MapredLocalTask represents any local work (i.e.: client side work) that hive needs to
 * execute. E.g.: This is used for generating Hashtables for Mapjoins on the client
 * before the Join is executed on the cluster.
 *
 * MapRedLocalTask does not actually execute the work in process, but rather generates
 * a command using ExecDriver. ExecDriver is what will finally drive processing the records.
 */
public class MapredLocalTask extends Task<MapredLocalWork> implements Serializable {

    private static final long serialVersionUID = 1L;

    private final Map<String, FetchOperator> fetchOperators = new HashMap<String, FetchOperator>();
    protected HadoopJobExecHelper jobExecHelper;
    private JobConf job;
    public static transient final Logger l4j = LoggerFactory.getLogger(MapredLocalTask.class);
    static final String HIVE_LOCAL_TASK_CHILD_OPTS_KEY = "HIVE_LOCAL_TASK_CHILD_OPTS";
    public static MemoryMXBean memoryMXBean;
    private static final Logger LOG = LoggerFactory.getLogger(MapredLocalTask.class);

    // not sure we need this exec context; but all the operators in the work
    // will pass this context throught
    private ExecMapperContext execContext = null;

    private Process executor;
    private SecureCmdDoAs secureDoAs;

    public MapredLocalTask() {
        super();
    }

    public MapredLocalTask(MapredLocalWork plan, JobConf job, boolean isSilent) throws HiveException {
        setWork(plan);
        this.job = job;
        console = new LogHelper(LOG, isSilent);
    }

    public void setExecContext(ExecMapperContext execContext) {
        this.execContext = execContext;
    }

    @Override
    public void updateTaskMetrics(Metrics metrics) {
        metrics.incrementCounter(MetricsConstant.HIVE_MR_TASKS);
    }

    @Override
    public void initialize(QueryState queryState, QueryPlan queryPlan, DriverContext driverContext,
            CompilationOpContext opContext) {
        super.initialize(queryState, queryPlan, driverContext, opContext);
        job = new JobConf(conf, ExecDriver.class);
        execContext = new ExecMapperContext(job);
        //we don't use the HadoopJobExecHooks for local tasks
        this.jobExecHelper = new HadoopJobExecHelper(job, console, this, null);
    }

    public static String now() {
        Calendar cal = Calendar.getInstance();
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        return sdf.format(cal.getTime());
    }

    @Override
    public boolean requireLock() {
        return true;
    }

    @Override
    public int execute(DriverContext driverContext) {
        if (conf.getBoolVar(HiveConf.ConfVars.SUBMITLOCALTASKVIACHILD)) {
            // send task off to another jvm
            return executeInChildVM(driverContext);
        } else {
            // execute in process
            return executeInProcess(driverContext);
        }
    }

    public int executeInChildVM(DriverContext driverContext) {
        // execute in child jvm
        try {
            // generate the cmd line to run in the child jvm
            Context ctx = driverContext.getCtx();
            String hiveJar = conf.getJar();

            String hadoopExec = conf.getVar(HiveConf.ConfVars.HADOOPBIN);
            conf.setVar(ConfVars.HIVEADDEDJARS, Utilities.getResourceFiles(conf, SessionState.ResourceType.JAR));
            // write out the plan to a local file
            Path planPath = new Path(ctx.getLocalTmpPath(), "plan.xml");
            MapredLocalWork plan = getWork();
            LOG.info("Generating plan file " + planPath.toString());

            OutputStream out = null;
            try {
                out = FileSystem.getLocal(conf).create(planPath);
                SerializationUtilities.serializePlan(plan, out);
                out.close();
                out = null;
            } finally {
                IOUtils.closeQuietly(out);
            }

            String isSilent = "true".equalsIgnoreCase(System.getProperty("test.silent")) ? "-nolog" : "";

            String jarCmd;

            jarCmd = hiveJar + " " + ExecDriver.class.getName();
            String hiveConfArgs = ExecDriver.generateCmdLine(conf, ctx);
            String cmdLine = hadoopExec + " jar " + jarCmd + " -localtask -plan " + planPath.toString() + " "
                    + isSilent + " " + hiveConfArgs;

            String workDir = (new File(".")).getCanonicalPath();
            String files = Utilities.getResourceFiles(conf, SessionState.ResourceType.FILE);

            if (!files.isEmpty()) {
                cmdLine = cmdLine + " -files " + files;

                workDir = ctx.getLocalTmpPath().toUri().getPath();

                if (!(new File(workDir)).mkdir()) {
                    throw new IOException("Cannot create tmp working dir: " + workDir);
                }

                for (String f : StringUtils.split(files, ',')) {
                    Path p = new Path(f);
                    String target = p.toUri().getPath();
                    String link = workDir + Path.SEPARATOR + p.getName();
                    if (FileUtil.symLink(target, link) != 0) {
                        throw new IOException("Cannot link to added file: " + target + " from: " + link);
                    }
                }
            }

            // Inherit Java system variables
            String hadoopOpts;
            StringBuilder sb = new StringBuilder();
            Properties p = System.getProperties();
            for (String element : HIVE_SYS_PROP) {
                if (p.containsKey(element)) {
                    sb.append(" -D" + element + "=" + p.getProperty(element));
                }
            }
            hadoopOpts = sb.toString();
            // Inherit the environment variables
            String[] env;
            Map<String, String> variables = new HashMap<String, String>(System.getenv());
            // The user can specify the hadoop memory

            // if ("local".equals(conf.getVar(HiveConf.ConfVars.HADOOPJT))) {
            // if we are running in local mode - then the amount of memory used
            // by the child jvm can no longer default to the memory used by the
            // parent jvm
            // int hadoopMem = conf.getIntVar(HiveConf.ConfVars.HIVEHADOOPMAXMEM);
            int hadoopMem = conf.getIntVar(HiveConf.ConfVars.HIVEHADOOPMAXMEM);
            if (hadoopMem == 0) {
                // remove env var that would default child jvm to use parent's memory
                // as default. child jvm would use default memory for a hadoop client
                variables.remove(HADOOP_MEM_KEY);
            } else {
                // user specified the memory for local mode hadoop run
                console.printInfo(" set heap size\t" + hadoopMem + "MB");
                variables.put(HADOOP_MEM_KEY, String.valueOf(hadoopMem));
            }
            // } else {
            // nothing to do - we are not running in local mode - only submitting
            // the job via a child process. in this case it's appropriate that the
            // child jvm use the same memory as the parent jvm

            // }

            //Set HADOOP_USER_NAME env variable for child process, so that
            // it also runs with hadoop permissions for the user the job is running as
            // This will be used by hadoop only in unsecure(/non kerberos) mode
            String endUserName = Utils.getUGI().getShortUserName();
            LOG.debug("setting HADOOP_USER_NAME\t" + endUserName);
            variables.put("HADOOP_USER_NAME", endUserName);

            if (variables.containsKey(HADOOP_OPTS_KEY)) {
                variables.put(HADOOP_OPTS_KEY, variables.get(HADOOP_OPTS_KEY) + hadoopOpts);
            } else {
                variables.put(HADOOP_OPTS_KEY, hadoopOpts);
            }

            //For Windows OS, we need to pass HIVE_HADOOP_CLASSPATH Java parameter while starting
            //Hiveserver2 using "-hiveconf hive.hadoop.classpath=%HIVE_LIB%". This is to combine path(s).
            if (HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_HADOOP_CLASSPATH) != null) {
                if (variables.containsKey("HADOOP_CLASSPATH")) {
                    variables.put("HADOOP_CLASSPATH", variables.get("HADOOP_CLASSPATH") + ";"
                            + HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_HADOOP_CLASSPATH));
                } else {
                    variables.put("HADOOP_CLASSPATH",
                            HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_HADOOP_CLASSPATH));
                }
            }

            if (variables.containsKey(MapRedTask.HIVE_DEBUG_RECURSIVE)) {
                MapRedTask.configureDebugVariablesForChildJVM(variables);
            }

            if (UserGroupInformation.isSecurityEnabled() && UserGroupInformation.isLoginKeytabBased()) {
                //If kerberos security is enabled, and HS2 doAs is enabled,
                // then additional params need to be set so that the command is run as
                // intended user
                secureDoAs = new SecureCmdDoAs(conf);
                secureDoAs.addEnv(variables);
            }

            // If HIVE_LOCAL_TASK_CHILD_OPTS is set, child VM environment setting
            // HADOOP_CLIENT_OPTS will be replaced with HIVE_LOCAL_TASK_CHILD_OPTS.
            // HADOOP_OPTS is updated too since HADOOP_CLIENT_OPTS is appended
            // to HADOOP_OPTS in most cases. This way, the local task JVM can
            // have different settings from those of HiveServer2.
            if (variables.containsKey(HIVE_LOCAL_TASK_CHILD_OPTS_KEY)) {
                String childOpts = variables.get(HIVE_LOCAL_TASK_CHILD_OPTS_KEY);
                if (childOpts == null) {
                    childOpts = "";
                }
                String clientOpts = variables.put(HADOOP_CLIENT_OPTS, childOpts);
                String tmp = variables.get(HADOOP_OPTS_KEY);
                if (tmp != null && !StringUtils.isBlank(clientOpts)) {
                    tmp = tmp.replace(clientOpts, childOpts);
                    variables.put(HADOOP_OPTS_KEY, tmp);
                }
            }

            env = new String[variables.size()];
            int pos = 0;
            for (Map.Entry<String, String> entry : variables.entrySet()) {
                String name = entry.getKey();
                String value = entry.getValue();
                env[pos++] = name + "=" + value;
                LOG.debug("Setting env: " + name + "=" + LogUtils.maskIfPassword(name, value));
            }

            LOG.info("Executing: " + cmdLine);

            // Run ExecDriver in another JVM
            executor = Runtime.getRuntime().exec(cmdLine, env, new File(workDir));

            CachingPrintStream errPrintStream = new CachingPrintStream(System.err);

            StreamPrinter outPrinter = new StreamPrinter(executor.getInputStream(), null, System.out);
            StreamPrinter errPrinter = new StreamPrinter(executor.getErrorStream(), null, errPrintStream);

            outPrinter.start();
            errPrinter.start();

            int exitVal = jobExecHelper.progressLocal(executor, getId());

            // wait for stream threads to finish
            outPrinter.join();
            errPrinter.join();

            if (exitVal != 0) {
                LOG.error("Execution failed with exit status: " + exitVal);
                if (SessionState.get() != null) {
                    SessionState.get().addLocalMapRedErrors(getId(), errPrintStream.getOutput());
                }
            } else {
                LOG.info("Execution completed successfully");
            }

            return exitVal;
        } catch (Exception e) {
            LOG.error("Exception: ", e);
            return (1);
        } finally {
            if (secureDoAs != null) {
                secureDoAs.close();
            }
        }
    }

    public int executeInProcess(DriverContext driverContext) {
        // check the local work
        if (work == null) {
            return -1;
        }

        if (execContext == null) {
            execContext = new ExecMapperContext(job);
        }

        memoryMXBean = ManagementFactory.getMemoryMXBean();
        long startTime = System.currentTimeMillis();
        console.printInfo(
                Utilities.now() + "\tStarting to launch local task to process map join;\tmaximum memory = "
                        + memoryMXBean.getHeapMemoryUsage().getMax());
        execContext.setJc(job);
        // set the local work, so all the operator can get this context
        execContext.setLocalWork(work);
        try {
            startForward(null);
            long currentTime = System.currentTimeMillis();
            long elapsed = currentTime - startTime;
            console.printInfo(
                    Utilities.now() + "\tEnd of local task; Time Taken: " + Utilities.showTime(elapsed) + " sec.");
        } catch (Throwable throwable) {
            if (throwable instanceof OutOfMemoryError || (throwable instanceof MapJoinMemoryExhaustionError)) {
                l4j.error("Hive Runtime Error: Map local work exhausted memory", throwable);
                return 3;
            } else {
                l4j.error("Hive Runtime Error: Map local work failed", throwable);
                return 2;
            }
        }
        return 0;
    }

    public void startForward(String bigTableBucket) throws Exception {
        boolean inputFileChangeSenstive = work.getInputFileChangeSensitive();
        initializeOperators(new HashMap<FetchOperator, JobConf>());
        // for each big table's bucket, call the start forward
        if (inputFileChangeSenstive) {
            for (Map<String, List<String>> bigTableBucketFiles : work.getBucketMapjoinContext()
                    .getAliasBucketFileNameMapping().values()) {
                if (bigTableBucket == null) {
                    for (String bigTableBucketFile : bigTableBucketFiles.keySet()) {
                        startForward(inputFileChangeSenstive, bigTableBucketFile);
                    }
                } else if (bigTableBucketFiles.keySet().contains(bigTableBucket)) {
                    startForward(inputFileChangeSenstive, bigTableBucket);
                }
            }
        } else {
            startForward(inputFileChangeSenstive, null);
        }
    }

    private void startForward(boolean inputFileChangeSenstive, String bigTableBucket) throws Exception {
        for (Operator<?> source : work.getAliasToWork().values()) {
            source.reset();
        }
        if (inputFileChangeSenstive) {
            execContext.setCurrentBigBucketFile(bigTableBucket);
        }
        for (Map.Entry<String, FetchOperator> entry : fetchOperators.entrySet()) {
            String alias = entry.getKey();
            FetchOperator fetchOp = entry.getValue();

            if (inputFileChangeSenstive) {
                fetchOp.clearFetchContext();
                setUpFetchOpContext(fetchOp, alias, bigTableBucket);
            }

            // get the root operator
            Operator<? extends OperatorDesc> forwardOp = work.getAliasToWork().get(alias);
            // walk through the operator tree
            while (!forwardOp.getDone()) {
                InspectableObject row = fetchOp.getNextRow();
                if (row == null) {
                    break;
                }
                forwardOp.process(row.o, 0);
            }
            forwardOp.flush();
        }
        for (Operator<?> source : work.getAliasToWork().values()) {
            source.close(false);
        }
    }

    private void initializeOperators(Map<FetchOperator, JobConf> fetchOpJobConfMap) throws HiveException {
        for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : work.getAliasToWork().entrySet()) {
            LOG.debug("initializeOperators: " + entry.getKey() + ", children = "
                    + entry.getValue().getChildOperators());
        }
        // this mapper operator is used to initialize all the operators
        for (Map.Entry<String, FetchWork> entry : work.getAliasToFetchWork().entrySet()) {
            if (entry.getValue() == null) {
                continue;
            }
            JobConf jobClone = new JobConf(job);

            TableScanOperator ts = (TableScanOperator) work.getAliasToWork().get(entry.getKey());
            // push down projections
            ColumnProjectionUtils.appendReadColumns(jobClone, ts.getNeededColumnIDs(), ts.getNeededColumns(),
                    ts.getNeededNestedColumnPaths());
            // push down filters
            HiveInputFormat.pushFilters(jobClone, ts);

            AcidUtils.setTransactionalTableScan(jobClone, ts.getConf().isAcidTable());
            AcidUtils.setAcidOperationalProperties(jobClone, ts.getConf().getAcidOperationalProperties());

            // create a fetch operator
            FetchOperator fetchOp = new FetchOperator(entry.getValue(), jobClone);
            fetchOpJobConfMap.put(fetchOp, jobClone);
            fetchOperators.put(entry.getKey(), fetchOp);
            l4j.info("fetchoperator for " + entry.getKey() + " created");
        }
        // initialize all forward operator
        for (Map.Entry<String, FetchOperator> entry : fetchOperators.entrySet()) {
            // get the forward op
            String alias = entry.getKey();
            Operator<? extends OperatorDesc> forwardOp = work.getAliasToWork().get(alias);

            // put the exe context into all the operators
            forwardOp.passExecContext(execContext);
            // All the operators need to be initialized before process
            FetchOperator fetchOp = entry.getValue();
            JobConf jobConf = fetchOpJobConfMap.get(fetchOp);

            if (jobConf == null) {
                jobConf = job;
            }
            // initialize the forward operator
            ObjectInspector objectInspector = fetchOp.getOutputObjectInspector();
            forwardOp.initialize(jobConf, new ObjectInspector[] { objectInspector });
            l4j.info("fetchoperator for " + entry.getKey() + " initialized");
        }
    }

    private void setUpFetchOpContext(FetchOperator fetchOp, String alias, String currentInputFile)
            throws Exception {

        BucketMapJoinContext bucketMatcherCxt = this.work.getBucketMapjoinContext();

        Class<? extends BucketMatcher> bucketMatcherCls = bucketMatcherCxt.getBucketMatcherClass();
        BucketMatcher bucketMatcher = ReflectionUtils.newInstance(bucketMatcherCls, null);
        bucketMatcher.setAliasBucketFileNameMapping(bucketMatcherCxt.getAliasBucketFileNameMapping());

        List<Path> aliasFiles = bucketMatcher.getAliasBucketFiles(currentInputFile,
                bucketMatcherCxt.getMapJoinBigTableAlias(), alias);
        fetchOp.setupContext(aliasFiles);
    }

    @Override
    public boolean isMapRedLocalTask() {
        return true;
    }

    @Override
    public Collection<Operator<? extends OperatorDesc>> getTopOperators() {
        return getWork().getAliasToWork().values();
    }

    @Override
    public String getName() {
        return "MAPREDLOCAL";
    }

    @Override
    public StageType getType() {
        //assert false;
        return StageType.MAPREDLOCAL;
    }

    @Override
    public void shutdown() {
        super.shutdown();
        if (executor != null) {
            executor.destroy();
            executor = null;
        }
    }
}