org.apache.hadoop.hive.ql.exec.MapredLocalTask.java Source code

Introduction

Here is the source code for org.apache.hadoop.hive.ql.exec.MapredLocalTask.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.exec;

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.Serializable;
import java.lang.management.ManagementFactory;
import java.lang.management.MemoryMXBean;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.io.CachingPrintStream;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.DriverContext;
import org.apache.hadoop.hive.ql.QueryPlan;
import org.apache.hadoop.hive.ql.exec.Utilities.StreamPrinter;
import org.apache.hadoop.hive.ql.exec.persistence.AbstractMapJoinKey;
import org.apache.hadoop.hive.ql.exec.persistence.HashMapWrapper;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectValue;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.MapredLocalWork;
import org.apache.hadoop.hive.ql.plan.MapredLocalWork.BucketMapJoinContext;
import org.apache.hadoop.hive.ql.plan.api.StageType;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.ReflectionUtils;

public class MapredLocalTask extends Task<MapredLocalWork> implements Serializable {

    private Map<String, FetchOperator> fetchOperators;
    protected HadoopJobExecHelper jobExecHelper;
    private JobConf job;
    public static transient final Log l4j = LogFactory.getLog(MapredLocalTask.class);
    static final String HADOOP_MEM_KEY = "HADOOP_HEAPSIZE";
    static final String HADOOP_OPTS_KEY = "HADOOP_OPTS";
    static final String[] HIVE_SYS_PROP = { "build.dir", "build.dir.hive" };
    public static MemoryMXBean memoryMXBean;
    private static final Log LOG = LogFactory.getLog(MapredLocalTask.class);

    // not sure we need this exec context; but all the operators in the work
    // will pass this context throught
    private final ExecMapperContext execContext = new ExecMapperContext();

    public MapredLocalTask() {
        super();
    }

    public MapredLocalTask(MapredLocalWork plan, JobConf job, boolean isSilent) throws HiveException {
        setWork(plan);
        this.job = job;
        console = new LogHelper(LOG, isSilent);
    }

    @Override
    public void initialize(HiveConf conf, QueryPlan queryPlan, DriverContext driverContext) {
        super.initialize(conf, queryPlan, driverContext);
        job = new JobConf(conf, ExecDriver.class);
        //we don't use the HadoopJobExecHooks for local tasks
        this.jobExecHelper = new HadoopJobExecHelper(job, console, this, null);
    }

    public static String now() {
        Calendar cal = Calendar.getInstance();
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-mm-dd hh:mm:ss");
        return sdf.format(cal.getTime());
    }

    @Override
    public boolean requireLock() {
        return true;
    }

    @Override
    public int execute(DriverContext driverContext) {
        try {
            // generate the cmd line to run in the child jvm
            Context ctx = driverContext.getCtx();
            String hiveJar = conf.getJar();

            String hadoopExec = conf.getVar(HiveConf.ConfVars.HADOOPBIN);
            String libJarsOption;

            // write out the plan to a local file
            Path planPath = new Path(ctx.getLocalTmpFileURI(), "plan.xml");
            OutputStream out = FileSystem.getLocal(conf).create(planPath);
            MapredLocalWork plan = getWork();
            LOG.info("Generating plan file " + planPath.toString());
            Utilities.serializeMapRedLocalWork(plan, out);

            String isSilent = "true".equalsIgnoreCase(System.getProperty("test.silent")) ? "-nolog" : "";

            String jarCmd;

            jarCmd = hiveJar + " " + ExecDriver.class.getName();

            String hiveConfArgs = ExecDriver.generateCmdLine(conf);
            String cmdLine = hadoopExec + " jar " + jarCmd + " -localtask -plan " + planPath.toString() + " "
                    + isSilent + " " + hiveConfArgs;

            String workDir = (new File(".")).getCanonicalPath();
            String files = Utilities.getResourceFiles(conf, SessionState.ResourceType.FILE);

            if (!files.isEmpty()) {
                cmdLine = cmdLine + " -files " + files;

                workDir = (new Path(ctx.getLocalTmpFileURI())).toUri().getPath();

                if (!(new File(workDir)).mkdir()) {
                    throw new IOException("Cannot create tmp working dir: " + workDir);
                }

                for (String f : StringUtils.split(files, ',')) {
                    Path p = new Path(f);
                    String target = p.toUri().getPath();
                    String link = workDir + Path.SEPARATOR + p.getName();
                    if (FileUtil.symLink(target, link) != 0) {
                        throw new IOException("Cannot link to added file: " + target + " from: " + link);
                    }
                }
            }

            LOG.info("Executing: " + cmdLine);
            Process executor = null;

            // Inherit Java system variables
            String hadoopOpts;
            StringBuilder sb = new StringBuilder();
            Properties p = System.getProperties();
            for (String element : HIVE_SYS_PROP) {
                if (p.containsKey(element)) {
                    sb.append(" -D" + element + "=" + p.getProperty(element));
                }
            }
            hadoopOpts = sb.toString();
            // Inherit the environment variables
            String[] env;
            Map<String, String> variables = new HashMap(System.getenv());
            // The user can specify the hadoop memory

            // if ("local".equals(conf.getVar(HiveConf.ConfVars.HADOOPJT))) {
            // if we are running in local mode - then the amount of memory used
            // by the child jvm can no longer default to the memory used by the
            // parent jvm
            // int hadoopMem = conf.getIntVar(HiveConf.ConfVars.HIVEHADOOPMAXMEM);
            int hadoopMem = conf.getIntVar(HiveConf.ConfVars.HIVEHADOOPMAXMEM);
            if (hadoopMem == 0) {
                // remove env var that would default child jvm to use parent's memory
                // as default. child jvm would use default memory for a hadoop client
                variables.remove(HADOOP_MEM_KEY);
            } else {
                // user specified the memory for local mode hadoop run
                console.printInfo(" set heap size\t" + hadoopMem + "MB");
                variables.put(HADOOP_MEM_KEY, String.valueOf(hadoopMem));
            }
            // } else {
            // nothing to do - we are not running in local mode - only submitting
            // the job via a child process. in this case it's appropriate that the
            // child jvm use the same memory as the parent jvm

            // }

            if (variables.containsKey(HADOOP_OPTS_KEY)) {
                variables.put(HADOOP_OPTS_KEY, variables.get(HADOOP_OPTS_KEY) + hadoopOpts);
            } else {
                variables.put(HADOOP_OPTS_KEY, hadoopOpts);
            }

            if (variables.containsKey(MapRedTask.HIVE_DEBUG_RECURSIVE)) {
                MapRedTask.configureDebugVariablesForChildJVM(variables);
            }

            env = new String[variables.size()];
            int pos = 0;
            for (Map.Entry<String, String> entry : variables.entrySet()) {
                String name = entry.getKey();
                String value = entry.getValue();
                env[pos++] = name + "=" + value;
            }

            // Run ExecDriver in another JVM
            executor = Runtime.getRuntime().exec(cmdLine, env, new File(workDir));

            CachingPrintStream errPrintStream = new CachingPrintStream(System.err);

            StreamPrinter outPrinter = new StreamPrinter(executor.getInputStream(), null, System.out);
            StreamPrinter errPrinter = new StreamPrinter(executor.getErrorStream(), null, errPrintStream);

            outPrinter.start();
            errPrinter.start();

            int exitVal = jobExecHelper.progressLocal(executor, getId());

            if (exitVal != 0) {
                LOG.error("Execution failed with exit status: " + exitVal);
                if (SessionState.get() != null) {
                    SessionState.get().addLocalMapRedErrors(getId(), errPrintStream.getOutput());
                }
            } else {
                LOG.info("Execution completed successfully");
                console.printInfo("Mapred Local Task Succeeded . Convert the Join into MapJoin");
            }

            return exitVal;
        } catch (Exception e) {
            e.printStackTrace();
            LOG.error("Exception: " + e.getMessage());
            return (1);
        }
    }

    public int executeFromChildJVM(DriverContext driverContext) {
        // check the local work
        if (work == null) {
            return -1;
        }
        memoryMXBean = ManagementFactory.getMemoryMXBean();
        long startTime = System.currentTimeMillis();
        console.printInfo(
                Utilities.now() + "\tStarting to launch local task to process map join;\tmaximum memory = "
                        + memoryMXBean.getHeapMemoryUsage().getMax());
        fetchOperators = new HashMap<String, FetchOperator>();
        Map<FetchOperator, JobConf> fetchOpJobConfMap = new HashMap<FetchOperator, JobConf>();
        execContext.setJc(job);
        // set the local work, so all the operator can get this context
        execContext.setLocalWork(work);
        boolean inputFileChangeSenstive = work.getInputFileChangeSensitive();
        try {

            initializeOperators(fetchOpJobConfMap);
            // for each big table's bucket, call the start forward
            if (inputFileChangeSenstive) {
                for (LinkedHashMap<String, ArrayList<String>> bigTableBucketFiles : work.getBucketMapjoinContext()
                        .getAliasBucketFileNameMapping().values()) {
                    for (String bigTableBucket : bigTableBucketFiles.keySet()) {
                        startForward(inputFileChangeSenstive, bigTableBucket);
                    }
                }
            } else {
                startForward(inputFileChangeSenstive, null);
            }
            long currentTime = System.currentTimeMillis();
            long elapsed = currentTime - startTime;
            console.printInfo(
                    Utilities.now() + "\tEnd of local task; Time Taken: " + Utilities.showTime(elapsed) + " sec.");
        } catch (Throwable e) {
            if (e instanceof OutOfMemoryError
                    || (e instanceof HiveException && e.getMessage().equals("RunOutOfMeomoryUsage"))) {
                // Don't create a new object if we are already out of memory
                return 3;
            } else {
                l4j.error("Hive Runtime Error: Map local work failed");
                e.printStackTrace();
                return 2;
            }
        }
        return 0;
    }

    private void startForward(boolean inputFileChangeSenstive, String bigTableBucket) throws Exception {
        for (Map.Entry<String, FetchOperator> entry : fetchOperators.entrySet()) {
            int fetchOpRows = 0;
            String alias = entry.getKey();
            FetchOperator fetchOp = entry.getValue();

            if (inputFileChangeSenstive) {
                fetchOp.clearFetchContext();
                setUpFetchOpContext(fetchOp, alias, bigTableBucket);
            }

            if (fetchOp.isEmptyTable()) {
                //generate empty hashtable for empty table
                this.generateDummyHashTable(alias, bigTableBucket);
                continue;
            }

            // get the root operator
            Operator<? extends Serializable> forwardOp = work.getAliasToWork().get(alias);
            // walk through the operator tree
            while (true) {
                InspectableObject row = fetchOp.getNextRow();
                if (row == null) {
                    if (inputFileChangeSenstive) {
                        String fileName = this.getFileName(bigTableBucket);
                        execContext.setCurrentBigBucketFile(fileName);
                        forwardOp.reset();
                    }
                    forwardOp.close(false);
                    break;
                }
                fetchOpRows++;
                forwardOp.process(row.o, 0);
                // check if any operator had a fatal error or early exit during
                // execution
                if (forwardOp.getDone()) {
                    // ExecMapper.setDone(true);
                    break;
                }
            }
        }
    }

    private void initializeOperators(Map<FetchOperator, JobConf> fetchOpJobConfMap) throws HiveException {
        // this mapper operator is used to initialize all the operators
        for (Map.Entry<String, FetchWork> entry : work.getAliasToFetchWork().entrySet()) {
            JobConf jobClone = new JobConf(job);

            Operator<? extends Serializable> tableScan = work.getAliasToWork().get(entry.getKey());
            boolean setColumnsNeeded = false;
            if (tableScan instanceof TableScanOperator) {
                ArrayList<Integer> list = ((TableScanOperator) tableScan).getNeededColumnIDs();
                if (list != null) {
                    ColumnProjectionUtils.appendReadColumnIDs(jobClone, list);
                    setColumnsNeeded = true;
                }
            }

            if (!setColumnsNeeded) {
                ColumnProjectionUtils.setFullyReadColumns(jobClone);
            }

            // create a fetch operator
            FetchOperator fetchOp = new FetchOperator(entry.getValue(), jobClone);
            fetchOpJobConfMap.put(fetchOp, jobClone);
            fetchOperators.put(entry.getKey(), fetchOp);
            l4j.info("fetchoperator for " + entry.getKey() + " created");
        }
        // initilize all forward operator
        for (Map.Entry<String, FetchOperator> entry : fetchOperators.entrySet()) {
            // get the forward op
            String alias = entry.getKey();
            Operator<? extends Serializable> forwardOp = work.getAliasToWork().get(alias);

            // put the exe context into all the operators
            forwardOp.setExecContext(execContext);
            // All the operators need to be initialized before process
            FetchOperator fetchOp = entry.getValue();
            JobConf jobConf = fetchOpJobConfMap.get(fetchOp);

            if (jobConf == null) {
                jobConf = job;
            }
            // initialize the forward operator
            ObjectInspector objectInspector = fetchOp.getOutputObjectInspector();
            if (objectInspector != null) {
                forwardOp.initialize(jobConf, new ObjectInspector[] { objectInspector });
                l4j.info("fetchoperator for " + entry.getKey() + " initialized");
            } else {
                fetchOp.setEmptyTable(true);
            }
        }
    }

    private void generateDummyHashTable(String alias, String bigBucketFileName) throws HiveException, IOException {
        // find the (byte)tag for the map join(HashTableSinkOperator)
        Operator<? extends Serializable> parentOp = work.getAliasToWork().get(alias);
        Operator<? extends Serializable> childOp = parentOp.getChildOperators().get(0);
        while ((childOp != null) && (!(childOp instanceof HashTableSinkOperator))) {
            parentOp = childOp;
            assert parentOp.getChildOperators().size() == 1;
            childOp = parentOp.getChildOperators().get(0);
        }
        if (childOp == null) {
            throw new HiveException("Cannot find HashTableSink op by tracing down the table scan operator tree");
        }
        byte tag = (byte) childOp.getParentOperators().indexOf(parentOp);

        // generate empty hashtable for this (byte)tag
        String tmpURI = this.getWork().getTmpFileURI();
        HashMapWrapper<AbstractMapJoinKey, MapJoinObjectValue> hashTable = new HashMapWrapper<AbstractMapJoinKey, MapJoinObjectValue>();

        if (bigBucketFileName == null || bigBucketFileName.length() == 0) {
            bigBucketFileName = "-";
        }
        HashTableSinkOperator htso = (HashTableSinkOperator) childOp;
        String tmpURIPath = Utilities.generatePath(tmpURI, htso.getConf().getDumpFilePrefix(), tag,
                bigBucketFileName);
        console.printInfo(Utilities.now() + "\tDump the hashtable into file: " + tmpURIPath);
        Path path = new Path(tmpURIPath);
        FileSystem fs = path.getFileSystem(job);
        File file = new File(path.toUri().getPath());
        fs.create(path);
        long fileLength = hashTable.flushMemoryCacheToPersistent(file);
        console.printInfo(Utilities.now() + "\tUpload 1 File to: " + tmpURIPath + " File size: " + fileLength);
        hashTable.close();
    }

    private void setUpFetchOpContext(FetchOperator fetchOp, String alias, String currentInputFile)
            throws Exception {

        BucketMapJoinContext bucketMatcherCxt = this.work.getBucketMapjoinContext();

        Class<? extends BucketMatcher> bucketMatcherCls = bucketMatcherCxt.getBucketMatcherClass();
        BucketMatcher bucketMatcher = (BucketMatcher) ReflectionUtils.newInstance(bucketMatcherCls, null);
        bucketMatcher.setAliasBucketFileNameMapping(bucketMatcherCxt.getAliasBucketFileNameMapping());

        List<Path> aliasFiles = bucketMatcher.getAliasBucketFiles(currentInputFile,
                bucketMatcherCxt.getMapJoinBigTableAlias(), alias);
        Iterator<Path> iter = aliasFiles.iterator();
        fetchOp.setupContext(iter, null);
    }

    private String getFileName(String path) {
        if (path == null || path.length() == 0) {
            return null;
        }

        int last_separator = path.lastIndexOf(Path.SEPARATOR) + 1;
        String fileName = path.substring(last_separator);
        return fileName;

    }

    @Override
    public void localizeMRTmpFilesImpl(Context ctx) {

    }

    @Override
    public boolean isMapRedLocalTask() {
        return true;
    }

    @Override
    public Collection<Operator<? extends Serializable>> getTopOperators() {
        return getWork().getAliasToWork().values();
    }

    @Override
    public String getName() {
        return "MAPREDLOCAL";
    }

    @Override
    public StageType getType() {
        //assert false;
        return StageType.MAPREDLOCAL;
    }

}