org.apache.pig.backend.hadoop.executionengine.MRExecutionEngine.java Source code

Introduction

Here is the source code for org.apache.pig.backend.hadoop.executionengine.MRExecutionEngine.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.pig.backend.hadoop.executionengine;

import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.net.Socket;
import java.net.SocketException;
import java.net.SocketImplFactory;
import java.net.URL;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.UUID;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.mapred.JobConf;
import org.apache.pig.PigConstants;
import org.apache.pig.PigException;
import org.apache.pig.backend.BackendException;
import org.apache.pig.backend.datastorage.DataStorage;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.executionengine.ExecutionEngine;
import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
import org.apache.pig.backend.hadoop.datastorage.HDataStorage;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil;
import org.apache.pig.backend.hadoop.streaming.HadoopExecutableManager;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.PigImplConstants;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.PlanException;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.streaming.ExecutableManager;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.newplan.Operator;
import org.apache.pig.newplan.logical.optimizer.LogicalPlanOptimizer;
import org.apache.pig.newplan.logical.optimizer.SchemaResetter;
import org.apache.pig.newplan.logical.optimizer.UidResetter;
import org.apache.pig.newplan.logical.relational.LOForEach;
import org.apache.pig.newplan.logical.relational.LogToPhyTranslationVisitor;
import org.apache.pig.newplan.logical.relational.LogicalPlan;
import org.apache.pig.newplan.logical.relational.LogicalRelationalOperator;
import org.apache.pig.newplan.logical.rules.InputOutputFileValidator;
import org.apache.pig.newplan.logical.visitor.SortInfoSetter;
import org.apache.pig.newplan.logical.visitor.StoreAliasSetter;
import org.apache.pig.pen.POOptimizeDisabler;
import org.apache.pig.tools.pigstats.PigStats;
import org.apache.pig.tools.pigstats.ScriptState;
import org.apache.pig.tools.pigstats.mapreduce.MRScriptState;

import com.google.common.base.Splitter;
import com.google.common.collect.Lists;

public class MRExecutionEngine implements ExecutionEngine {

    public static final String JOB_TRACKER_LOCATION = "mapred.job.tracker";
    private static final String FILE_SYSTEM_LOCATION = "fs.default.name";
    private static final String ALTERNATIVE_FILE_SYSTEM_LOCATION = "fs.defaultFS";

    private static final String HADOOP_SITE = "hadoop-site.xml";
    private static final String CORE_SITE = "core-site.xml";
    private static final String YARN_SITE = "yarn-site.xml";
    private final Log log = LogFactory.getLog(getClass());
    public static final String LOCAL = "local";

    protected PigContext pigContext;

    protected DataStorage ds;

    @SuppressWarnings("deprecation")
    protected JobConf jobConf;

    // key: the operator key from the logical plan that originated the physical
    // plan
    // val: the operator key for the root of the phyisical plan
    protected Map<OperatorKey, OperatorKey> logicalToPhysicalKeys;

    protected Map<Operator, PhysicalOperator> newLogToPhyMap;
    private LogicalPlan newPreoptimizedPlan;

    protected Launcher launcher;

    public MRExecutionEngine(PigContext pigContext) {
        this.pigContext = pigContext;
        this.logicalToPhysicalKeys = new HashMap<OperatorKey, OperatorKey>();

        this.ds = null;

        // to be set in the init method
        this.jobConf = null;

        this.launcher = new MapReduceLauncher();
    }

    @SuppressWarnings("deprecation")
    public JobConf getJobConf() {
        return this.jobConf;
    }

    @Override
    public DataStorage getDataStorage() {
        return this.ds;
    }

    @Override
    public void init() throws ExecException {
        init(this.pigContext.getProperties());
    }

    @SuppressWarnings({ "deprecation", "resource" })
    private void init(Properties properties) throws ExecException {
        // First set the ssh socket factory
        setSSHFactory();

        String cluster = null;
        String nameNode = null;

        // We need to build a configuration object first in the manner described
        // below
        // and then get back a properties object to inspect the
        // JOB_TRACKER_LOCATION
        // and FILE_SYSTEM_LOCATION. The reason to do this is if we looked only
        // at
        // the existing properties object, we may not get the right settings. So
        // we want
        // to read the configurations in the order specified below and only then
        // look
        // for JOB_TRACKER_LOCATION and FILE_SYSTEM_LOCATION.

        // Hadoop by default specifies two resources, loaded in-order from the
        // classpath:
        // 1. hadoop-default.xml : Read-only defaults for hadoop.
        // 2. hadoop-site.xml: Site-specific configuration for a given hadoop
        // installation.
        // Now add the settings from "properties" object to override any
        // existing properties
        // All of the above is accomplished in the method call below

        JobConf jc = null;
        if (!this.pigContext.getExecType().isLocal()) {
            // Check existence of user provided configs
            String isHadoopConfigsOverriden = properties.getProperty("pig.use.overriden.hadoop.configs");
            if (isHadoopConfigsOverriden != null && isHadoopConfigsOverriden.equals("true")) {
                jc = new JobConf(ConfigurationUtil.toConfiguration(properties));
            } else {
                // Check existence of hadoop-site.xml or core-site.xml in
                // classpath
                // if user provided confs are not being used
                Configuration testConf = new Configuration();
                ClassLoader cl = testConf.getClassLoader();
                URL hadoop_site = cl.getResource(HADOOP_SITE);
                URL core_site = cl.getResource(CORE_SITE);

                if (hadoop_site == null && core_site == null) {
                    throw new ExecException(
                            "Cannot find hadoop configurations in classpath (neither hadoop-site.xml nor core-site.xml was found in the classpath)."
                                    + " If you plan to use local mode, please put -x local option in command line",
                            4010);
                }
                jc = new JobConf();
            }
            jc.addResource("pig-cluster-hadoop-site.xml");
            jc.addResource(YARN_SITE);

            // Trick to invoke static initializer of DistributedFileSystem to
            // add hdfs-default.xml
            // into configuration
            new DistributedFileSystem();

            // the method below alters the properties object by overriding the
            // hadoop properties with the values from properties and recomputing
            // the properties
            recomputeProperties(jc, properties);
        } else {
            // If we are running in local mode we dont read the hadoop conf file
            if (properties.getProperty("mapreduce.framework.name") == null) {
                properties.setProperty("mapreduce.framework.name", "local");
            }
            properties.setProperty(JOB_TRACKER_LOCATION, LOCAL);
            properties.setProperty(FILE_SYSTEM_LOCATION, "file:///");
            properties.setProperty(ALTERNATIVE_FILE_SYSTEM_LOCATION, "file:///");

            jc = new JobConf(false);
            jc.addResource("core-default.xml");
            jc.addResource("mapred-default.xml");
            jc.addResource("yarn-default.xml");
            recomputeProperties(jc, properties);
        }

        cluster = jc.get(JOB_TRACKER_LOCATION);
        nameNode = jc.get(FILE_SYSTEM_LOCATION);
        if (nameNode == null)
            nameNode = (String) pigContext.getProperties().get(ALTERNATIVE_FILE_SYSTEM_LOCATION);

        if (cluster != null && cluster.length() > 0) {
            if (!cluster.contains(":") && !cluster.equalsIgnoreCase(LOCAL)) {
                cluster = cluster + ":50020";
            }
            properties.setProperty(JOB_TRACKER_LOCATION, cluster);
        }

        if (nameNode != null && nameNode.length() > 0) {
            if (!nameNode.contains(":") && !nameNode.equalsIgnoreCase(LOCAL)) {
                nameNode = nameNode + ":8020";
            }
            properties.setProperty(FILE_SYSTEM_LOCATION, nameNode);
        }

        log.info("Connecting to hadoop file system at: " + (nameNode == null ? LOCAL : nameNode));
        // constructor sets DEFAULT_REPLICATION_FACTOR_KEY
        ds = new HDataStorage(properties);

        if (cluster != null && !cluster.equalsIgnoreCase(LOCAL)) {
            log.info("Connecting to map-reduce job tracker at: " + jc.get(JOB_TRACKER_LOCATION));
        }

        // Set job-specific configuration knobs
        jobConf = jc;
    }

    @SuppressWarnings("unchecked")
    public PhysicalPlan compile(LogicalPlan plan, Properties properties) throws FrontendException {
        if (plan == null) {
            int errCode = 2041;
            String msg = "No Plan to compile";
            throw new FrontendException(msg, errCode, PigException.BUG);
        }

        newPreoptimizedPlan = new LogicalPlan(plan);

        if (pigContext.inIllustrator) {
            // disable all PO-specific optimizations
            POOptimizeDisabler pod = new POOptimizeDisabler(plan);
            pod.visit();
        }

        UidResetter uidResetter = new UidResetter(plan);
        uidResetter.visit();

        SchemaResetter schemaResetter = new SchemaResetter(plan, true /*
                                                                      * skip
                                                                      * duplicate
                                                                      * uid check
                                                                      */);
        schemaResetter.visit();

        HashSet<String> disabledOptimizerRules;
        try {
            disabledOptimizerRules = (HashSet<String>) ObjectSerializer
                    .deserialize(pigContext.getProperties().getProperty(PigImplConstants.PIG_OPTIMIZER_RULES_KEY));
        } catch (IOException ioe) {
            int errCode = 2110;
            String msg = "Unable to deserialize optimizer rules.";
            throw new FrontendException(msg, errCode, PigException.BUG, ioe);
        }
        if (disabledOptimizerRules == null) {
            disabledOptimizerRules = new HashSet<String>();
        }

        String pigOptimizerRulesDisabled = this.pigContext.getProperties()
                .getProperty(PigConstants.PIG_OPTIMIZER_RULES_DISABLED_KEY);
        if (pigOptimizerRulesDisabled != null) {
            disabledOptimizerRules.addAll(Lists.newArrayList((Splitter.on(",").split(pigOptimizerRulesDisabled))));
        }

        if (pigContext.inIllustrator) {
            disabledOptimizerRules.add("MergeForEach");
            disabledOptimizerRules.add("PartitionFilterOptimizer");
            disabledOptimizerRules.add("LimitOptimizer");
            disabledOptimizerRules.add("SplitFilter");
            disabledOptimizerRules.add("PushUpFilter");
            disabledOptimizerRules.add("MergeFilter");
            disabledOptimizerRules.add("PushDownForEachFlatten");
            disabledOptimizerRules.add("ColumnMapKeyPrune");
            disabledOptimizerRules.add("AddForEach");
            disabledOptimizerRules.add("GroupByConstParallelSetter");
        }

        StoreAliasSetter storeAliasSetter = new StoreAliasSetter(plan);
        storeAliasSetter.visit();

        // run optimizer
        LogicalPlanOptimizer optimizer = new LogicalPlanOptimizer(plan, 100, disabledOptimizerRules);
        optimizer.optimize();

        // compute whether output data is sorted or not
        SortInfoSetter sortInfoSetter = new SortInfoSetter(plan);
        sortInfoSetter.visit();

        if (!pigContext.inExplain) {
            // Validate input/output file. Currently no validation framework in
            // new logical plan, put this validator here first.
            // We might decide to move it out to a validator framework in future
            InputOutputFileValidator validator = new InputOutputFileValidator(plan, pigContext);
            validator.validate();
        }

        // translate new logical plan to physical plan
        LogToPhyTranslationVisitor translator = new LogToPhyTranslationVisitor(plan);

        translator.setPigContext(pigContext);
        translator.visit();
        newLogToPhyMap = translator.getLogToPhyMap();
        return translator.getPhysicalPlan();
    }

    public Map<Operator, PhysicalOperator> getLogToPhyMap() {
        return newLogToPhyMap;
    }

    public Map<LOForEach, Map<LogicalRelationalOperator, PhysicalOperator>> getForEachInnerLogToPhyMap(
            LogicalPlan plan) {
        Map<LOForEach, Map<LogicalRelationalOperator, PhysicalOperator>> result = new HashMap<LOForEach, Map<LogicalRelationalOperator, PhysicalOperator>>();
        Iterator<Operator> outerIter = plan.getOperators();
        while (outerIter.hasNext()) {
            Operator oper = outerIter.next();
            if (oper instanceof LOForEach) {
                LogicalPlan innerPlan = ((LOForEach) oper).getInnerPlan();
                Map<LogicalRelationalOperator, PhysicalOperator> innerOpMap = new HashMap<LogicalRelationalOperator, PhysicalOperator>();
                Iterator<Operator> innerIter = innerPlan.getOperators();
                while (innerIter.hasNext()) {
                    Operator innerOper = innerIter.next();
                    innerOpMap.put(((LogicalRelationalOperator) innerOper), newLogToPhyMap.get(innerOper));
                }
                result.put((LOForEach) oper, innerOpMap);
            }
        }
        return result;
    }

    public LogicalPlan getNewPlan() {
        return newPreoptimizedPlan;
    }

    @SuppressWarnings({ "unchecked", "rawtypes" })
    protected void setSSHFactory() {
        Properties properties = this.pigContext.getProperties();
        String g = properties.getProperty("ssh.gateway");
        if (g == null || g.length() == 0)
            return;
        try {
            Class clazz = Class.forName("org.apache.pig.shock.SSHSocketImplFactory");
            SocketImplFactory f = (SocketImplFactory) clazz.getMethod("getFactory", new Class[0]).invoke(0,
                    new Object[0]);
            Socket.setSocketImplFactory(f);
        } catch (SocketException e) {
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Method to apply pig properties to JobConf (replaces properties with
     * resulting jobConf values)
     * 
     * @param conf
     *            JobConf with appropriate hadoop resource files
     * @param properties
     *            Pig properties that will override hadoop properties;
     *            properties might be modified
     */
    @SuppressWarnings("deprecation")
    protected void recomputeProperties(JobConf jobConf, Properties properties) {
        // We need to load the properties from the hadoop configuration
        // We want to override these with any existing properties we have.
        if (jobConf != null && properties != null) {
            // set user properties on the jobConf to ensure that defaults
            // and deprecation is applied correctly
            Enumeration<Object> propertiesIter = properties.keys();
            while (propertiesIter.hasMoreElements()) {
                String key = (String) propertiesIter.nextElement();
                String val = properties.getProperty(key);
                // We do not put user.name, See PIG-1419
                if (!key.equals("user.name"))
                    jobConf.set(key, val);
            }
            // clear user defined properties and re-populate
            properties.clear();
            Iterator<Map.Entry<String, String>> iter = jobConf.iterator();
            while (iter.hasNext()) {
                Map.Entry<String, String> entry = iter.next();
                properties.put(entry.getKey(), entry.getValue());
            }
        }
    }

    @Override
    public ScriptState instantiateScriptState() {
        return new MRScriptState(UUID.randomUUID().toString());
    }

    @Override
    public PigStats launchPig(LogicalPlan lp, String grpName, PigContext pc)
            throws FrontendException, ExecException {

        try {
            PhysicalPlan pp = compile(lp, pc.getProperties());
            return launcher.launchPig(pp, grpName, pigContext);
        } catch (ExecException e) {
            throw (ExecException) e;
        } catch (FrontendException e) {
            throw (FrontendException) e;
        } catch (Exception e) {
            throw new ExecException(e);
        } finally {
            launcher.reset();
        }
    }

    @Override
    public void explain(LogicalPlan lp, PigContext pc, PrintStream ps, String format, boolean verbose, File file,
            String suffix) throws PlanException, VisitorException, IOException, FrontendException {

        PrintStream pps = ps;
        PrintStream eps = ps;

        try {

            if (file != null) {
                pps = new PrintStream(new File(file, "physical_plan-" + suffix));
                eps = new PrintStream(new File(file, "exec_plan-" + suffix));
            }

            PhysicalPlan pp = compile(lp, pc.getProperties());
            pp.explain(pps, format, verbose);

            MapRedUtil.checkLeafIsStore(pp, pigContext);
            launcher.explain(pp, pigContext, eps, format, verbose);
        } finally {
            launcher.reset();
            //Only close the stream if we opened it.
            if (file != null) {
                pps.close();
                eps.close();
            }
        }
    }

    @Override
    public Properties getConfiguration() {
        if (jobConf == null)
            return null;
        return ConfigurationUtil.toProperties(jobConf);
    }

    @Override
    public void setConfiguration(Properties newConfiguration) throws ExecException {
        init(newConfiguration);
    }

    @Override
    public void setProperty(String property, String value) {
        // mPigServer.getPigContext().getProperties().setProperty(key, value);
        // PIG-2508 properties need to be managed through JobConf
        // since all other code depends on access to properties,
        // we need to re-populate from updated JobConf
        // java.util.HashSet<?> keysBefore = new
        // java.util.HashSet<Object>(mPigServer.getPigContext().getProperties().keySet());
        // set current properties on jobConf
        Properties properties = pigContext.getProperties();
        Enumeration<Object> propertiesIter = pigContext.getProperties().keys();
        while (propertiesIter.hasMoreElements()) {
            String pkey = (String) propertiesIter.nextElement();
            String val = properties.getProperty(pkey);
            // We do not put user.name, See PIG-1419
            if (!pkey.equals("user.name"))
                jobConf.set(pkey, val);
        }
        // set new value, JobConf will handle deprecation etc.
        jobConf.set(property, value);
        // re-initialize to reflect updated JobConf
        properties.clear();
        Iterator<Map.Entry<String, String>> iter = jobConf.iterator();
        while (iter.hasNext()) {
            Map.Entry<String, String> entry = iter.next();
            properties.put(entry.getKey(), entry.getValue());
        }
        // keysBefore.removeAll(mPigServer.getPigContext().getProperties().keySet());
        // log.info("PIG-2508: keys dropped from properties: " + keysBefore);
    }

    @Override
    public ExecutableManager getExecutableManager() {
        return new HadoopExecutableManager();
    }

    @Override
    public void kill() throws BackendException {
        if (launcher != null) {
            launcher.kill();
        }
    }

    @Override
    public void killJob(String jobID) throws BackendException {
        if (launcher != null) {
            launcher.killJob(jobID, jobConf);
        }
    }

}