Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.twitter.pig.backend.hadoop.executionengine.tez; import java.io.IOException; import java.net.Socket; import java.net.SocketException; import java.net.SocketImplFactory; import java.net.URL; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Properties; import com.google.common.base.Splitter; import com.google.common.collect.Lists; import com.twitter.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapred.JobConf; import org.apache.pig.ExecType; import org.apache.pig.PigConstants; import org.apache.pig.PigException; import org.apache.pig.backend.datastorage.DataStorage; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.executionengine.ExecutionEngine; import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil; import org.apache.pig.backend.hadoop.datastorage.HDataStorage; import org.apache.pig.backend.hadoop.executionengine.Launcher; import org.apache.pig.backend.hadoop.executionengine.MRExecutionEngine; import org.apache.pig.backend.hadoop.executionengine.MapRedResult; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan; import org.apache.pig.impl.PigContext; import org.apache.pig.impl.PigImplConstants; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.plan.OperatorKey; import org.apache.pig.impl.util.ObjectSerializer; import org.apache.pig.newplan.Operator; import org.apache.pig.newplan.logical.optimizer.DanglingNestedNodeRemover; import org.apache.pig.newplan.logical.optimizer.LogicalPlanOptimizer; import org.apache.pig.newplan.logical.optimizer.SchemaResetter; import org.apache.pig.newplan.logical.optimizer.UidResetter; import org.apache.pig.newplan.logical.relational.LOForEach; import org.apache.pig.newplan.logical.relational.LogToPhyTranslationVisitor; import org.apache.pig.newplan.logical.relational.LogicalPlan; import org.apache.pig.newplan.logical.relational.LogicalRelationalOperator; import org.apache.pig.newplan.logical.rules.InputOutputFileValidator; import org.apache.pig.newplan.logical.visitor.SortInfoSetter; import org.apache.pig.newplan.logical.visitor.StoreAliasSetter; import org.apache.pig.pen.POOptimizeDisabler; public class TezExecutionEngine extends MRExecutionEngine implements ExecutionEngine { public static final String JOB_TRACKER_LOCATION = "mapred.job.tracker"; private static final String FILE_SYSTEM_LOCATION = "fs.default.name"; private static final String ALTERNATIVE_FILE_SYSTEM_LOCATION = "fs.defaultFS"; private static final String HADOOP_SITE = "hadoop-site.xml"; private static final String CORE_SITE = "core-site.xml"; private static final String YARN_SITE = "yarn-site.xml"; private final Log log = LogFactory.getLog(getClass()); public static final String LOCAL = "local"; protected PigContext pigContext; protected DataStorage ds; @SuppressWarnings("deprecation") protected JobConf jobConf; // key: the operator key from the logical plan that originated the physical plan // val: the operator key for the root of the phyisical plan protected Map<OperatorKey, OperatorKey> logicalToPhysicalKeys; // map from LOGICAL key to into about the execution protected Map<OperatorKey, MapRedResult> materializedResults; protected Map<Operator, PhysicalOperator> newLogToPhyMap; private LogicalPlan newPreoptimizedPlan; public TezExecutionEngine(PigContext pigContext) { super(pigContext); this.pigContext = pigContext; this.logicalToPhysicalKeys = new HashMap<OperatorKey, OperatorKey>(); this.materializedResults = new HashMap<OperatorKey, MapRedResult>(); this.ds = null; // to be set in the init method this.jobConf = null; } @SuppressWarnings("deprecation") public JobConf getJobConf() { return this.jobConf; } public Map<OperatorKey, MapRedResult> getMaterializedResults() { return this.materializedResults; } public DataStorage getDataStorage() { return this.ds; } public void init() throws ExecException { init(this.pigContext.getProperties()); } @SuppressWarnings("deprecation") private void init(Properties properties) throws ExecException { //First set the ssh socket factory setSSHFactory(); String cluster = null; String nameNode = null; // We need to build a configuration object first in the manner described below // and then get back a properties object to inspect the JOB_TRACKER_LOCATION // and FILE_SYSTEM_LOCATION. The reason to do this is if we looked only at // the existing properties object, we may not get the right settings. So we want // to read the configurations in the order specified below and only then look // for JOB_TRACKER_LOCATION and FILE_SYSTEM_LOCATION. // Hadoop by default specifies two resources, loaded in-order from the classpath: // 1. hadoop-default.xml : Read-only defaults for hadoop. // 2. hadoop-site.xml: Site-specific configuration for a given hadoop installation. // Now add the settings from "properties" object to override any existing properties // All of the above is accomplished in the method call below JobConf jc = null; if (this.pigContext.getExecType() == ExecType.TEZ) { // Check existence of user provided configs String isHadoopConfigsOverriden = properties.getProperty("pig.use.overriden.hadoop.configs"); if (isHadoopConfigsOverriden != null && isHadoopConfigsOverriden.equals("true")) { jc = new JobConf(ConfigurationUtil.toConfiguration(properties)); } else { // Check existence of hadoop-site.xml or core-site.xml in classpath // if user provided confs are not being used Configuration testConf = new Configuration(); ClassLoader cl = testConf.getClassLoader(); URL hadoop_site = cl.getResource(HADOOP_SITE); URL core_site = cl.getResource(CORE_SITE); if (hadoop_site == null && core_site == null) { throw new ExecException( "Cannot find hadoop configurations in classpath (neither hadoop-site.xml nor core-site.xml was found in the classpath)." + " If you plan to use local mode, please put -x local option in command line", 4010); } jc = new JobConf(); } jc.addResource("pig-cluster-hadoop-site.xml"); jc.addResource(YARN_SITE); /* // Trick to invoke static initializer of DistributedFileSystem to add hdfs-default.xml // into configuration new DistributedFileSystem(); */ //the method below alters the properties object by overriding the //hadoop properties with the values from properties and recomputing //the properties recomputeProperties(jc, properties); } else { // If we are running in local mode we dont read the hadoop conf file if (properties.getProperty("mapreduce.framework.name") == null) { properties.setProperty("mapreduce.framework.name", "local"); } properties.setProperty(JOB_TRACKER_LOCATION, LOCAL); properties.setProperty(FILE_SYSTEM_LOCATION, "file:///"); properties.setProperty(ALTERNATIVE_FILE_SYSTEM_LOCATION, "file:///"); jc = new JobConf(false); jc.addResource("core-default.xml"); jc.addResource("mapred-default.xml"); jc.addResource("yarn-default.xml"); recomputeProperties(jc, properties); } cluster = jc.get(JOB_TRACKER_LOCATION); nameNode = jc.get(FILE_SYSTEM_LOCATION); if (nameNode == null) nameNode = (String) pigContext.getProperties().get(ALTERNATIVE_FILE_SYSTEM_LOCATION); if (cluster != null && cluster.length() > 0) { if (!cluster.contains(":") && !cluster.equalsIgnoreCase(LOCAL)) { cluster = cluster + ":50020"; } properties.setProperty(JOB_TRACKER_LOCATION, cluster); } if (nameNode != null && nameNode.length() > 0) { if (!nameNode.contains(":") && !nameNode.equalsIgnoreCase(LOCAL)) { nameNode = nameNode + ":8020"; } properties.setProperty(FILE_SYSTEM_LOCATION, nameNode); } log.info("Connecting to hadoop file system at: " + (nameNode == null ? LOCAL : nameNode)); // constructor sets DEFAULT_REPLICATION_FACTOR_KEY ds = new HDataStorage(properties); if (cluster != null && !cluster.equalsIgnoreCase(LOCAL)) { log.info("Connecting to map-reduce job tracker at: " + jc.get(JOB_TRACKER_LOCATION)); } // Set job-specific configuration knobs jobConf = jc; } public void updateConfiguration(Properties newConfiguration) throws ExecException { init(newConfiguration); } @SuppressWarnings("unchecked") public PhysicalPlan compile(LogicalPlan plan, Properties properties) throws FrontendException { if (plan == null) { int errCode = 2041; String msg = "No Plan to compile"; throw new FrontendException(msg, errCode, PigException.BUG); } newPreoptimizedPlan = new LogicalPlan(plan); if (pigContext.inIllustrator) { // disable all PO-specific optimizations POOptimizeDisabler pod = new POOptimizeDisabler(plan); pod.visit(); } UidResetter uidResetter = new UidResetter(plan); uidResetter.visit(); SchemaResetter schemaResetter = new SchemaResetter(plan, true /*skip duplicate uid check*/ ); schemaResetter.visit(); HashSet<String> disabledOptimizerRules; try { disabledOptimizerRules = (HashSet<String>) ObjectSerializer .deserialize(pigContext.getProperties().getProperty(PigImplConstants.PIG_OPTIMIZER_RULES_KEY)); } catch (IOException ioe) { int errCode = 2110; String msg = "Unable to deserialize optimizer rules."; throw new FrontendException(msg, errCode, PigException.BUG, ioe); } if (disabledOptimizerRules == null) { disabledOptimizerRules = new HashSet<String>(); } String pigOptimizerRulesDisabled = this.pigContext.getProperties() .getProperty(PigConstants.PIG_OPTIMIZER_RULES_DISABLED_KEY); if (pigOptimizerRulesDisabled != null) { disabledOptimizerRules.addAll(Lists.newArrayList((Splitter.on(",").split(pigOptimizerRulesDisabled)))); } if (pigContext.inIllustrator) { disabledOptimizerRules.add("MergeForEach"); disabledOptimizerRules.add("PartitionFilterOptimizer"); disabledOptimizerRules.add("LimitOptimizer"); disabledOptimizerRules.add("SplitFilter"); disabledOptimizerRules.add("PushUpFilter"); disabledOptimizerRules.add("MergeFilter"); disabledOptimizerRules.add("PushDownForEachFlatten"); disabledOptimizerRules.add("ColumnMapKeyPrune"); disabledOptimizerRules.add("AddForEach"); disabledOptimizerRules.add("GroupByConstParallelSetter"); } StoreAliasSetter storeAliasSetter = new StoreAliasSetter(plan); storeAliasSetter.visit(); // run optimizer LogicalPlanOptimizer optimizer = new LogicalPlanOptimizer(plan, 100, disabledOptimizerRules); optimizer.optimize(); // compute whether output data is sorted or not SortInfoSetter sortInfoSetter = new SortInfoSetter(plan); sortInfoSetter.visit(); if (!pigContext.inExplain) { // Validate input/output file. Currently no validation framework in // new logical plan, put this validator here first. // We might decide to move it out to a validator framework in future InputOutputFileValidator validator = new InputOutputFileValidator(plan, pigContext); validator.validate(); } // translate new logical plan to physical plan LogToPhyTranslationVisitor translator = new LogToPhyTranslationVisitor(plan); translator.setPigContext(pigContext); translator.visit(); newLogToPhyMap = translator.getLogToPhyMap(); return translator.getPhysicalPlan(); } public Map<Operator, PhysicalOperator> getLogToPhyMap() { return newLogToPhyMap; } public Map<LOForEach, Map<LogicalRelationalOperator, PhysicalOperator>> getForEachInnerLogToPhyMap( LogicalPlan plan) { Map<LOForEach, Map<LogicalRelationalOperator, PhysicalOperator>> result = new HashMap<LOForEach, Map<LogicalRelationalOperator, PhysicalOperator>>(); Iterator<Operator> outerIter = plan.getOperators(); while (outerIter.hasNext()) { Operator oper = outerIter.next(); if (oper instanceof LOForEach) { LogicalPlan innerPlan = ((LOForEach) oper).getInnerPlan(); Map<LogicalRelationalOperator, PhysicalOperator> innerOpMap = new HashMap<LogicalRelationalOperator, PhysicalOperator>(); Iterator<Operator> innerIter = innerPlan.getOperators(); while (innerIter.hasNext()) { Operator innerOper = innerIter.next(); innerOpMap.put(((LogicalRelationalOperator) innerOper), newLogToPhyMap.get(innerOper)); } result.put((LOForEach) oper, innerOpMap); } } return result; } public LogicalPlan getNewPlan() { return newPreoptimizedPlan; } @SuppressWarnings({ "unchecked", "rawtypes" }) private void setSSHFactory() { Properties properties = this.pigContext.getProperties(); String g = properties.getProperty("ssh.gateway"); if (g == null || g.length() == 0) return; try { Class clazz = Class.forName("org.apache.pig.shock.SSHSocketImplFactory"); SocketImplFactory f = (SocketImplFactory) clazz.getMethod("getFactory", new Class[0]).invoke(0, new Object[0]); Socket.setSocketImplFactory(f); } catch (SocketException e) { } catch (Exception e) { throw new RuntimeException(e); } } /** * Method to apply pig properties to JobConf * (replaces properties with resulting jobConf values) * @param conf JobConf with appropriate hadoop resource files * @param properties Pig properties that will override hadoop properties; properties might be modified */ @SuppressWarnings("deprecation") private void recomputeProperties(JobConf jobConf, Properties properties) { // We need to load the properties from the hadoop configuration // We want to override these with any existing properties we have. if (jobConf != null && properties != null) { // set user properties on the jobConf to ensure that defaults // and deprecation is applied correctly Enumeration<Object> propertiesIter = properties.keys(); while (propertiesIter.hasMoreElements()) { String key = (String) propertiesIter.nextElement(); String val = properties.getProperty(key); // We do not put user.name, See PIG-1419 if (!key.equals("user.name")) jobConf.set(key, val); } //clear user defined properties and re-populate properties.clear(); Iterator<Map.Entry<String, String>> iter = jobConf.iterator(); while (iter.hasNext()) { Map.Entry<String, String> entry = iter.next(); properties.put(entry.getKey(), entry.getValue()); } } } @Override public Launcher getLauncher() { return new TezLauncher(); } }