Java tutorial
/* * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cascading.flow.tez; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import cascading.CascadingException; import cascading.flow.FlowElement; import cascading.flow.FlowElements; import cascading.flow.FlowException; import cascading.flow.FlowNode; import cascading.flow.FlowProcess; import cascading.flow.FlowRuntimeProps; import cascading.flow.hadoop.ConfigurationSetter; import cascading.flow.hadoop.util.HadoopUtil; import cascading.flow.planner.BaseFlowStep; import cascading.flow.planner.FlowStepJob; import cascading.flow.planner.graph.ElementGraph; import cascading.flow.planner.process.FlowNodeGraph; import cascading.flow.planner.process.ProcessEdge; import cascading.flow.stream.annotations.StreamMode; import cascading.flow.tez.planner.Hadoop2TezFlowStepJob; import cascading.flow.tez.util.TezUtil; import cascading.management.state.ClientState; import cascading.pipe.Boundary; import cascading.pipe.CoGroup; import cascading.pipe.Group; import cascading.pipe.GroupBy; import cascading.pipe.Merge; import cascading.property.AppProps; import cascading.tap.Tap; import cascading.tap.hadoop.Hfs; import cascading.tap.hadoop.PartitionTap; import cascading.tuple.Tuple; import cascading.tuple.hadoop.TupleSerialization; import cascading.tuple.hadoop.util.GroupingSortingComparator; import cascading.tuple.hadoop.util.ReverseGroupingSortingComparator; import cascading.tuple.hadoop.util.ReverseTupleComparator; import cascading.tuple.hadoop.util.TupleComparator; import cascading.tuple.io.TuplePair; import cascading.tuple.tez.util.GroupingSortingPartitioner; import cascading.tuple.tez.util.TuplePartitioner; import cascading.util.Util; import cascading.util.Version; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.yarn.api.records.LocalResource; import org.apache.hadoop.yarn.api.records.LocalResourceType; import org.apache.tez.common.TezUtils; import org.apache.tez.dag.api.DAG; import org.apache.tez.dag.api.DataSinkDescriptor; import org.apache.tez.dag.api.DataSourceDescriptor; import org.apache.tez.dag.api.Edge; import org.apache.tez.dag.api.EdgeProperty; import org.apache.tez.dag.api.GroupInputEdge; import org.apache.tez.dag.api.InputDescriptor; import org.apache.tez.dag.api.OutputDescriptor; import org.apache.tez.dag.api.ProcessorDescriptor; import org.apache.tez.dag.api.TezConfiguration; import org.apache.tez.dag.api.UserPayload; import org.apache.tez.dag.api.Vertex; import org.apache.tez.dag.api.VertexGroup; import org.apache.tez.mapreduce.input.MRInput; import org.apache.tez.mapreduce.output.MROutput; import org.apache.tez.runtime.library.api.TezRuntimeConfiguration; import org.apache.tez.runtime.library.input.ConcatenatedMergedKeyValueInput; import org.apache.tez.runtime.library.input.OrderedGroupedKVInput; import org.apache.tez.runtime.library.input.OrderedGroupedMergedKVInput; import org.apache.tez.runtime.library.input.UnorderedKVInput; import org.apache.tez.runtime.library.output.OrderedPartitionedKVOutput; import org.apache.tez.runtime.library.output.UnorderedKVOutput; import org.apache.tez.runtime.library.output.UnorderedPartitionedKVOutput; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static cascading.flow.hadoop.util.HadoopUtil.addComparators; import static cascading.flow.hadoop.util.HadoopUtil.serializeBase64; import static cascading.flow.tez.util.TezUtil.addToClassPath; import static cascading.tap.hadoop.DistCacheTap.CASCADING_LOCAL_RESOURCES; import static cascading.tap.hadoop.DistCacheTap.CASCADING_REMOTE_RESOURCES; import static java.util.Collections.singletonList; import static org.apache.hadoop.yarn.api.records.LocalResourceType.ARCHIVE; import static org.apache.hadoop.yarn.api.records.LocalResourceType.FILE; /** * */ public class Hadoop2TezFlowStep extends BaseFlowStep<TezConfiguration> { private static final Logger LOG = LoggerFactory.getLogger(Hadoop2TezFlowStep.class); private Map<String, LocalResource> allLocalResources = new HashMap<>(); private Map<Path, Path> syncPaths = new HashMap<>(); private Map<String, String> environment = new HashMap<>(); public Hadoop2TezFlowStep(ElementGraph elementGraph, FlowNodeGraph flowNodeGraph) { super(elementGraph, flowNodeGraph); } @Override public Map<Object, Object> getConfigAsProperties() { return HadoopUtil.createProperties(getConfig()); } @Override public TezConfiguration createInitializedConfig(FlowProcess<TezConfiguration> flowProcess, TezConfiguration parentConfig) { TezConfiguration stepConf = parentConfig == null ? new TezConfiguration() : new TezConfiguration(parentConfig); TupleSerialization.setSerializations(stepConf); String versionString = Version.getRelease(); if (versionString != null) stepConf.set("cascading.version", versionString); stepConf.set(CASCADING_FLOW_STEP_ID, getID()); stepConf.set("cascading.flow.step.num", Integer.toString(getOrdinal())); HadoopUtil.setIsInflow(stepConf); String flowStagingPath = ((Hadoop2TezFlow) getFlow()).getFlowStagingPath(); List<String> classPath = ((Hadoop2TezFlow) getFlow()).getClassPath(); // is updated in addToClassPath method Map<String, LocalResource> dagResources = new HashMap<>(); if (!classPath.isEmpty()) { // jars in the root will be in the remote CLASSPATH, no need to add to the environment Map<Path, Path> dagClassPath = addToClassPath(stepConf, flowStagingPath, null, classPath, FILE, dagResources, null); syncPaths.putAll(dagClassPath); } String appJarPath = stepConf.get(AppProps.APP_JAR_PATH); if (appJarPath != null) { // the PATTERN represents the insides of the app jar, those elements must be added to the remote CLASSPATH List<String> classpath = singletonList(appJarPath); Map<Path, Path> pathMap = addToClassPath(stepConf, flowStagingPath, null, classpath, ARCHIVE, dagResources, environment); syncPaths.putAll(pathMap); // AM does not support environments like containers do, so the classpath has to be passed via configuration. String fileName = new File(appJarPath).getName(); stepConf.set(TezConfiguration.TEZ_CLUSTER_ADDITIONAL_CLASSPATH_PREFIX, "$PWD/" + fileName + "/:$PWD/" + fileName + "/classes/:$PWD/" + fileName + "/lib/*:"); } allLocalResources.putAll(dagResources); initFromStepConfigDef(stepConf); return stepConf; } @Override protected FlowStepJob createFlowStepJob(ClientState clientState, FlowProcess<TezConfiguration> flowProcess, TezConfiguration initializedStepConfig) { DAG dag = createDAG(flowProcess, initializedStepConfig); return new Hadoop2TezFlowStepJob(clientState, this, initializedStepConfig, dag); } private DAG createDAG(FlowProcess<TezConfiguration> flowProcess, TezConfiguration initializedConfig) { FlowNodeGraph nodeGraph = getFlowNodeGraph(); Map<FlowNode, Vertex> vertexMap = new HashMap<>(); DAG dag = DAG.create( getStepDisplayName(initializedConfig.getInt("cascading.display.id.truncate", Util.ID_LENGTH))); dag.addTaskLocalFiles(allLocalResources); Iterator<FlowNode> iterator = nodeGraph.getOrderedTopologicalIterator(); // ordering of nodes for consistent remote debugging while (iterator.hasNext()) { FlowNode flowNode = iterator.next(); Vertex vertex = createVertex(flowProcess, initializedConfig, flowNode); dag.addVertex(vertex); vertexMap.put(flowNode, vertex); } LinkedList<ProcessEdge> processedEdges = new LinkedList<>(); for (ProcessEdge processEdge : nodeGraph.edgeSet()) { if (processedEdges.contains(processEdge)) continue; FlowNode edgeTargetFlowNode = nodeGraph.getEdgeTarget(processEdge); FlowElement flowElement = processEdge.getFlowElement(); List<FlowNode> sourceNodes = nodeGraph.getElementSourceProcesses(flowElement); EdgeProperty edgeProperty = createEdgeProperty(initializedConfig, processEdge); Vertex targetVertex = vertexMap.get(edgeTargetFlowNode); if (sourceNodes.size() == 1 || flowElement instanceof CoGroup || flowElement instanceof Boundary) // todo: create group vertices around incoming ordinal { FlowNode edgeSourceFlowNode = nodeGraph.getEdgeSource(processEdge); Vertex sourceVertex = vertexMap.get(edgeSourceFlowNode); LOG.debug("adding edge between: {} and {}", sourceVertex, targetVertex); dag.addEdge(Edge.create(sourceVertex, targetVertex, edgeProperty)); } else if (flowElement instanceof GroupBy || flowElement instanceof Merge) // merge - source nodes > 1 { List<String> sourceVerticesIDs = new ArrayList<>(); List<Vertex> sourceVertices = new ArrayList<>(); for (FlowNode edgeSourceFlowNode : sourceNodes) { sourceVerticesIDs.add(edgeSourceFlowNode.getID()); sourceVertices.add(vertexMap.get(edgeSourceFlowNode)); processedEdges.add(nodeGraph.getEdge(edgeSourceFlowNode, edgeTargetFlowNode)); } VertexGroup vertexGroup = dag.createVertexGroup(edgeTargetFlowNode.getID(), sourceVertices.toArray(new Vertex[sourceVertices.size()])); String inputClassName = flowElement instanceof Group ? OrderedGroupedMergedKVInput.class.getName() : ConcatenatedMergedKeyValueInput.class.getName(); InputDescriptor inputDescriptor = InputDescriptor.create(inputClassName) .setUserPayload(edgeProperty.getEdgeDestination().getUserPayload()); LOG.info("adding grouped edge between: {} and {}", Util.join(sourceVerticesIDs, ","), targetVertex.getName()); dag.addEdge(GroupInputEdge.create(vertexGroup, targetVertex, edgeProperty, inputDescriptor)); } else { throw new UnsupportedOperationException("can't make edge for: " + flowElement); } } return dag; } private EdgeProperty createEdgeProperty(TezConfiguration config, ProcessEdge processEdge) { FlowElement flowElement = processEdge.getFlowElement(); EdgeValues edgeValues = new EdgeValues(new TezConfiguration(config), processEdge); edgeValues.keyClassName = Tuple.class.getName(); // TEZ_RUNTIME_INTERMEDIATE_OUTPUT_KEY_CLASS edgeValues.valueClassName = Tuple.class.getName(); // TEZ_RUNTIME_INTERMEDIATE_OUTPUT_VALUE_CLASS edgeValues.keyComparatorClassName = TupleComparator.class.getName(); edgeValues.keyPartitionerClassName = TuplePartitioner.class.getName(); edgeValues.outputClassName = null; edgeValues.inputClassName = null; edgeValues.movementType = null; edgeValues.sourceType = null; edgeValues.schedulingType = null; if (flowElement instanceof Group) applyGroup(edgeValues); else if ((flowElement instanceof Boundary || flowElement instanceof Merge) && processEdge.getSourceAnnotations().contains(StreamMode.Accumulated)) applyBoundaryMergeAccumulated(edgeValues); else if (flowElement instanceof Boundary || flowElement instanceof Merge) applyBoundaryMerge(edgeValues); else throw new IllegalStateException( "unsupported flow element: " + flowElement.getClass().getCanonicalName()); return createEdgeProperty(edgeValues); } private EdgeValues applyBoundaryMerge(EdgeValues edgeValues) { // todo: support for one to one edgeValues.outputClassName = UnorderedPartitionedKVOutput.class.getName(); edgeValues.inputClassName = UnorderedKVInput.class.getName(); edgeValues.movementType = EdgeProperty.DataMovementType.SCATTER_GATHER; edgeValues.sourceType = EdgeProperty.DataSourceType.PERSISTED; edgeValues.schedulingType = EdgeProperty.SchedulingType.SEQUENTIAL; return edgeValues; } private EdgeValues applyBoundaryMergeAccumulated(EdgeValues edgeValues) { edgeValues.outputClassName = UnorderedKVOutput.class.getName(); edgeValues.inputClassName = UnorderedKVInput.class.getName(); edgeValues.movementType = EdgeProperty.DataMovementType.BROADCAST; edgeValues.sourceType = EdgeProperty.DataSourceType.PERSISTED; edgeValues.schedulingType = EdgeProperty.SchedulingType.SEQUENTIAL; return edgeValues; } private EdgeValues applyGroup(EdgeValues edgeValues) { Group group = (Group) edgeValues.flowElement; if (group.isSortReversed()) edgeValues.keyComparatorClassName = ReverseTupleComparator.class.getName(); addComparators(edgeValues.config, "cascading.group.comparator", group.getKeySelectors(), this, group); if (!group.isGroupBy()) { edgeValues.outputClassName = OrderedPartitionedKVOutput.class.getName(); edgeValues.inputClassName = OrderedGroupedKVInput.class.getName(); edgeValues.movementType = EdgeProperty.DataMovementType.SCATTER_GATHER; edgeValues.sourceType = EdgeProperty.DataSourceType.PERSISTED; edgeValues.schedulingType = EdgeProperty.SchedulingType.SEQUENTIAL; } else { addComparators(edgeValues.config, "cascading.sort.comparator", group.getSortingSelectors(), this, group); edgeValues.outputClassName = OrderedPartitionedKVOutput.class.getName(); edgeValues.inputClassName = OrderedGroupedKVInput.class.getName(); edgeValues.movementType = EdgeProperty.DataMovementType.SCATTER_GATHER; edgeValues.sourceType = EdgeProperty.DataSourceType.PERSISTED; edgeValues.schedulingType = EdgeProperty.SchedulingType.SEQUENTIAL; } if (group.isSorted()) { edgeValues.keyClassName = TuplePair.class.getName(); edgeValues.keyPartitionerClassName = GroupingSortingPartitioner.class.getName(); if (group.isSortReversed()) edgeValues.keyComparatorClassName = ReverseGroupingSortingComparator.class.getName(); else edgeValues.keyComparatorClassName = GroupingSortingComparator.class.getName(); } return edgeValues; } private EdgeProperty createEdgeProperty(EdgeValues edgeValues) { TezConfiguration outputConfig = new TezConfiguration(edgeValues.getConfig()); outputConfig.set("cascading.node.sink", FlowElements.id(edgeValues.getFlowElement())); UserPayload outputPayload = createIntermediatePayloadOutput(outputConfig, edgeValues); TezConfiguration inputConfig = new TezConfiguration(edgeValues.getConfig()); inputConfig.set("cascading.node.source", FlowElements.id(edgeValues.getFlowElement())); inputConfig.set("cascading.node.source.ordinals", Util.join(edgeValues.getOrdinals(), ",")); UserPayload inputPayload = createIntermediatePayloadInput(inputConfig, edgeValues); return EdgeProperty.create(edgeValues.getMovementType(), edgeValues.getSourceType(), edgeValues.getSchedulingType(), OutputDescriptor.create(edgeValues.getOutputClassName()).setUserPayload(outputPayload), InputDescriptor.create(edgeValues.getInputClassName()).setUserPayload(inputPayload)); } private UserPayload createIntermediatePayloadOutput(TezConfiguration config, EdgeValues edgeValues) { config.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS, edgeValues.keyClassName); config.set(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS, edgeValues.valueClassName); config.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_COMPARATOR_CLASS, edgeValues.keyComparatorClassName); config.set(TezRuntimeConfiguration.TEZ_RUNTIME_PARTITIONER_CLASS, edgeValues.keyPartitionerClassName); setWorkingDirectory(config); return getPayload(config); } private UserPayload createIntermediatePayloadInput(TezConfiguration config, EdgeValues edgeValues) { config.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS, edgeValues.keyClassName); config.set(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS, edgeValues.valueClassName); config.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_COMPARATOR_CLASS, edgeValues.keyComparatorClassName); config.set(TezRuntimeConfiguration.TEZ_RUNTIME_PARTITIONER_CLASS, edgeValues.keyPartitionerClassName); setWorkingDirectory(config); return getPayload(config); } private static void setWorkingDirectory(Configuration conf) { String name = conf.get(JobContext.WORKING_DIR); if (name != null) return; try { Path dir = FileSystem.get(conf).getWorkingDirectory(); conf.set(JobContext.WORKING_DIR, dir.toString()); } catch (IOException exception) { throw new RuntimeException(exception); } } public Vertex createVertex(FlowProcess<TezConfiguration> flowProcess, TezConfiguration initializedConfig, FlowNode flowNode) { JobConf conf = new JobConf(initializedConfig); addInputOutputMapping(conf, flowNode); conf.setBoolean("mapred.used.genericoptionsparser", true); Map<String, LocalResource> taskLocalResources = new HashMap<>(); Map<FlowElement, Configuration> sourceConfigs = initFromSources(flowNode, flowProcess, conf, taskLocalResources); Map<FlowElement, Configuration> sinkConfigs = initFromSinks(flowNode, flowProcess, conf); initFromTraps(flowNode, flowProcess, conf); initFromNodeConfigDef(flowNode, conf); // force step to local mode if any tap is local setLocalMode(initializedConfig, conf, null); conf.set("cascading.flow.node.num", Integer.toString(flowNode.getOrdinal())); int parallelism = getParallelism(flowNode, conf); if (parallelism == 0) throw new FlowException(getName(), "the default number of gather partitions must be set, see cascading.flow.FlowRuntimeProps"); Vertex vertex = newVertex(flowNode, conf, parallelism); if (!taskLocalResources.isEmpty()) vertex.addTaskLocalFiles(taskLocalResources); for (FlowElement flowElement : sourceConfigs.keySet()) { if (!(flowElement instanceof Tap)) continue; Configuration sourceConf = sourceConfigs.get(flowElement); // not setting the new-api value could result in failures if not set by the Scheme if (sourceConf.get("mapred.mapper.new-api") == null) HadoopUtil.setNewApi(sourceConf, sourceConf.get("mapred.input.format.class", sourceConf.get("mapreduce.job.inputformat.class"))); // unfortunately we cannot just load the input format and set it on the builder with also pulling all other // values out of the configuration. MRInput.MRInputConfigBuilder configBuilder = MRInput.createConfigBuilder(sourceConf, null); // grouping splits loses file name info, breaking partition tap default impl if (flowElement instanceof PartitionTap) // todo: generify configBuilder.groupSplits(false); DataSourceDescriptor dataSourceDescriptor = configBuilder.build(); vertex.addDataSource(FlowElements.id(flowElement), dataSourceDescriptor); } for (FlowElement flowElement : sinkConfigs.keySet()) { if (!(flowElement instanceof Tap)) continue; Configuration sinkConf = sinkConfigs.get(flowElement); Class outputFormatClass; String outputPath; // we have to set sane defaults if not set by the tap // typically the case of MultiSinkTap String formatClassName = sinkConf.get("mapred.output.format.class", sinkConf.get("mapreduce.job.outputformat.class")); if (formatClassName == null) { outputFormatClass = TextOutputFormat.class; // unused, use "new" api, its the default outputPath = Hfs.getTempPath(sinkConf).toString(); // unused } else { outputFormatClass = Util.loadClass(formatClassName); outputPath = getOutputPath(sinkConf); } if (outputPath == null && getOutputPath(sinkConf) == null && isFileOutputFormat(outputFormatClass)) outputPath = Hfs.getTempPath(sinkConf).toString(); // unused MROutput.MROutputConfigBuilder configBuilder = MROutput.createConfigBuilder(sinkConf, outputFormatClass, outputPath); DataSinkDescriptor dataSinkDescriptor = configBuilder.build(); vertex.addDataSink(FlowElements.id(flowElement), dataSinkDescriptor); } addRemoteDebug(flowNode, vertex); addRemoteProfiling(flowNode, vertex); return vertex; } protected String getOutputPath(Configuration sinkConf) { return sinkConf.get("mapred.output.dir", sinkConf.get("mapreduce.output.fileoutputformat.outputdir")); } protected boolean isFileOutputFormat(Class outputFormatClass) { return org.apache.hadoop.mapred.FileOutputFormat.class.isAssignableFrom(outputFormatClass) || org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.class .isAssignableFrom(outputFormatClass); } protected int getParallelism(FlowNode flowNode, JobConf conf) { // only count streamed taps, accumulated taps are always annotated HashSet<Tap> sourceStreamedTaps = new HashSet<>(flowNode.getSourceTaps()); sourceStreamedTaps.removeAll(flowNode.getSourceElements(StreamMode.Accumulated)); if (sourceStreamedTaps.size() != 0) return -1; int parallelism = Integer.MAX_VALUE; for (Tap tap : flowNode.getSinkTaps()) { int numSinkParts = tap.getScheme().getNumSinkParts(); if (numSinkParts == 0) continue; if (parallelism != Integer.MAX_VALUE) LOG.info( "multiple sink taps in flow node declaring numSinkParts, choosing lowest value. see cascading.flow.FlowRuntimeProps for broader control."); parallelism = Math.min(parallelism, numSinkParts); } if (parallelism != Integer.MAX_VALUE) return parallelism; return conf.getInt(FlowRuntimeProps.GATHER_PARTITIONS, 0); } private void addInputOutputMapping(JobConf conf, FlowNode flowNode) { FlowNodeGraph flowNodeGraph = getFlowNodeGraph(); Set<ProcessEdge> incomingEdges = flowNodeGraph.incomingEdgesOf(flowNode); for (ProcessEdge processEdge : incomingEdges) conf.set("cascading.node.source." + processEdge.getID(), flowNodeGraph.getEdgeSource(processEdge).getID()); Set<ProcessEdge> outgoingEdges = flowNodeGraph.outgoingEdgesOf(flowNode); for (ProcessEdge processEdge : outgoingEdges) conf.set("cascading.node.sink." + processEdge.getID(), flowNodeGraph.getEdgeTarget(processEdge).getID()); } protected Map<FlowElement, Configuration> initFromSources(FlowNode flowNode, FlowProcess<TezConfiguration> flowProcess, Configuration conf, Map<String, LocalResource> taskLocalResources) { Set<? extends FlowElement> accumulatedSources = flowNode.getSourceElements(StreamMode.Accumulated); for (FlowElement element : accumulatedSources) { if (element instanceof Tap) { JobConf current = new JobConf(conf); Tap tap = (Tap) element; if (tap.getIdentifier() == null) throw new IllegalStateException("tap may not have null identifier: " + tap.toString()); tap.sourceConfInit(flowProcess, current); Collection<String> paths = current.getStringCollection(CASCADING_LOCAL_RESOURCES + Tap.id(tap)); if (!paths.isEmpty()) { String flowStagingPath = ((Hadoop2TezFlow) getFlow()).getFlowStagingPath(); String resourceSubPath = Tap.id(tap); Map<Path, Path> pathMap = TezUtil.addToClassPath(current, flowStagingPath, resourceSubPath, paths, LocalResourceType.FILE, taskLocalResources, null); current.setStrings(CASCADING_REMOTE_RESOURCES + Tap.id(tap), taskLocalResources.keySet().toArray(new String[taskLocalResources.size()])); allLocalResources.putAll(taskLocalResources); syncPaths.putAll(pathMap); } Map<String, String> map = flowProcess.diffConfigIntoMap(new TezConfiguration(conf), new TezConfiguration(current)); conf.set("cascading.node.accumulated.source.conf." + Tap.id(tap), pack(map, conf)); setLocalMode(conf, current, tap); } } Set<FlowElement> sources = new HashSet<>(flowNode.getSourceElements()); sources.removeAll(accumulatedSources); if (sources.isEmpty()) throw new IllegalStateException("all sources marked as accumulated"); Map<FlowElement, Configuration> configs = new HashMap<>(); for (FlowElement element : sources) { JobConf current = new JobConf(conf); String id = FlowElements.id(element); current.set("cascading.node.source", id); if (element instanceof Tap) { Tap tap = (Tap) element; if (tap.getIdentifier() == null) throw new IllegalStateException("tap may not have null identifier: " + tap.toString()); tap.sourceConfInit(flowProcess, current); setLocalMode(conf, current, tap); } configs.put(element, current); } return configs; } protected Map<FlowElement, Configuration> initFromSinks(FlowNode flowNode, FlowProcess<? extends Configuration> flowProcess, Configuration conf) { Set<FlowElement> sinks = flowNode.getSinkElements(); Map<FlowElement, Configuration> configs = new HashMap<>(); for (FlowElement element : sinks) { JobConf current = new JobConf(conf); if (element instanceof Tap) { Tap tap = (Tap) element; if (tap.getIdentifier() == null) throw new IllegalStateException("tap may not have null identifier: " + element.toString()); tap.sinkConfInit(flowProcess, current); setLocalMode(conf, current, tap); } String id = FlowElements.id(element); current.set("cascading.node.sink", id); configs.put(element, current); } return configs; } private void initFromNodeConfigDef(FlowNode flowNode, Configuration conf) { initConfFromNodeConfigDef(flowNode.getElementGraph(), new ConfigurationSetter(conf)); } private void initFromStepConfigDef(Configuration conf) { initConfFromStepConfigDef(new ConfigurationSetter(conf)); } protected void initFromTraps(FlowNode flowNode, FlowProcess<? extends Configuration> flowProcess, Configuration conf) { Map<String, Tap> traps = flowNode.getTrapMap(); if (!traps.isEmpty()) { JobConf trapConf = new JobConf(conf); for (Tap tap : traps.values()) { tap.sinkConfInit(flowProcess, trapConf); setLocalMode(conf, trapConf, tap); } } } private Vertex newVertex(FlowNode flowNode, Configuration conf, int parallelism) { conf.set(FlowNode.CASCADING_FLOW_NODE, pack(flowNode, conf)); // todo: pack into payload directly ProcessorDescriptor descriptor = ProcessorDescriptor.create(FlowProcessor.class.getName()); descriptor.setUserPayload(getPayload(conf)); Vertex vertex = Vertex.create(flowNode.getID(), descriptor, parallelism); if (environment != null) vertex.setTaskEnvironment(environment); return vertex; } private UserPayload getPayload(Configuration conf) { try { return TezUtils.createUserPayloadFromConf(conf); } catch (IOException exception) { throw new CascadingException(exception); } } private String pack(Object object, Configuration conf) { try { return serializeBase64(object, conf, true); } catch (IOException exception) { throw new FlowException("unable to pack object: " + object.getClass().getCanonicalName(), exception); } } @Override public void clean(TezConfiguration entries) { } public void syncArtifacts() { // this may not be strictly necessary, but there is a condition where setting the access time // fails, so there may be one were setting the modification time fails. if so, we can compensate. Map<String, Long> timestamps = HadoopUtil.syncPaths(getConfig(), syncPaths, true); for (Map.Entry<String, Long> entry : timestamps.entrySet()) { LocalResource localResource = allLocalResources.get(entry.getKey()); if (localResource != null) localResource.setTimestamp(entry.getValue()); } } private void setLocalMode(Configuration parent, JobConf current, Tap tap) { // force step to local mode if (!HadoopUtil.isLocal(current)) return; if (tap != null) logInfo("tap forcing step to tez local mode: " + tap.getIdentifier()); HadoopUtil.setLocal(parent); } private void addRemoteDebug(FlowNode flowNode, Vertex vertex) { String value = System.getProperty("test.debug.node", null); if (Util.isEmpty(value)) return; if (!flowNode.getSourceElementNames().contains(value) && asInt(value) != flowNode.getOrdinal()) return; LOG.warn("remote debugging enabled with property: {}, on node: {}, with node id: {}", "test.debug.node", value, flowNode.getID()); String opts = vertex.getTaskLaunchCmdOpts(); if (opts == null) opts = ""; String address = System.getProperty("test.debug.address", "localhost:5005").trim(); opts += " -agentlib:jdwp=transport=dt_socket,server=n,address=" + address + ",suspend=y"; vertex.setTaskLaunchCmdOpts(opts); } private void addRemoteProfiling(FlowNode flowNode, Vertex vertex) { String value = System.getProperty("test.profile.node", null); if (Util.isEmpty(value)) return; if (!flowNode.getSourceElementNames().contains(value) && asInt(value) != flowNode.getOrdinal()) return; LOG.warn("remote profiling enabled with property: {}, on node: {}, with node id: {}", "test.profile.node", value, flowNode.getID()); String opts = vertex.getTaskLaunchCmdOpts(); if (opts == null) opts = ""; String path = System.getProperty("test.profile.path", "/tmp/jfr/"); if (!path.endsWith("/")) path += "/"; LOG.warn("remote profiling property: {}, logging to path: {}", "test.profile.path", path); opts += String.format( " -XX:+UnlockCommercialFeatures -XX:+FlightRecorder -XX:FlightRecorderOptions=defaultrecording=true,dumponexit=true,dumponexitpath=%1$s%2$s,disk=true,repository=%1$s%2$s", path, flowNode.getID()); vertex.setTaskLaunchCmdOpts(opts); } private int asInt(String value) { try { return Integer.parseInt(value); } catch (NumberFormatException exception) { return -1; } } public Map<String, LocalResource> getAllLocalResources() { return allLocalResources; } private static class EdgeValues { FlowElement flowElement; TezConfiguration config; Set ordinals; String keyClassName; String valueClassName; String keyComparatorClassName; String keyPartitionerClassName; String outputClassName; String inputClassName; EdgeProperty.DataMovementType movementType; EdgeProperty.DataSourceType sourceType; EdgeProperty.SchedulingType schedulingType; private EdgeValues(TezConfiguration config, ProcessEdge processEdge) { this.config = config; this.flowElement = processEdge.getFlowElement(); this.ordinals = processEdge.getIncomingOrdinals(); } public FlowElement getFlowElement() { return flowElement; } public TezConfiguration getConfig() { return config; } public Set getOrdinals() { return ordinals; } public String getKeyClassName() { return keyClassName; } public String getValueClassName() { return valueClassName; } public String getKeyComparatorClassName() { return keyComparatorClassName; } public String getKeyPartitionerClassName() { return keyPartitionerClassName; } public String getOutputClassName() { return outputClassName; } public String getInputClassName() { return inputClassName; } public EdgeProperty.DataMovementType getMovementType() { return movementType; } public EdgeProperty.DataSourceType getSourceType() { return sourceType; } public EdgeProperty.SchedulingType getSchedulingType() { return schedulingType; } } }