Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.samza.execution; import java.util.ArrayList; import java.util.Base64; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.UUID; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; import org.apache.samza.config.Config; import org.apache.samza.config.JavaTableConfig; import org.apache.samza.config.JobConfig; import org.apache.samza.config.MapConfig; import org.apache.samza.config.SerializerConfig; import org.apache.samza.config.StorageConfig; import org.apache.samza.config.StreamConfig; import org.apache.samza.config.TaskConfig; import org.apache.samza.config.TaskConfigJava; import org.apache.samza.operators.StreamGraphImpl; import org.apache.samza.operators.spec.InputOperatorSpec; import org.apache.samza.operators.spec.JoinOperatorSpec; import org.apache.samza.operators.spec.OperatorSpec; import org.apache.samza.operators.spec.OutputStreamImpl; import org.apache.samza.operators.spec.StatefulOperatorSpec; import org.apache.samza.operators.spec.WindowOperatorSpec; import org.apache.samza.util.MathUtil; import org.apache.samza.serializers.Serde; import org.apache.samza.serializers.SerializableSerde; import org.apache.samza.system.StreamSpec; import org.apache.samza.table.TableProvider; import org.apache.samza.table.TableProviderFactory; import org.apache.samza.table.TableSpec; import org.apache.samza.util.Util; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Joiner; /** * A JobNode is a physical execution unit. In RemoteExecutionEnvironment, it's a job that will be submitted * to remote cluster. In LocalExecutionEnvironment, it's a set of StreamProcessors for local execution. * A JobNode contains the input/output, and the configs for physical execution. */ public class JobNode { private static final Logger log = LoggerFactory.getLogger(JobNode.class); private static final String CONFIG_JOB_PREFIX = "jobs.%s."; private static final String CONFIG_INTERNAL_EXECUTION_PLAN = "samza.internal.execution.plan"; private final String jobName; private final String jobId; private final String id; private final StreamGraphImpl streamGraph; private final List<StreamEdge> inEdges = new ArrayList<>(); private final List<StreamEdge> outEdges = new ArrayList<>(); private final List<TableSpec> tables = new ArrayList<>(); private final Config config; JobNode(String jobName, String jobId, StreamGraphImpl streamGraph, Config config) { this.jobName = jobName; this.jobId = jobId; this.id = createId(jobName, jobId); this.streamGraph = streamGraph; this.config = config; } public StreamGraphImpl getStreamGraph() { return streamGraph; } public String getId() { return id; } public String getJobName() { return jobName; } public String getJobId() { return jobId; } void addInEdge(StreamEdge in) { inEdges.add(in); } void addOutEdge(StreamEdge out) { outEdges.add(out); } List<StreamEdge> getInEdges() { return inEdges; } List<StreamEdge> getOutEdges() { return outEdges; } void addTable(TableSpec tableSpec) { tables.add(tableSpec); } /** * Generate the configs for a job * @param executionPlanJson JSON representation of the execution plan * @return config of the job */ public JobConfig generateConfig(String executionPlanJson) { Map<String, String> configs = new HashMap<>(); configs.put(JobConfig.JOB_NAME(), jobName); final List<String> inputs = new ArrayList<>(); final List<String> broadcasts = new ArrayList<>(); for (StreamEdge inEdge : inEdges) { String formattedSystemStream = inEdge.getFormattedSystemStream(); if (inEdge.getStreamSpec().isBroadcast()) { broadcasts.add(formattedSystemStream + "#0"); } else { inputs.add(formattedSystemStream); } } configs.put(TaskConfig.INPUT_STREAMS(), Joiner.on(',').join(inputs)); if (!broadcasts.isEmpty()) { // TODO: remove this once we support defining broadcast input stream in high-level // task.broadcast.input should be generated by the planner in the future. final String taskBroadcasts = config.get(TaskConfigJava.BROADCAST_INPUT_STREAMS); if (StringUtils.isNoneEmpty(taskBroadcasts)) { broadcasts.add(taskBroadcasts); } configs.put(TaskConfigJava.BROADCAST_INPUT_STREAMS, Joiner.on(',').join(broadcasts)); } // set triggering interval if a window or join is defined if (streamGraph.hasWindowOrJoins()) { if ("-1".equals(config.get(TaskConfig.WINDOW_MS(), "-1"))) { long triggerInterval = computeTriggerInterval(); log.info("Using triggering interval: {} for jobName: {}", triggerInterval, jobName); configs.put(TaskConfig.WINDOW_MS(), String.valueOf(triggerInterval)); } } streamGraph.getAllOperatorSpecs().forEach(opSpec -> { if (opSpec instanceof StatefulOperatorSpec) { ((StatefulOperatorSpec) opSpec).getStoreDescriptors() .forEach(sd -> configs.putAll(sd.getStorageConfigs())); // store key and message serdes are configured separately in #addSerdeConfigs } }); configs.put(CONFIG_INTERNAL_EXECUTION_PLAN, executionPlanJson); // write input/output streams to configs inEdges.stream().filter(StreamEdge::isIntermediate).forEach(edge -> configs.putAll(edge.generateConfig())); // write serialized serde instances and stream serde configs to configs addSerdeConfigs(configs); tables.forEach(tableSpec -> { // Table provider factory configs.put(String.format(JavaTableConfig.TABLE_PROVIDER_FACTORY, tableSpec.getId()), tableSpec.getTableProviderFactoryClassName()); // Note: no need to generate config for Serde's, as they are already produced by addSerdeConfigs() // Generate additional configuration TableProviderFactory tableProviderFactory = Util.getObj(tableSpec.getTableProviderFactoryClassName(), TableProviderFactory.class); TableProvider tableProvider = tableProviderFactory.getTableProvider(tableSpec); configs.putAll(tableProvider.generateConfig(configs)); }); log.info("Job {} has generated configs {}", jobName, configs); String configPrefix = String.format(CONFIG_JOB_PREFIX, jobName); // Disallow user specified job inputs/outputs. This info comes strictly from the user application. Map<String, String> allowedConfigs = new HashMap<>(config); if (allowedConfigs.containsKey(TaskConfig.INPUT_STREAMS())) { log.warn("Specifying task inputs in configuration is not allowed with Fluent API. " + "Ignoring configured value for " + TaskConfig.INPUT_STREAMS()); allowedConfigs.remove(TaskConfig.INPUT_STREAMS()); } log.debug("Job {} has allowed configs {}", jobName, allowedConfigs); return new JobConfig(Util.rewriteConfig( extractScopedConfig(new MapConfig(allowedConfigs), new MapConfig(configs), configPrefix))); } /** * Serializes the {@link Serde} instances for operators, adds them to the provided config, and * sets the serde configuration for the input/output/intermediate streams appropriately. * * We try to preserve the number of Serde instances before and after serialization. However we don't * guarantee that references shared between these serdes instances (e.g. an Jackson ObjectMapper shared * between two json serdes) are shared after deserialization too. * * Ideally all the user defined objects in the application should be serialized and de-serialized in one pass * from the same output/input stream so that we can maintain reference sharing relationships. * * @param configs the configs to add serialized serde instances and stream serde configs to */ void addSerdeConfigs(Map<String, String> configs) { // collect all key and msg serde instances for streams Map<String, Serde> streamKeySerdes = new HashMap<>(); Map<String, Serde> streamMsgSerdes = new HashMap<>(); Map<StreamSpec, InputOperatorSpec> inputOperators = streamGraph.getInputOperators(); inEdges.forEach(edge -> { String streamId = edge.getStreamSpec().getId(); InputOperatorSpec inputOperatorSpec = inputOperators.get(edge.getStreamSpec()); streamKeySerdes.put(streamId, inputOperatorSpec.getKeySerde()); streamMsgSerdes.put(streamId, inputOperatorSpec.getValueSerde()); }); Map<StreamSpec, OutputStreamImpl> outputStreams = streamGraph.getOutputStreams(); outEdges.forEach(edge -> { String streamId = edge.getStreamSpec().getId(); OutputStreamImpl outputStream = outputStreams.get(edge.getStreamSpec()); streamKeySerdes.put(streamId, outputStream.getKeySerde()); streamMsgSerdes.put(streamId, outputStream.getValueSerde()); }); // collect all key and msg serde instances for stores Map<String, Serde> storeKeySerdes = new HashMap<>(); Map<String, Serde> storeMsgSerdes = new HashMap<>(); streamGraph.getAllOperatorSpecs().forEach(opSpec -> { if (opSpec instanceof StatefulOperatorSpec) { ((StatefulOperatorSpec) opSpec).getStoreDescriptors().forEach(storeDescriptor -> { storeKeySerdes.put(storeDescriptor.getStoreName(), storeDescriptor.getKeySerde()); storeMsgSerdes.put(storeDescriptor.getStoreName(), storeDescriptor.getMsgSerde()); }); } }); // collect all key and msg serde instances for tables Map<String, Serde> tableKeySerdes = new HashMap<>(); Map<String, Serde> tableValueSerdes = new HashMap<>(); tables.forEach(tableSpec -> { tableKeySerdes.put(tableSpec.getId(), tableSpec.getSerde().getKeySerde()); tableValueSerdes.put(tableSpec.getId(), tableSpec.getSerde().getValueSerde()); }); // for each unique stream or store serde instance, generate a unique name and serialize to config HashSet<Serde> serdes = new HashSet<>(streamKeySerdes.values()); serdes.addAll(streamMsgSerdes.values()); serdes.addAll(storeKeySerdes.values()); serdes.addAll(storeMsgSerdes.values()); serdes.addAll(tableKeySerdes.values()); serdes.addAll(tableValueSerdes.values()); SerializableSerde<Serde> serializableSerde = new SerializableSerde<>(); Base64.Encoder base64Encoder = Base64.getEncoder(); Map<Serde, String> serdeUUIDs = new HashMap<>(); serdes.forEach(serde -> { String serdeName = serdeUUIDs.computeIfAbsent(serde, s -> serde.getClass().getSimpleName() + "-" + UUID.randomUUID().toString()); configs.putIfAbsent(String.format(SerializerConfig.SERDE_SERIALIZED_INSTANCE(), serdeName), base64Encoder.encodeToString(serializableSerde.toBytes(serde))); }); // set key and msg serdes for streams to the serde names generated above streamKeySerdes.forEach((streamId, serde) -> { String streamIdPrefix = String.format(StreamConfig.STREAM_ID_PREFIX(), streamId); String keySerdeConfigKey = streamIdPrefix + StreamConfig.KEY_SERDE(); configs.put(keySerdeConfigKey, serdeUUIDs.get(serde)); }); streamMsgSerdes.forEach((streamId, serde) -> { String streamIdPrefix = String.format(StreamConfig.STREAM_ID_PREFIX(), streamId); String valueSerdeConfigKey = streamIdPrefix + StreamConfig.MSG_SERDE(); configs.put(valueSerdeConfigKey, serdeUUIDs.get(serde)); }); // set key and msg serdes for stores to the serde names generated above storeKeySerdes.forEach((storeName, serde) -> { String keySerdeConfigKey = String.format(StorageConfig.KEY_SERDE(), storeName); configs.put(keySerdeConfigKey, serdeUUIDs.get(serde)); }); storeMsgSerdes.forEach((storeName, serde) -> { String msgSerdeConfigKey = String.format(StorageConfig.MSG_SERDE(), storeName); configs.put(msgSerdeConfigKey, serdeUUIDs.get(serde)); }); // set key and msg serdes for tables to the serde names generated above tableKeySerdes.forEach((tableId, serde) -> { String keySerdeConfigKey = String.format(JavaTableConfig.TABLE_KEY_SERDE, tableId); configs.put(keySerdeConfigKey, serdeUUIDs.get(serde)); }); tableValueSerdes.forEach((tableId, serde) -> { String valueSerdeConfigKey = String.format(JavaTableConfig.TABLE_VALUE_SERDE, tableId); configs.put(valueSerdeConfigKey, serdeUUIDs.get(serde)); }); } /** * Computes the triggering interval to use during the execution of this {@link JobNode} */ private long computeTriggerInterval() { // Obtain the operator specs from the streamGraph Collection<OperatorSpec> operatorSpecs = streamGraph.getAllOperatorSpecs(); // Filter out window operators, and obtain a list of their triggering interval values List<Long> windowTimerIntervals = operatorSpecs.stream() .filter(spec -> spec.getOpCode() == OperatorSpec.OpCode.WINDOW) .map(spec -> ((WindowOperatorSpec) spec).getDefaultTriggerMs()).collect(Collectors.toList()); // Filter out the join operators, and obtain a list of their ttl values List<Long> joinTtlIntervals = operatorSpecs.stream().filter(spec -> spec instanceof JoinOperatorSpec) .map(spec -> ((JoinOperatorSpec) spec).getTtlMs()).collect(Collectors.toList()); // Combine both the above lists List<Long> candidateTimerIntervals = new ArrayList<>(joinTtlIntervals); candidateTimerIntervals.addAll(windowTimerIntervals); if (candidateTimerIntervals.isEmpty()) { return -1; } // Compute the gcd of the resultant list return MathUtil.gcd(candidateTimerIntervals); } /** * This function extract the subset of configs from the full config, and use it to override the generated configs * from the job. * @param fullConfig full config * @param generatedConfig config generated for the job * @param configPrefix prefix to extract the subset of the config overrides * @return config that merges the generated configs and overrides */ private static Config extractScopedConfig(Config fullConfig, Config generatedConfig, String configPrefix) { Config scopedConfig = fullConfig.subset(configPrefix); Config[] configPrecedence = new Config[] { fullConfig, generatedConfig, scopedConfig }; // Strip empty configs so they don't override the configs before them. Map<String, String> mergedConfig = new HashMap<>(); for (Map<String, String> config : configPrecedence) { for (Map.Entry<String, String> property : config.entrySet()) { String value = property.getValue(); if (!(value == null || value.isEmpty())) { mergedConfig.put(property.getKey(), property.getValue()); } } } scopedConfig = new MapConfig(mergedConfig); log.debug("Prefix '{}' has merged config {}", configPrefix, scopedConfig); return scopedConfig; } static String createId(String jobName, String jobId) { return String.format("%s-%s", jobName, jobId); } }