com.google.cloud.dataflow.sdk.runners.inprocess.InProcessEvaluationContext.java Source code

Java tutorial

Introduction

Here is the source code for com.google.cloud.dataflow.sdk.runners.inprocess.InProcessEvaluationContext.java

Source

/*
 * Copyright (C) 2016 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.cloud.dataflow.sdk.runners.inprocess;

import static com.google.common.base.Preconditions.checkNotNull;

import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.runners.inprocess.CommittedResult.OutputType;
import com.google.cloud.dataflow.sdk.runners.inprocess.GroupByKeyEvaluatorFactory.InProcessGroupByKeyOnly;
import com.google.cloud.dataflow.sdk.runners.inprocess.InMemoryWatermarkManager.FiredTimers;
import com.google.cloud.dataflow.sdk.runners.inprocess.InMemoryWatermarkManager.TransformWatermarks;
import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.PCollectionViewWriter;
import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.UncommittedBundle;
import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
import com.google.cloud.dataflow.sdk.transforms.windowing.Trigger;
import com.google.cloud.dataflow.sdk.util.ExecutionContext;
import com.google.cloud.dataflow.sdk.util.ReadyCheckingSideInputReader;
import com.google.cloud.dataflow.sdk.util.SideInputReader;
import com.google.cloud.dataflow.sdk.util.TimerInternals.TimerData;
import com.google.cloud.dataflow.sdk.util.WindowedValue;
import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
import com.google.cloud.dataflow.sdk.util.common.CounterSet;
import com.google.cloud.dataflow.sdk.util.state.CopyOnAccessInMemoryStateInternals;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import com.google.cloud.dataflow.sdk.values.PValue;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.util.concurrent.MoreExecutors;
import org.joda.time.Instant;
import java.util.Collection;
import java.util.EnumSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import javax.annotation.Nullable;

/**
 * The evaluation context for a specific pipeline being executed by the
 * {@link InProcessPipelineRunner}. Contains state shared within the execution across all
 * transforms.
 *
 * <p>{@link InProcessEvaluationContext} contains shared state for an execution of the
 * {@link InProcessPipelineRunner} that can be used while evaluating a {@link PTransform}. This
 * consists of views into underlying state and watermark implementations, access to read and write
 * {@link PCollectionView PCollectionViews}, and constructing {@link CounterSet CounterSets} and
 * {@link ExecutionContext ExecutionContexts}. This includes executing callbacks asynchronously when
 * state changes to the appropriate point (e.g. when a {@link PCollectionView} is requested and
 * known to be empty).
 *
 * <p>{@link InProcessEvaluationContext} also handles results by committing finalizing bundles based
 * on the current global state and updating the global state appropriately. This includes updating
 * the per-{@link StepAndKey} state, updating global watermarks, and executing any callbacks that
 * can be executed.
 */
class InProcessEvaluationContext {
    /** The step name for each {@link AppliedPTransform} in the {@link Pipeline}. */
    private final Map<AppliedPTransform<?, ?, ?>, String> stepNames;

    /** The options that were used to create this {@link Pipeline}. */
    private final InProcessPipelineOptions options;
    private final Clock clock;

    private final BundleFactory bundleFactory;
    /** The current processing time and event time watermarks and timers. */
    private final InMemoryWatermarkManager watermarkManager;

    /** Executes callbacks based on the progression of the watermark. */
    private final WatermarkCallbackExecutor callbackExecutor;

    /** The stateInternals of the world, by applied PTransform and key. */
    private final ConcurrentMap<StepAndKey, CopyOnAccessInMemoryStateInternals<?>> applicationStateInternals;

    private final InProcessSideInputContainer sideInputContainer;

    private final CounterSet mergedCounters;

    public static InProcessEvaluationContext create(InProcessPipelineOptions options, Clock clock,
            BundleFactory bundleFactory, Collection<AppliedPTransform<?, ?, ?>> rootTransforms,
            Map<PValue, Collection<AppliedPTransform<?, ?, ?>>> valueToConsumers,
            Map<AppliedPTransform<?, ?, ?>, String> stepNames, Collection<PCollectionView<?>> views) {
        return new InProcessEvaluationContext(options, clock, bundleFactory, rootTransforms, valueToConsumers,
                stepNames, views);
    }

    private InProcessEvaluationContext(InProcessPipelineOptions options, Clock clock, BundleFactory bundleFactory,
            Collection<AppliedPTransform<?, ?, ?>> rootTransforms,
            Map<PValue, Collection<AppliedPTransform<?, ?, ?>>> valueToConsumers,
            Map<AppliedPTransform<?, ?, ?>, String> stepNames, Collection<PCollectionView<?>> views) {
        this.options = checkNotNull(options);
        this.clock = clock;
        this.bundleFactory = checkNotNull(bundleFactory);
        checkNotNull(rootTransforms);
        checkNotNull(valueToConsumers);
        checkNotNull(stepNames);
        checkNotNull(views);
        this.stepNames = stepNames;

        this.watermarkManager = InMemoryWatermarkManager.create(clock, rootTransforms, valueToConsumers);
        this.sideInputContainer = InProcessSideInputContainer.create(this, views);

        this.applicationStateInternals = new ConcurrentHashMap<>();
        this.mergedCounters = new CounterSet();

        this.callbackExecutor = WatermarkCallbackExecutor.create(MoreExecutors.directExecutor());
    }

    /**
     * Handle the provided {@link InProcessTransformResult}, produced after evaluating the provided
     * {@link CommittedBundle} (potentially null, if the result of a root {@link PTransform}).
     *
     * <p>The result is the output of running the transform contained in the
     * {@link InProcessTransformResult} on the contents of the provided bundle.
     *
     * @param completedBundle the bundle that was processed to produce the result. Potentially
     *                        {@code null} if the transform that produced the result is a root
     *                        transform
     * @param completedTimers the timers that were delivered to produce the {@code completedBundle},
     *                        or an empty iterable if no timers were delivered
     * @param result the result of evaluating the input bundle
     * @return the committed bundles contained within the handled {@code result}
     */
    public CommittedResult handleResult(@Nullable CommittedBundle<?> completedBundle,
            Iterable<TimerData> completedTimers, InProcessTransformResult result) {
        Iterable<? extends CommittedBundle<?>> committedBundles = commitBundles(result.getOutputBundles());
        // Update watermarks and timers
        EnumSet<OutputType> outputTypes = EnumSet.copyOf(result.getOutputTypes());
        if (Iterables.isEmpty(committedBundles)) {
            outputTypes.remove(OutputType.BUNDLE);
        } else {
            outputTypes.add(OutputType.BUNDLE);
        }
        CommittedResult committedResult = CommittedResult.create(result,
                completedBundle == null ? null
                        : completedBundle.withElements((Iterable) result.getUnprocessedElements()),
                committedBundles, outputTypes);
        watermarkManager.updateWatermarks(completedBundle,
                result.getTimerUpdate().withCompletedTimers(completedTimers), committedResult,
                result.getWatermarkHold());
        // Update counters
        if (result.getCounters() != null) {
            mergedCounters.merge(result.getCounters());
        }
        // Update state internals
        CopyOnAccessInMemoryStateInternals<?> theirState = result.getState();
        if (theirState != null) {
            CopyOnAccessInMemoryStateInternals<?> committedState = theirState.commit();
            StepAndKey stepAndKey = StepAndKey.of(result.getTransform(),
                    completedBundle == null ? null : completedBundle.getKey());
            if (!committedState.isEmpty()) {
                applicationStateInternals.put(stepAndKey, committedState);
            } else {
                applicationStateInternals.remove(stepAndKey);
            }
        }
        return committedResult;
    }

    private Iterable<? extends CommittedBundle<?>> commitBundles(Iterable<? extends UncommittedBundle<?>> bundles) {
        ImmutableList.Builder<CommittedBundle<?>> completed = ImmutableList.builder();
        for (UncommittedBundle<?> inProgress : bundles) {
            AppliedPTransform<?, ?, ?> producing = inProgress.getPCollection().getProducingTransformInternal();
            TransformWatermarks watermarks = watermarkManager.getWatermarks(producing);
            CommittedBundle<?> committed = inProgress.commit(watermarks.getSynchronizedProcessingOutputTime());
            // Empty bundles don't impact watermarks and shouldn't trigger downstream execution, so
            // filter them out
            if (!Iterables.isEmpty(committed.getElements())) {
                completed.add(committed);
            }
        }
        return completed.build();
    }

    private void fireAllAvailableCallbacks() {
        for (AppliedPTransform<?, ?, ?> transform : stepNames.keySet()) {
            fireAvailableCallbacks(transform);
        }
    }

    private void fireAvailableCallbacks(AppliedPTransform<?, ?, ?> producingTransform) {
        TransformWatermarks watermarks = watermarkManager.getWatermarks(producingTransform);
        callbackExecutor.fireForWatermark(producingTransform, watermarks.getOutputWatermark());
    }

    /**
     * Create a {@link UncommittedBundle} for use by a source.
     */
    public <T> UncommittedBundle<T> createRootBundle(PCollection<T> output) {
        return bundleFactory.createRootBundle(output);
    }

    /**
     * Create a {@link UncommittedBundle} whose elements belong to the specified {@link
     * PCollection}.
     */
    public <T> UncommittedBundle<T> createBundle(CommittedBundle<?> input, PCollection<T> output) {
        return bundleFactory.createBundle(input, output);
    }

    /**
     * Create a {@link UncommittedBundle} with the specified keys at the specified step. For use by
     * {@link InProcessGroupByKeyOnly} {@link PTransform PTransforms}.
     */
    public <K, T> UncommittedBundle<T> createKeyedBundle(CommittedBundle<?> input, StructuralKey<K> key,
            PCollection<T> output) {
        return bundleFactory.createKeyedBundle(input, key, output);
    }

    /**
     * Create a {@link PCollectionViewWriter}, whose elements will be used in the provided
     * {@link PCollectionView}.
     */
    public <ElemT, ViewT> PCollectionViewWriter<ElemT, ViewT> createPCollectionViewWriter(
            PCollection<Iterable<ElemT>> input, final PCollectionView<ViewT> output) {
        return new PCollectionViewWriter<ElemT, ViewT>() {
            @Override
            public void add(Iterable<WindowedValue<ElemT>> values) {
                sideInputContainer.write(output, values);
            }
        };
    }

    /**
     * Schedule a callback to be executed after output would be produced for the given window
     * if there had been input.
     *
     * <p>Output would be produced when the watermark for a {@link PValue} passes the point at
     * which the trigger for the specified window (with the specified windowing strategy) must have
     * fired from the perspective of that {@link PValue}, as specified by the value of
     * {@link Trigger#getWatermarkThatGuaranteesFiring(BoundedWindow)} for the trigger of the
     * {@link WindowingStrategy}. When the callback has fired, either values will have been produced
     * for a key in that window, the window is empty, or all elements in the window are late. The
     * callback will be executed regardless of whether values have been produced.
     */
    public void scheduleAfterOutputWouldBeProduced(PValue value, BoundedWindow window,
            WindowingStrategy<?, ?> windowingStrategy, Runnable runnable) {
        AppliedPTransform<?, ?, ?> producing = getProducing(value);
        callbackExecutor.callOnGuaranteedFiring(producing, window, windowingStrategy, runnable);

        fireAvailableCallbacks(lookupProducing(value));
    }

    private AppliedPTransform<?, ?, ?> getProducing(PValue value) {
        if (value.getProducingTransformInternal() != null) {
            return value.getProducingTransformInternal();
        }
        return lookupProducing(value);
    }

    private AppliedPTransform<?, ?, ?> lookupProducing(PValue value) {
        for (AppliedPTransform<?, ?, ?> transform : stepNames.keySet()) {
            if (transform.getOutput().equals(value) || transform.getOutput().expand().contains(value)) {
                return transform;
            }
        }
        return null;
    }

    /**
     * Get the options used by this {@link Pipeline}.
     */
    public InProcessPipelineOptions getPipelineOptions() {
        return options;
    }

    /**
     * Get an {@link ExecutionContext} for the provided {@link AppliedPTransform} and key.
     */
    public InProcessExecutionContext getExecutionContext(AppliedPTransform<?, ?, ?> application,
            StructuralKey<?> key) {
        StepAndKey stepAndKey = StepAndKey.of(application, key);
        return new InProcessExecutionContext(clock, key,
                (CopyOnAccessInMemoryStateInternals<Object>) applicationStateInternals.get(stepAndKey),
                watermarkManager.getWatermarks(application));
    }

    /**
     * Get all of the steps used in this {@link Pipeline}.
     */
    public Collection<AppliedPTransform<?, ?, ?>> getSteps() {
        return stepNames.keySet();
    }

    /**
     * Get the Step Name for the provided application.
     */
    public String getStepName(AppliedPTransform<?, ?, ?> application) {
        return stepNames.get(application);
    }

    /**
     * Returns a {@link ReadyCheckingSideInputReader} capable of reading the provided
     * {@link PCollectionView PCollectionViews}.
     *
     * @param sideInputs the {@link PCollectionView PCollectionViews} the result should be able to
     * read
     * @return a {@link SideInputReader} that can read all of the provided {@link PCollectionView
     * PCollectionViews}
     */
    public ReadyCheckingSideInputReader createSideInputReader(final List<PCollectionView<?>> sideInputs) {
        return sideInputContainer.createReaderForViews(sideInputs);
    }

    /**
     * Create a {@link CounterSet} for this {@link Pipeline}. The {@link CounterSet} is independent
     * of all other {@link CounterSet CounterSets} created by this call.
     *
     * The {@link InProcessEvaluationContext} is responsible for unifying the counters present in
     * all created {@link CounterSet CounterSets} when the transforms that call this method
     * complete.
     */
    public CounterSet createCounterSet() {
        return new CounterSet();
    }

    /**
     * Returns all of the counters that have been merged into this context via calls to
     * {@link CounterSet#merge(CounterSet)}.
     */
    public CounterSet getCounters() {
        return mergedCounters;
    }

    @VisibleForTesting
    void forceRefresh() {
        watermarkManager.refreshAll();
        fireAllAvailableCallbacks();
    }

    /**
     * Extracts all timers that have been fired and have not already been extracted.
     *
     * <p>This is a destructive operation. Timers will only appear in the result of this method once
     * for each time they are set.
     */
    public Map<AppliedPTransform<?, ?, ?>, Map<StructuralKey<?>, FiredTimers>> extractFiredTimers() {
        forceRefresh();
        Map<AppliedPTransform<?, ?, ?>, Map<StructuralKey<?>, FiredTimers>> fired = watermarkManager
                .extractFiredTimers();
        return fired;
    }

    /**
     * Returns true if the step will not produce additional output.
     *
     * <p>If the provided transform produces only {@link IsBounded#BOUNDED}
     * {@link PCollection PCollections}, returns true if the watermark is at
     * {@link BoundedWindow#TIMESTAMP_MAX_VALUE positive infinity}.
     *
     * <p>If the provided transform produces any {@link IsBounded#UNBOUNDED}
     * {@link PCollection PCollections}, returns the value of
     * {@link InProcessPipelineOptions#isShutdownUnboundedProducersWithMaxWatermark()}.
     */
    public boolean isDone(AppliedPTransform<?, ?, ?> transform) {
        // if the PTransform's watermark isn't at the max value, it isn't done
        if (watermarkManager.getWatermarks(transform).getOutputWatermark()
                .isBefore(BoundedWindow.TIMESTAMP_MAX_VALUE)) {
            return false;
        }
        // If the PTransform has any unbounded outputs, and unbounded producers should not be shut down,
        // the PTransform may produce additional output. It is not done.
        for (PValue output : transform.getOutput().expand()) {
            if (output instanceof PCollection) {
                IsBounded bounded = ((PCollection<?>) output).isBounded();
                if (bounded.equals(IsBounded.UNBOUNDED)
                        && !options.isShutdownUnboundedProducersWithMaxWatermark()) {
                    return false;
                }
            }
        }
        // The PTransform's watermark was at positive infinity and all of its outputs are known to be
        // done. It is done.
        return true;
    }

    /**
     * Returns true if all steps are done.
     */
    public boolean isDone() {
        for (AppliedPTransform<?, ?, ?> transform : stepNames.keySet()) {
            if (!isDone(transform)) {
                return false;
            }
        }
        return true;
    }

    public Instant now() {
        return clock.now();
    }

    Clock getClock() {
        return clock;
    }
}