Java tutorial
/* * Copyright (C) 2016 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.google.cloud.dataflow.sdk.io; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; import com.google.cloud.dataflow.sdk.io.CountingSource.NowTimestampFn; import com.google.cloud.dataflow.sdk.io.Read.Unbounded; import com.google.cloud.dataflow.sdk.transforms.PTransform; import com.google.cloud.dataflow.sdk.transforms.SerializableFunction; import com.google.cloud.dataflow.sdk.transforms.display.DisplayData; import com.google.cloud.dataflow.sdk.values.PBegin; import com.google.cloud.dataflow.sdk.values.PCollection; import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded; import com.google.common.base.Optional; import org.joda.time.Duration; import org.joda.time.Instant; /** * A {@link PTransform} that produces longs. When used to produce a * {@link IsBounded#BOUNDED bounded} {@link PCollection}, {@link CountingInput} starts at {@code 0} * and counts up to a specified maximum. When used to produce an * {@link IsBounded#UNBOUNDED unbounded} {@link PCollection}, it counts up to {@link Long#MAX_VALUE} * and then never produces more output. (In practice, this limit should never be reached.) * * <p>The bounded {@link CountingInput} is implemented based on {@link OffsetBasedSource} and * {@link OffsetBasedSource.OffsetBasedReader}, so it performs efficient initial splitting and it * supports dynamic work rebalancing. * * <p>To produce a bounded {@code PCollection<Long>}, use {@link CountingInput#upTo(long)}: * * <pre>{@code * Pipeline p = ... * PTransform<PBegin, PCollection<Long>> producer = CountingInput.upTo(1000); * PCollection<Long> bounded = p.apply(producer); * }</pre> * * <p>To produce an unbounded {@code PCollection<Long>}, use {@link CountingInput#unbounded()}, * calling {@link UnboundedCountingInput#withTimestampFn(SerializableFunction)} to provide values * with timestamps other than {@link Instant#now}. * * <pre>{@code * Pipeline p = ... * * // To create an unbounded producer that uses processing time as the element timestamp. * PCollection<Long> unbounded = p.apply(CountingInput.unbounded()); * // Or, to create an unbounded source that uses a provided function to set the element timestamp. * PCollection<Long> unboundedWithTimestamps = * p.apply(CountingInput.unbounded().withTimestampFn(someFn)); * }</pre> */ public class CountingInput { /** * Creates a {@link BoundedCountingInput} that will produce the specified number of elements, * from {@code 0} to {@code numElements - 1}. */ public static BoundedCountingInput upTo(long numElements) { checkArgument(numElements > 0, "numElements (%s) must be greater than 0", numElements); return new BoundedCountingInput(numElements); } /** * Creates an {@link UnboundedCountingInput} that will produce numbers starting from {@code 0} up * to {@link Long#MAX_VALUE}. * * <p>After {@link Long#MAX_VALUE}, the transform never produces more output. (In practice, this * limit should never be reached.) * * <p>Elements in the resulting {@link PCollection PCollection<Long>} will by default have * timestamps corresponding to processing time at element generation, provided by * {@link Instant#now}. Use the transform returned by * {@link UnboundedCountingInput#withTimestampFn(SerializableFunction)} to control the output * timestamps. */ public static UnboundedCountingInput unbounded() { return new UnboundedCountingInput(new NowTimestampFn(), 1L /* Elements per period */, Duration.ZERO /* Period length */, Optional.<Long>absent() /* Maximum number of records */, Optional.<Duration>absent() /* Maximum read duration */); } /** * A {@link PTransform} that will produce a specified number of {@link Long Longs} starting from * 0. */ public static class BoundedCountingInput extends PTransform<PBegin, PCollection<Long>> { private final long numElements; private BoundedCountingInput(long numElements) { this.numElements = numElements; } @SuppressWarnings("deprecation") @Override public PCollection<Long> apply(PBegin begin) { return begin.apply(Read.from(CountingSource.upTo(numElements))); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.add(DisplayData.item("upTo", numElements).withLabel("Count Up To")); } } /** * A {@link PTransform} that will produce numbers starting from {@code 0} up to * {@link Long#MAX_VALUE}. * * <p>After {@link Long#MAX_VALUE}, the transform never produces more output. (In practice, this * limit should never be reached.) * * <p>Elements in the resulting {@link PCollection PCollection<Long>} will by default have * timestamps corresponding to processing time at element generation, provided by * {@link Instant#now}. Use the transform returned by * {@link UnboundedCountingInput#withTimestampFn(SerializableFunction)} to control the output * timestamps. */ public static class UnboundedCountingInput extends PTransform<PBegin, PCollection<Long>> { private final SerializableFunction<Long, Instant> timestampFn; private final long elementsPerPeriod; private final Duration period; private final Optional<Long> maxNumRecords; private final Optional<Duration> maxReadTime; private UnboundedCountingInput(SerializableFunction<Long, Instant> timestampFn, long elementsPerPeriod, Duration period, Optional<Long> maxNumRecords, Optional<Duration> maxReadTime) { this.timestampFn = timestampFn; this.elementsPerPeriod = elementsPerPeriod; this.period = period; this.maxNumRecords = maxNumRecords; this.maxReadTime = maxReadTime; } /** * Returns an {@link UnboundedCountingInput} like this one, but where output elements have the * timestamp specified by the timestampFn. * * <p>Note that the timestamps produced by {@code timestampFn} may not decrease. */ public UnboundedCountingInput withTimestampFn(SerializableFunction<Long, Instant> timestampFn) { return new UnboundedCountingInput(timestampFn, elementsPerPeriod, period, maxNumRecords, maxReadTime); } /** * Returns an {@link UnboundedCountingInput} like this one, but that will read at most the * specified number of elements. * * <p>A bounded amount of elements will be produced by the result transform, and the result * {@link PCollection} will be {@link IsBounded#BOUNDED bounded}. */ public UnboundedCountingInput withMaxNumRecords(long maxRecords) { checkArgument(maxRecords > 0, "MaxRecords must be a positive (nonzero) value. Got %s", maxRecords); return new UnboundedCountingInput(timestampFn, elementsPerPeriod, period, Optional.of(maxRecords), maxReadTime); } /** * Returns an {@link UnboundedCountingInput} like this one, but with output production limited * to an aggregate rate of no more than the number of elements per the period length. * * <p>Note that when there are multiple splits, each split outputs independently. This may lead * to elements not being produced evenly across time, though the aggregate rate will still * approach the specified rate. * * <p>A duration of {@link Duration#ZERO} will produce output as fast as possible. */ public UnboundedCountingInput withRate(long numElements, Duration periodLength) { return new UnboundedCountingInput(timestampFn, numElements, periodLength, maxNumRecords, maxReadTime); } /** * Returns an {@link UnboundedCountingInput} like this one, but that will read for at most the * specified amount of time. * * <p>A bounded amount of elements will be produced by the result transform, and the result * {@link PCollection} will be {@link IsBounded#BOUNDED bounded}. */ public UnboundedCountingInput withMaxReadTime(Duration readTime) { checkNotNull(readTime, "ReadTime cannot be null"); return new UnboundedCountingInput(timestampFn, elementsPerPeriod, period, maxNumRecords, Optional.of(readTime)); } @SuppressWarnings("deprecation") @Override public PCollection<Long> apply(PBegin begin) { Unbounded<Long> read = Read.from(CountingSource.createUnbounded().withTimestampFn(timestampFn) .withRate(elementsPerPeriod, period)); if (!maxNumRecords.isPresent() && !maxReadTime.isPresent()) { return begin.apply(read); } else if (maxNumRecords.isPresent() && !maxReadTime.isPresent()) { return begin.apply(read.withMaxNumRecords(maxNumRecords.get())); } else if (!maxNumRecords.isPresent() && maxReadTime.isPresent()) { return begin.apply(read.withMaxReadTime(maxReadTime.get())); } else { return begin.apply(read.withMaxReadTime(maxReadTime.get()).withMaxNumRecords(maxNumRecords.get())); } } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.add(DisplayData.item("timestampFn", timestampFn.getClass()).withLabel("Timestamp Function")); if (maxReadTime.isPresent()) { builder.add(DisplayData.item("maxReadTime", maxReadTime.get()).withLabel("Maximum Read Time")); } if (maxNumRecords.isPresent()) { builder.add(DisplayData.item("maxRecords", maxNumRecords.get()).withLabel("Maximum Read Records")); } } } }