Java tutorial
/** * Copyright (c) 2016 DataTorrent, Inc. ALL Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.datatorrent.lib.dedup; import java.io.Serializable; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import javax.validation.constraints.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.commons.lang.mutable.MutableLong; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import com.datatorrent.api.AutoMetric; import com.datatorrent.api.Context; import com.datatorrent.api.Context.OperatorContext; import com.datatorrent.api.Context.PortContext; import com.datatorrent.api.DefaultInputPort; import com.datatorrent.api.DefaultOutputPort; import com.datatorrent.api.Operator; import com.datatorrent.api.Operator.ActivationListener; import com.datatorrent.api.Stats; import com.datatorrent.api.StatsListener; import com.datatorrent.api.StreamCodec; import com.datatorrent.api.annotation.InputPortFieldAnnotation; import com.datatorrent.api.annotation.OperatorAnnotation; import com.datatorrent.lib.bucket.AbstractBucket; import com.datatorrent.lib.bucket.AbstractBucketManager; import com.datatorrent.lib.bucket.BucketManager; import com.datatorrent.lib.counters.BasicCounters; import com.datatorrent.netlet.util.DTThrowable; /** * This is the base implementation of an de-duplication operator.<br/> * Subclasses must implement the {@link #getEventKey(Object)} method which extracts the key from an event. This key * is used for deciding whether an event is a duplicate or not. * <p/> * Processing of an event involves: * <ol> * <li>Finding the bucket key of an event by calling {@link BucketManager#getBucketKeyFor(Object)}.</li> * <li>Getting the bucket from {@link BucketManager} by calling {@link BucketManager#getBucket(long)}.</li> * <li> * If the bucket is not loaded: * <ol> * <li>it requests the {@link BucketManager} to load the bucket which is a non-blocking call.</li> * <li>Adds the event to {@link #waitingEvents} which is a collection of events that are waiting for buckets to be * loaded.</li> * <li>{@link BucketManager} loads the bucket and informs deduper by calling {@link #bucketLoaded(AbstractBucket)}</li> * <li>The deduper then processes the waiting events in {@link #handleIdleTime()}</li> * </ol> * <li> * If the bucket is loaded, the operator drops the event if it is already present in the bucket; emits it otherwise. * </li> * </ol> * </p> * * <p> * Based on the assumption that duplicate events fall in the same bucket. * </p> * * Additionally it also has the following * features: * {@link #orderedOutput}: Whether or not the order of input tuples is preserved * * @displayName Deduper * @category Deduplication * @tags dedupe * * @param <INPUT> type of input tuple * @param <OUTPUT> type of output tuple * @since 0.9.4 */ @OperatorAnnotation(checkpointableWithinAppWindow = false) public abstract class AbstractDeduper<INPUT, OUTPUT> implements Operator, BucketManager.Listener<INPUT>, Operator.IdleTimeHandler, ActivationListener<Context> { /** * The input port on which events are received. */ @InputPortFieldAnnotation(optional = true) public final transient DefaultInputPort<INPUT> input = new DefaultInputPort<INPUT>() { public void setup(Context.PortContext context) { pojoClass = context.getAttributes().get(PortContext.TUPLE_CLASS); } @Override public final void process(INPUT tuple) { processTuple(tuple); } @Override public com.datatorrent.api.StreamCodec<INPUT> getStreamCodec() { return getDeduperStreamCodec(); } }; /** * The output port on which deduped events are emitted. */ public final transient DefaultOutputPort<OUTPUT> output = new DefaultOutputPort<OUTPUT>(); /** * The output port on which duplicate events are emitted. */ public final transient DefaultOutputPort<INPUT> duplicates = new DefaultOutputPort<INPUT>(); /** * The output port on which expired events are emitted. */ public final transient DefaultOutputPort<OUTPUT> expired = new DefaultOutputPort<OUTPUT>(); /** * The output port on which error events are emitted. */ public final transient DefaultOutputPort<OUTPUT> error = new DefaultOutputPort<OUTPUT>(); //Check-pointed state @NotNull protected BucketManager<INPUT> bucketManager; //bucketKey -> list of bucketData which belong to that bucket and are waiting for the bucket to be loaded. @NotNull protected final Map<Long, List<INPUT>> waitingEvents; protected Set<Integer> partitionKeys; protected int partitionMask; protected boolean orderedOutput = false; /** * Map to hold the result of a tuple processing (unique, duplicate, expired or error) until previous * tuples get processed. This is used only when {@link #orderedOutput} is true. */ protected transient Map<INPUT, Decision> decisions; //Non check-pointed state protected final transient BlockingQueue<AbstractBucket<INPUT>> fetchedBuckets; private transient long sleepTimeMillis; private transient OperatorContext context; protected BasicCounters<MutableLong> counters; private transient long currentWindow; private Class<?> pojoClass; // Deduper Auto Metrics @AutoMetric protected long uniqueEvents; @AutoMetric protected long duplicateEvents; @AutoMetric protected long expiredEvents; @AutoMetric protected long errorEvents; public AbstractDeduper() { waitingEvents = Maps.newHashMap(); partitionKeys = Sets.newHashSet(0); partitionMask = 0; fetchedBuckets = new LinkedBlockingQueue<AbstractBucket<INPUT>>(); counters = new BasicCounters<MutableLong>(MutableLong.class); } @Override public void setup(OperatorContext context) { this.context = context; this.currentWindow = context.getValue(Context.OperatorContext.ACTIVATION_WINDOW_ID); sleepTimeMillis = context.getValue(OperatorContext.SPIN_MILLIS); bucketManager.setBucketCounters(counters); counters.setCounter(CounterKeys.DUPLICATE_EVENTS, new MutableLong()); bucketManager.startService(this); logger.debug("bucket keys at startup {}", waitingEvents.keySet()); for (long bucketKey : waitingEvents.keySet()) { bucketManager.loadBucketData(bucketKey); } if (orderedOutput) { decisions = Maps.newLinkedHashMap(); } } @Override public void activate(Context context) { ((AbstractBucketManager<INPUT>) bucketManager).setPojoClass(pojoClass); bucketManager.activate(context); } @Override public void deactivate() { } @Override public void teardown() { bucketManager.shutdownService(); } @Override public void beginWindow(long l) { currentWindow = l; // Reset Dedup Metrics uniqueEvents = 0; duplicateEvents = 0; expiredEvents = 0; errorEvents = 0; // Reset Bucket Metrics ((AbstractBucketManager<INPUT>) bucketManager).setBucketsInMemory(0); ((AbstractBucketManager<INPUT>) bucketManager).setDeletedBuckets(0); ((AbstractBucketManager<INPUT>) bucketManager).setEventsInMemory(0); ((AbstractBucketManager<INPUT>) bucketManager).setEvictedBuckets(0); ((AbstractBucketManager<INPUT>) bucketManager).setEventsCommittedLastWindow(0); } /** * Processes an incoming tuple * * @param tuple */ protected void processTuple(INPUT tuple) { long bucketKey = bucketManager.getBucketKeyFor(tuple); if (bucketKey < 0) { processInvalid(tuple, bucketKey); return; } //ignore event AbstractBucket<INPUT> bucket = bucketManager.getBucket(bucketKey); processValid(tuple, bucket, bucketKey); } /** * Processes invalid tuples. * * @param tuple * @param bucketKey */ protected void processInvalid(INPUT tuple, long bucketKey) { if (bucketKey == -1) { if (orderedOutput && !decisions.isEmpty()) { recordDecision(tuple, Decision.EXPIRED); } else { processExpired(tuple); } } else if (bucketKey == -2) { if (orderedOutput && !decisions.isEmpty()) { recordDecision(tuple, Decision.ERROR); } else { processError(tuple); } } } /** * Processes an expired tuple * * @param tuple */ protected void processExpired(INPUT tuple) { expiredEvents++; emitExpired(convert(tuple)); } /** * Processes an error tuple * * @param tuple */ protected void processError(INPUT tuple) { errorEvents++; emitError(convert(tuple)); } /** * Processes a valid (non-expired) tuple. This tuple may be a unique or a duplicate. In case a decision cannot be made * due to unavailability of buckets in memory, the tuple waits until the bucket is loaded. * * @param tuple * The tuple to be processed * @param bucket * The in-memory bucket in which the tuple belongs * @param bucketKey * The bucket key of the bucket */ protected void processValid(INPUT tuple, AbstractBucket<INPUT> bucket, long bucketKey) { if (bucket != null && !waitingEvents.containsKey(bucketKey) && bucket.containsEvent(tuple) && bucket.isDataOnDiskLoaded()) { processDuplicate(tuple, bucket); } else if (bucket != null && !waitingEvents.containsKey(bucketKey) && bucket.isDataOnDiskLoaded()) { bucketManager.newEvent(bucketKey, tuple); processUnique(tuple, bucket); } else { processWaitingEvent(tuple, bucket, bucketKey); } } /** * Processes the duplicate tuple. * * @param tuple * The tuple which is a duplicate * @param bucket * The bucket to which the tuple belongs */ protected void processDuplicate(INPUT tuple, AbstractBucket<INPUT> bucket) { counters.getCounter(CounterKeys.DUPLICATE_EVENTS).increment(); if (orderedOutput && !decisions.isEmpty()) { recordDecision(tuple, Decision.DUPLICATE); } else { duplicateEvents++; emitDuplicate(tuple); } } /** * Processes the unique tuple. * * @param tuple * The tuple which is a unique * @param bucket * The bucket to which the tuple belongs */ protected void processUnique(INPUT tuple, AbstractBucket<INPUT> bucket) { if (orderedOutput && !decisions.isEmpty()) { recordDecision(tuple, Decision.UNIQUE); } else { uniqueEvents++; emitOutput(convert(tuple)); } } /** * Processes a tuple which needs to wait for its bucket to be loaded. Triggers load of the corresponding bucket and * adds the tuple in a waiting queue {@link #waitingEvents} The bucket is loaded from the disk. Before that we check * if there is a pending request to load the bucket and in that case we put the event in a waiting list. * * @param tuple * The tuple which needs to wait * @param bucket * The bucket to which the tuple belongs * @param bucketKey * The key of the bucket */ protected void processWaitingEvent(INPUT tuple, AbstractBucket<INPUT> bucket, long bucketKey) { boolean doLoadFromDisk = false; List<INPUT> waitingList = waitingEvents.get(bucketKey); if (waitingList == null) { waitingList = Lists.newArrayList(); waitingEvents.put(bucketKey, waitingList); doLoadFromDisk = true; } waitingList.add(tuple); if (doLoadFromDisk) { //Trigger the storage manager to load bucketData for this bucket key. This is a non-blocking call. bucketManager.loadBucketData(bucketKey); } if (orderedOutput) { recordDecision(tuple, Decision.UNKNOWN); } } @Override public void endWindow() { try { bucketManager.blockUntilAllRequestsServiced(); handleIdleTime(); Preconditions.checkArgument(waitingEvents.isEmpty(), waitingEvents.keySet()); if (orderedOutput) { emitProcessedTuples(); Preconditions.checkArgument(decisions.isEmpty(), "events pending " + decisions.size()); } bucketManager.endWindow(currentWindow); } catch (Throwable cause) { DTThrowable.rethrow(cause); } context.setCounters(counters); } @Override public void handleIdleTime() { if (orderedOutput) { emitProcessedTuples(); } if (!fetchedBuckets.isEmpty()) { processAuxiliary(); } else { /* nothing to do here, so sleep for a while to avoid busy loop */ try { Thread.sleep(sleepTimeMillis); } catch (InterruptedException ie) { throw new RuntimeException(ie); } } } /** * Does any auxiliary processing in the idle time of the operator. This processes waiting tuples which have their * buckets loaded in memory and can be processed. */ protected void processAuxiliary() { if (!fetchedBuckets.isEmpty()) { /** * Remove all the events from waiting list whose buckets are loaded. * Process these events again. */ AbstractBucket<INPUT> bucket; while ((bucket = fetchedBuckets.poll()) != null) { List<INPUT> waitingList = waitingEvents.remove(bucket.bucketKey); if (waitingList != null) { for (INPUT event : waitingList) { if (!bucket.containsEvent(event)) { if (bucketManager.getBucketKeyFor(event) < 0) { // This event will be expired after all tuples in this window are finished processing. // Temporarily add the event to this bucket, so as to deduplicate within this window. bucketManager.addEventToBucket(bucket, event); } else { bucketManager.newEvent(bucket.bucketKey, event); } processUnique(event, bucket); } else { processDuplicate(event, bucket); } } } } } } /** * Records a decision for use later. This is needed to ensure that the order of incoming tuples is maintained. * * @param tuple * @param d */ protected void recordDecision(INPUT tuple, Decision d) { decisions.put(tuple, d); } /** * Processes tuples for which the decision (unique / duplicate / expired / error) has been made. */ protected void emitProcessedTuples() { Iterator<Entry<INPUT, Decision>> entries = decisions.entrySet().iterator(); while (entries.hasNext()) { Entry<INPUT, Decision> td = entries.next(); switch (td.getValue()) { case UNIQUE: uniqueEvents++; emitOutput(convert(td.getKey())); entries.remove(); break; case DUPLICATE: duplicateEvents++; emitDuplicate(td.getKey()); entries.remove(); break; case EXPIRED: expiredEvents++; emitExpired(convert(td.getKey())); entries.remove(); break; case ERROR: errorEvents++; emitError(convert(td.getKey())); entries.remove(); break; default: /* * Decision for this is still UNKNOWN. Tuple is still waiting for bucket to be loaded. Break and come back * later in endWindow. */ break; } } } /** * {@inheritDoc} */ @Override public void bucketLoaded(AbstractBucket<INPUT> loadedBucket) { fetchedBuckets.add(loadedBucket); } /** * {@inheritDoc} */ @Override public void bucketOffLoaded(long bucketKey) { } /** * {@inheritDoc}} */ @Override public void bucketDeleted(long bucketKey) { } /** * Sets the bucket manager. * * @param bucketManager {@link BucketManager} to be used by deduper. */ public void setBucketManager(@NotNull BucketManager<INPUT> bucketManager) { this.bucketManager = Preconditions.checkNotNull(bucketManager, "storage manager"); } public BucketManager<INPUT> getBucketManager() { return this.bucketManager; } /** * Converts the input tuple to output tuple. * * @param input input event. * @return output tuple derived from input. */ protected abstract OUTPUT convert(INPUT input); protected abstract Object getEventKey(INPUT event); protected void emitOutput(OUTPUT event) { output.emit(event); } protected void emitDuplicate(INPUT event) { duplicates.emit(event); } protected void emitExpired(OUTPUT event) { expired.emit(event); } protected void emitError(OUTPUT event) { error.emit(event); } protected StreamCodec<INPUT> getDeduperStreamCodec() { return null; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof AbstractDeduper)) { return false; } AbstractDeduper<?, ?> deduper = (AbstractDeduper<?, ?>) o; if (partitionMask != deduper.partitionMask) { return false; } if (!bucketManager.equals(deduper.bucketManager)) { return false; } if (partitionKeys != null ? !partitionKeys.equals(deduper.partitionKeys) : deduper.partitionKeys != null) { return false; } return waitingEvents.equals(deduper.waitingEvents); } @Override public int hashCode() { int result = bucketManager.hashCode(); result = 31 * result + (waitingEvents.hashCode()); result = 31 * result + (partitionKeys != null ? partitionKeys.hashCode() : 0); result = 31 * result + partitionMask; return result; } @Override public String toString() { return "Deduper{" + "partitionKeys=" + partitionKeys + ", partitionMask=" + partitionMask + '}'; } public enum CounterKeys { DUPLICATE_EVENTS } public static class CountersListener implements StatsListener, Serializable { @Override public Response processStats(BatchedOperatorStats batchedOperatorStats) { List<Stats.OperatorStats> lastWindowedStats = batchedOperatorStats.getLastWindowedStats(); if (lastWindowedStats != null) { for (Stats.OperatorStats os : lastWindowedStats) { if (os.counters != null) { if (os.counters instanceof BasicCounters) { @SuppressWarnings("unchecked") BasicCounters<MutableLong> cs = (BasicCounters<MutableLong>) os.counters; logger.debug( "operatorId:{} buckets:[in-memory:{} deleted:{} evicted:{}] events:[in-memory:{} " + "committed-last-window:{} duplicates:{}] low:{} high:{}", batchedOperatorStats.getOperatorId(), cs.getCounter(BucketManager.CounterKeys.BUCKETS_IN_MEMORY), cs.getCounter(BucketManager.CounterKeys.DELETED_BUCKETS), cs.getCounter(BucketManager.CounterKeys.EVICTED_BUCKETS), cs.getCounter(BucketManager.CounterKeys.EVENTS_IN_MEMORY), cs.getCounter(BucketManager.CounterKeys.EVENTS_COMMITTED_LAST_WINDOW), cs.getCounter(CounterKeys.DUPLICATE_EVENTS)); } } } } return null; } private static final long serialVersionUID = 201404082336L; protected static final transient Logger logger = LoggerFactory.getLogger(CountersListener.class); } /** * Checks whether output of deduper should preserve the input order */ public boolean isOrderedOutput() { return orderedOutput; } /** * If set to true, the deduper will emit tuples in the order in which they were received. Tuples which arrived later * will wait for previous tuples to get processed and be emitted. If not set, the order of tuples may change as tuples * may be emitted out of order as and when they get processed. * * @param orderedOutput */ public void setOrderedOutput(boolean orderedOutput) { this.orderedOutput = orderedOutput; } // Bucket Manager Metrics @AutoMetric public long getDeletedBuckets() { return ((AbstractBucketManager<INPUT>) bucketManager).getDeletedBuckets(); } @AutoMetric public long getEvictedBuckets() { return ((AbstractBucketManager<INPUT>) bucketManager).getEvictedBuckets(); } @AutoMetric public long getEventsInMemory() { return ((AbstractBucketManager<INPUT>) bucketManager).getEventsInMemory(); } @AutoMetric public long getEventsCommittedLastWindow() { return ((AbstractBucketManager<INPUT>) bucketManager).getEventsCommittedLastWindow(); } @AutoMetric public long getEndOfBuckets() { return ((AbstractBucketManager<INPUT>) bucketManager).getEndOfBuckets(); } @AutoMetric public long getStartOfBuckets() { return ((AbstractBucketManager<INPUT>) bucketManager).getStartOfBuckets(); } public Class<?> getPojoClass() { return pojoClass; } /** * Enum for holding all possible values for a decision for a tuple */ protected enum Decision { UNIQUE, DUPLICATE, EXPIRED, ERROR, UNKNOWN } private static final Logger logger = LoggerFactory.getLogger(AbstractDeduper.class); }