Java tutorial
/* * Copyright (c) 2014 DataTorrent, Inc. ALL Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.datatorrent.lib.dedup; import java.io.Serializable; import java.util.*; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import javax.validation.constraints.Min; import javax.validation.constraints.NotNull; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.commons.lang.mutable.MutableLong; import com.datatorrent.lib.bucket.Bucket; import com.datatorrent.lib.bucket.BucketManager; import com.datatorrent.lib.bucket.Bucketable; import com.datatorrent.lib.bucket.TimeBasedBucketManagerImpl; import com.datatorrent.lib.counters.BasicCounters; import com.datatorrent.api.*; import com.datatorrent.api.Context.OperatorContext; import com.datatorrent.api.annotation.InputPortFieldAnnotation; import com.datatorrent.common.util.DTThrowable; /** * This is the base implementation of a deduper, which drops duplicate events. * Subclasses must implement the convert method which turns input tuples into output tuples. * <p> * Processing of an event involves: * <ol> * <li>Finding the bucket key of an event by calling {@link BucketManager#getBucketKeyFor(Bucketable)}.</li> * <li>Getting the bucket from {@link BucketManager} by calling {@link BucketManager#getBucket(long)}.</li> * <li> * If the bucket is not loaded: * <ol> * <li>it requests the {@link BucketManager} to load the bucket which is a non-blocking call.</li> * <li>Adds the event to {@link #waitingEvents} which is a collection of events that are waiting for buckets to be loaded.</li> * <li>{@link BucketManager} loads the bucket and informs deduper by calling {@link #bucketLoaded(Bucket)}</li> * <li>The deduper then processes the waiting events in {@link #handleIdleTime()}</li> * </ol> * <li> * If the bucket is loaded, the operator drops the event if it is already present in the bucket; emits it otherwise. * </li> * </ol> * </p> * * <p> * Based on the assumption that duplicate events fall in the same bucket. * </p> * * @displayName Deduper * @category Deduplication * @tags dedupe * * @param <INPUT> type of input tuple * @param <OUTPUT> type of output tuple * @since 0.9.4 */ public abstract class Deduper<INPUT extends Bucketable, OUTPUT> implements Operator, BucketManager.Listener<INPUT>, Operator.IdleTimeHandler, Partitioner<Deduper<INPUT, OUTPUT>> { /** * The input port on which events are received. */ @InputPortFieldAnnotation(optional = true) public final transient DefaultInputPort<INPUT> input = new DefaultInputPort<INPUT>() { @Override public final void process(INPUT tuple) { long bucketKey = bucketManager.getBucketKeyFor(tuple); if (bucketKey < 0) { return; } //ignore event Bucket<INPUT> bucket = bucketManager.getBucket(bucketKey); if (bucket != null && bucket.containsEvent(tuple)) { counters.getCounter(CounterKeys.DUPLICATE_EVENTS).increment(); return; } //ignore event if (bucket != null && bucket.isDataOnDiskLoaded()) { bucketManager.newEvent(bucketKey, tuple); output.emit(convert(tuple)); } else { /** * The bucket on disk is not loaded. So we load the bucket from the disk. * Before that we check if there is a pending request to load the bucket and in that case we * put the event in a waiting list. */ boolean doLoadFromDisk = false; List<INPUT> waitingList = waitingEvents.get(bucketKey); if (waitingList == null) { waitingList = Lists.newArrayList(); waitingEvents.put(bucketKey, waitingList); doLoadFromDisk = true; } waitingList.add(tuple); if (doLoadFromDisk) { //Trigger the storage manager to load bucketData for this bucket key. This is a non-blocking call. bucketManager.loadBucketData(bucketKey); } } } }; /** * The output port on which deduped events are emitted. */ public final transient DefaultOutputPort<OUTPUT> output = new DefaultOutputPort<OUTPUT>(); //Check-pointed state @NotNull protected BucketManager<INPUT> bucketManager; //bucketKey -> list of bucketData which belong to that bucket and are waiting for the bucket to be loaded. @NotNull protected final Map<Long, List<INPUT>> waitingEvents; protected Set<Integer> partitionKeys; protected int partitionMask; //Non check-pointed state protected transient final BlockingQueue<Bucket<INPUT>> fetchedBuckets; private transient long sleepTimeMillis; private transient OperatorContext context; protected BasicCounters<MutableLong> counters; private transient long currentWindow; @Min(1) private int partitionCount = 1; public Deduper() { waitingEvents = Maps.newHashMap(); partitionKeys = Sets.newHashSet(0); partitionMask = 0; fetchedBuckets = new LinkedBlockingQueue<Bucket<INPUT>>(); counters = new BasicCounters<MutableLong>(MutableLong.class); } public void setPartitionCount(int partitionCount) { this.partitionCount = partitionCount; } public int getPartitionCount() { return partitionCount; } @Override public void setup(OperatorContext context) { this.context = context; this.currentWindow = context.getValue(Context.OperatorContext.ACTIVATION_WINDOW_ID); sleepTimeMillis = context.getValue(OperatorContext.SPIN_MILLIS); bucketManager.setBucketCounters(counters); counters.setCounter(CounterKeys.DUPLICATE_EVENTS, new MutableLong()); bucketManager.startService(this); logger.debug("bucket keys at startup {}", waitingEvents.keySet()); for (long bucketKey : waitingEvents.keySet()) { bucketManager.loadBucketData(bucketKey); } } @Override public void teardown() { bucketManager.shutdownService(); } @Override public void beginWindow(long l) { currentWindow = l; } @Override public void endWindow() { try { bucketManager.blockUntilAllRequestsServiced(); handleIdleTime(); Preconditions.checkArgument(waitingEvents.isEmpty(), waitingEvents.keySet()); bucketManager.endWindow(currentWindow); } catch (Throwable cause) { DTThrowable.rethrow(cause); } context.setCounters(counters); } @Override public void handleIdleTime() { if (fetchedBuckets.isEmpty()) { /* nothing to do here, so sleep for a while to avoid busy loop */ try { Thread.sleep(sleepTimeMillis); } catch (InterruptedException ie) { throw new RuntimeException(ie); } } else { /** * Remove all the events from waiting list whose buckets are loaded. * Process these events again. */ Bucket<INPUT> bucket; while ((bucket = fetchedBuckets.poll()) != null) { List<INPUT> waitingList = waitingEvents.remove(bucket.bucketKey); if (waitingList != null) { for (INPUT event : waitingList) { if (!bucket.containsEvent(event)) { bucketManager.newEvent(bucket.bucketKey, event); output.emit(convert(event)); } else { counters.getCounter(CounterKeys.DUPLICATE_EVENTS).increment(); } } } } } } /** * {@inheritDoc} */ @Override public void bucketLoaded(Bucket<INPUT> loadedBucket) { fetchedBuckets.add(loadedBucket); } /** * {@inheritDoc} */ @Override public void bucketOffLoaded(long bucketKey) { } @Override public void partitioned(Map<Integer, Partition<Deduper<INPUT, OUTPUT>>> partitions) { } @Override @SuppressWarnings({ "BroadCatchBlock", "TooBroadCatch", "UseSpecificCatch" }) public Collection<Partition<Deduper<INPUT, OUTPUT>>> definePartitions( Collection<Partition<Deduper<INPUT, OUTPUT>>> partitions, PartitioningContext context) { final int finalCapacity = DefaultPartition.getRequiredPartitionCount(context, this.partitionCount); //Collect the state here List<BucketManager<INPUT>> oldStorageManagers = Lists.newArrayList(); Map<Long, List<INPUT>> allWaitingEvents = Maps.newHashMap(); for (Partition<Deduper<INPUT, OUTPUT>> partition : partitions) { //collect all bucketStorageManagers oldStorageManagers.add(partition.getPartitionedInstance().bucketManager); //collect all waiting events for (Map.Entry<Long, List<INPUT>> awaitingList : partition.getPartitionedInstance().waitingEvents .entrySet()) { if (awaitingList.getValue().size() > 0) { List<INPUT> existingList = allWaitingEvents.get(awaitingList.getKey()); if (existingList == null) { existingList = Lists.newArrayList(); allWaitingEvents.put(awaitingList.getKey(), existingList); } existingList.addAll(awaitingList.getValue()); } } partition.getPartitionedInstance().waitingEvents.clear(); } partitions.clear(); Collection<Partition<Deduper<INPUT, OUTPUT>>> newPartitions = Lists.newArrayListWithCapacity(finalCapacity); Map<Integer, BucketManager<INPUT>> partitionKeyToStorageManagers = Maps.newHashMap(); for (int i = 0; i < finalCapacity; i++) { try { @SuppressWarnings("unchecked") Deduper<INPUT, OUTPUT> deduper = this.getClass().newInstance(); DefaultPartition<Deduper<INPUT, OUTPUT>> partition = new DefaultPartition<Deduper<INPUT, OUTPUT>>( deduper); newPartitions.add(partition); } catch (Throwable cause) { DTThrowable.rethrow(cause); } } DefaultPartition.assignPartitionKeys(Collections.unmodifiableCollection(newPartitions), input); int lPartitionMask = newPartitions.iterator().next().getPartitionKeys().get(input).mask; //transfer the state here for (Partition<Deduper<INPUT, OUTPUT>> deduperPartition : newPartitions) { Deduper<INPUT, OUTPUT> deduperInstance = deduperPartition.getPartitionedInstance(); deduperInstance.partitionKeys = deduperPartition.getPartitionKeys().get(input).partitions; deduperInstance.partitionMask = lPartitionMask; logger.debug("partitions {},{}", deduperInstance.partitionKeys, deduperInstance.partitionMask); deduperInstance.bucketManager = bucketManager.cloneWithProperties(); for (int partitionKey : deduperInstance.partitionKeys) { partitionKeyToStorageManagers.put(partitionKey, deduperInstance.bucketManager); } //distribute waiting events for (long bucketKey : allWaitingEvents.keySet()) { for (Iterator<INPUT> iterator = allWaitingEvents.get(bucketKey).iterator(); iterator.hasNext();) { INPUT event = iterator.next(); int partitionKey = event.getEventKey().hashCode() & lPartitionMask; if (deduperInstance.partitionKeys.contains(partitionKey)) { List<INPUT> existingList = deduperInstance.waitingEvents.get(bucketKey); if (existingList == null) { existingList = Lists.newArrayList(); deduperInstance.waitingEvents.put(bucketKey, existingList); } existingList.add(event); iterator.remove(); } } } } //let storage manager and subclasses distribute state as well bucketManager.definePartitions(oldStorageManagers, partitionKeyToStorageManagers, lPartitionMask); return newPartitions; } /** * Sets the bucket manager. * * @param bucketManager {@link BucketManager} to be used by deduper. */ public void setBucketManager(@NotNull BucketManager<INPUT> bucketManager) { this.bucketManager = Preconditions.checkNotNull(bucketManager, "storage manager"); } public BucketManager<INPUT> getBucketManager() { return this.bucketManager; } /** * Converts the input tuple to output tuple. * * @param input input event. * @return output tuple derived from input. */ protected abstract OUTPUT convert(INPUT input); @Override public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof Deduper)) { return false; } Deduper<?, ?> deduper = (Deduper<?, ?>) o; if (partitionMask != deduper.partitionMask) { return false; } if (!bucketManager.equals(deduper.bucketManager)) { return false; } if (partitionKeys != null ? !partitionKeys.equals(deduper.partitionKeys) : deduper.partitionKeys != null) { return false; } return waitingEvents.equals(deduper.waitingEvents); } @Override public int hashCode() { int result = bucketManager.hashCode(); result = 31 * result + (waitingEvents.hashCode()); result = 31 * result + (partitionKeys != null ? partitionKeys.hashCode() : 0); result = 31 * result + partitionMask; return result; } @Override public String toString() { return "Deduper{" + "partitionKeys=" + partitionKeys + ", partitionMask=" + partitionMask + '}'; } public static enum CounterKeys { DUPLICATE_EVENTS } public static class CountersListener implements StatsListener, Serializable { @Override public Response processStats(BatchedOperatorStats batchedOperatorStats) { List<Stats.OperatorStats> lastWindowedStats = batchedOperatorStats.getLastWindowedStats(); if (lastWindowedStats != null) { for (Stats.OperatorStats os : lastWindowedStats) { if (os.counters != null) { if (os.counters instanceof BasicCounters) { @SuppressWarnings("unchecked") BasicCounters<MutableLong> cs = (BasicCounters<MutableLong>) os.counters; logger.debug( "operatorId:{} buckets:[in-memory:{} deleted:{} evicted:{}] events:[in-memory:{} committed-last-window:{} " + "ignored:{} duplicates:{}] low:{} high:{}", batchedOperatorStats.getOperatorId(), cs.getCounter(BucketManager.CounterKeys.BUCKETS_IN_MEMORY), cs.getCounter(BucketManager.CounterKeys.DELETED_BUCKETS), cs.getCounter(BucketManager.CounterKeys.EVICTED_BUCKETS), cs.getCounter(BucketManager.CounterKeys.EVENTS_IN_MEMORY), cs.getCounter(BucketManager.CounterKeys.EVENTS_COMMITTED_LAST_WINDOW), cs.getCounter(TimeBasedBucketManagerImpl.CounterKeys.IGNORED_EVENTS), cs.getCounter(CounterKeys.DUPLICATE_EVENTS), cs.getCounter(TimeBasedBucketManagerImpl.CounterKeys.LOW), cs.getCounter(TimeBasedBucketManagerImpl.CounterKeys.HIGH)); } } } } return null; } private static final long serialVersionUID = 201404082336L; protected static transient final Logger logger = LoggerFactory.getLogger(CountersListener.class); } private final static Logger logger = LoggerFactory.getLogger(Deduper.class); }