Java tutorial
/** * Copyright (c) 2016 DataTorrent, Inc. ALL Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.datatorrent.contrib.dimensions; import java.io.IOException; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import javax.validation.constraints.Min; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.apex.malhar.lib.dimensions.DimensionsConversionContext; import org.apache.apex.malhar.lib.dimensions.DimensionsDescriptor; import org.apache.apex.malhar.lib.dimensions.DimensionsEvent.Aggregate; import org.apache.apex.malhar.lib.dimensions.DimensionsEvent.EventKey; import org.apache.apex.malhar.lib.dimensions.aggregator.AbstractTopBottomAggregator; import org.apache.apex.malhar.lib.dimensions.aggregator.CompositeAggregator; import org.apache.apex.malhar.lib.dimensions.aggregator.IncrementalAggregator; import org.apache.apex.malhar.lib.dimensions.aggregator.OTFAggregator; import org.apache.commons.lang3.mutable.MutableInt; import org.apache.hadoop.classification.InterfaceStability.Unstable; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import com.google.common.primitives.Ints; import com.google.common.primitives.Longs; import com.datatorrent.api.DefaultOutputPort; import com.datatorrent.api.annotation.OperatorAnnotation; import com.datatorrent.api.annotation.OutputPortFieldAnnotation; import com.datatorrent.contrib.hdht.AbstractSinglePortHDHTWriter; import com.datatorrent.lib.appdata.gpo.GPOByteArrayList; import com.datatorrent.lib.appdata.gpo.GPOMutable; import com.datatorrent.lib.appdata.gpo.GPOUtils; import com.datatorrent.lib.appdata.schemas.Fields; import com.datatorrent.lib.appdata.schemas.FieldsDescriptor; import com.datatorrent.lib.appdata.schemas.Type; import com.datatorrent.lib.codec.KryoSerializableStreamCodec; import com.datatorrent.lib.dimensions.AggregationIdentifier; import com.datatorrent.netlet.util.Slice; /** * This operator is a base class for dimension store operators. This operator assumes that an upstream * {@link com.datatorrent.lib.dimensions.AbstractDimensionsComputationFlexibleSingleSchema} operator is producing * {@link Aggregate} objects which are provided to it as input. * * @since 3.1.0 */ @OperatorAnnotation(checkpointableWithinAppWindow = false) public abstract class DimensionsStoreHDHT extends AbstractSinglePortHDHTWriter<Aggregate> { /** * The default number of windows for which data is held in the cache. */ public static final int DEFAULT_CACHE_WINDOW_DURATION = 120; /** * This is the key value to use for storing the committed windowID in a bucket. */ public static final int META_DATA_ID_WINDOW_ID = 0; /** * This is the key for storing committed window IDs as a {@link Slice}. */ public static final Slice WINDOW_ID_KEY = new Slice(GPOUtils.serializeInt(META_DATA_ID_WINDOW_ID)); /** * This is the key used to store the format version of the store. */ public static final int META_DATA_ID_STORE_FORMAT = 1; /** * This is the key used to store the format version of the store, as a {@link Slice}. */ public static final Slice STORE_FORMAT_KEY = new Slice(GPOUtils.serializeInt(META_DATA_ID_STORE_FORMAT)); /** * This is the store format version to store in each HDHT bucket. The store format version represents * the way in which data is serialized to HDHT. If this format changes in the future, existing data * could be read out of HDHT and reformated, to support seamless upgrades. */ public static final int STORE_FORMAT_VERSION = 0; /** * This is the byte representation of the current Store Format Version. */ public static final byte[] STORE_FORMAT_VERSION_BYTES = GPOUtils.serializeInt(STORE_FORMAT_VERSION); /** * The number of windows that the operator's {@link Aggregate} cache is preserved for. */ @Min(1) private int cacheWindowDuration = DEFAULT_CACHE_WINDOW_DURATION; /** * This keeps track of the number of windows seen since the last time the operator's cache * was cleared. */ private int cacheWindowCount = 0; /** * The ID of the current window that this operator is on. */ @VisibleForTesting protected transient long currentWindowID; /** * This is the operator's {@link Aggregate} cache, which is used to store aggregations. * The keys of this map are {@link EventKey}s for this aggregate. The values in this * map are the corresponding {@link Aggregate}s. */ protected transient Map<EventKey, Aggregate> cache = new ConcurrentHashMap<EventKey, Aggregate>(); /** * The computation for composite aggregators need to get the aggregates of embed incremental aggregator. * This embed aggregator can identified by AggregationIdentifier */ protected transient Map<AggregationIdentifier, Set<EventKey>> embedIdentifierToEventKeys; /** * A map from composite aggregator ID to aggregate value. * not use ConcurrentHashMap as all operations are in same thread, */ protected transient Map<Integer, GPOMutable> compositeAggregteCache = Maps.newHashMap(); /** * The IDs of the HDHT buckets that this operator writes to. */ protected Set<Long> buckets = Sets.newHashSet(); /** * This flag indicates whether or not meta information like committed window ID, and * storage format version have been read from each bucket. */ @VisibleForTesting protected transient boolean readMetaData = false; /** * This {@link Map} holds the committed window ID for each HDHT bucket. This map is used * to determine when it is acceptable to start writing data to an HDHT bucket for fault * tolerance purposes. */ @VisibleForTesting protected final transient Map<Long, Long> futureBuckets = Maps.newHashMap(); @OutputPortFieldAnnotation(optional = true) public final transient DefaultOutputPort<Aggregate> updates = new DefaultOutputPort<>(); private boolean useSystemTimeForLatestTimeBuckets = false; private Long minTimestamp = null; private Long maxTimestamp = null; private final transient GPOByteArrayList bal = new GPOByteArrayList(); private final transient GPOByteArrayList tempBal = new GPOByteArrayList(); protected Aggregate tmpCompositeDestAggregate = new Aggregate(); /** * Constructor to create operator. */ public DimensionsStoreHDHT() { //Do nothing } /** * This is a helper method that is used to retrieve the aggregator ID corresponding to an aggregatorName. * * @param aggregatorName Name of the aggregator to look up an ID for. * @return The ID of the aggregator corresponding to the given aggregatorName. */ protected abstract int getIncrementalAggregatorID(String aggregatorName); /** * This is a helper method which gets the {@link IncrementalAggregator} corresponding to the given aggregatorID. * * @param aggregatorID The aggregatorID of the {@link IncrementalAggregator} to retrieve. * @return The {@link IncrementalAggregator} with the given ID. */ protected abstract IncrementalAggregator getAggregator(int aggregatorID); protected abstract CompositeAggregator getCompositeAggregator(int aggregatorID); /** * This is a helper method which gets the {@link FieldsDescriptor} object for the key corresponding to the given * schema and * {@link DimensionsDescriptor} in that schema. * * @param schemaID The schemaID corresponding to the schema for which to retrieve a key descriptor. * @param dimensionsDescriptorID The dimensionsDescriptorID corresponding to the {@link DimensionsDescriptor} for * which to retrieve a key descriptor. * @return The key descriptor for the given schemaID and dimensionsDescriptorID. */ protected abstract FieldsDescriptor getKeyDescriptor(int schemaID, int dimensionsDescriptorID); /** * This is a helper method which gets the {@link FieldsDescriptor} object for the aggregates corresponding to the * given schema, * {@link DimensionsDescriptor} in that schema, and aggregtorID. * * @param schemaID The schemaID corresponding to the schema for which to retrieve a value descriptor. * @param dimensionsDescriptorID The dimensionsDescriptorID corresponding to the {@link DimensionsDescriptor} for * which * to retrieve a value descriptor. * @param aggregatorID The id of the aggregator used to aggregate the values. * @return The value descriptor for the given schemaID, dimensionsDescriptorID, and aggregatorID. */ protected abstract FieldsDescriptor getValueDescriptor(int schemaID, int dimensionsDescriptorID, int aggregatorID); /** * This is a helper method which retrieves the bucketID that a schema corresponds to. * * @param schemaID The schema ID for which to find a bucketID. * @return The bucketID corresponding to the given schemaID. */ protected abstract long getBucketForSchema(int schemaID); /** * This is another helper method which gets the bucket that the given {@link EventKey} belongs to. * * @param eventKey The event key. * @return The bucketID of the bucket that the given {@link EventKey} belongs to. */ protected long getBucketForSchema(EventKey eventKey) { return getBucketForSchema(eventKey.getSchemaID()); } /** * This is a convenience helper method which serializes the key of the given {@link Aggregate}. * * @param gae The {@link Aggregate} to serialize. * @return The serialized {@link Aggregate}. */ protected byte[] getKeyBytesGAE(Aggregate gae) { return getEventKeyBytesGAE(gae.getEventKey()); } /** * Method serializes the given {@link EventKey}. * * @param eventKey The {@link EventKey} to serialize. * @return The serialized {@link EventKey}. */ public synchronized byte[] getEventKeyBytesGAE(EventKey eventKey) { long timestamp = 0; if (eventKey.getKey().getFieldDescriptor().getFieldList().contains(DimensionsDescriptor.DIMENSION_TIME)) { //If key includes a time stamp retrieve it. timestamp = eventKey.getKey().getFieldLong(DimensionsDescriptor.DIMENSION_TIME); } //Time is a special case for HDHT all keys should be prefixed by a timestamp. byte[] timeBytes = Longs.toByteArray(timestamp); byte[] schemaIDBytes = Ints.toByteArray(eventKey.getSchemaID()); byte[] dimensionDescriptorIDBytes = Ints.toByteArray(eventKey.getDimensionDescriptorID()); byte[] aggregatorIDBytes = Ints.toByteArray(eventKey.getAggregatorID()); byte[] gpoBytes = GPOUtils.serialize(eventKey.getKey(), tempBal); bal.add(timeBytes); bal.add(schemaIDBytes); bal.add(dimensionDescriptorIDBytes); bal.add(aggregatorIDBytes); bal.add(gpoBytes); byte[] serializedBytes = bal.toByteArray(); bal.clear(); return serializedBytes; } /** * This method serializes the aggregate payload ({@link GPOMutable}) in the given {@link Aggregate}. * * @param event The {@link Aggregate} whose aggregate payload needs to be serialized. * @return The serialized aggregate payload of the given {@link Aggregate}. */ public synchronized byte[] getValueBytesGAE(Aggregate event) { FieldsDescriptor metaDataDescriptor = null; final int aggregatorID = event.getEventKey().getAggregatorID(); if (getAggregator(aggregatorID) != null) { metaDataDescriptor = getAggregator(aggregatorID).getMetaDataDescriptor(); } else { metaDataDescriptor = getCompositeAggregator(aggregatorID).getMetaDataDescriptor(); } if (metaDataDescriptor != null) { bal.add(GPOUtils.serialize(event.getMetaData(), tempBal)); } bal.add(GPOUtils.serialize(event.getAggregates(), tempBal)); byte[] serializedBytes = bal.toByteArray(); bal.clear(); return serializedBytes; } /** * Creates an {@link Aggregate} from a serialized {@link EventKey} and a * serialize {@link GPOMutable} object. * * @param key A serialized {@link EventKey}. * @param aggregate A serialized {@link GPOMutable} containing all the values of the aggregates. * @return An {@link Aggregate} object with the given {@link EventKey} and aggregate payload. */ public Aggregate fromKeyValueGAE(Slice key, byte[] aggregate) { MutableInt offset = new MutableInt(Type.LONG.getByteSize()); int schemaID = GPOUtils.deserializeInt(key.buffer, offset); int dimensionDescriptorID = GPOUtils.deserializeInt(key.buffer, offset); int aggregatorID = GPOUtils.deserializeInt(key.buffer, offset); FieldsDescriptor keysDescriptor = getKeyDescriptor(schemaID, dimensionDescriptorID); FieldsDescriptor aggDescriptor = getValueDescriptor(schemaID, dimensionDescriptorID, aggregatorID); FieldsDescriptor metaDataDescriptor = null; if (getAggregator(aggregatorID) != null) { metaDataDescriptor = getAggregator(aggregatorID).getMetaDataDescriptor(); } else { metaDataDescriptor = getCompositeAggregator(aggregatorID).getMetaDataDescriptor(); } GPOMutable keys = GPOUtils.deserialize(keysDescriptor, key.buffer, offset); offset.setValue(0); GPOMutable metaData = null; if (metaDataDescriptor != null) { metaData = GPOUtils.deserialize(metaDataDescriptor, aggregate, offset); metaData.applyObjectPayloadFix(); } GPOMutable aggs = GPOUtils.deserialize(aggDescriptor, aggregate, offset); Aggregate gae = new Aggregate(keys, aggs, metaData, schemaID, dimensionDescriptorID, aggregatorID); return gae; } /** * This is a helper method which synchronously loads data from the given key from the given * bucketID. This method performs the same operation as the other {@link #load(long, com.datatorrent.netlet.util * .Slice) * method except, it deserializes the value byte array into an {@link Aggregate}. * * @param eventKey The {@link EventKey} whose corresponding {@link Aggregate} needs to be loaded. * @return The {@link Aggregate} corresponding to the given {@link EventKey}. */ public Aggregate load(EventKey eventKey) { long bucket = getBucketForSchema(eventKey); byte[] key = getEventKeyBytesGAE(eventKey); Slice keySlice = new Slice(key, 0, key.length); byte[] val = load(bucket, keySlice); if (val == null) { return null; } return fromKeyValueGAE(keySlice, val); } /** * This is a helper method which synchronously loads data from with the given key from * the given bucketID. This method first checks the uncommitted cache of the operator * before attempting to load the data from HDFS. * * @param bucketID The bucketID from which to load data. * @param keySlice The key for which to load data. * @return The value of the data with the given key. */ public byte[] load(long bucketID, Slice keySlice) { byte[] val = getUncommitted(bucketID, keySlice); if (val == null) { try { val = get(bucketID, keySlice); } catch (IOException ex) { throw new RuntimeException(ex); } } return val; } /** * This method determines the partitionID that the given {@link Aggregate} belongs to. This method * is called by the operator's stream codec. * * @param inputEvent The {@link Aggregate} whose partitionID needs to be determined. * @return The id of the partition that the given {@link Aggregate} belongs to. */ public int getPartitionGAE(Aggregate inputEvent) { return inputEvent.getBucketID(); } /** * This method stores the given {@link Aggregate} into HDHT. * * @param gae The {@link Aggregate} to store into HDHT. */ public void putGAE(Aggregate gae) { try { put(getBucketForSchema(gae.getSchemaID()), new Slice(codec.getKeyBytes(gae)), codec.getValueBytes(gae)); } catch (IOException ex) { throw new RuntimeException(ex); } } /** * This is a helper method which writes out the store's format version. * * @param bucket The bucketID to write out the store's format version to. * @throws IOException */ public void putStoreFormatVersion(long bucket) throws IOException { put(bucket, STORE_FORMAT_KEY, STORE_FORMAT_VERSION_BYTES); } @Override public void beginWindow(long windowId) { currentWindowID = windowId; super.beginWindow(windowId); if (!readMetaData) { //Note reading only seems to work between begin and end window in the unit tests. //This could should only be executed once when the operator starts. //read meta data such as committed windowID when the operator starts. for (Long bucket : buckets) { byte[] windowIDValueBytes; //loads the committed windowID from the bucket. windowIDValueBytes = load(bucket, WINDOW_ID_KEY); if (windowIDValueBytes == null) { continue; } long committedWindowID = GPOUtils.deserializeLong(windowIDValueBytes, new MutableInt(0)); futureBuckets.put(bucket, committedWindowID); } //Write Store Format Version out to each bucket for (Long bucket : buckets) { try { LOG.debug("Writing out store format version to bucket {}", bucket); putStoreFormatVersion(bucket); } catch (IOException ex) { throw new RuntimeException(ex); } } readMetaData = true; } initializeEmbedIdentifierToEventKeys(); } @Override protected void processEvent(Aggregate gae) { LOG.debug("Before event key {}", gae.getEventKey()); int schemaID = gae.getSchemaID(); int ddID = gae.getDimensionDescriptorID(); int aggregatorID = gae.getAggregatorID(); AggregationIdentifier aggregationIdentifier = new AggregationIdentifier(schemaID, ddID, aggregatorID); Set<EventKey> embedEventKeys = embedIdentifierToEventKeys.get(aggregationIdentifier); FieldsDescriptor keyFieldsDescriptor = getKeyDescriptor(schemaID, ddID); FieldsDescriptor valueFieldsDescriptor = getValueDescriptor(schemaID, ddID, aggregatorID); gae.getKeys().setFieldDescriptor(keyFieldsDescriptor); if (valueFieldsDescriptor == null) { LOG.info("ids for failure {} {} {}", schemaID, ddID, aggregatorID); } gae.getAggregates().setFieldDescriptor(valueFieldsDescriptor); //Skip data for buckets with greater committed window Ids if (!futureBuckets.isEmpty()) { long bucket = getBucketForSchema(schemaID); Long committedWindowID = futureBuckets.get(bucket); if (committedWindowID != null && currentWindowID <= committedWindowID) { LOG.debug("Skipping"); return; } } GPOMutable metaData = gae.getMetaData(); IncrementalAggregator aggregator = getAggregator(gae.getAggregatorID()); if (metaData != null) { metaData.setFieldDescriptor(aggregator.getMetaDataDescriptor()); metaData.applyObjectPayloadFix(); } LOG.debug("Event key {}", gae.getEventKey()); Aggregate aggregate = cache.get(gae.getEventKey()); if (aggregate == null) { aggregate = load(gae.getEventKey()); if (aggregate != null) { cache.put(aggregate.getEventKey(), aggregate); if (embedEventKeys != null) { embedEventKeys.add(aggregate.getEventKey()); } } } if (aggregate == null) { cache.put(gae.getEventKey(), gae); if (embedEventKeys != null) { embedEventKeys.add(gae.getEventKey()); } } else { LOG.debug("Aggregating input"); aggregator.aggregate(aggregate, gae); if (embedEventKeys != null) { embedEventKeys.add(gae.getEventKey()); } } } @Override public void endWindow() { //Write out the last committed window ID for each bucket. byte[] currentWindowIDBytes = GPOUtils.serializeLong(currentWindowID); for (Long bucket : buckets) { Long committedWindowID = futureBuckets.get(bucket); if (committedWindowID == null || committedWindowID <= currentWindowID) { futureBuckets.remove(bucket); try { put(bucket, WINDOW_ID_KEY, currentWindowIDBytes); } catch (IOException ex) { throw new RuntimeException(ex); } } } cacheWindowCount++; handleTopBottomAggregators(); //Write out the contents of the cache. for (Map.Entry<EventKey, Aggregate> entry : cache.entrySet()) { putGAE(entry.getValue()); } emitUpdates(); if (cacheWindowCount == cacheWindowDuration) { //clear the cache if the cache window duration is reached. cache.clear(); cacheWindowCount = 0; } cleanupEmbedIdentifierToEventKeys(); super.endWindow(); } /** * This method initialize the cacheForCompositeComputation. * Basically, it computes the AggregationIdentifier for all composite aggregators and put as the key of * cacheForCompositeComputation */ protected void initializeEmbedIdentifierToEventKeys() { if (embedIdentifierToEventKeys != null) { return; } embedIdentifierToEventKeys = Maps.newHashMap(); Map<Integer, AbstractTopBottomAggregator> topBottomAggregatorIdToInstance = getTopBottomAggregatorIdToInstance(); if (topBottomAggregatorIdToInstance != null) { Set<AggregationIdentifier> allIdentifiers = Sets.newHashSet(); for (Map.Entry<Integer, AbstractTopBottomAggregator> entry : topBottomAggregatorIdToInstance .entrySet()) { allIdentifiers.addAll(getDependedIncrementalAggregationIdentifiers(entry.getValue())); } for (AggregationIdentifier identifier : allIdentifiers) { embedIdentifierToEventKeys.put(identifier, Sets.<EventKey>newHashSet()); } } } /** * only cleanup the value. keep the entry for next time */ protected void cleanupEmbedIdentifierToEventKeys() { if (embedIdentifierToEventKeys == null) { return; } for (Set<EventKey> eventKeys : embedIdentifierToEventKeys.values()) { eventKeys.clear(); } } /** * in case of embed is OTF aggregator, get identifier for incremental aggregators * @param topBottomAggregator * @return */ protected abstract Set<AggregationIdentifier> getDependedIncrementalAggregationIdentifiers( AbstractTopBottomAggregator topBottomAggregator); /** * input: the aggregte of Incremental Aggregators from cache * output: create a new cache instead of share the same cache of Incremental Aggregators */ protected void handleTopBottomAggregators() { Map<Integer, AbstractTopBottomAggregator> topBottomAggregatorIdToInstance = getTopBottomAggregatorIdToInstance(); if (topBottomAggregatorIdToInstance == null) { return; } for (AbstractTopBottomAggregator aggregator : topBottomAggregatorIdToInstance.values()) { Set<AggregationIdentifier> embedAggregatorIdentifiers = getDependedIncrementalAggregationIdentifiers( aggregator); String embedAggregatorName = aggregator.getEmbedAggregatorName(); if (isIncrementalAggregator(embedAggregatorName)) { //embed is incremental aggregator for (AggregationIdentifier embedAggregatorIdentifier : embedAggregatorIdentifiers) { Set<EventKey> eventKeysForIdentifier = embedIdentifierToEventKeys .get(embedAggregatorIdentifier); if (eventKeysForIdentifier.isEmpty()) { continue; } //group the event key Map<EventKey, Set<EventKey>> compositeEventKeyToEmbedEventKeys = groupEventKeysByCompositeEventKey( aggregator, eventKeysForIdentifier); for (Map.Entry<EventKey, Set<EventKey>> compositeEventKeyEntry : compositeEventKeyToEmbedEventKeys .entrySet()) { aggregateComposite(aggregator, compositeEventKeyEntry.getKey(), compositeEventKeyEntry.getValue(), cache); } } } else { //embed is an oft aggregator Map<EventKey, Aggregate> oftEventKeyToAggregate = computeOTFAggregates(aggregator, embedAggregatorName, getOTFAggregatorByName(embedAggregatorName)); //group the event by composite event key Map<EventKey, Set<EventKey>> compositeEventKeyToEmbedEventKeys = groupEventKeysByCompositeEventKey( aggregator, oftEventKeyToAggregate.keySet()); for (Map.Entry<EventKey, Set<EventKey>> compositeEventKeyEntry : compositeEventKeyToEmbedEventKeys .entrySet()) { aggregateComposite(aggregator, compositeEventKeyEntry.getKey(), compositeEventKeyEntry.getValue(), oftEventKeyToAggregate); } } } } protected abstract boolean isIncrementalAggregator(String aggregatorName); protected abstract OTFAggregator getOTFAggregatorByName(String otfAggregatorName); /** * group the event keys of embed aggregator by event key of composite aggregator. * The composite event keys should have less fields than the embed aggregator event keys * * @param aggregator * @param embedAggregatorEventKeys * @return */ protected Map<EventKey, Set<EventKey>> groupEventKeysByCompositeEventKey( AbstractTopBottomAggregator compositeAggregator, Set<EventKey> embedAggregatorEventKeys) { Map<EventKey, Set<EventKey>> groupedEventKeys = Maps.newHashMap(); for (EventKey embedEventKey : embedAggregatorEventKeys) { EventKey compositeEventKey = createCompositeEventKey(compositeAggregator, embedEventKey); Set<EventKey> embedEventKeys = groupedEventKeys.get(compositeEventKey); if (embedEventKeys == null) { embedEventKeys = Sets.newHashSet(); groupedEventKeys.put(compositeEventKey, embedEventKeys); } embedEventKeys.add(embedEventKey); } return groupedEventKeys; } /** * create the event key for the composite aggregate based on the event key of embed aggregator. * @param compositeAggregator * @param embedEventKey * @return */ protected EventKey createCompositeEventKey(AbstractTopBottomAggregator compositeAggregator, EventKey embedEventKey) { return createCompositeEventKey(embedEventKey.getBucketID(), embedEventKey.getSchemaID(), compositeAggregator.getDimensionDescriptorID(), compositeAggregator.getAggregatorID(), compositeAggregator.getFields(), embedEventKey); } /** * The composite event keys should have less fields than the embed aggregator event keys * @param bucketId * @param schemaId * @param dimensionDescriptorId * @param aggregatorId * @param compositeFieldNames * @param embedEventKey * @return */ protected EventKey createCompositeEventKey(int bucketId, int schemaId, int dimensionDescriptorId, int aggregatorId, Set<String> compositeFieldNames, EventKey embedEventKey) { final GPOMutable embedKey = embedEventKey.getKey(); GPOMutable key = cloneGPOMutableLimitToFields(embedKey, compositeFieldNames); return new EventKey(bucketId, schemaId, dimensionDescriptorId, aggregatorId, key); } /** * clone a GPOMutable with selected field name * TODO: this class can move to the GPOUtils * @param fieldNames * @return */ public static GPOMutable cloneGPOMutableLimitToFields(GPOMutable orgGpo, Set<String> fieldNames) { Set<String> fieldsIncludeTime = Sets.newHashSet(); fieldsIncludeTime.addAll(fieldNames); fieldsIncludeTime.add("timeBucket"); fieldsIncludeTime.add("time"); return new GPOMutable(orgGpo, new Fields(fieldsIncludeTime)); } protected Aggregate fetchOrLoadAggregate(EventKey eventKey) { Aggregate aggregate = cache.get(eventKey); if (aggregate != null) { return aggregate; } aggregate = load(eventKey); if (aggregate != null) { cache.put(eventKey, aggregate); } return aggregate; } //this is a tmp map use member variable just for share the variable protected transient Map<EventKey, Aggregate> eventKeyToAggregate = Maps.newHashMap(); protected Map<EventKey, Aggregate> getAggregates(Set<EventKey> eventKeys) { eventKeyToAggregate.clear(); for (EventKey eventKey : eventKeys) { eventKeyToAggregate.put(eventKey, cache.get(eventKey)); } return eventKeyToAggregate; } /** * * @param aggregator The composite aggregator for the aggregation * @param compositeEventKey The composite event key, used to locate the target/dest aggregate * @param inputEventKeys The input(incremental) event keys, used to locate the input aggregates * @param inputEventKeyToAggregate The repository of input event key to aggregate. inputEventKeyToAggregate.keySet() * should be a super set of inputEventKeys */ protected void aggregateComposite(AbstractTopBottomAggregator aggregator, EventKey compositeEventKey, Set<EventKey> inputEventKeys, Map<EventKey, Aggregate> inputEventKeyToAggregate) { Aggregate resultAggregate = fetchOrLoadAggregate(compositeEventKey); if (resultAggregate == null) { resultAggregate = new Aggregate(compositeEventKey, new GPOMutable(aggregator.getAggregateDescriptor())); cache.put(compositeEventKey, resultAggregate); } aggregator.aggregate(resultAggregate, inputEventKeys, inputEventKeyToAggregate); } public Aggregate createAggregate(EventKey eventKey, DimensionsConversionContext context, int aggregatorIndex) { GPOMutable aggregates = new GPOMutable(context.aggregateDescriptor); Aggregate aggregate = new Aggregate(eventKey, aggregates); aggregate.setAggregatorIndex(aggregatorIndex); return aggregate; } protected abstract List<String> getOTFChildrenAggregatorNames(String oftAggregatorName); /** * compute to get the result of OTF aggregates(result). The input aggregate value from the cache. * @param aggregator the composite aggregator of this OTF aggregator * @param oftAggregatorName * @param oftAggregator * @return */ protected Map<EventKey, Aggregate> computeOTFAggregates(AbstractTopBottomAggregator aggregator, String oftAggregatorName, OTFAggregator oftAggregator) { Set<AggregationIdentifier> dependedIncrementalAggregatorIdentifiers = getDependedIncrementalAggregationIdentifiers( aggregator); List<String> childrenAggregator = getOTFChildrenAggregatorNames(oftAggregatorName); Map<Integer, Integer> childAggregatorIdToIndex = Maps.newHashMap(); int index = 0; for (String childAggregatorName : childrenAggregator) { childAggregatorIdToIndex.put(this.getIncrementalAggregatorID(childAggregatorName), index++); } //get event keys for children of OTF List<Set<EventKey>> childrenEventKeysByAggregator = Lists.newArrayList(); for (AggregationIdentifier identifier : dependedIncrementalAggregatorIdentifiers) { Set<EventKey> eventKeys = embedIdentifierToEventKeys.get(identifier); if (eventKeys != null && !eventKeys.isEmpty()) { childrenEventKeysByAggregator.add(eventKeys); } } if (childrenEventKeysByAggregator.isEmpty()) { return Collections.emptyMap(); } //arrange the EventKey by key value, the OTF requires the aggregates for same event key for computing. List<List<EventKey>> childrenEventKeysByKeyValue = Lists.newArrayList(); Set<EventKey> eventKeys = childrenEventKeysByAggregator.get(0); for (EventKey eventKey : eventKeys) { List<EventKey> eventKeysOfOneKeyValue = Lists.newArrayList(); eventKeysOfOneKeyValue.add(eventKey); //the eventKey is from list(0) //get one from one entry of the list with same key final GPOMutable key = eventKey.getKey(); for (int i = 1; i < childrenEventKeysByAggregator.size(); ++i) { Set<EventKey> eventKeysOfOneAggregator = childrenEventKeysByAggregator.get(i); if (eventKeysOfOneAggregator == null) { continue; } EventKey matchedEventKey = null; for (EventKey ek : eventKeysOfOneAggregator) { if (key.equals(ek.getKey())) { matchedEventKey = ek; break; } } if (matchedEventKey == null) { //can't find matchedEventKey, ignore continue; } eventKeysOfOneKeyValue.add(matchedEventKey); } if (eventKeysOfOneKeyValue.size() != childrenAggregator.size()) { LOG.warn("The fetched size is " + eventKeysOfOneKeyValue.size() + ", not same as expected size " + childrenAggregator.size()); } else { childrenEventKeysByKeyValue.add(eventKeysOfOneKeyValue); } } Map<EventKey, Aggregate> compositeInputAggregates = Maps.newHashMap(); GPOMutable[] srcValues = new GPOMutable[childrenEventKeysByKeyValue.get(0).size()]; for (List<EventKey> sameKeyEvents : childrenEventKeysByKeyValue) { for (EventKey ek : sameKeyEvents) { //the values pass to the oft aggregator should be ordered by the depended aggregators srcValues[childAggregatorIdToIndex.get(ek.getAggregatorID())] = cache.get(ek).getAggregates(); } GPOMutable result = oftAggregator.aggregate(srcValues); compositeInputAggregates.put(sameKeyEvents.get(0), new Aggregate(sameKeyEvents.get(0), result)); } return compositeInputAggregates; } /** * get all composite aggregators * @return Map of aggregator id to top bottom aggregator */ protected abstract Map<Integer, AbstractTopBottomAggregator> getTopBottomAggregatorIdToInstance(); /** * This method is called in {@link #endWindow} and emits updated aggregates. Override * this method if you want to control whether or not updates are emitted. */ protected void emitUpdates() { if (updates.isConnected()) { for (Map.Entry<EventKey, Aggregate> entry : cache.entrySet()) { updates.emit(entry.getValue()); } } } @Override public HDHTCodec<Aggregate> getCodec() { return new GenericAggregateEventCodec(); } /** * Returns the cacheWindowDuration. * * @return The cacheWindowDuration. */ public int getCacheWindowDuration() { return cacheWindowDuration; } /** * Sets the cacheWindowDuration which determines the number of windows for which * data is held in this operator's cache. * * @param cacheWindowDuration The number of windows for which data is held in this operator's cache. */ public void setCacheWindowDuration(int cacheWindowDuration) { this.cacheWindowDuration = cacheWindowDuration; } /** * @return the minTimestamp */ @Unstable public Long getMinTimestamp() { return minTimestamp; } /** * @param minTimestamp the minTimestamp to set */ @Unstable public void setMinTimestamp(Long minTimestamp) { this.minTimestamp = minTimestamp; } /** * @return the maxTimestamp */ @Unstable public Long getMaxTimestamp() { return maxTimestamp; } /** * @param maxTimestamp the maxTimestamp to set */ @Unstable public void setMaxTimestamp(Long maxTimestamp) { this.maxTimestamp = maxTimestamp; } /** * @return the useSystemTimeForLatestTimeBuckets */ @Unstable public boolean isUseSystemTimeForLatestTimeBuckets() { return useSystemTimeForLatestTimeBuckets; } /** * @param useSystemTimeForLatestTimeBuckets the useSystemTimeForLatestTimeBuckets to set */ @Unstable public void setUseSystemTimeForLatestTimeBuckets(boolean useSystemTimeForLatestTimeBuckets) { this.useSystemTimeForLatestTimeBuckets = useSystemTimeForLatestTimeBuckets; } /** * This is a codec which defines how data is serialized to HDHT. This codec is effectively * a proxy which call's on the operator's overridable {@link #getKeyBytesGAE}, {@link #getValueBytesGAE}, * {@link #fromKeyValueGAE}, and {@link #getPartitionGAE} methods. */ class GenericAggregateEventCodec extends KryoSerializableStreamCodec<Aggregate> implements HDHTCodec<Aggregate> { private static final long serialVersionUID = 201503170256L; /** * Creates the codec. */ public GenericAggregateEventCodec() { //Do nothing } @Override public byte[] getKeyBytes(Aggregate gae) { return getKeyBytesGAE(gae); } @Override public byte[] getValueBytes(Aggregate gae) { return getValueBytesGAE(gae); } @Override public Aggregate fromKeyValue(Slice key, byte[] value) { return fromKeyValueGAE(key, value); } @Override public int getPartition(Aggregate gae) { return getPartitionGAE(gae); } } @Override public void addQuery(HDSQuery query) { super.addQuery(query); } /** * Gets the currently issued {@link HDSQuery}s. * * @return The currently issued {@link HDSQuery}s. */ public Map<Slice, HDSQuery> getQueries() { return this.queries; } private static final Logger LOG = LoggerFactory.getLogger(DimensionsStoreHDHT.class); }