Java tutorial
/** * Licensed to Odiago, Inc. under one or more contributor license * agreements. See the NOTICE.txt file distributed with this work for * additional information regarding copyright ownership. Odiago, Inc. * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the * License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. */ package com.odiago.flumebase.exec; import java.io.IOException; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.cloudera.flume.core.Event; import com.odiago.flumebase.lang.TimeSpan; import com.odiago.flumebase.parser.TypedField; import com.odiago.flumebase.parser.WindowSpec; import com.odiago.flumebase.plan.HashJoinNode; import com.odiago.flumebase.util.WindowedHashMap; /** * FlowElement that performs a hash join between two input streams * based on equality of a specific input key. */ public class HashJoinElement extends FlowElementImpl { private static final Logger LOG = LoggerFactory.getLogger(HashJoinElement.class.getName()); /** * HashMap containing enqueued elements of the left stream within the * current window. */ private WindowedHashMap<Object, EventWrapper, Long> mLeftMap; /** * HashMap containing enqueued elements of the right stream within the * current window. */ private WindowedHashMap<Object, EventWrapper, Long> mRightMap; /** Name of the left-side stream. */ private String mLeftName; /** Name of the right-side stream. */ private String mRightName; /** Name of the key field from the left stream. */ private TypedField mLeftKey; /** Name of the key field from the right stream. */ private TypedField mRightKey; /** Window specification in which we are joining. */ private WindowSpec mWindowWidth; /** * The actual time interval over which we're doing the join; derived * from mWindowWidth. */ private TimeSpan mTimeSpan; /** Name of the output stream. */ private String mOutName; /** * Mapping from field names to indices in CompositeEventWrapper arrays * describing the output events from this join operation. */ private Map<String, Integer> mFieldMap; /** * The amount of slack time we provide before we evict old elements. */ private int mSlackTime; public HashJoinElement(FlowElementContext ctxt, String leftName, String rightName, TypedField leftKey, TypedField rightKey, WindowSpec windowWidth, String outName, List<TypedField> leftFieldNames, List<TypedField> rightFieldNames, Configuration conf) { super(ctxt); mSlackTime = conf.getInt(BucketedAggregationElement.SLACK_INTERVAL_KEY, BucketedAggregationElement.DEFAULT_SLACK_INTERVAL); if (mSlackTime < 0) { mSlackTime = BucketedAggregationElement.DEFAULT_SLACK_INTERVAL; } mLeftMap = new WindowedHashMap<Object, EventWrapper, Long>(); mRightMap = new WindowedHashMap<Object, EventWrapper, Long>(); mLeftName = leftName; mRightName = rightName; mLeftKey = leftKey; mRightKey = rightKey; mWindowWidth = windowWidth; try { assert mWindowWidth.getRangeSpec().isConstant(); mTimeSpan = (TimeSpan) mWindowWidth.getRangeSpec().eval(new EmptyEventWrapper()); } catch (IOException ioe) { // This should be a constant expression, so this would be quite surprising. LOG.error("Unexpected IOE during timespan eval() in HashJoin: " + ioe); } mOutName = outName; initFieldMap(leftFieldNames, rightFieldNames); } public HashJoinElement(FlowElementContext ctxt, HashJoinNode joinNode) { this(ctxt, joinNode.getLeftName(), joinNode.getRightName(), joinNode.getLeftKey(), joinNode.getRightKey(), joinNode.getWindowWidth(), joinNode.getOutputName(), joinNode.getLeftFields(), joinNode.getRightFields(), joinNode.getConf()); } /** * Initialize the map we install in every output CompositeEventWrapper. * This describes which of the nested EventWrappers contains each field of * the joined record. We compute this once and then reuse it in each output * event; we always use the ordered list [leftStream, rightStream] in the wrapped * list. */ private void initFieldMap(List<TypedField> leftFields, List<TypedField> rightFields) { mFieldMap = new HashMap<String, Integer>(); // Left EventWrapper has index 0... for (TypedField f : leftFields) { mFieldMap.put(f.getAvroName(), 0); } // Right EventWrapper has index 1. for (TypedField f : rightFields) { mFieldMap.put(f.getAvroName(), 1); } mFieldMap = Collections.unmodifiableMap(mFieldMap); } @Override public void takeEvent(EventWrapper e) throws IOException, InterruptedException { Event event = e.getEvent(); // Determine which stream the event is from; this determines which map we // place the event in, and which map we check for candidate join matches. String streamName = e.getAttr(STREAM_NAME_ATTR); if (null == streamName) { // We don't know which stream this came from. Don't process it. LOG.warn("Got event with no " + STREAM_NAME_ATTR + " attribute!"); return; } WindowedHashMap<Object, EventWrapper, Long> insertMap; // Map where we insert this event. WindowedHashMap<Object, EventWrapper, Long> joinMap; // Map we pull join candidates from. TypedField keyField; // The field to grab from the event wrapper. boolean isLeft; if (streamName.equals(mLeftName)) { insertMap = mLeftMap; joinMap = mRightMap; keyField = mLeftKey; isLeft = true; } else if (streamName.equals(mRightName)) { insertMap = mRightMap; joinMap = mLeftMap; keyField = mRightKey; isLeft = false; } else { // Not from either stream? LOG.warn("Got event with unexpected " + STREAM_NAME_ATTR + "=" + streamName); return; // Don't know what to do with this. } // Look up elements from the opposite map to determine what joins we can perform. Object key = e.getField(keyField); if (null == key) { // The key field is null; this will not match to anything in an inner join. return; } assert mTimeSpan.isRelative; long curTime = event.getTimestamp(); Long lo; Long hi; if (isLeft) { // If this event is from the left stream, calculate the relative time interval normally. lo = curTime + mTimeSpan.lo; hi = curTime + mTimeSpan.hi; } else { // If this event is from the right stream, use the "mirror image" of the timespan. // "RANGE INTERVAL 10 MINUTES PRECEDING" actually means, join with the /next/ 10 // minutes of data from this perspective. lo = curTime - mTimeSpan.hi; hi = curTime - mTimeSpan.lo; } LOG.debug("Working on key: " + key + ", isLeft=" + isLeft); LOG.debug("Timestamp=" + curTime + ", interval=" + lo + ", " + hi); // Join with all the events in the window. List<EventWrapper> joinEvents = joinMap.getRange(key, lo, hi, isLeft, !isLeft); for (EventWrapper joinWrapper : joinEvents) { CompositeEvent outEvent = new CompositeEvent(mFieldMap, event.getPriority(), event.getTimestamp(), event.getNanos(), event.getHost()); CompositeEventWrapper outWrapper = new CompositeEventWrapper(); if (isLeft) { outEvent.add(e); outEvent.add(joinWrapper); } else { // Add the left event to the composite first. // Order matters due to the fixed mFieldMap. outEvent.add(joinWrapper); outEvent.add(e); } outEvent.setAttr(STREAM_NAME_ATTR, mOutName); // set the output stream name. outWrapper.reset(outEvent); emit(outWrapper); } // Save the event for joining with other events that arrive in the future. insertMap.put(key, e, curTime); // Remove entries from the join target map that are behind the current // window, to keep the window maps from overfilling. // Anything behind the 'lo' value can be removed. joinMap.removeOlderThan(lo - mSlackTime); // If we get lots of records on one side of the join but no records // on the other side for an extended period of time, we won't be culling the // correct map. Given 'lo' calculated from the perspective of oldest entry in // the other map, remove obsolete values from insertMap. Calculating based // on the oldest entry in the other map ensures that we are not discarding // values that we cannot process yet because one stream is delayed. Long oldestInOtherMap = joinMap.oldestTimestamp(); if (null != oldestInOtherMap) { Long otherMapLo; if (isLeft) { otherMapLo = oldestInOtherMap - mTimeSpan.hi; } else { otherMapLo = oldestInOtherMap + mTimeSpan.lo; } LOG.debug("otherMapLo=" + otherMapLo); insertMap.removeOlderThan(otherMapLo - mSlackTime); } } }