Java tutorial
/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.execution.steps; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentLinkedDeque; import java.util.concurrent.atomic.AtomicBoolean; import org.diqube.data.column.ColumnShard; import org.diqube.data.types.dbl.DoubleColumnShard; import org.diqube.data.types.lng.LongColumnShard; import org.diqube.data.types.str.StringColumnShard; import org.diqube.execution.ColumnVersionManager; import org.diqube.execution.consumers.AbstractThreadedGroupIntermediaryAggregationConsumer; import org.diqube.execution.consumers.ColumnBuiltConsumer; import org.diqube.execution.consumers.ColumnVersionBuiltConsumer; import org.diqube.execution.consumers.DoneConsumer; import org.diqube.execution.consumers.GenericConsumer; import org.diqube.execution.consumers.GroupFinalAggregationConsumer; import org.diqube.execution.consumers.GroupIntermediaryAggregationConsumer; import org.diqube.execution.exception.ExecutablePlanExecutionException; import org.diqube.executionenv.ExecutionEnvironment; import org.diqube.executionenv.VersionedExecutionEnvironment; import org.diqube.function.AggregationFunction; import org.diqube.function.FunctionException; import org.diqube.function.FunctionFactory; import org.diqube.function.IntermediaryResult; import org.diqube.loader.columnshard.ColumnShardBuilderFactory; import org.diqube.loader.columnshard.SparseColumnShardBuilder; import org.diqube.queries.QueryRegistry; import org.diqube.util.Triple; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.Iterables; /** * Receives {@link IntermediaryResult}s provided by {@link GroupIntermediaryAggregationStep}s and combines them to * actual values. The resulting column that is built (of which the outputs are informed by a {@link ColumnBuiltConsumer} * ) will contain just enough rows to contain all group results (groupId = rowId, finding the max of it). If a * {@link ColumnVersionBuiltConsumer} is wired, intermediary columns will be built. * * <p> * Input: 1 {@link GroupIntermediaryAggregationConsumer} <br> * Output: {@link GroupFinalAggregationConsumer}, {@link ColumnBuiltConsumer}, {@link ColumnVersionBuiltConsumer} * * @author Bastian Gloeckle */ public class GroupFinalAggregationStep extends AbstractThreadedExecutablePlanStep { private static final Logger logger = LoggerFactory.getLogger(GroupFinalAggregationStep.class); private AtomicBoolean sourceIsDone = new AtomicBoolean(false); private ConcurrentLinkedDeque<Triple<Long, IntermediaryResult, IntermediaryResult>> groupIntermediaryUpdates = new ConcurrentLinkedDeque<>(); private AbstractThreadedGroupIntermediaryAggregationConsumer groupIntermediaryConsumer = new AbstractThreadedGroupIntermediaryAggregationConsumer( this) { @Override protected void allSourcesAreDone() { GroupFinalAggregationStep.this.sourceIsDone.set(true); } @Override protected void doConsumeIntermediaryAggregationResult(long groupId, String colName, IntermediaryResult oldIntermediaryResult, IntermediaryResult newIntermediaryResult) { if (newIntermediaryResult.getOutputColName().equals(outputColName)) groupIntermediaryUpdates.add(new Triple<>(groupId, oldIntermediaryResult, newIntermediaryResult)); } }; private ExecutionEnvironment defaultEnv; private FunctionFactory functionFactory; private String functionNameLowerCase; private String outputColName; private Map<Long, AggregationFunction<Object, Object>> aggregationFunctions = new HashMap<>(); private ColumnShardBuilderFactory columnShardBuilderFactory; private ColumnVersionManager columnVersionManager; private List<Object> constantFunctionParameters; private Set<Long> groupIdsChangedSinceLastOutputVersionBuilt = new HashSet<>(); public GroupFinalAggregationStep(int stepId, QueryRegistry queryRegistry, ExecutionEnvironment defaultEnv, FunctionFactory functionFactory, ColumnShardBuilderFactory columnShardBuilderFactory, String functionNameLowerCase, String outputColName, ColumnVersionManager columnVersionManager, List<Object> constantFunctionParameters) { super(stepId, queryRegistry); this.defaultEnv = defaultEnv; this.functionFactory = functionFactory; this.columnShardBuilderFactory = columnShardBuilderFactory; this.functionNameLowerCase = functionNameLowerCase; this.outputColName = outputColName; this.columnVersionManager = columnVersionManager; this.constantFunctionParameters = constantFunctionParameters; } @Override protected void validateOutputConsumer(GenericConsumer consumer) throws IllegalArgumentException { if (!(consumer instanceof DoneConsumer) && !(consumer instanceof GroupFinalAggregationConsumer) && !(consumer instanceof ColumnBuiltConsumer) && !(consumer instanceof ColumnVersionBuiltConsumer)) throw new IllegalArgumentException("Only GroupFinalAggregationConsumer, ColumnBuiltConsumer " + "and ColumnVersionBuiltConsumer supported."); } @Override protected void execute() { @SuppressWarnings("unchecked") Triple<Long, IntermediaryResult, IntermediaryResult>[] activeUpdates = new Triple[groupIntermediaryUpdates .size()]; for (int i = 0; i < activeUpdates.length; i++) activeUpdates[i] = groupIntermediaryUpdates.poll(); if (activeUpdates.length > 0) { Set<Long> groupIdsChanged = new HashSet<>(); for (Triple<Long, IntermediaryResult, IntermediaryResult> update : activeUpdates) { Long groupId = update.getLeft(); groupIdsChanged.add(groupId); IntermediaryResult oldIntermediary = update.getMiddle(); IntermediaryResult newIntermediary = update.getRight(); logger.trace("Processing update of group {} on col {}: new {}, old {}", groupId, outputColName, newIntermediary, oldIntermediary); if (!aggregationFunctions.containsKey(groupId)) { AggregationFunction<Object, Object> fn = functionFactory .createAggregationFunction(functionNameLowerCase, newIntermediary.getInputColumnType()); if (fn == null) throw new ExecutablePlanExecutionException("Cannot find function '" + functionNameLowerCase + "' with input data type " + newIntermediary.getInputColumnType()); for (int i = 0; i < constantFunctionParameters.size(); i++) fn.provideConstantParameter(i, constantFunctionParameters.get(i)); fn.addIntermediary(newIntermediary.createValueIterator()); aggregationFunctions.put(groupId, fn); } else { if (oldIntermediary != null) aggregationFunctions.get(groupId).removeIntermediary(oldIntermediary.createValueIterator()); aggregationFunctions.get(groupId).addIntermediary(newIntermediary.createValueIterator()); } } for (Long groupId : groupIdsChanged) { Object result = aggregationFunctions.get(groupId).calculate(); logger.trace("New value for group {} on col {}: {}", groupId, outputColName, result); forEachOutputConsumerOfType(GroupFinalAggregationConsumer.class, c -> c.consumeAggregationResult(groupId, outputColName, result)); } groupIdsChangedSinceLastOutputVersionBuilt.addAll(groupIdsChanged); } boolean isLastRun = sourceIsDone.get() && groupIntermediaryUpdates.isEmpty(); // Note: get "last run" value before // building the col! ColumnShard newCol = null; // inform ColumnVersionBuiltConsumers (and build new version of column) if there are "enough" updates - do not do // this too often as it is pretty time consuming. if (activeUpdates.length > 0 && existsOutputConsumerOfType(ColumnVersionBuiltConsumer.class)) { // TODO #2 (stats): base this on stats. if ((groupIdsChangedSinceLastOutputVersionBuilt.size() >= 0.05 * aggregationFunctions.size()) || // /* */ // execute definitely if this is the last run: We need to send the changed group ids! /* */ isLastRun) { logger.trace("Creating new column version of {}, changed group IDs {}", outputColName, groupIdsChangedSinceLastOutputVersionBuilt); newCol = createNewColumn(); VersionedExecutionEnvironment newEnv = columnVersionManager.createNewVersion(newCol); Set<Long> finalGroupIdsChanged = groupIdsChangedSinceLastOutputVersionBuilt; forEachOutputConsumerOfType(ColumnVersionBuiltConsumer.class, c -> c.columnVersionBuilt(newEnv, outputColName, finalGroupIdsChanged)); groupIdsChangedSinceLastOutputVersionBuilt = new HashSet<>(); } } // if done, inform other consumers. if (isLastRun) { if (!aggregationFunctions.isEmpty()) { // check if there is any result at all, if not, just report "done" (below). logger.trace("Creating final grouped column {}", outputColName); if (newCol == null) newCol = createNewColumn(); switch (newCol.getColumnType()) { case STRING: defaultEnv.storeTemporaryStringColumnShard((StringColumnShard) newCol); break; case LONG: defaultEnv.storeTemporaryLongColumnShard((LongColumnShard) newCol); break; case DOUBLE: defaultEnv.storeTemporaryDoubleColumnShard((DoubleColumnShard) newCol); break; } forEachOutputConsumerOfType(ColumnBuiltConsumer.class, c -> c.columnBuilt(outputColName)); } forEachOutputConsumerOfType(GenericConsumer.class, c -> c.sourceIsDone()); doneProcessing(); } } private ColumnShard createNewColumn() throws FunctionException { SparseColumnShardBuilder<Object> columnBuildManager = columnShardBuilderFactory .createSparseColumnShardBuilder(outputColName); Map<Long, Object> rowIdToValue = new HashMap<>(); long maxRowId = -1; for (Long rowId : aggregationFunctions.keySet()) { rowIdToValue.put(rowId, aggregationFunctions.get(rowId).calculate()); if (rowId > maxRowId) maxRowId = rowId; } logger.trace("Values of new col (limit): {}", Iterables.limit(rowIdToValue.entrySet(), 100)); columnBuildManager.withNumberOfRows(maxRowId + 1).withValues(rowIdToValue); ColumnShard columnShard = columnBuildManager.build(); return columnShard; } @Override protected List<GenericConsumer> inputConsumers() { return new ArrayList<>(Arrays.asList(new GenericConsumer[] { groupIntermediaryConsumer })); } @Override protected String getAdditionalToStringDetails() { return "funcName=" + functionNameLowerCase + ", outputCol=" + outputColName; } }