Java tutorial
/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.execution.steps; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.concurrent.ConcurrentSkipListSet; import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Consumer; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.LongStream; import org.diqube.data.column.ColumnPage; import org.diqube.data.column.ColumnShard; import org.diqube.data.column.ColumnShardFactory; import org.diqube.data.column.ColumnType; import org.diqube.data.column.ConstantColumnShard; import org.diqube.data.column.StandardColumnShard; import org.diqube.data.types.dbl.DoubleColumnShard; import org.diqube.data.types.lng.LongColumnShard; import org.diqube.data.types.str.StringColumnShard; import org.diqube.execution.ColumnVersionManager; import org.diqube.execution.consumers.AbstractThreadedColumnBuiltConsumer; import org.diqube.execution.consumers.AbstractThreadedColumnVersionBuiltConsumer; import org.diqube.execution.consumers.ColumnBuiltConsumer; import org.diqube.execution.consumers.ColumnVersionBuiltConsumer; import org.diqube.execution.consumers.DoneConsumer; import org.diqube.execution.consumers.GenericConsumer; import org.diqube.execution.exception.ExecutablePlanBuildException; import org.diqube.execution.exception.ExecutablePlanExecutionException; import org.diqube.executionenv.ExecutionEnvironment; import org.diqube.executionenv.VersionedExecutionEnvironment; import org.diqube.executionenv.querystats.QueryableColumnShard; import org.diqube.function.FunctionFactory; import org.diqube.function.ProjectionFunction; import org.diqube.loader.LoaderColumnInfo; import org.diqube.loader.columnshard.ColumnShardBuilderFactory; import org.diqube.loader.columnshard.ColumnShardBuilderManager; import org.diqube.queries.QueryRegistry; import org.diqube.util.ColumnOrValue; import org.diqube.util.Pair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.Iterables; /** * A step that projects values e.g. of another column. * * <p> * A {@link ProjectStep} basically executes a {@link ProjectionFunction} on a specific set of input parameters and * creates a new column out of the results. * * <p> * The resulting column is either a {@link StandardColumnShard} or a {@link ConstantColumnShard}, based on the input * parameters to the function: If they are only constants or constants and other {@link ConstantColumnShard}s, a * {@link ConstantColumnShard} will be built, otherwise a {@link StandardColumnShard} will be built. * * * <p> * Input: multiple optional {@link ColumnBuiltConsumer}, multiple optional {@link ColumnVersionBuiltConsumer}. <br> * Output: {@link ColumnBuiltConsumer}, {@link ColumnVersionBuiltConsumer}. * * @author Bastian Gloeckle */ public class ProjectStep extends AbstractThreadedExecutablePlanStep { private static final Logger logger = LoggerFactory.getLogger(ProjectStep.class); /** true as soon as input ColumnBuiltConsumer has reported "done" */ private AtomicBoolean inputSourcesDone = new AtomicBoolean(false); /** only important if a ColumnBuiltConsumer is wired, contains those columns that have not yet been built fully. */ private Set<String> columnsThatStillNeedToBeBuilt; /** True as soon as all columns that this projectstep relies on are built. */ private AtomicBoolean allColumnsBuilt = new AtomicBoolean(false); private AbstractThreadedColumnBuiltConsumer columnBuiltConsumer = new AbstractThreadedColumnBuiltConsumer( this) { @Override protected void doColumnBuilt(String colName) { columnsThatStillNeedToBeBuilt.remove(colName); if (columnsThatStillNeedToBeBuilt.isEmpty()) allColumnsBuilt.set(true); } @Override protected void allSourcesAreDone() { inputSourcesDone.set(true); } }; private Object newestSync = new Object(); /** * Newest version of {@link VersionedExecutionEnvironment} that should be used to resolve any values while being based * on intermediary columns (= happens only on query master)!. Sync access using {@link #newestSync}. */ private VersionedExecutionEnvironment newestTemporaryEnv = null; /** * The rowIds that have been reported as being "adjusted" since the last run of #execute(). "Adjusted" means that the * values of these rowIds might have changed. Sync access using {@link #newestSync}. */ private Set<Long> newestAdjustedRowIds = new HashSet<>(); private AbstractThreadedColumnVersionBuiltConsumer columnVersionBuiltConsumer = new AbstractThreadedColumnVersionBuiltConsumer( this) { @Override protected void allSourcesAreDone() { // we rely on ColumnBuiltConsumer to report the final build. } @Override protected void doColumnBuilt(VersionedExecutionEnvironment env, String colName, Set<Long> adjustedRowIds) { synchronized (newestSync) { if (newestTemporaryEnv == null) newestTemporaryEnv = env; else if (newestTemporaryEnv.getVersion() < env.getVersion()) newestTemporaryEnv = env; newestAdjustedRowIds.addAll(adjustedRowIds); } } }; private ExecutionEnvironment defaultEnv; /** Output projected values to this column */ private String outputColName; private FunctionFactory functionFactory; /** parameters to pass to the {@link ProjectionFunction}. */ private ColumnOrValue[] functionParameters; /** function name of the function to execute */ private String functionNameLowerCase; /** * Prepared set containing the names of the columns that show up in the input parameters of the function. Having a * column name in here means that the execution of the {@link ProjectionFunction} depends on this column being * available. */ private Set<String> inputColNames; private Function<ColumnType, ColumnShardBuilderManager> columnShardBuilderManagerSupplier; private ColumnVersionManager columnVersionManager; private ColumnShardFactory columnShardFactory; private ColumnShardBuilderFactory columnShardBuilderFactory; /** * @param functionNameLowerCase * name of the function to be executed * @param functionParameters * The parameters * @param outputColName * column to be created. * @param columnShardBuilderFactory * factory for creating a new col. * @param columnVersionManager * Needed in case {@link ColumnVersionBuiltConsumer} are wired and intermediate columns should be created. * This is needed on query master only. */ public ProjectStep(int stepId, QueryRegistry queryRegistry, ExecutionEnvironment defaultEnv, FunctionFactory functionFactory, String functionNameLowerCase, ColumnOrValue[] functionParameters, String outputColName, ColumnShardBuilderFactory columnShardBuilderFactory, ColumnShardFactory columnShardFactory, ColumnVersionManager columnVersionManager) { super(stepId, queryRegistry); this.defaultEnv = defaultEnv; this.functionFactory = functionFactory; this.functionNameLowerCase = functionNameLowerCase; this.functionParameters = functionParameters; this.outputColName = outputColName; this.columnShardBuilderFactory = columnShardBuilderFactory; this.columnShardFactory = columnShardFactory; this.columnVersionManager = columnVersionManager; } @Override public void initialize() { inputColNames = new HashSet<>(); for (ColumnOrValue param : functionParameters) if (param.getType().equals(ColumnOrValue.Type.COLUMN)) inputColNames.add(param.getColumnName()); columnsThatStillNeedToBeBuilt = new ConcurrentSkipListSet<>(inputColNames); for (Iterator<String> it = columnsThatStillNeedToBeBuilt.iterator(); it.hasNext();) if (defaultEnv.getColumnShard(it.next()) != null) it.remove(); columnShardBuilderManagerSupplier = (outputColType) -> { LoaderColumnInfo columnInfo = new LoaderColumnInfo(outputColType); return columnShardBuilderFactory.createColumnShardBuilderManager(columnInfo, defaultEnv.getFirstRowIdInShard()); }; } @Override protected void validateOutputConsumer(GenericConsumer consumer) throws IllegalArgumentException { if (!(consumer instanceof DoneConsumer) && !(consumer instanceof ColumnBuiltConsumer) && !(consumer instanceof ColumnVersionBuiltConsumer)) throw new IllegalArgumentException( "Only ColumnBuiltConsumer and ColumnVersionBuiltConsumer supported."); } @Override protected void execute() { // Did we fill the output column completely and are we done? boolean columnFullyBuilt = false; ColumnShard column = null; VersionedExecutionEnvironment temporaryEnv; Set<Long> curAdjustedRowIds; synchronized (newestSync) { temporaryEnv = newestTemporaryEnv; curAdjustedRowIds = newestAdjustedRowIds; if (curAdjustedRowIds != null && !curAdjustedRowIds.isEmpty()) newestAdjustedRowIds = new HashSet<>(); } if (inputColNames.size() == 0) { // we do not have input columns, just literals. The resulting column will likely end up being a column with only // one row, a 'constant' row. This is handled accordingly in ResolveColumnDictIdsStep. ColumnType inputColType = null; if (functionParameters[0].getValue() instanceof Long) inputColType = ColumnType.LONG; else if (functionParameters[0].getValue() instanceof String) inputColType = ColumnType.STRING; else if (functionParameters[0].getValue() instanceof Double) inputColType = ColumnType.DOUBLE; ProjectionFunction<Object, Object> fn = functionFactory.createProjectionFunction(functionNameLowerCase, inputColType); if (fn == null) throw new ExecutablePlanExecutionException("Cannot find function '" + functionNameLowerCase + "' with input data type " + inputColType); for (int paramIdx = 0; paramIdx < functionParameters.length; paramIdx++) fn.provideConstantParameter(paramIdx, functionParameters[paramIdx].getValue()); Object[] fnResult = fn.execute(); switch (fn.getOutputType()) { case LONG: column = columnShardFactory.createConstantLongColumnShard(outputColName, (Long) fnResult[0], defaultEnv.getFirstRowIdInShard()); break; case STRING: column = columnShardFactory.createConstantStringColumnShard(outputColName, (String) fnResult[0], defaultEnv.getFirstRowIdInShard()); break; case DOUBLE: column = columnShardFactory.createConstantDoubleColumnShard(outputColName, (Double) fnResult[0], defaultEnv.getFirstRowIdInShard()); break; } columnFullyBuilt = true; logger.trace("Build constant column {} as there are no column inputs. Value: {}", outputColName, fnResult[0]); } else if (columnBuiltConsumer.getNumberOfTimesWired() == 0 || (columnBuiltConsumer.getNumberOfTimesWired() > 0 && allColumnsBuilt.get())) { // We waited enough, all our source columns are built fully and are available in the defaultEnv. logger.trace("Build standard column {} based on default environment (= last run).", outputColName); column = buildColumnBasedProjection(defaultEnv); columnFullyBuilt = true; } else if (columnBuiltConsumer.getNumberOfTimesWired() > 0 && inputSourcesDone.get() && !allColumnsBuilt.get()) { // we need to wait for columns to be built, but the columnBuiltConsumer reported to be done, but not all columns // have been built. Therefore we cannot execute the projection, but just report "done". logger.debug("Projection waited for columns to be built, but some won't be built. Skipping."); forEachOutputConsumerOfType(GenericConsumer.class, c -> c.sourceIsDone()); doneProcessing(); return; } else { // not all columns are yet fully available. Let's see if we have enough information to at least project some parts // for the time being. if (temporaryEnv != null && existsOutputConsumerOfType(ColumnVersionBuiltConsumer.class)) { boolean allInputColsAvailable = inputColNames.stream() .allMatch(colName -> temporaryEnv.getColumnShard(colName) != null); if (allInputColsAvailable) { // we have data for all input columns available, which means that we can start projection at least // /something/. logger.trace("Build intermediary column {} after following rowIds were adjusted (limit) {}", outputColName, Iterables.limit(curAdjustedRowIds, 100)); // execute full projection, although we have specific row IDs that have been altered. // TODO #8 cache intermediary results and use that to not again apply the projection function to all elements // again. column = buildColumnBasedProjection(temporaryEnv); } } } if (column != null) { if (temporaryEnv != null && columnVersionManager != null && existsOutputConsumerOfType(ColumnVersionBuiltConsumer.class)) { logger.trace("Will store new version of {}", outputColName); // inform ColumnVersionBuiltConsumer VersionedExecutionEnvironment newEnv = columnVersionManager.createNewVersion(column); forEachOutputConsumerOfType(ColumnVersionBuiltConsumer.class, c -> c.columnVersionBuilt(newEnv, outputColName, curAdjustedRowIds)); } // if done, inform other consumers. if (columnFullyBuilt) { logger.trace("Will store final column {}", outputColName); switch (column.getColumnType()) { case STRING: defaultEnv.storeTemporaryStringColumnShard((StringColumnShard) column); break; case LONG: defaultEnv.storeTemporaryLongColumnShard((LongColumnShard) column); break; case DOUBLE: defaultEnv.storeTemporaryDoubleColumnShard((DoubleColumnShard) column); break; } forEachOutputConsumerOfType(ColumnBuiltConsumer.class, c -> c.columnBuilt(outputColName)); forEachOutputConsumerOfType(GenericConsumer.class, c -> c.sourceIsDone()); doneProcessing(); } } } /** * Executes a projection that is based on at least one {@link ColumnShard}, which is/are available in the given * {@link ExecutionEnvironment}. * * The created column, which might either be a {@link StandardColumnShard} or a {@link ConstantColumnShard} (in case */ private ColumnShard buildColumnBasedProjection(ExecutionEnvironment env) { // buckets of row IDs we want to process together. Left of pair: first row ID of bucket, right: length. Set<Pair<Long, Integer>> rowIdBucketsToProcess; if (inputColNames.stream().anyMatch(colName -> env.getPureStandardColumnShard(colName) != null)) { // Find column shard that contains the least rows, in order to calculate rowID buckets below. // On the query master each column might have different number of rows, therefore we find the least common number // of rows that we can process. String referenceColName = inputColNames.stream() .filter(colName -> env.getPureStandardColumnShard(colName) != null).map(name -> // new Pair<String, Long>(name, env.getPureStandardColumnShard(name).getNumberOfRowsInColumnShard())) .min((p1, p2) -> p1.getRight().compareTo(p2.getRight())).get().getLeft(); rowIdBucketsToProcess = env.getColumnShard(referenceColName).getGoodResolutionPairs(); } else { // only ConstantColumnShard objects. rowIdBucketsToProcess = new HashSet<Pair<Long, Integer>>(); rowIdBucketsToProcess.add(new Pair<Long, Integer>(defaultEnv.getFirstRowIdInShard(), 1)); } // choose an arbitrary input column to identify input colType. All input columns and constants need to be of equal // type anyway. ColumnType inputColumnType = env.getColumnType(inputColNames.stream().findAny().get()); ConstantColumnShard[] resultConstantColumn = new ConstantColumnShard[1]; resultConstantColumn[0] = null; ProjectionFunction<?, ?> tmpProjectionFunction = functionFactory .createProjectionFunction(functionNameLowerCase, inputColumnType); if (tmpProjectionFunction == null) throw new ExecutablePlanExecutionException( "Cannot find function '" + functionNameLowerCase + "' with input data type " + inputColumnType); ColumnShardBuilderManager columnShardBuilderManager = columnShardBuilderManagerSupplier .apply(tmpProjectionFunction.getOutputType()); // execute ProjectionFunctions based on buckets of rowIds. rowIdBucketsToProcess.forEach(new Consumer<Pair<Long, Integer>>() { @Override public void accept(Pair<Long, Integer> pair) { long firstRowId = pair.getLeft(); int length = pair.getRight(); ProjectionFunction<Object, Object> fn = functionFactory .createProjectionFunction(functionNameLowerCase, inputColumnType); boolean hadStandardColumnInput = false; for (int paramIdx = 0; paramIdx < functionParameters.length; paramIdx++) { ColumnOrValue param = functionParameters[paramIdx]; if (param.getType() == ColumnOrValue.Type.LITERAL) { fn.provideConstantParameter(paramIdx, param.getValue()); } else { ConstantColumnShard constantShard = env.getPureConstantColumnShard(param.getColumnName()); if (constantShard != null) { fn.provideConstantParameter(paramIdx, constantShard.getValue()); } else { hadStandardColumnInput = true; Object[] colValues = fn.createEmptyInputArray(length); int rowsResolved = resolveValuesFromColumn(env.getColumnShard(param.getColumnName()), firstRowId, length, colValues); if (rowsResolved != length) throw new ExecutablePlanExecutionException("Column " + param.getColumnName() + " does not contain the same number of rows as other columns; cannot execute function " + functionNameLowerCase + " to produce output column " + outputColName); fn.provideParameter(paramIdx, colValues); } } } Object[] fnResult = fn.execute(); if (hadStandardColumnInput) { columnShardBuilderManager.addValues(outputColName, fnResult, firstRowId); } else { // we did not have input from a standardColumnShard. We would not execute this method if there were no // column // input at all, therefore all column inputs were constantColumnShards. Because of this we should again // build // a constantColumnShard. // It is no problem to directly create the result column within the forEach(..) call, as in case all inputs // are constants, there is only one Pair<Long, Integer> the forEach is iterating over. switch (fn.getOutputType()) { case LONG: resultConstantColumn[0] = columnShardFactory.createConstantLongColumnShard(outputColName, (Long) fnResult[0], defaultEnv.getFirstRowIdInShard()); break; case STRING: resultConstantColumn[0] = columnShardFactory.createConstantStringColumnShard(outputColName, (String) fnResult[0], defaultEnv.getFirstRowIdInShard()); break; case DOUBLE: resultConstantColumn[0] = columnShardFactory.createConstantDoubleColumnShard(outputColName, (Double) fnResult[0], defaultEnv.getFirstRowIdInShard()); break; } } } }); if (resultConstantColumn[0] == null) return columnShardBuilderManager.buildAndFree(outputColName); return resultConstantColumn[0]; } /** * Resolves values of a specific row ID range from the given column and takes care of fetching those values from the * {@link ColumnPage}s that contain them. * * @return number of elements resolved - this might be smaller than 'length' in case source column did not provide * enough data. */ private int resolveValuesFromColumn(QueryableColumnShard column, long firstRowId, int length, Object[] result) { if (column.getFirstRowId() > firstRowId) { // make sure firstRowId is inside the column shard. long delta = firstRowId - column.getFirstRowId(); length -= delta; firstRowId += delta; if (length <= 0) return 0; } Long[] columnValueIds = column.resolveColumnValueIdsForRowsFlat( // LongStream.range(firstRowId, firstRowId + length).mapToObj(Long::valueOf) .collect(Collectors.toList())); // highest index in columnValueIds where the value is != -1 int maxIdx = columnValueIds.length - 1; if (columnValueIds[maxIdx] == -1L) { // resolveColumnValueIdsForRowsFlat returns -1 for rowIds not contained in the column shard. This can happen if // the length parameter of this method is too high. As we though provided a sorted input to // resolveColumnValueIdsForRowsFlat, this can happen only at the end of the columnValueIds array. // We therefore do a binary search for the first -1 in a consecutive batch of -1s. int lo = 0; int hi = columnValueIds.length - 1; boolean found = false; while (!found && hi >= 0 && lo < columnValueIds.length && hi >= lo) { if (columnValueIds[lo] == -1L) { maxIdx = lo - 1; found = true; } else if (columnValueIds[hi] != -1L) { maxIdx = hi; found = true; } else { int mid = (hi - lo) / 2; if (columnValueIds[mid] == -1L) hi = mid - 1; else lo = mid + 1; } } if (maxIdx == -1) // all columnValueIds == -1 return 0; } if (maxIdx < columnValueIds.length - 1) { Long[] newColumnValueIds = new Long[maxIdx + 1]; System.arraycopy(columnValueIds, 0, newColumnValueIds, 0, maxIdx + 1); columnValueIds = newColumnValueIds; } Object[] values = column.getColumnShardDictionary().decompressValues(columnValueIds); System.arraycopy(values, 0, result, 0, values.length); return values.length; } @Override protected List<GenericConsumer> inputConsumers() { return new ArrayList<>( Arrays.asList(new GenericConsumer[] { columnBuiltConsumer, columnVersionBuiltConsumer })); } @Override protected void validateWiredStatus() throws ExecutablePlanBuildException { // noop. Both is fine, having an input and not having an input. } @Override protected String getAdditionalToStringDetails() { return "funcName=" + functionNameLowerCase + ", outputCol=" + outputColName; } }