org.diqube.execution.steps.ProjectStep.java Source code

Introduction

Here is the source code for org.diqube.execution.steps.ProjectStep.java
Source

/**
 * diqube: Distributed Query Base.
 *
 * Copyright (C) 2015 Bastian Gloeckle
 *
 * This file is part of diqube.
 *
 * diqube is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.diqube.execution.steps;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.LongStream;

import org.diqube.data.column.ColumnPage;
import org.diqube.data.column.ColumnShard;
import org.diqube.data.column.ColumnShardFactory;
import org.diqube.data.column.ColumnType;
import org.diqube.data.column.ConstantColumnShard;
import org.diqube.data.column.StandardColumnShard;
import org.diqube.data.types.dbl.DoubleColumnShard;
import org.diqube.data.types.lng.LongColumnShard;
import org.diqube.data.types.str.StringColumnShard;
import org.diqube.execution.ColumnVersionManager;
import org.diqube.execution.consumers.AbstractThreadedColumnBuiltConsumer;
import org.diqube.execution.consumers.AbstractThreadedColumnVersionBuiltConsumer;
import org.diqube.execution.consumers.ColumnBuiltConsumer;
import org.diqube.execution.consumers.ColumnVersionBuiltConsumer;
import org.diqube.execution.consumers.DoneConsumer;
import org.diqube.execution.consumers.GenericConsumer;
import org.diqube.execution.exception.ExecutablePlanBuildException;
import org.diqube.execution.exception.ExecutablePlanExecutionException;
import org.diqube.executionenv.ExecutionEnvironment;
import org.diqube.executionenv.VersionedExecutionEnvironment;
import org.diqube.executionenv.querystats.QueryableColumnShard;
import org.diqube.function.FunctionFactory;
import org.diqube.function.ProjectionFunction;
import org.diqube.loader.LoaderColumnInfo;
import org.diqube.loader.columnshard.ColumnShardBuilderFactory;
import org.diqube.loader.columnshard.ColumnShardBuilderManager;
import org.diqube.queries.QueryRegistry;
import org.diqube.util.ColumnOrValue;
import org.diqube.util.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Iterables;

/**
 * A step that projects values e.g. of another column.
 * 
 * <p>
 * A {@link ProjectStep} basically executes a {@link ProjectionFunction} on a specific set of input parameters and
 * creates a new column out of the results.
 * 
 * <p>
 * The resulting column is either a {@link StandardColumnShard} or a {@link ConstantColumnShard}, based on the input
 * parameters to the function: If they are only constants or constants and other {@link ConstantColumnShard}s, a
 * {@link ConstantColumnShard} will be built, otherwise a {@link StandardColumnShard} will be built.
 * 
 * 
 * <p>
 * Input: multiple optional {@link ColumnBuiltConsumer}, multiple optional {@link ColumnVersionBuiltConsumer}. <br>
 * Output: {@link ColumnBuiltConsumer}, {@link ColumnVersionBuiltConsumer}.
 *
 * @author Bastian Gloeckle
 */
public class ProjectStep extends AbstractThreadedExecutablePlanStep {

    private static final Logger logger = LoggerFactory.getLogger(ProjectStep.class);

    /** true as soon as input ColumnBuiltConsumer has reported "done" */
    private AtomicBoolean inputSourcesDone = new AtomicBoolean(false);

    /** only important if a ColumnBuiltConsumer is wired, contains those columns that have not yet been built fully. */
    private Set<String> columnsThatStillNeedToBeBuilt;
    /** True as soon as all columns that this projectstep relies on are built. */
    private AtomicBoolean allColumnsBuilt = new AtomicBoolean(false);

    private AbstractThreadedColumnBuiltConsumer columnBuiltConsumer = new AbstractThreadedColumnBuiltConsumer(
            this) {
        @Override
        protected void doColumnBuilt(String colName) {
            columnsThatStillNeedToBeBuilt.remove(colName);

            if (columnsThatStillNeedToBeBuilt.isEmpty())
                allColumnsBuilt.set(true);
        }

        @Override
        protected void allSourcesAreDone() {
            inputSourcesDone.set(true);
        }
    };

    private Object newestSync = new Object();
    /**
     * Newest version of {@link VersionedExecutionEnvironment} that should be used to resolve any values while being based
     * on intermediary columns (= happens only on query master)!. Sync access using {@link #newestSync}.
     */
    private VersionedExecutionEnvironment newestTemporaryEnv = null;
    /**
     * The rowIds that have been reported as being "adjusted" since the last run of #execute(). "Adjusted" means that the
     * values of these rowIds might have changed. Sync access using {@link #newestSync}.
     */
    private Set<Long> newestAdjustedRowIds = new HashSet<>();

    private AbstractThreadedColumnVersionBuiltConsumer columnVersionBuiltConsumer = new AbstractThreadedColumnVersionBuiltConsumer(
            this) {
        @Override
        protected void allSourcesAreDone() {
            // we rely on ColumnBuiltConsumer to report the final build.
        }

        @Override
        protected void doColumnBuilt(VersionedExecutionEnvironment env, String colName, Set<Long> adjustedRowIds) {
            synchronized (newestSync) {
                if (newestTemporaryEnv == null)
                    newestTemporaryEnv = env;
                else if (newestTemporaryEnv.getVersion() < env.getVersion())
                    newestTemporaryEnv = env;
                newestAdjustedRowIds.addAll(adjustedRowIds);
            }
        }
    };

    private ExecutionEnvironment defaultEnv;
    /** Output projected values to this column */
    private String outputColName;
    private FunctionFactory functionFactory;
    /** parameters to pass to the {@link ProjectionFunction}. */
    private ColumnOrValue[] functionParameters;
    /** function name of the function to execute */
    private String functionNameLowerCase;
    /**
     * Prepared set containing the names of the columns that show up in the input parameters of the function. Having a
     * column name in here means that the execution of the {@link ProjectionFunction} depends on this column being
     * available.
     */
    private Set<String> inputColNames;

    private Function<ColumnType, ColumnShardBuilderManager> columnShardBuilderManagerSupplier;
    private ColumnVersionManager columnVersionManager;
    private ColumnShardFactory columnShardFactory;
    private ColumnShardBuilderFactory columnShardBuilderFactory;

    /**
     * @param functionNameLowerCase
     *          name of the function to be executed
     * @param functionParameters
     *          The parameters
     * @param outputColName
     *          column to be created.
     * @param columnShardBuilderFactory
     *          factory for creating a new col.
     * @param columnVersionManager
     *          Needed in case {@link ColumnVersionBuiltConsumer} are wired and intermediate columns should be created.
     *          This is needed on query master only.
     */
    public ProjectStep(int stepId, QueryRegistry queryRegistry, ExecutionEnvironment defaultEnv,
            FunctionFactory functionFactory, String functionNameLowerCase, ColumnOrValue[] functionParameters,
            String outputColName, ColumnShardBuilderFactory columnShardBuilderFactory,
            ColumnShardFactory columnShardFactory, ColumnVersionManager columnVersionManager) {
        super(stepId, queryRegistry);
        this.defaultEnv = defaultEnv;
        this.functionFactory = functionFactory;
        this.functionNameLowerCase = functionNameLowerCase;
        this.functionParameters = functionParameters;
        this.outputColName = outputColName;
        this.columnShardBuilderFactory = columnShardBuilderFactory;
        this.columnShardFactory = columnShardFactory;
        this.columnVersionManager = columnVersionManager;
    }

    @Override
    public void initialize() {
        inputColNames = new HashSet<>();
        for (ColumnOrValue param : functionParameters)
            if (param.getType().equals(ColumnOrValue.Type.COLUMN))
                inputColNames.add(param.getColumnName());

        columnsThatStillNeedToBeBuilt = new ConcurrentSkipListSet<>(inputColNames);
        for (Iterator<String> it = columnsThatStillNeedToBeBuilt.iterator(); it.hasNext();)
            if (defaultEnv.getColumnShard(it.next()) != null)
                it.remove();

        columnShardBuilderManagerSupplier = (outputColType) -> {
            LoaderColumnInfo columnInfo = new LoaderColumnInfo(outputColType);
            return columnShardBuilderFactory.createColumnShardBuilderManager(columnInfo,
                    defaultEnv.getFirstRowIdInShard());
        };
    }

    @Override
    protected void validateOutputConsumer(GenericConsumer consumer) throws IllegalArgumentException {
        if (!(consumer instanceof DoneConsumer) && !(consumer instanceof ColumnBuiltConsumer)
                && !(consumer instanceof ColumnVersionBuiltConsumer))
            throw new IllegalArgumentException(
                    "Only ColumnBuiltConsumer and ColumnVersionBuiltConsumer supported.");
    }

    @Override
    protected void execute() {
        // Did we fill the output column completely and are we done?
        boolean columnFullyBuilt = false;

        ColumnShard column = null;

        VersionedExecutionEnvironment temporaryEnv;
        Set<Long> curAdjustedRowIds;
        synchronized (newestSync) {
            temporaryEnv = newestTemporaryEnv;
            curAdjustedRowIds = newestAdjustedRowIds;
            if (curAdjustedRowIds != null && !curAdjustedRowIds.isEmpty())
                newestAdjustedRowIds = new HashSet<>();
        }

        if (inputColNames.size() == 0) {
            // we do not have input columns, just literals. The resulting column will likely end up being a column with only
            // one row, a 'constant' row. This is handled accordingly in ResolveColumnDictIdsStep.

            ColumnType inputColType = null;
            if (functionParameters[0].getValue() instanceof Long)
                inputColType = ColumnType.LONG;
            else if (functionParameters[0].getValue() instanceof String)
                inputColType = ColumnType.STRING;
            else if (functionParameters[0].getValue() instanceof Double)
                inputColType = ColumnType.DOUBLE;

            ProjectionFunction<Object, Object> fn = functionFactory.createProjectionFunction(functionNameLowerCase,
                    inputColType);

            if (fn == null)
                throw new ExecutablePlanExecutionException("Cannot find function '" + functionNameLowerCase
                        + "' with input data type " + inputColType);

            for (int paramIdx = 0; paramIdx < functionParameters.length; paramIdx++)
                fn.provideConstantParameter(paramIdx, functionParameters[paramIdx].getValue());

            Object[] fnResult = fn.execute();

            switch (fn.getOutputType()) {
            case LONG:
                column = columnShardFactory.createConstantLongColumnShard(outputColName, (Long) fnResult[0],
                        defaultEnv.getFirstRowIdInShard());
                break;
            case STRING:
                column = columnShardFactory.createConstantStringColumnShard(outputColName, (String) fnResult[0],
                        defaultEnv.getFirstRowIdInShard());
                break;
            case DOUBLE:
                column = columnShardFactory.createConstantDoubleColumnShard(outputColName, (Double) fnResult[0],
                        defaultEnv.getFirstRowIdInShard());
                break;
            }
            columnFullyBuilt = true;
            logger.trace("Build constant column {} as there are no column inputs. Value: {}", outputColName,
                    fnResult[0]);
        } else if (columnBuiltConsumer.getNumberOfTimesWired() == 0
                || (columnBuiltConsumer.getNumberOfTimesWired() > 0 && allColumnsBuilt.get())) {
            // We waited enough, all our source columns are built fully and are available in the defaultEnv.

            logger.trace("Build standard column {} based on default environment (= last run).", outputColName);
            column = buildColumnBasedProjection(defaultEnv);
            columnFullyBuilt = true;
        } else if (columnBuiltConsumer.getNumberOfTimesWired() > 0 && inputSourcesDone.get()
                && !allColumnsBuilt.get()) {
            // we need to wait for columns to be built, but the columnBuiltConsumer reported to be done, but not all columns
            // have been built. Therefore we cannot execute the projection, but just report "done".
            logger.debug("Projection waited for columns to be built, but some won't be built. Skipping.");
            forEachOutputConsumerOfType(GenericConsumer.class, c -> c.sourceIsDone());
            doneProcessing();
            return;
        } else {
            // not all columns are yet fully available. Let's see if we have enough information to at least project some parts
            // for the time being.

            if (temporaryEnv != null && existsOutputConsumerOfType(ColumnVersionBuiltConsumer.class)) {
                boolean allInputColsAvailable = inputColNames.stream()
                        .allMatch(colName -> temporaryEnv.getColumnShard(colName) != null);

                if (allInputColsAvailable) {
                    // we have data for all input columns available, which means that we can start projection at least
                    // /something/.

                    logger.trace("Build intermediary column {} after following rowIds were adjusted (limit) {}",
                            outputColName, Iterables.limit(curAdjustedRowIds, 100));

                    // execute full projection, although we have specific row IDs that have been altered.
                    // TODO #8 cache intermediary results and use that to not again apply the projection function to all elements
                    // again.
                    column = buildColumnBasedProjection(temporaryEnv);
                }
            }
        }

        if (column != null) {
            if (temporaryEnv != null && columnVersionManager != null
                    && existsOutputConsumerOfType(ColumnVersionBuiltConsumer.class)) {
                logger.trace("Will store new version of {}", outputColName);
                // inform ColumnVersionBuiltConsumer
                VersionedExecutionEnvironment newEnv = columnVersionManager.createNewVersion(column);

                forEachOutputConsumerOfType(ColumnVersionBuiltConsumer.class,
                        c -> c.columnVersionBuilt(newEnv, outputColName, curAdjustedRowIds));
            }

            // if done, inform other consumers.
            if (columnFullyBuilt) {
                logger.trace("Will store final column {}", outputColName);

                switch (column.getColumnType()) {
                case STRING:
                    defaultEnv.storeTemporaryStringColumnShard((StringColumnShard) column);
                    break;
                case LONG:
                    defaultEnv.storeTemporaryLongColumnShard((LongColumnShard) column);
                    break;
                case DOUBLE:
                    defaultEnv.storeTemporaryDoubleColumnShard((DoubleColumnShard) column);
                    break;
                }
                forEachOutputConsumerOfType(ColumnBuiltConsumer.class, c -> c.columnBuilt(outputColName));
                forEachOutputConsumerOfType(GenericConsumer.class, c -> c.sourceIsDone());
                doneProcessing();
            }
        }
    }

    /**
     * Executes a projection that is based on at least one {@link ColumnShard}, which is/are available in the given
     * {@link ExecutionEnvironment}.
     * 
     * The created column, which might either be a {@link StandardColumnShard} or a {@link ConstantColumnShard} (in case
     */
    private ColumnShard buildColumnBasedProjection(ExecutionEnvironment env) {

        // buckets of row IDs we want to process together. Left of pair: first row ID of bucket, right: length.
        Set<Pair<Long, Integer>> rowIdBucketsToProcess;

        if (inputColNames.stream().anyMatch(colName -> env.getPureStandardColumnShard(colName) != null)) {
            // Find column shard that contains the least rows, in order to calculate rowID buckets below.
            // On the query master each column might have different number of rows, therefore we find the least common number
            // of rows that we can process.
            String referenceColName = inputColNames.stream()
                    .filter(colName -> env.getPureStandardColumnShard(colName) != null).map(name -> //
                    new Pair<String, Long>(name,
                            env.getPureStandardColumnShard(name).getNumberOfRowsInColumnShard()))
                    .min((p1, p2) -> p1.getRight().compareTo(p2.getRight())).get().getLeft();

            rowIdBucketsToProcess = env.getColumnShard(referenceColName).getGoodResolutionPairs();
        } else {
            // only ConstantColumnShard objects.
            rowIdBucketsToProcess = new HashSet<Pair<Long, Integer>>();
            rowIdBucketsToProcess.add(new Pair<Long, Integer>(defaultEnv.getFirstRowIdInShard(), 1));
        }

        // choose an arbitrary input column to identify input colType. All input columns and constants need to be of equal
        // type anyway.
        ColumnType inputColumnType = env.getColumnType(inputColNames.stream().findAny().get());

        ConstantColumnShard[] resultConstantColumn = new ConstantColumnShard[1];
        resultConstantColumn[0] = null;

        ProjectionFunction<?, ?> tmpProjectionFunction = functionFactory
                .createProjectionFunction(functionNameLowerCase, inputColumnType);

        if (tmpProjectionFunction == null)
            throw new ExecutablePlanExecutionException(
                    "Cannot find function '" + functionNameLowerCase + "' with input data type " + inputColumnType);

        ColumnShardBuilderManager columnShardBuilderManager = columnShardBuilderManagerSupplier
                .apply(tmpProjectionFunction.getOutputType());

        // execute ProjectionFunctions based on buckets of rowIds.
        rowIdBucketsToProcess.forEach(new Consumer<Pair<Long, Integer>>() {
            @Override
            public void accept(Pair<Long, Integer> pair) {
                long firstRowId = pair.getLeft();
                int length = pair.getRight();

                ProjectionFunction<Object, Object> fn = functionFactory
                        .createProjectionFunction(functionNameLowerCase, inputColumnType);

                boolean hadStandardColumnInput = false;
                for (int paramIdx = 0; paramIdx < functionParameters.length; paramIdx++) {
                    ColumnOrValue param = functionParameters[paramIdx];
                    if (param.getType() == ColumnOrValue.Type.LITERAL) {
                        fn.provideConstantParameter(paramIdx, param.getValue());
                    } else {
                        ConstantColumnShard constantShard = env.getPureConstantColumnShard(param.getColumnName());
                        if (constantShard != null) {
                            fn.provideConstantParameter(paramIdx, constantShard.getValue());
                        } else {
                            hadStandardColumnInput = true;
                            Object[] colValues = fn.createEmptyInputArray(length);
                            int rowsResolved = resolveValuesFromColumn(env.getColumnShard(param.getColumnName()),
                                    firstRowId, length, colValues);
                            if (rowsResolved != length)
                                throw new ExecutablePlanExecutionException("Column " + param.getColumnName()
                                        + " does not contain the same number of rows as other columns; cannot execute function "
                                        + functionNameLowerCase + " to produce output column " + outputColName);
                            fn.provideParameter(paramIdx, colValues);
                        }
                    }
                }

                Object[] fnResult = fn.execute();

                if (hadStandardColumnInput) {
                    columnShardBuilderManager.addValues(outputColName, fnResult, firstRowId);
                } else {
                    // we did not have input from a standardColumnShard. We would not execute this method if there were no
                    // column
                    // input at all, therefore all column inputs were constantColumnShards. Because of this we should again
                    // build
                    // a constantColumnShard.
                    // It is no problem to directly create the result column within the forEach(..) call, as in case all inputs
                    // are constants, there is only one Pair<Long, Integer> the forEach is iterating over.
                    switch (fn.getOutputType()) {
                    case LONG:
                        resultConstantColumn[0] = columnShardFactory.createConstantLongColumnShard(outputColName,
                                (Long) fnResult[0], defaultEnv.getFirstRowIdInShard());
                        break;
                    case STRING:
                        resultConstantColumn[0] = columnShardFactory.createConstantStringColumnShard(outputColName,
                                (String) fnResult[0], defaultEnv.getFirstRowIdInShard());
                        break;
                    case DOUBLE:
                        resultConstantColumn[0] = columnShardFactory.createConstantDoubleColumnShard(outputColName,
                                (Double) fnResult[0], defaultEnv.getFirstRowIdInShard());
                        break;
                    }
                }
            }
        });

        if (resultConstantColumn[0] == null)
            return columnShardBuilderManager.buildAndFree(outputColName);

        return resultConstantColumn[0];
    }

    /**
     * Resolves values of a specific row ID range from the given column and takes care of fetching those values from the
     * {@link ColumnPage}s that contain them.
     *
     * @return number of elements resolved - this might be smaller than 'length' in case source column did not provide
     *         enough data.
     */
    private int resolveValuesFromColumn(QueryableColumnShard column, long firstRowId, int length, Object[] result) {
        if (column.getFirstRowId() > firstRowId) {
            // make sure firstRowId is inside the column shard.
            long delta = firstRowId - column.getFirstRowId();
            length -= delta;
            firstRowId += delta;
            if (length <= 0)
                return 0;
        }

        Long[] columnValueIds = column.resolveColumnValueIdsForRowsFlat( //
                LongStream.range(firstRowId, firstRowId + length).mapToObj(Long::valueOf)
                        .collect(Collectors.toList()));

        // highest index in columnValueIds where the value is != -1
        int maxIdx = columnValueIds.length - 1;
        if (columnValueIds[maxIdx] == -1L) {
            // resolveColumnValueIdsForRowsFlat returns -1 for rowIds not contained in the column shard. This can happen if
            // the length parameter of this method is too high. As we though provided a sorted input to
            // resolveColumnValueIdsForRowsFlat, this can happen only at the end of the columnValueIds array.
            // We therefore do a binary search for the first -1 in a consecutive batch of -1s.
            int lo = 0;
            int hi = columnValueIds.length - 1;
            boolean found = false;
            while (!found && hi >= 0 && lo < columnValueIds.length && hi >= lo) {
                if (columnValueIds[lo] == -1L) {
                    maxIdx = lo - 1;
                    found = true;
                } else if (columnValueIds[hi] != -1L) {
                    maxIdx = hi;
                    found = true;
                } else {
                    int mid = (hi - lo) / 2;
                    if (columnValueIds[mid] == -1L)
                        hi = mid - 1;
                    else
                        lo = mid + 1;
                }
            }

            if (maxIdx == -1)
                // all columnValueIds == -1
                return 0;
        }

        if (maxIdx < columnValueIds.length - 1) {
            Long[] newColumnValueIds = new Long[maxIdx + 1];
            System.arraycopy(columnValueIds, 0, newColumnValueIds, 0, maxIdx + 1);
            columnValueIds = newColumnValueIds;
        }

        Object[] values = column.getColumnShardDictionary().decompressValues(columnValueIds);
        System.arraycopy(values, 0, result, 0, values.length);

        return values.length;
    }

    @Override
    protected List<GenericConsumer> inputConsumers() {
        return new ArrayList<>(
                Arrays.asList(new GenericConsumer[] { columnBuiltConsumer, columnVersionBuiltConsumer }));
    }

    @Override
    protected void validateWiredStatus() throws ExecutablePlanBuildException {
        // noop. Both is fine, having an input and not having an input.
    }

    @Override
    protected String getAdditionalToStringDetails() {
        return "funcName=" + functionNameLowerCase + ", outputCol=" + outputColName;
    }

}