org.diqube.execution.steps.ResolveColumnDictIdsStep.java Source code

Introduction

Here is the source code for org.diqube.execution.steps.ResolveColumnDictIdsStep.java
Source

/**
 * diqube: Distributed Query Base.
 *
 * Copyright (C) 2015 Bastian Gloeckle
 *
 * This file is part of diqube.
 *
 * diqube is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.diqube.execution.steps;

import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.NavigableSet;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.atomic.AtomicBoolean;

import org.diqube.execution.ColumnVersionBuiltHelper;
import org.diqube.execution.consumers.AbstractThreadedColumnBuiltConsumer;
import org.diqube.execution.consumers.AbstractThreadedColumnVersionBuiltConsumer;
import org.diqube.execution.consumers.AbstractThreadedRowIdConsumer;
import org.diqube.execution.consumers.ColumnBuiltConsumer;
import org.diqube.execution.consumers.ColumnDictIdConsumer;
import org.diqube.execution.consumers.ColumnVersionBuiltConsumer;
import org.diqube.execution.consumers.DoneConsumer;
import org.diqube.execution.consumers.GenericConsumer;
import org.diqube.execution.consumers.RowIdConsumer;
import org.diqube.execution.exception.ExecutablePlanBuildException;
import org.diqube.executionenv.ExecutionEnvironment;
import org.diqube.executionenv.VersionedExecutionEnvironment;
import org.diqube.queries.QueryRegistry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;

/**
 * Resolves Column shard Dictionary IDs for the rowIds in a specific column.
 * 
 * <p>
 * This step can optionally be executed on a column that still needs to be constructed. In that case, a
 * {@link ColumnBuiltConsumer} input needs to be specified which keeps this step up to date with the construction of
 * that column. In that case, an additional {@link ColumnVersionBuiltConsumer} could be specified. If no
 * {@link ColumnBuiltConsumer} is specified, then it is expected that the column is already available through the
 * default {@link ExecutionEnvironment}.
 * 
 * <p>
 * Input: 1 {@link RowIdConsumer} and 1 optional {@link ColumnBuiltConsumer}, 1 optional
 * {@link ColumnVersionBuiltConsumer} <br>
 * Output: {@link ColumnDictIdConsumer}s.
 * 
 * @author Bastian Gloeckle
 */
public class ResolveColumnDictIdsStep extends AbstractThreadedExecutablePlanStep {

    private static final Logger logger = LoggerFactory.getLogger(ResolveColumnDictIdsStep.class);

    private AtomicBoolean rowIdSourceIsEmpty = new AtomicBoolean(false);

    private ConcurrentLinkedDeque<Long> rowIds = new ConcurrentLinkedDeque<>();

    private RowIdConsumer rowIdConsumer = new AbstractThreadedRowIdConsumer(this) {
        @Override
        public void allSourcesAreDone() {
            ResolveColumnDictIdsStep.this.rowIdSourceIsEmpty.set(true);
        }

        @Override
        protected void doConsume(Long[] rowIds) {
            for (long rowId : rowIds)
                ResolveColumnDictIdsStep.this.rowIds.add(rowId);
        }
    };

    /** Only important if {@link #colBuiltConsumer} is wired */
    private AtomicBoolean sourceColumnIsBuilt = new AtomicBoolean(false);

    private AtomicBoolean colBuiltConsumerIsDone = new AtomicBoolean(false);

    private AbstractThreadedColumnBuiltConsumer colBuiltConsumer = new AbstractThreadedColumnBuiltConsumer(this) {
        @Override
        protected void doColumnBuilt(String colName) {
            if (colName.equals(ResolveColumnDictIdsStep.this.colName))
                ResolveColumnDictIdsStep.this.sourceColumnIsBuilt.set(true);
        }

        @Override
        protected void allSourcesAreDone() {
            colBuiltConsumerIsDone.set(true);
        }
    };

    private Object newestSync = new Object();
    /**
     * The {@link VersionedExecutionEnvironment} with the highest ID that has been provided up until now. Use this
     * {@link ExecutionEnvironment} for resolving any valus of columns when based on intermediary values. Sync access with
     * {@link #newestSync}.
     */
    private VersionedExecutionEnvironment newestTemporaryEnv = null;
    /**
     * Those row IDs that have been reported since the last run of {@link #execute()} as having their values changed..
     * Sync access with {@link #newestSync}.
     */
    private NavigableSet<Long> newestAdjustedRowIds = new ConcurrentSkipListSet<>();

    private AbstractThreadedColumnVersionBuiltConsumer columnVersionBuiltConsumer = new AbstractThreadedColumnVersionBuiltConsumer(
            this) {
        @Override
        protected void allSourcesAreDone() {
        }

        @Override
        protected void doColumnBuilt(VersionedExecutionEnvironment env, String colName, Set<Long> adjustedRowIds) {
            // TODO #8 act only if colName.equals(this.colName).
            synchronized (newestSync) {
                if (newestTemporaryEnv == null)
                    newestTemporaryEnv = env;
                else if (newestTemporaryEnv.getVersion() < env.getVersion())
                    newestTemporaryEnv = env;
                newestAdjustedRowIds.addAll(adjustedRowIds);
            }
        }
    };

    /** name of the col to resolve values of. */
    private String colName;

    private ExecutionEnvironment defaultEnv;

    /**
     * Row IDs that have been reported by {@link RowIdConsumer} for resolving. But up until now, there were no values
     * available for these rowIds, so we remember them to be resolved later. This can happen if
     * {@link ColumnVersionBuiltConsumer} is wired and we base our execution on intermediary values.
     */
    private NavigableSet<Long> notYetProcessedRowIds = new TreeSet<>();
    /**
     * All rowIds that we already resolved values of. We need to remember those in case any of these rowIds changes its
     * values (as reported by input {@link ColumnVersionBuiltConsumer}s) and we need to resolve it again.
     */
    private Set<Long> processedRowIds = new HashSet<>();

    public ResolveColumnDictIdsStep(int stepId, QueryRegistry queryRegistry, ExecutionEnvironment defaultEnv,
            String colName) {
        super(stepId, queryRegistry);
        this.defaultEnv = defaultEnv;
        this.colName = colName;
    }

    @Override
    public void execute() {
        boolean intermediateRun = !(colBuiltConsumer.getNumberOfTimesWired() == 0 || sourceColumnIsBuilt.get());

        if (colBuiltConsumer.getNumberOfTimesWired() > 0 && colBuiltConsumerIsDone.get()
                && !sourceColumnIsBuilt.get()) {
            logger.debug("Waited for column {} to  be built, but it won't be built. Skipping.", colName);
            forEachOutputConsumerOfType(GenericConsumer.class, c -> c.sourceIsDone());
            doneProcessing();
            return;
        }

        NavigableSet<Long> curAdjustedRowIds;
        synchronized (newestSync) {
            // Fetch rowIds whose values have been adjusted. Note that this is not 100% thread-safe in case intermediateRun ==
            // true. Because in that case we will resolve the corresponding ExecutionEnvironment that should be used later
            // with another sync block - in between a new env might have arrived with new adjustedRowIds - as the set of
            // rowIds being reported only increases though, it is no problem to only execute on a set of adjustedRows on a
            // newer env, as we will resolve those other reported rowIds just one execution later.
            curAdjustedRowIds = newestAdjustedRowIds;
            newestAdjustedRowIds = new TreeSet<>();
        }

        ExecutionEnvironment env;
        if (!intermediateRun)
            env = defaultEnv;
        else {
            synchronized (newestSync) {
                env = newestTemporaryEnv;
                if (env == null || env.getColumnShard(colName) == null) {
                    // re-remember those IDs we removed from the set already.
                    newestAdjustedRowIds.addAll(curAdjustedRowIds);
                    return;
                }
            }
        }

        // fetch row IDs whose columndictid should be resolved.
        NavigableSet<Long> activeRowIds = new TreeSet<>();
        Long rowId;
        while ((rowId = rowIds.poll()) != null)
            activeRowIds.add(rowId);

        if (intermediateRun) {
            // restrict active row IDs to only contain available rows and include & publish notYetProcessedRowIds.
            long maxAvailableRowId = new ColumnVersionBuiltHelper().publishActiveRowIds(env, Arrays.asList(colName),
                    activeRowIds, notYetProcessedRowIds);

            if (maxAvailableRowId == -1L) {
                // our column is not built. Should not happen, but just to be sure...
                logger.warn(
                        "ColumnVersionBuiltHelper told us that our column is notr built. This should not happen.");
                return;
            }

            // adjust set of rows that have been adjusted - shrink them to the row IDs that are available. If other rowIds
            // have changed their value this is not interesting to us, because we did notyet resolve their values anyway.
            curAdjustedRowIds = curAdjustedRowIds.headSet(maxAvailableRowId, true);
        } else {
            activeRowIds.addAll(notYetProcessedRowIds);
            notYetProcessedRowIds.clear();
        }

        // be sure to resolve those row IDs fresh that we resolved already but whose value changed.
        activeRowIds.addAll(Sets.intersection(curAdjustedRowIds, processedRowIds));

        if (activeRowIds.size() > 0) {
            logger.trace("Resolving column dict IDs of col {} based on ExecutionEnv {} at row IDs (limit, {}) {}",
                    colName, env, activeRowIds.size(), Iterables.limit(activeRowIds, 500));

            if (env.getPureConstantColumnShard(colName) != null) {
                long columnValueId = env.getPureConstantColumnShard(colName).getSingleColumnDictId();

                Map<Long, Long> rowIdToDictIdMap = new HashMap<>();
                for (Long curRowId : activeRowIds)
                    rowIdToDictIdMap.put(curRowId, columnValueId);
                logger.trace(
                        "Resolving column dict IDs of col {} done, was easy as it was a constant col, sending out updates",
                        colName);
                forEachOutputConsumerOfType(ColumnDictIdConsumer.class,
                        c -> c.consume(env, colName, rowIdToDictIdMap));
            } else {
                Map<Long, Long> rowIdToColumnValueId = env.getColumnShard(colName)
                        .resolveColumnValueIdsForRows(activeRowIds);

                logger.trace("Resolving column dict IDs of col {} done, sending out updates (limit): {}", colName,
                        Iterables.limit(rowIdToColumnValueId.entrySet(), 100));
                forEachOutputConsumerOfType(ColumnDictIdConsumer.class,
                        c -> c.consume(env, colName, rowIdToColumnValueId));
            }

            processedRowIds.addAll(activeRowIds);
        }

        if (!intermediateRun && rowIdSourceIsEmpty.get() && rowIds.isEmpty() && newestAdjustedRowIds.isEmpty()) {
            forEachOutputConsumerOfType(GenericConsumer.class, c -> c.sourceIsDone());
            doneProcessing();
        }
    }

    @Override
    public List<GenericConsumer> inputConsumers() {
        return Arrays.asList(new GenericConsumer[] { rowIdConsumer, colBuiltConsumer, columnVersionBuiltConsumer });
    }

    @Override
    protected void validateOutputConsumer(GenericConsumer consumer) throws IllegalArgumentException {
        if (!(consumer instanceof DoneConsumer) && !(consumer instanceof ColumnDictIdConsumer))
            throw new IllegalArgumentException("Only ColumnDictIdConsumer supported!");
    }

    @Override
    protected void validateWiredStatus() throws ExecutablePlanBuildException {
        if (rowIdConsumer.getNumberOfTimesWired() == 0)
            throw new ExecutablePlanBuildException("RowID consumer is not wired on " + this.toString());
    }

    @Override
    protected String getAdditionalToStringDetails() {
        return "colName=" + colName;
    }
}