org.diqube.execution.steps.BuildColumnFromValuesStep.java Source code

Java tutorial

Introduction

Here is the source code for org.diqube.execution.steps.BuildColumnFromValuesStep.java

Source

/**
 * diqube: Distributed Query Base.
 *
 * Copyright (C) 2015 Bastian Gloeckle
 *
 * This file is part of diqube.
 *
 * diqube is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.diqube.execution.steps;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;

import org.diqube.data.column.ColumnShard;
import org.diqube.data.types.dbl.DoubleColumnShard;
import org.diqube.data.types.lng.LongColumnShard;
import org.diqube.data.types.str.StringColumnShard;
import org.diqube.execution.ColumnVersionManager;
import org.diqube.execution.consumers.AbstractThreadedColumnValueConsumer;
import org.diqube.execution.consumers.ColumnBuiltConsumer;
import org.diqube.execution.consumers.ColumnValueConsumer;
import org.diqube.execution.consumers.ColumnVersionBuiltConsumer;
import org.diqube.execution.consumers.DoneConsumer;
import org.diqube.execution.consumers.GenericConsumer;
import org.diqube.executionenv.ExecutionEnvironment;
import org.diqube.executionenv.VersionedExecutionEnvironment;
import org.diqube.loader.columnshard.ColumnShardBuilderFactory;
import org.diqube.loader.columnshard.SparseColumnShardBuilder;
import org.diqube.queries.QueryRegistry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Iterables;

/**
 * Builds a temporary sparse column out of values provided by a {@link ColumnValueConsumer}.
 * 
 * <p>
 * It builds a final column shard as soon as the input {@link ColumnValueConsumer} is fully done - if one is interested
 * in more updates, the {@link ColumnVersionBuiltConsumer} should be wired. The latter will receive as much updates as
 * possible with intermediate {@link ExecutionEnvironment}s containing the built column.
 * 
 * <p>
 * Input: 1 {@link ColumnValueConsumer}<br>
 * Output: {@link ColumnBuiltConsumer} and {@link ColumnVersionBuiltConsumer}
 *
 * @author Bastian Gloeckle
 */
public class BuildColumnFromValuesStep extends AbstractThreadedExecutablePlanStep {

    private static final Logger logger = LoggerFactory.getLogger(BuildColumnFromValuesStep.class);

    private String colName;

    private AtomicBoolean sourceIsDone = new AtomicBoolean(false);

    private Object columnSync = new Object();
    /** All values of the column we're interested in, keyed by rowId. Sync access with {@link #columnSync}. */
    private Map<Long, Object> columnValues = new HashMap<>();
    /**
     * Those rowIds that have been updated since the last run of {@link #execute()}. Sync access with {@link #columnSync}.
     */
    private Set<Long> updatedRowIds = new HashSet<Long>();
    /** <code>true</code> if there was at least one update for our col since the last run of {@link #execute()} */
    private AtomicBoolean atLeastOneInterestingUpdate = new AtomicBoolean(false);

    private AbstractThreadedColumnValueConsumer columnValueConsumer = new AbstractThreadedColumnValueConsumer(
            this) {
        @Override
        protected void allSourcesAreDone() {
            BuildColumnFromValuesStep.this.sourceIsDone.set(true);
        }

        @Override
        protected void doConsume(String colName, Map<Long, Object> values) {
            if (!colName.equals(BuildColumnFromValuesStep.this.colName))
                return;

            synchronized (columnSync) {
                columnValues.putAll(values);
                updatedRowIds.addAll(values.keySet());
                atLeastOneInterestingUpdate.set(true);
            }
        }
    };

    private ColumnShardBuilderFactory columnShardBuilderFactory;

    private ExecutionEnvironment defaultEnv;

    private ColumnVersionManager columnVersionManager;

    public BuildColumnFromValuesStep(int stepId, QueryRegistry queryRegistry, ExecutionEnvironment defaultEnv,
            String colName, ColumnShardBuilderFactory columnShardBuilderFactory,
            ColumnVersionManager columnVersionManager) {
        super(stepId, queryRegistry);
        this.defaultEnv = defaultEnv;
        this.colName = colName;
        this.columnShardBuilderFactory = columnShardBuilderFactory;
        this.columnVersionManager = columnVersionManager;
    }

    @Override
    protected void validateOutputConsumer(GenericConsumer consumer) throws IllegalArgumentException {
        if (!(consumer instanceof DoneConsumer) && !(consumer instanceof ColumnBuiltConsumer)
                && !(consumer instanceof ColumnVersionBuiltConsumer))
            throw new IllegalArgumentException(
                    "Only ColumnBuiltConsumer and ColumnVersionBuiltConsumer supported.");
    }

    @Override
    protected void execute() {
        // this is the last run of this execute method if the input source is fully done.
        boolean intermediateRun = !sourceIsDone.get();

        if (intermediateRun && !existsOutputConsumerOfType(ColumnVersionBuiltConsumer.class))
            // if this is NOT the last run (= there are more values to be provided), but there is no-one who'd listen to
            // intermediary updates, do not calculate them.
            return;

        if (intermediateRun && !atLeastOneInterestingUpdate.get())
            return;

        Map<Long, Object> values;
        Set<Long> curUpdatedRowIds;
        synchronized (columnSync) {
            atLeastOneInterestingUpdate.set(false);

            if (columnValues == null || columnValues.isEmpty()) {
                if (!intermediateRun) {
                    // source is done but we did not receive any data. Do not build column, just report "done".
                    forEachOutputConsumerOfType(GenericConsumer.class, c -> c.sourceIsDone());
                    doneProcessing();
                    return;
                }
                return;
            }

            values = new HashMap<Long, Object>(columnValues);

            curUpdatedRowIds = updatedRowIds;
            updatedRowIds = new HashSet<>();
        }
        long numberOfRows = values.keySet().stream().max(Long::compare).get() + 1;

        SparseColumnShardBuilder<Object> columnShardBuilder = columnShardBuilderFactory
                .createSparseColumnShardBuilder(colName);

        columnShardBuilder.withValues(values);
        columnShardBuilder.withNumberOfRows(numberOfRows);
        ColumnShard newColumn = columnShardBuilder.build();

        // inform ColumnVersionBuiltConsumers
        if (existsOutputConsumerOfType(ColumnVersionBuiltConsumer.class)) {
            logger.trace("Building new column version for {} after adjusting rows (limt) {}", colName,
                    Iterables.limit(curUpdatedRowIds, 500));
            VersionedExecutionEnvironment newEnv = columnVersionManager.createNewVersion(newColumn);
            forEachOutputConsumerOfType(ColumnVersionBuiltConsumer.class,
                    c -> c.columnVersionBuilt(newEnv, colName, curUpdatedRowIds));
        }

        // if done, inform other consumers.
        if (!intermediateRun) {
            switch (newColumn.getColumnType()) {
            case STRING:
                defaultEnv.storeTemporaryStringColumnShard((StringColumnShard) newColumn);
                break;
            case LONG:
                defaultEnv.storeTemporaryLongColumnShard((LongColumnShard) newColumn);
                break;
            case DOUBLE:
                defaultEnv.storeTemporaryDoubleColumnShard((DoubleColumnShard) newColumn);
                break;
            }

            logger.trace("Built column {} from values received from a ColumnValueConsumer.", colName);
            forEachOutputConsumerOfType(ColumnBuiltConsumer.class, c -> c.columnBuilt(colName));
            forEachOutputConsumerOfType(GenericConsumer.class, c -> c.sourceIsDone());
            doneProcessing();
        }
    }

    @Override
    protected List<GenericConsumer> inputConsumers() {
        return new ArrayList<>(Arrays.asList(new GenericConsumer[] { columnValueConsumer }));
    }

    @Override
    protected String getAdditionalToStringDetails() {
        return "colName=" + colName;
    }

}