Java tutorial
/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.execution.steps; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.NavigableSet; import java.util.Set; import java.util.TreeSet; import java.util.concurrent.ConcurrentLinkedDeque; import java.util.concurrent.ConcurrentSkipListSet; import java.util.concurrent.atomic.AtomicBoolean; import org.diqube.execution.ColumnVersionBuiltHelper; import org.diqube.execution.consumers.AbstractThreadedColumnBuiltConsumer; import org.diqube.execution.consumers.AbstractThreadedColumnVersionBuiltConsumer; import org.diqube.execution.consumers.AbstractThreadedRowIdConsumer; import org.diqube.execution.consumers.ColumnBuiltConsumer; import org.diqube.execution.consumers.ColumnDictIdConsumer; import org.diqube.execution.consumers.ColumnVersionBuiltConsumer; import org.diqube.execution.consumers.DoneConsumer; import org.diqube.execution.consumers.GenericConsumer; import org.diqube.execution.consumers.RowIdConsumer; import org.diqube.execution.exception.ExecutablePlanBuildException; import org.diqube.executionenv.ExecutionEnvironment; import org.diqube.executionenv.VersionedExecutionEnvironment; import org.diqube.queries.QueryRegistry; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.Iterables; import com.google.common.collect.Sets; /** * Resolves Column shard Dictionary IDs for the rowIds in a specific column. * * <p> * This step can optionally be executed on a column that still needs to be constructed. In that case, a * {@link ColumnBuiltConsumer} input needs to be specified which keeps this step up to date with the construction of * that column. In that case, an additional {@link ColumnVersionBuiltConsumer} could be specified. If no * {@link ColumnBuiltConsumer} is specified, then it is expected that the column is already available through the * default {@link ExecutionEnvironment}. * * <p> * Input: 1 {@link RowIdConsumer} and 1 optional {@link ColumnBuiltConsumer}, 1 optional * {@link ColumnVersionBuiltConsumer} <br> * Output: {@link ColumnDictIdConsumer}s. * * @author Bastian Gloeckle */ public class ResolveColumnDictIdsStep extends AbstractThreadedExecutablePlanStep { private static final Logger logger = LoggerFactory.getLogger(ResolveColumnDictIdsStep.class); private AtomicBoolean rowIdSourceIsEmpty = new AtomicBoolean(false); private ConcurrentLinkedDeque<Long> rowIds = new ConcurrentLinkedDeque<>(); private RowIdConsumer rowIdConsumer = new AbstractThreadedRowIdConsumer(this) { @Override public void allSourcesAreDone() { ResolveColumnDictIdsStep.this.rowIdSourceIsEmpty.set(true); } @Override protected void doConsume(Long[] rowIds) { for (long rowId : rowIds) ResolveColumnDictIdsStep.this.rowIds.add(rowId); } }; /** Only important if {@link #colBuiltConsumer} is wired */ private AtomicBoolean sourceColumnIsBuilt = new AtomicBoolean(false); private AtomicBoolean colBuiltConsumerIsDone = new AtomicBoolean(false); private AbstractThreadedColumnBuiltConsumer colBuiltConsumer = new AbstractThreadedColumnBuiltConsumer(this) { @Override protected void doColumnBuilt(String colName) { if (colName.equals(ResolveColumnDictIdsStep.this.colName)) ResolveColumnDictIdsStep.this.sourceColumnIsBuilt.set(true); } @Override protected void allSourcesAreDone() { colBuiltConsumerIsDone.set(true); } }; private Object newestSync = new Object(); /** * The {@link VersionedExecutionEnvironment} with the highest ID that has been provided up until now. Use this * {@link ExecutionEnvironment} for resolving any valus of columns when based on intermediary values. Sync access with * {@link #newestSync}. */ private VersionedExecutionEnvironment newestTemporaryEnv = null; /** * Those row IDs that have been reported since the last run of {@link #execute()} as having their values changed.. * Sync access with {@link #newestSync}. */ private NavigableSet<Long> newestAdjustedRowIds = new ConcurrentSkipListSet<>(); private AbstractThreadedColumnVersionBuiltConsumer columnVersionBuiltConsumer = new AbstractThreadedColumnVersionBuiltConsumer( this) { @Override protected void allSourcesAreDone() { } @Override protected void doColumnBuilt(VersionedExecutionEnvironment env, String colName, Set<Long> adjustedRowIds) { // TODO #8 act only if colName.equals(this.colName). synchronized (newestSync) { if (newestTemporaryEnv == null) newestTemporaryEnv = env; else if (newestTemporaryEnv.getVersion() < env.getVersion()) newestTemporaryEnv = env; newestAdjustedRowIds.addAll(adjustedRowIds); } } }; /** name of the col to resolve values of. */ private String colName; private ExecutionEnvironment defaultEnv; /** * Row IDs that have been reported by {@link RowIdConsumer} for resolving. But up until now, there were no values * available for these rowIds, so we remember them to be resolved later. This can happen if * {@link ColumnVersionBuiltConsumer} is wired and we base our execution on intermediary values. */ private NavigableSet<Long> notYetProcessedRowIds = new TreeSet<>(); /** * All rowIds that we already resolved values of. We need to remember those in case any of these rowIds changes its * values (as reported by input {@link ColumnVersionBuiltConsumer}s) and we need to resolve it again. */ private Set<Long> processedRowIds = new HashSet<>(); public ResolveColumnDictIdsStep(int stepId, QueryRegistry queryRegistry, ExecutionEnvironment defaultEnv, String colName) { super(stepId, queryRegistry); this.defaultEnv = defaultEnv; this.colName = colName; } @Override public void execute() { boolean intermediateRun = !(colBuiltConsumer.getNumberOfTimesWired() == 0 || sourceColumnIsBuilt.get()); if (colBuiltConsumer.getNumberOfTimesWired() > 0 && colBuiltConsumerIsDone.get() && !sourceColumnIsBuilt.get()) { logger.debug("Waited for column {} to be built, but it won't be built. Skipping.", colName); forEachOutputConsumerOfType(GenericConsumer.class, c -> c.sourceIsDone()); doneProcessing(); return; } NavigableSet<Long> curAdjustedRowIds; synchronized (newestSync) { // Fetch rowIds whose values have been adjusted. Note that this is not 100% thread-safe in case intermediateRun == // true. Because in that case we will resolve the corresponding ExecutionEnvironment that should be used later // with another sync block - in between a new env might have arrived with new adjustedRowIds - as the set of // rowIds being reported only increases though, it is no problem to only execute on a set of adjustedRows on a // newer env, as we will resolve those other reported rowIds just one execution later. curAdjustedRowIds = newestAdjustedRowIds; newestAdjustedRowIds = new TreeSet<>(); } ExecutionEnvironment env; if (!intermediateRun) env = defaultEnv; else { synchronized (newestSync) { env = newestTemporaryEnv; if (env == null || env.getColumnShard(colName) == null) { // re-remember those IDs we removed from the set already. newestAdjustedRowIds.addAll(curAdjustedRowIds); return; } } } // fetch row IDs whose columndictid should be resolved. NavigableSet<Long> activeRowIds = new TreeSet<>(); Long rowId; while ((rowId = rowIds.poll()) != null) activeRowIds.add(rowId); if (intermediateRun) { // restrict active row IDs to only contain available rows and include & publish notYetProcessedRowIds. long maxAvailableRowId = new ColumnVersionBuiltHelper().publishActiveRowIds(env, Arrays.asList(colName), activeRowIds, notYetProcessedRowIds); if (maxAvailableRowId == -1L) { // our column is not built. Should not happen, but just to be sure... logger.warn( "ColumnVersionBuiltHelper told us that our column is notr built. This should not happen."); return; } // adjust set of rows that have been adjusted - shrink them to the row IDs that are available. If other rowIds // have changed their value this is not interesting to us, because we did notyet resolve their values anyway. curAdjustedRowIds = curAdjustedRowIds.headSet(maxAvailableRowId, true); } else { activeRowIds.addAll(notYetProcessedRowIds); notYetProcessedRowIds.clear(); } // be sure to resolve those row IDs fresh that we resolved already but whose value changed. activeRowIds.addAll(Sets.intersection(curAdjustedRowIds, processedRowIds)); if (activeRowIds.size() > 0) { logger.trace("Resolving column dict IDs of col {} based on ExecutionEnv {} at row IDs (limit, {}) {}", colName, env, activeRowIds.size(), Iterables.limit(activeRowIds, 500)); if (env.getPureConstantColumnShard(colName) != null) { long columnValueId = env.getPureConstantColumnShard(colName).getSingleColumnDictId(); Map<Long, Long> rowIdToDictIdMap = new HashMap<>(); for (Long curRowId : activeRowIds) rowIdToDictIdMap.put(curRowId, columnValueId); logger.trace( "Resolving column dict IDs of col {} done, was easy as it was a constant col, sending out updates", colName); forEachOutputConsumerOfType(ColumnDictIdConsumer.class, c -> c.consume(env, colName, rowIdToDictIdMap)); } else { Map<Long, Long> rowIdToColumnValueId = env.getColumnShard(colName) .resolveColumnValueIdsForRows(activeRowIds); logger.trace("Resolving column dict IDs of col {} done, sending out updates (limit): {}", colName, Iterables.limit(rowIdToColumnValueId.entrySet(), 100)); forEachOutputConsumerOfType(ColumnDictIdConsumer.class, c -> c.consume(env, colName, rowIdToColumnValueId)); } processedRowIds.addAll(activeRowIds); } if (!intermediateRun && rowIdSourceIsEmpty.get() && rowIds.isEmpty() && newestAdjustedRowIds.isEmpty()) { forEachOutputConsumerOfType(GenericConsumer.class, c -> c.sourceIsDone()); doneProcessing(); } } @Override public List<GenericConsumer> inputConsumers() { return Arrays.asList(new GenericConsumer[] { rowIdConsumer, colBuiltConsumer, columnVersionBuiltConsumer }); } @Override protected void validateOutputConsumer(GenericConsumer consumer) throws IllegalArgumentException { if (!(consumer instanceof DoneConsumer) && !(consumer instanceof ColumnDictIdConsumer)) throw new IllegalArgumentException("Only ColumnDictIdConsumer supported!"); } @Override protected void validateWiredStatus() throws ExecutablePlanBuildException { if (rowIdConsumer.getNumberOfTimesWired() == 0) throw new ExecutablePlanBuildException("RowID consumer is not wired on " + this.toString()); } @Override protected String getAdditionalToStringDetails() { return "colName=" + colName; } }