Java tutorial
/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.plan.planner; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.function.Consumer; import java.util.function.Supplier; import java.util.stream.Collectors; import org.diqube.diql.request.FunctionRequest; import org.diqube.execution.ColumnVersionManager; import org.diqube.execution.ExecutablePlanFactory; import org.diqube.execution.ExecutablePlanStep; import org.diqube.execution.consumers.ColumnBuiltConsumer; import org.diqube.execution.consumers.ColumnValueConsumer; import org.diqube.execution.consumers.ColumnVersionBuiltConsumer; import org.diqube.execution.consumers.GroupIntermediaryAggregationConsumer; import org.diqube.execution.steps.BuildColumnFromValuesStep; import org.diqube.execution.steps.GroupFinalAggregationStep; import org.diqube.execution.steps.ProjectStep; import org.diqube.executionenv.ExecutionEnvironment; import org.diqube.plan.PlannerColumnInfo; import org.diqube.util.ColumnOrValue; import com.google.common.collect.Iterables; import com.google.common.collect.Sets; /** * {@link ColumnManager} for the Query Master node. * * @author Bastian Gloeckle */ public class MasterColumnManager implements ColumnManager<ExecutablePlanStep> { private Map<String, List<ExecutablePlanStep>> functionMasterSteps = new HashMap<>(); private ExecutionEnvironment env; private Supplier<Integer> nextMasterStepIdSupplier; private ExecutablePlanFactory executablePlanFactory; private Map<String, PlannerColumnInfo> columnInfo; private Set<String> columnsThatNeedToBeAvailable = new HashSet<>(); private Map<String, List<ExecutablePlanStep>> delayedWires = new HashMap<>(); private RemoteResolveManager remoteResolveManager; private MasterWireManager masterWireManager; private ExecutablePlanStep columnValuesProvidingStep; private ColumnVersionManager columnVersionManager; /** * @param remoteResolveManager * This {@link RemoteResolveManager} will be fed with those columns that need to be available on the master * and therefore need to be resolved on the remotes. Please note the JavaDoc of {@link #prepareBuild()}. */ public MasterColumnManager(ExecutionEnvironment masterExecutuionEnvironment, Supplier<Integer> nextMasterStepIdSupplier, ExecutablePlanFactory executablePlanFactory, ColumnVersionManager columnVersionManager, Map<String, PlannerColumnInfo> columnInfo, RemoteResolveManager remoteResolveManager, MasterWireManager masterWireManager) { this.env = masterExecutuionEnvironment; this.nextMasterStepIdSupplier = nextMasterStepIdSupplier; this.executablePlanFactory = executablePlanFactory; this.columnInfo = columnInfo; this.remoteResolveManager = remoteResolveManager; this.masterWireManager = masterWireManager; this.columnVersionManager = columnVersionManager; } @Override public void produceColumn(FunctionRequest fnReq) { if (fnReq.getType().equals(FunctionRequest.Type.PROJECTION)) { ProjectStep projectStep = executablePlanFactory.createProjectStep(nextMasterStepIdSupplier.get(), env, fnReq.getFunctionName(), fnReq.getOutputColumn(), fnReq.getInputParameters().toArray(new ColumnOrValue[fnReq.getInputParameters().size()]), columnVersionManager); // ensure all input columns are fully available on master for (ColumnOrValue input : fnReq.getInputParameters()) { if (input.getType().equals(ColumnOrValue.Type.COLUMN)) ensureColumnAvailable(input.getColumnName()); } functionMasterSteps.put(fnReq.getOutputColumn(), new ArrayList<>(Arrays.asList(new ExecutablePlanStep[] { projectStep }))); } else if (fnReq.getType().equals(FunctionRequest.Type.AGGREGATION_ROW)) { List<Object> constantFunctionParams = new ArrayList<>(); for (ColumnOrValue param : fnReq.getInputParameters()) { if (param.getType().equals(ColumnOrValue.Type.LITERAL)) constantFunctionParams.add(param.getValue()); } // TODO #28 do NOT calculate all Grouped results on query master, but distribute groups according to group hash // along all cluster nodes. That means that those clusternodes would fully process specific sets of groups and // they would need to have all the data needed for calculating those groups transferred to them. This though // would decrease the load on the query master heavily, especially if the results are ordered by grouped // columns early. GroupFinalAggregationStep finalStep = executablePlanFactory.createGroupFinalAggregationStep( nextMasterStepIdSupplier.get(), env, fnReq.getFunctionName(), fnReq.getOutputColumn(), columnVersionManager, constantFunctionParams); functionMasterSteps.put(fnReq.getOutputColumn(), new ArrayList<>(Arrays.asList(new ExecutablePlanStep[] { finalStep }))); } // note that the query master does not execute anything for AGGREGATION_COL! } @Override public void ensureColumnAvailable(String colName) { columnsThatNeedToBeAvailable.add(colName); } @Override public void wireOutputOfColumnIfAvailable(String colName, ExecutablePlanStep targetStep) { if (functionMasterSteps.containsKey(colName)) { ExecutablePlanStep previousStep = Iterables.getLast(functionMasterSteps.get(colName)); masterWireManager.wire(ColumnVersionBuiltConsumer.class, previousStep, targetStep); masterWireManager.wire(ColumnBuiltConsumer.class, previousStep, targetStep); } else if (columnsThatNeedToBeAvailable.contains(colName)) { if (!delayedWires.containsKey(colName)) delayedWires.put(colName, new ArrayList<>()); delayedWires.get(colName).add(targetStep); } } /** * Accepts a step that provides a {@link GroupIntermediaryAggregationConsumer} - these intermediate results are * provided by cluster nodes and need to be finalized on the query master. */ @Override public void wireGroupInput(ExecutablePlanStep groupIntermediateAggregateSourceStep) { // wire GroupFinalAggregationSteps to a source of row IDs. functionMasterSteps.values().stream().flatMap(steps -> steps.stream()) .filter(step -> step instanceof GroupFinalAggregationStep) .forEach(new Consumer<ExecutablePlanStep>() { @Override public void accept(ExecutablePlanStep groupFinalAggStep) { masterWireManager.wire(GroupIntermediaryAggregationConsumer.class, groupIntermediateAggregateSourceStep, groupFinalAggStep); } }); } /** * Prepares the call to {@link #build()}. Execute before building the steps of the {@link RemoteResolveManager} that * was specified in the constructor: This method will add any needed resolve steps in that * {@link RemoteResolveManager} if the query master needs additional columns to be resolved! */ @Override public void prepareBuild() { // ensure the source columns of the columns that are calculated on the query master are available // Row Aggregation columns do not need the whole columns on query master, as they will receive the groupIntermediary // results from the cluster nodes for (Entry<String, List<ExecutablePlanStep>> remoteEntry : functionMasterSteps.entrySet()) { PlannerColumnInfo colInfo = columnInfo.get(remoteEntry.getKey()); if (colInfo != null && !colInfo.getType().equals(FunctionRequest.Type.AGGREGATION_ROW)) for (String prevColumnName : colInfo.getDependsOnColumns()) ensureColumnAvailable(prevColumnName); } // take care of the columns we need to ensure are available on the query master. for (String transferColName : Sets.difference(columnsThatNeedToBeAvailable, functionMasterSteps.keySet())) { ExecutablePlanStep masterCreationStep = executablePlanFactory.createBuildColumnFromValuesStep( nextMasterStepIdSupplier.get(), env, transferColName, columnVersionManager); functionMasterSteps.put(transferColName, new ArrayList<ExecutablePlanStep>( Arrays.asList(new ExecutablePlanStep[] { masterCreationStep }))); // ensure the remote provides values for that column remoteResolveManager.resolveValuesOfColumn(transferColName); } } /** * @param columnValuesProvidingStep * The {@link ExecutablePlanStep} that provides all values of all columns that the cluster nodes have * resolved. This is needed for those columns that need to be created on the query master (= * {@link #ensureColumnAvailable(String)} was called). */ public void provideColumnValuesProvidingStep(ExecutablePlanStep columnValuesProvidingStep) { this.columnValuesProvidingStep = columnValuesProvidingStep; } /** * Call {@link #provideColumnValuesProvidingStep(ExecutablePlanStep)} before this! */ @Override public List<ExecutablePlanStep> build() { if (columnValuesProvidingStep == null) throw new IllegalStateException("Column values step was not provided."); // wire the columns that have been created; the source columns are available on the query master already, see // #prepareBuild. // Aggregation columns do not need to wait for column builts of parameters on query master, as they will receive the // groupIntermediary results from the cluster nodes and will not work on the column values directly. for (Entry<String, List<ExecutablePlanStep>> masterEntry : functionMasterSteps.entrySet()) { PlannerColumnInfo colInfo = columnInfo.get(masterEntry.getKey()); if (colInfo != null && !colInfo.getType().equals(FunctionRequest.Type.AGGREGATION_ROW)) { ExecutablePlanStep inputStep = Iterables.getFirst(masterEntry.getValue(), null); for (String prevColumnName : colInfo.getDependsOnColumns()) wireOutputOfColumnIfAvailable(prevColumnName, inputStep); } } // wire the BuildColumnFromValues steps inputs to the step that is providing the colum values. functionMasterSteps.values().stream().flatMap(lst -> lst.stream()) .filter(step -> step instanceof BuildColumnFromValuesStep) .forEach(new Consumer<ExecutablePlanStep>() { @Override public void accept(ExecutablePlanStep buildColumnFromValuesStep) { masterWireManager.wire(ColumnValueConsumer.class, columnValuesProvidingStep, buildColumnFromValuesStep); } }); // execute delayedWires, as now all columns should be represented in functionMasterSteps. for (String sourceColName : delayedWires.keySet()) for (ExecutablePlanStep targetStep : delayedWires.get(sourceColName)) wireOutputOfColumnIfAvailable(sourceColName, targetStep); List<ExecutablePlanStep> allSteps = functionMasterSteps.values().stream().flatMap(lst -> lst.stream()) .collect(Collectors.toList()); return allSteps; } @Override public boolean isColumnProduced(String colName) { return functionMasterSteps.containsKey(colName); } }