Java tutorial
/* * (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. */ package com.linkedin.cubert.analyzer.physical; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Set; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.node.ArrayNode; import org.codehaus.jackson.node.ObjectNode; import org.joda.time.DateTime; import org.joda.time.Days; import org.apache.commons.lang.ArrayUtils; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import com.linkedin.cubert.analyzer.physical.SemanticAnalyzer.Node; import com.linkedin.cubert.operator.aggregate.AggregationType; import com.linkedin.cubert.analyzer.physical.Lineage; import com.linkedin.cubert.analyzer.physical.LineageGraph.LineageGraphVertex; import com.linkedin.cubert.analyzer.physical.LineageGraph.LineagePath; import com.linkedin.cubert.analyzer.physical.Lineage.LineageException; import com.linkedin.cubert.analyzer.physical.Lineage.OperatorVisitor; import com.linkedin.cubert.analyzer.physical.Lineage.OutputColumn; import com.linkedin.cubert.analyzer.physical.OperatorLineage; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.utils.AvroUtils; import com.linkedin.cubert.utils.CommonUtils; import com.linkedin.cubert.utils.DateTimeUtilities; import com.linkedin.cubert.utils.JsonUtils; import com.linkedin.cubert.utils.Pair; import com.linkedin.cubert.utils.RewriteUtils; import com.linkedin.cubert.plan.physical.JobExecutor; import com.linkedin.cubert.utils.CubertMD; public abstract class AggregateRewriter { public abstract void rewriteFactBlockgenPath(ObjectNode cubeNode, LineagePath opSequencePath, ObjectNode factNode, Pair<ObjectNode, ObjectNode> bgInfo) throws AggregateRewriteException; public abstract void transformFactPreBlockgen(ObjectNode programNode, Pair<ObjectNode, ObjectNode> bgInfo) throws AggregateRewriteException; public abstract ObjectNode transformMVPreCombine(String mvName); public abstract ObjectNode postCombineGroupBy(); public abstract ObjectNode postCombineFilter(); public abstract boolean isRewritable(ObjectNode operatorNode); public abstract ObjectNode preCubeTransform(); public abstract String[] getMeasureColumns(ObjectNode cubeOperatorNode); public class AggregateRewriteException extends Exception { public AggregateRewriteException(String msg) { super(msg); } } public AggregateRewriter() { this.lineage = new Lineage(); } /* * The main control flow for the rewrite is encapsulated in the base class. a cube or * group by operator which is a candidate for rewrite is identified and a rewrite * initiated if found. */ public JsonNode rewrite(JsonNode plan) { JsonNode clonedPlan = JsonUtils.cloneNode(plan); AggregateRewriterVisitor sVisitor = new AggregateRewriterVisitor((ObjectNode) clonedPlan, this); Lineage.visitOperators((ObjectNode) clonedPlan, null, sVisitor); // if rewrite failed return the original plan. if (sVisitor.rewriteFailed) return plan; // return the modified plan return clonedPlan; } private static class AggregateRewriterVisitor implements Lineage.OperatorVisitor { private AggregateRewriter srewriter = null; private ObjectNode programNode = null; private boolean rewriteFailed = false; public AggregateRewriterVisitor(ObjectNode programNode, AggregateRewriter srewriter) { this.srewriter = srewriter; this.programNode = programNode; } @Override public boolean inspect(ObjectNode jobNode, JsonNode phaseNode, ObjectNode operatorNode) { if (operatorNode.get("operator") == null) return true; if (srewriter.isRewritable(operatorNode) && operatorNode.get("summaryRewrite") != null) { try { srewriter.injectRewrite(programNode, jobNode, phaseNode, operatorNode, operatorNode.get("mvName").getTextValue(), operatorNode.get("mvPath").getTextValue()); } catch (AggregateRewriter.AggregateRewriteException e) { System.out.println("Summary rewrite failed due to exception " + e); this.rewriteFailed = true; return false; } catch (IOException e) { System.out.println("Summary rewrite failed due to IO exception " + e); this.rewriteFailed = true; return false; } } return true; } } protected String combinedRelation; protected String[] preCubeLoadColumns; protected static final String DATE_COLUMN_NAME = "__dateValue"; protected HashMap<String, Integer> mvRefreshMap = new HashMap<String, Integer>(); protected int factStartDate = 0; protected int factEndDate = 0; protected HashMap<Integer, List<OutputColumn>> measureLineageMap = new HashMap<Integer, List<OutputColumn>>(); protected List<ObjectNode> factNodes = new ArrayList<ObjectNode>(); protected ObjectNode preCubeFactLoad; protected HashSet<OutputColumn> factColumns = new HashSet<OutputColumn>(); protected String factBgInput; protected String storedFactPathName; public boolean mvExists = true; protected String dateColumnAlias = null; // the column name assigned to date column // currently protected boolean formattedDateColumn = false; // a formatted date column is available // in the critical path. protected ObjectNode tNode = null; // timespec node for current fact protected List<String> incFactTables = new ArrayList<String>(); // explicitly // identified fact // tables protected int mvRefreshDate = 0; protected int mvRefreshDateOverride = -1; protected DateTime incLoadDate = null; protected int mvHorizonDate; protected Lineage lineage; protected ObjectNode programNode; protected ObjectNode cubeJobNode; protected JsonNode cubePhaseNode; protected ObjectNode cubeOperatorNode; protected Pair<ObjectNode, ObjectNode> bgInfo = null; protected ArrayList<Pair<ObjectNode, ObjectNode>> bgInfoList = new ArrayList<Pair<ObjectNode, ObjectNode>>(); protected ArrayList<ObjectNode> combineNodes = null; protected String mvName; protected String mvPath; public void injectRewrite(ObjectNode programNode, ObjectNode jobNode, JsonNode phaseNode, ObjectNode cubeOperatorNode, String mvName, String mvPath) throws AggregateRewriteException, IOException { this.programNode = programNode; this.cubeJobNode = jobNode; this.cubePhaseNode = phaseNode; this.cubeOperatorNode = cubeOperatorNode; this.mvName = mvName; this.mvPath = mvPath; try { this.lineage.buildLineage(programNode); } catch (LineageException e) { throw new AggregateRewriteException(e.toString()); } readSummaryMetaData(); String[] measures = getMeasureColumns(cubeOperatorNode); String[] dimensions = getDimensionColumns(cubeOperatorNode); System.out.println( "Measure columns = " + Arrays.toString(measures) + " Dimensions=" + Arrays.toString(dimensions)); traceFactLoads(measures); traceFactBlockgens(); calculateIncrementalFactLoadDates(); rewriteFactBlockgenPaths(); // Perform any pre-blockgen xforms needed on the fact. transformFactPreBlockgen(programNode, bgInfo); createMVBlockgen(dimensions, measures); insertPreCubeCombine((String[]) (ArrayUtils.addAll(dimensions, measures))); incrementalizeFactLoads(); insertMVRefreshDateJobHook(); rewritePreCubeMeasureJoins(); programNode.put("summaryRewrite", "true"); } /* * From information about the measure columns, trace the fact table loads. */ protected void traceFactLoads(String[] measures) throws AggregateRewriteException { for (int measureIndex = 0; measureIndex < measures.length; measureIndex++) { List<Lineage.OutputColumn> inputColumns; try { ObjectNode inputNode = lineage.getLineage().getPreLineage().findOperatorSource(cubeOperatorNode, cubeOperatorNode.get("input").getTextValue()); inputColumns = lineage.traceLoadColumn(programNode, new OutputColumn(inputNode, measures[measureIndex])); } catch (LineageException e1) { throw new AggregateRewriteException( "Lineage exception when tracing measure column load" + e1.toString()); } if (inputColumns == null || inputColumns.size() == 0) throw new AggregateRewriteException("NULL or empty inputColumns"); List<OutputColumn> inputFactColumns = filterFactTableColumns(inputColumns); if (inputFactColumns == null || inputFactColumns.size() == 0) throw new AggregateRewriteException("NULL or empty input fact columns"); measureLineageMap.put(new Integer(measureIndex), inputFactColumns); for (OutputColumn factColumn : inputFactColumns) { factNodes.add(factColumn.opNode); factColumns.add(factColumn); } } if (factNodes.size() == 0) throw new AggregateRewriteException("Could not locate Fact tables"); // Trace (factLoad, optional BIJ, CREATE BLOCK, CUBE) path from each fact table for (ObjectNode fnode : factNodes) { System.out.println("Discovered fact node " + fnode); validateDateRanges(fnode); } } /* * From the known array of fact nodes, trace the list of blockgen paths to all of the * fact tables prior to the cubing jobs. */ private void traceFactBlockgens() throws AggregateRewriteException { ArrayList<String> nodeSequence1 = new ArrayList<String>(); nodeSequence1.add("BLOCK_INDEX_JOIN"); nodeSequence1.add("CREATE_BLOCK"); ArrayList<String> nodeSequence2 = new ArrayList<String>(); nodeSequence2.add("CREATE_BLOCK"); for (int i = 0; i < factNodes.size(); i++) { ObjectNode factNode = factNodes.get(i); List<LineagePath> match1 = lineage.traceMatchingPaths(factNode, nodeSequence1, cubeOperatorNode, true); List<LineagePath> match2 = lineage.traceMatchingPaths(factNode, nodeSequence2, cubeOperatorNode, true); if (isEmptyList(match1) && isEmptyList(match2)) throw new AggregateRewriteException("Found NO matching paths from LOAD fact -> CUBE"); // if (!isEmptyList(match1) && !isEmptyList(match2) || match1.size() > 1) // throw new // AggregateRewriteException("Found multiple matching paths from LOAD fact -> CUBE"); LineagePath matchingPath = (!isEmptyList(match1) ? match1.get(0) : match2.get(0)); Pair<ObjectNode, ObjectNode> bgInfo1 = extractFactBlockgenInfo(matchingPath); if (bgInfo1 == null) throw new AggregateRewriteException("Could not locate blockgen info for fact"); if (bgInfo != null && !bgInfo1.equals(bgInfo)) throw new AggregateRewriteException("Found inconsistent Blockgen information for fact"); bgInfo = bgInfo1; bgInfoList.add(bgInfo); } } private void readSummaryMetaData() throws IOException, AggregateRewriteException { FileSystem fs = FileSystem.get(new JobConf()); this.mvExists = false; FileStatus[] files = fs.globStatus(new Path(mvPath + "/avro/*.avro")); if (files != null && files.length > 0) { this.mvExists = true; processSummaryMetaData(mvPath); } } private void rewriteFactBlockgenPaths() throws AggregateRewriteException { for (int i = 0; i < factNodes.size(); i++) { // walk path from factLoad to blockgen, validate operators are expected // introduce projection for date column ObjectNode factNode = factNodes.get(i); LineagePath criticalPath = lineage.tracePath(factNode, bgInfoList.get(i).getSecond()); rewriteFactBlockgenPath(cubeOperatorNode, criticalPath, factNode, bgInfoList.get(i)); } } private void createMVBlockgen(String[] dimensions, String[] measures) throws AggregateRewriteException { String[] mvDimensions = (dimensions); String[] mvMeasures = (measures); String[] mvColumns = (String[]) ArrayUtils.addAll(mvDimensions, mvMeasures); // Step1: At the very beginnning of the program, create a blockgen by index for // the // historical MV if (this.mvExists) { ObjectNode mvBlockgenJobNode = (ObjectNode) createBlockgenForMV(programNode, cubeOperatorNode, bgInfo, mvName, mvPath, mvColumns); ArrayNode jobsListNode = JsonUtils.insertNodeListBefore((ArrayNode) (programNode.get("jobs")), cubeJobNode, Arrays.asList(new ObjectNode[] { mvBlockgenJobNode })); programNode.put("jobs", jobsListNode); } } private void insertPreCubeCombine(String[] mvColumns) throws AggregateRewriteException { getLoadFromCubingJob(programNode, cubeJobNode, bgInfo); BlockSchema mvSchema = new BlockSchema(this.preCubeFactLoad.get("schema")); /* * Prior to OLAP cube count distinct, introduce(a) LOAD historical MV, (b) Combine * with input fact relation and merge of MV with a TEE operator to store the * updated MV tuples. */ combineNodes = createCombineWithHistorical(programNode, cubeJobNode, cubePhaseNode, cubeOperatorNode, bgInfo, mvName, mvPath, mvSchema.toJson(), mvColumns); } private void incrementalizeFactLoads() throws AggregateRewriteException { // Step2: Change date parameters for LOAD of Fact relation (i.e inputNode) to // LAST_MV_REFRESH_DATE, $endDate; try { incrementalizeInputLoad(programNode, this.factNodes, cubeOperatorNode, mvName, mvPath); } catch (IOException e) { throw new AggregateRewriteException("IO exception when trying to incrementalize input load " + e); } } private void insertMVRefreshDateJobHook() { // MV refresh map is updated AFTER incrementalizeInputLoad is called. String refreshCmd = "METAFILE UPDATE " + mvPath + " " + "mv.refresh.time " + this.mvRefreshMap.get(mvName) + " mv.refresh.time.override " + this.mvRefreshMap.get(mvName) + " mv.horizon.time " + this.factStartDate; ((ArrayNode) (cubeJobNode.get("postJobHooks"))).add(refreshCmd); } private void rewritePreCubeMeasureJoins() throws AggregateRewriteException { for (OutputColumn inputFactColumn : this.factColumns) { List<ObjectNode> measureJoins = null; try { measureJoins = lineage.traceColumnJoins(programNode, inputFactColumn); } catch (LineageException e) { throw new AggregateRewriteException( "Lineage exception when tracing column joins for " + inputFactColumn); } List<ObjectNode> allJoins = lineage.computeJoinsInJob(programNode, cubeJobNode); JsonNode joinPhase = validateMeasureJoins(measureJoins, cubeJobNode, allJoins, inputFactColumn.opNode); // the last operator in the COMBINE chain will be a TEE whose input relation // is the // combined MV String combinedRelationName = combineNodes.get(combineNodes.size() - 1).get("output").getTextValue(); ArrayList<ObjectNode> newMeasureJoins = rewriteMeasureJoins(programNode, cubeOperatorNode, combinedRelationName, measureJoins, inputFactColumn.opNode); // Delete all current measure Joins. ArrayList<ObjectNode> joinOpList = new ArrayList<ObjectNode>(); joinOpList.addAll(measureJoins); deleteOperatorNodes(programNode, joinOpList); ArrayNode finalOps = null; // Insert the new Measure joins right after the combine nodes.. finalOps = JsonUtils.insertNodeListAfter(lineage.getPhaseOperators(joinPhase), combineNodes.get(combineNodes.size() - 1), newMeasureJoins); lineage.setPhaseOperators(cubeJobNode, joinPhase, finalOps); } } private void getLoadFromCubingJob(ObjectNode programNode, ObjectNode jobNode, Pair<ObjectNode, ObjectNode> blockgenInfo) throws AggregateRewriteException { ObjectNode blockgenNode = blockgenInfo.getSecond(); this.storedFactPathName = lineage.getBlockgenStorePath(programNode, blockgenInfo.getSecond()); if (storedFactPathName == null) throw new AggregateRewriteException( "Unknown or ambiguous output path name for fact table in OLAP_CUBE phase"); this.preCubeFactLoad = lineage.getMatchingLoadInJob(jobNode, storedFactPathName); if (preCubeFactLoad == null) throw new AggregateRewriteException( "Cannot find matching LOAD for " + storedFactPathName + " in job " + jobNode); } protected ObjectNode getFactTimeSpecNode(ObjectNode factNode, ObjectNode cubeNode) throws AggregateRewriteException { List<String> paths = lineage.getPaths(factNode.get("path")); for (JsonNode timeSpecNode : (ArrayNode) (cubeNode.get("timeColumnSpec"))) { String elementPath = ((ObjectNode) timeSpecNode).get("factPath").getTextValue(); if (paths.indexOf(elementPath) != -1) { tNode = (ObjectNode) timeSpecNode; return tNode; } } throw new AggregateRewriteException( "No matching time column specification found for FACT load at " + factNode.toString()); } private Pair<ObjectNode, ObjectNode> extractFactBlockgenInfo(LineagePath matchingPath) { List<LineageGraphVertex> nodes = matchingPath.nodes; int size = nodes.size(); if (size == 3) { ObjectNode bijNode = ((OperatorLineage) (nodes.get(0))).node; return new Pair<ObjectNode, ObjectNode>(bijNode, ((OperatorLineage) (nodes.get(1))).node); } else { return new Pair<ObjectNode, ObjectNode>(null, ((OperatorLineage) (nodes.get(0))).node); } } private boolean isEmptyList(List<LineagePath> match1) { return (match1 == null || match1.size() == 0 ? true : false); } private String[] getDimensionColumns(ObjectNode cubeOperatorNode) { String operatorType = cubeOperatorNode.get("operator").getTextValue(); if (operatorType.equalsIgnoreCase("CUBE")) return JsonUtils.asArray(cubeOperatorNode.get("dimensions")); else if (operatorType.equalsIgnoreCase("GROUP_BY")) return JsonUtils.asArray(cubeOperatorNode.get("groupBy")); return null; } private boolean isDatedPath(ArrayNode pathArray) { for (JsonNode pathNode : pathArray) { if (pathNode instanceof ObjectNode && ((ObjectNode) pathNode).get("startDate") != null) return true; } return false; } // All base relations that load a column are present in "inputColumns". // we are testing the fact table based on dated path. private List<OutputColumn> filterFactTableColumns(List<OutputColumn> inputColumns) { List<OutputColumn> result = new ArrayList<OutputColumn>(); for (OutputColumn candCol : inputColumns) { System.out.println("traceFactTableColumns: printing candCol = " + candCol.toString()); if (isDatedPath((ArrayNode) candCol.opNode.get("path"))) { if (!isAnIncrementalCandidate(candCol.opNode.get("path"))) continue; result.add(candCol); } } return result; } private boolean isAnIncrementalCandidate(JsonNode pathsNode) { List<String> paths = LineageHelper.getPaths(pathsNode); for (String s : paths) { if (incFactTables != null && incFactTables.size() > 0 && incFactTables.indexOf(s) == -1) return false; String[] splits = s.split("/"); String fname = splits[splits.length - 1]; if (fname.startsWith("dim") || fname.startsWith("DIM")) return false; } return true; } private JsonNode validateMeasureJoins(List<ObjectNode> measureJoins, JsonNode jobNode, List<ObjectNode> allJoins, ObjectNode factNode) throws AggregateRewriteException { JsonNode joinPhaseResult = null; List<JsonNode> mapArray = JsonUtils.toArrayList((ArrayNode) (jobNode.get("map"))); System.out.println("MeasureJoins = \n" + CommonUtils.listAsString(measureJoins, "\n")); // All joins on measure column must be in the COMBINE or OLAP CUBE phase. for (ObjectNode measureJoin : measureJoins) { JsonNode joinPhase = lineage.getPhase(measureJoin); if (joinPhaseResult != null && joinPhase != joinPhaseResult) throw new AggregateRewriteException("Measure Joins placed at incorrect place"); joinPhaseResult = joinPhase; } // Check that every join which occurs in this phase is a measure Join. for (ObjectNode joinOp : allJoins) { boolean found = false; if (CommonUtils.indexOfByRef(measureJoins, joinOp) == -1) { System.out.println("Problematic join = " + joinOp.toString()); throw new AggregateRewriteException("Unsupported Join operator in Rewrite phase"); } } // ensure all joins are in map phase if (CommonUtils.indexOfByRef(mapArray, joinPhaseResult) == -1) throw new AggregateRewriteException("Measure Joins found in reduce phase"); // walk all operators after preCubeFactLoad. expect to find all measure joins List<JsonNode> mapOps = JsonUtils.toArrayList((ArrayNode) joinPhaseResult.get("operators")); int matchedJoins = 0; int preCubeFactLoadIdx = -1, firstJoinIdx = -1, opIdx = 0; System.out.println("Map operators are " + CommonUtils.listAsString(mapOps, "\n")); for (JsonNode mapOp : mapOps) { if (preCubeFactLoadIdx != -1) { if (lineage.isJoinOperator((ObjectNode) mapOp)) { if (CommonUtils.indexOfByRef(measureJoins, (ObjectNode) mapOp) != -1) matchedJoins++; if (firstJoinIdx == -1) firstJoinIdx = opIdx; } else if (firstJoinIdx != -1) break; opIdx++; } if (mapOp == this.preCubeFactLoad) preCubeFactLoadIdx = opIdx; } // the sequence of measure joins has to follow the fact load. if (matchedJoins != measureJoins.size() || !checkOtherJoinConstraints()) throw new AggregateRewriteException( "Measure Joins not placed contiguously in map phase, matchedJoins = " + matchedJoins + " firstJoinIdx = " + firstJoinIdx + " preCubeFactLoadIdx = " + preCubeFactLoadIdx); ; return joinPhaseResult; } private boolean checkOtherJoinConstraints() { // TODO Auto-generated method stub return true; } private void deleteOperatorNodes(ObjectNode programNode, List<ObjectNode> opNodeList) { ArrayNode reduceOps = null; ArrayNode mapOps = null; for (ObjectNode operatorNode : opNodeList) { JsonNode phaseNode = lineage.getPhase(operatorNode); if (phaseNode instanceof ArrayNode) { JsonUtils.deleteFromArrayNode((ArrayNode) phaseNode, operatorNode); } else { ArrayNode operatorListNode = (ArrayNode) phaseNode.get("operators"); JsonUtils.deleteFromArrayNode(operatorListNode, operatorNode); } } } private ArrayList<ObjectNode> rewriteMeasureJoins(ObjectNode programNode, ObjectNode cubeOperatorNode, String combinedRelationName, List<ObjectNode> measureJoins, ObjectNode factNode) { ArrayList<ObjectNode> newMeasureJoins = new ArrayList<ObjectNode>(); String[] newInputs; int inputIndex = 0; for (ObjectNode measureJoin : measureJoins) { ObjectNode newJoin = (ObjectNode) (measureJoin); String replacedInput = null; newInputs = JsonUtils.asArray(measureJoin.get("input")); int replacedInputIndex = lineage.getDescendantInputIndex(newInputs, measureJoin, factNode); replacedInput = newInputs[replacedInputIndex]; newInputs[replacedInputIndex] = combinedRelationName; System.out .println(String.format("Replaced join-input %s with %s", replacedInput, combinedRelationName)); newJoin.put("input", JsonUtils.createArrayNode(newInputs)); if (newJoin.get("leftBlock").equals(replacedInput)) newJoin.put("leftBlock", combinedRelationName); else newJoin.put("rightBlock", combinedRelationName); newMeasureJoins.add(newJoin); } return newMeasureJoins; } private ArrayList<ObjectNode> createCombineWithHistorical(ObjectNode programNode, ObjectNode jobNode, JsonNode phaseNode, ObjectNode cubeOperatorNode, Pair<ObjectNode, ObjectNode> blockgenInfo, String mvName, String mvPath, JsonNode mvSchemaJson, String[] mvColumns) throws AggregateRewriteException { ArrayNode cacheIndexNode = null; if (jobNode.get("cacheIndex") == null) jobNode.put("cacheIndex", JsonUtils.createArrayNode()); cacheIndexNode = (ArrayNode) jobNode.get("cacheIndex"); ArrayList<ObjectNode> combineOps = new ArrayList<ObjectNode>(); String factName = preCubeFactLoad.get("output").getTextValue(); combinedRelation = factName; if (this.mvExists) { // We are loading the MV blocks using the MV index. cacheIndexNode.add(RewriteUtils.createObjectNode("name", factName + "MV_BLOCKGEN_INDEX", "path", mvPath + "/blockgen")); // XXX- omitted type definitions here. ObjectNode loadMvNode = RewriteUtils.createObjectNode("operator", "LOAD_BLOCK", "input", preCubeFactLoad.get("output"), "output", factName + "_MV", "index", factName + "MV_BLOCKGEN_INDEX", "path", mvPath + "/blockgen"); combineOps.add(loadMvNode); preCubeLoadColumns = lineage.getSchemaOutputColumns(preCubeFactLoad); ObjectNode transformMVNode = transformMVPreCombine(factName + "_MV"); if (transformMVNode != null) combineOps.add(transformMVNode); ArrayNode combineInput = JsonUtils.createArrayNode(); combineInput.add(preCubeFactLoad.get("output").getTextValue()); combineInput.add(factName + "_MV"); ObjectNode combineNode = RewriteUtils.createObjectNode("operator", "COMBINE", "input", combineInput, "output", combinedRelation, "pivotBy", JsonUtils.createArrayNode(preCubeLoadColumns)); combineOps.add(combineNode); ObjectNode postCombineGby = postCombineGroupBy(); if (postCombineGby != null) combineOps.add(postCombineGby); } ObjectNode postCombineFilter = postCombineFilter(); if (postCombineFilter != null) combineOps.add(postCombineFilter); ObjectNode storeMVNode = createMVStorageNode(); combineOps.add(storeMVNode); // A pre cube transform can be provided to handle special xforms // for instance for time series. ObjectNode preCubeTransform = preCubeTransform(); if (preCubeTransform != null) combineOps.add(preCubeTransform); JsonNode loadPhase = lineage.getPhase(preCubeFactLoad); ArrayNode phaseOps = lineage.getPhaseOperators(loadPhase); phaseOps = JsonUtils.insertNodeListAfter(phaseOps, preCubeFactLoad, combineOps); lineage.setPhaseOperators(jobNode, loadPhase, phaseOps); // add post job hook to rename existing (old MV) to avro_old cubeJobNode.put("postJobHooks", JsonUtils.createArrayNode()); ArrayNode jobHooks = (ArrayNode) (jobNode.get("postJobHooks")); addCubeJobHooks(jobHooks); return combineOps; } private void incrementalizeInputLoad(ObjectNode programNode, List<ObjectNode> factNodes2, ObjectNode cubeOperatorNode, String mvName, String mvPath) throws IOException, AggregateRewriteException { for (ObjectNode inputFactNode : factNodes2) incrementalizeInputLoad(programNode, inputFactNode, cubeOperatorNode, mvName, mvPath); } // check that all fact table path loads are in the same date range. private void validateDateRanges(ObjectNode inputNode) throws AggregateRewriteException { // TODO : validate that the date range spans less than or equal to a month and // future support for more time span. ArrayNode paths = (ArrayNode) inputNode.get("path"); for (JsonNode pathNode : paths) { if (!(pathNode instanceof ObjectNode)) continue; int startDate = Integer.parseInt(((ObjectNode) pathNode).get("startDate").getTextValue()); if (this.factStartDate == 0) { this.factStartDate = startDate; System.out.println("Setting fact startDate to " + startDate); } else if (startDate != this.factStartDate) throw new AggregateRewriteException("Inconsistent fact start dates"); int endDate = Integer.parseInt(((ObjectNode) pathNode).get("endDate").getTextValue()); if (this.factEndDate == 0) this.factEndDate = endDate; else if (endDate != this.factEndDate) throw new AggregateRewriteException("Inconsistent fact end dates"); } // restricting date range to one Month. // TODO: how does the Rubix runtime know that computation is iterative, occurs // daily // and startDate incremented by 1 each day ? String es = Integer.toString(factEndDate); String ss = Integer.toString(factStartDate); if (DateTimeUtilities.getDateTime(es).minusDays(30).isAfter(DateTimeUtilities.getDateTime(ss))) throw new AggregateRewriteException( String.format("[sd=%d, ed=%d] Time spans larger than a month are currently not supported", factStartDate, factEndDate)); } private void processSummaryMetaData(String mvPath) throws AggregateRewriteException { // TODO: retrieve the MV horizon HashMap<String, String> metaEntries; try { metaEntries = CubertMD.readMetafile(mvPath); } catch (IOException e) { throw new AggregateRewriteException("Cannot read Meta file for summary at " + mvPath); } // if user explicitly indicated incremental candidates String incCandidates; if ((incCandidates = metaEntries.get("mv.incremental.candidates")) != null) { String[] incFactTablesArray = incCandidates.split(","); for (String s : incFactTablesArray) incFactTables.add(s); } if (metaEntries == null || metaEntries.get("mv.refresh.time") == null) throw new AggregateRewriteException("MV metaEntries not visible"); mvHorizonDate = Integer.parseInt(metaEntries.get("mv.horizon.time")); mvRefreshDate = Integer.parseInt(metaEntries.get("mv.refresh.time")); if (metaEntries.get("mv.refresh.time.override") != null) mvRefreshDateOverride = Integer.parseInt(metaEntries.get("mv.refresh.time.override")); } private void calculateIncrementalFactLoadDates() throws AggregateRewriteException { if (!this.mvExists) return; DateTime dt = null; /* * difference between factStartDate and mvHorizonDate determines #of bit shifts. * the new horizon date is always determined by the factStartDate. */ if (this.factStartDate > mvHorizonDate) throw new AggregateRewriteException(String.format( "Fact start date(%d) in the future of mv horizon date(%d) ", factStartDate, mvHorizonDate)); dt = DateTimeUtilities.getDateTime(Integer.toString(mvRefreshDate)); incLoadDate = dt.plusDays(1); // Handle over-ride case. if (mvRefreshDateOverride != -1) { // if over-ridden time is before the physical MV refresh time if (mvRefreshDateOverride != 0 && mvRefreshDateOverride < mvRefreshDate) incLoadDate = DateTimeUtilities.getDateTime(Integer.toString(mvRefreshDateOverride)).plusDays(1); /* * mvRefreshDateOverride of 0 is treated as a hint to turn off * incrementalization. */ else if (mvRefreshDateOverride == 0) incLoadDate = null; } if (mvRefreshDate != 0 && incLoadDate != null) { if (!(DateTimeUtilities.getDateTime(factStartDate).isBefore(incLoadDate) && DateTimeUtilities.getDateTime(factEndDate).isAfter(incLoadDate))) throw new AggregateRewriteException( String.format("MV date range mis-matches load range[%s, %s] mvRefreshDate=%s ", factStartDate, factEndDate, mvRefreshDate)); } } // private void incrementalizeInputLoad(ObjectNode programNode, ObjectNode inputNode, ObjectNode cubeOperatorNode, String mvName, String mvPath) throws IOException, AggregateRewriteException { // extract input paths from inputNode and adjust start-date to MV refresh date+1. ArrayNode paths = (ArrayNode) inputNode.get("path"); System.out.println("Incrementalize InputNode = " + inputNode.toString()); int newMvRefreshTime = 0; for (int i = 0; i < paths.size(); i++) { JsonNode pathNode = paths.get(i); if (pathNode instanceof ObjectNode) { String startDate = ((ObjectNode) pathNode).get("startDate").getTextValue(); // System.out.println("startDate = " + startDate); DateTime loadStart = DateTimeUtilities.getDateTime((startDate)); String endDate = ((ObjectNode) pathNode).get("endDate").getTextValue(); DateTime loadEnd = DateTimeUtilities.getDateTime(endDate); if (mvRefreshDate != 0 && incLoadDate != null) { if (loadStart.isBefore(incLoadDate) && loadEnd.isAfter(incLoadDate)) { ((ObjectNode) pathNode).put("origStartDate", startDate); ((ObjectNode) pathNode).put("startDate", Integer.toString(DateTimeUtilities.asInt(incLoadDate))); } else throw new AggregateRewriteException( String.format("MV date range mis-matches load range[%s, %s] ", startDate, endDate)); } newMvRefreshTime = Math.max(Integer.parseInt(((ObjectNode) pathNode).get("endDate").getTextValue()), newMvRefreshTime); } } System.out.println("Setting MV refresh time for " + mvName + " to " + newMvRefreshTime); mvRefreshMap.put(mvName, newMvRefreshTime); } private JsonNode createBlockgenForMV(ObjectNode programNode, ObjectNode cubeOperatorNode, Pair<ObjectNode, ObjectNode> bgInfo, String mvName, String mvPath, String[] mvColumns) throws AggregateRewriteException { String bgFactPath = null; String[] partitionKeys = null; String[] pivotKeys = null; String[] shufflePivotKeys = null; String mvInputPath = mvPath + "/avro"; if (lineage.isBlockgenByIndex(bgInfo.getSecond())) { partitionKeys = JsonUtils.asArray(bgInfo.getFirst().get("partitionKeys")); String indexName = bgInfo.getFirst().get("index").getTextValue(); ObjectNode jobNode = lineage.getOperatorJobNode(bgInfo.getSecond()); // This should include BLOCK_ID, else assert. shufflePivotKeys = JsonUtils.asArray(((ObjectNode) (jobNode.get("shuffle"))).get("pivotKeys")); String indexPath = lineage.traceIndexPath(jobNode, indexName); System.out.println("Traced blockgen index " + indexName + " path as" + indexPath); System.out.println("job node = " + jobNode.toString()); bgFactPath = indexPath; } else { bgFactPath = lineage.getDatedPathRoot((ArrayNode) (bgInfo.getSecond().get("path"))); partitionKeys = JsonUtils.asArray(bgInfo.getSecond().get("partitionKeys")); pivotKeys = JsonUtils.asArray(bgInfo.getSecond().get("pivotKeys")); shufflePivotKeys = (String[]) ArrayUtils.addAll(new String[] { "BLOCK_ID" }, pivotKeys); } ArrayNode cacheIndexNode = JsonUtils.createArrayNode(); cacheIndexNode.add(RewriteUtils.createObjectNode("name", mvName + "_fact_index", "path", bgFactPath)); JsonNode mapNode = JsonUtils.makeJson( String.format("[{'input' : {'name':'%s', 'type': 'AVRO', 'path':['%s']}}]", mvName, mvInputPath)); System.out.println("Blockgen partition keys = " + Arrays.toString(partitionKeys)); ObjectNode blockIndexJoin = RewriteUtils.createObjectNode("operator", "BLOCK_INDEX_JOIN", "input", mvName, "output", mvName, "index", mvName + "_fact_index", "partitionKeys", JsonUtils.createArrayNode(partitionKeys)); ObjectNode mapperNode = (ObjectNode) (((ArrayNode) mapNode).get(0)); mapperNode.put("operators", JsonUtils.createArrayNode()); ((ArrayNode) (mapperNode.get("operators"))).add(blockIndexJoin); JsonNode shuffleNode = RewriteUtils.createObjectNode("name", mvName, "type", "SHUFFLE", "partitionKeys", JsonUtils.createArrayNode(new String[] { "BLOCK_ID" }), "pivotKeys", JsonUtils.createArrayNode(shufflePivotKeys)); JsonNode reduceOpNode = RewriteUtils.createObjectNode("operator", "CREATE_BLOCK", "input", mvName, "output", mvName + "_blockgen", "blockgenType", "BY_INDEX", "index", mvName + "_fact_index", "indexPath", bgFactPath, "partitionKeys", JsonUtils.createArrayNode(new String[] { "BLOCK_ID" }), "pivotKeys", JsonUtils.createArrayNode(shufflePivotKeys), "originalPartitionKeys", JsonUtils.createArrayNode(partitionKeys)); ArrayNode reduceNode = JsonUtils.createArrayNode(); reduceNode.add(reduceOpNode); ObjectNode outputNode = RewriteUtils.createObjectNode("name", mvName + "_blockgen", "path", mvPath + "/blockgen", "type", "RUBIX", "params", RewriteUtils.createObjectNode("overwrite", "true")); ObjectNode jobNode = RewriteUtils.createObjectNode("name", "BLOCKGEN FOR MV", "map", mapNode, "shuffle", shuffleNode, "reduce", reduceNode, "output", outputNode); jobNode.put("cacheIndex", cacheIndexNode); jobNode.put("reducers", 100); System.out.println("JOB json = " + jobNode.toString()); return jobNode; } /* default implementation provided. Derived class can over-ride */ protected void addCubeJobHooks(ArrayNode jobHooks) { if (this.mvExists) { String renameCmd1 = String.format("HDFS RENAME %s %s", mvPath + "/avro", mvPath + "/avro_old"); jobHooks.add(renameCmd1); } String renameCmd2 = String.format("HDFS RENAME %s %s", mvPath + "/avro_new", mvPath + "/avro"); jobHooks.add(renameCmd2); } /* * Default implementation provided in the base class. A rewriter can over-ride as * needed */ protected ObjectNode createMVStorageNode() { return RewriteUtils.createObjectNode("operator", "TEE", "input", combinedRelation, "output", combinedRelation, "type", "AVRO", "path", mvPath + "/avro_new", "passthrough", true); } }