Java tutorial
/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.analyzer.physical; import static com.linkedin.cubert.utils.CommonUtils.generateVariableName; import static com.linkedin.cubert.utils.JsonUtils.asArray; import static com.linkedin.cubert.utils.JsonUtils.cloneNode; import static com.linkedin.cubert.utils.JsonUtils.createArrayNode; import static com.linkedin.cubert.utils.JsonUtils.createObjectNode; import static com.linkedin.cubert.utils.JsonUtils.getText; import java.io.IOException; import java.util.Set; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.node.ArrayNode; import org.codehaus.jackson.node.ObjectNode; import com.linkedin.cubert.utils.CommonUtils; import com.linkedin.cubert.utils.JsonUtils; /** * Rewrites BLOCKGEN or CUBE-COUNT-DISTINCT as shuffle operators. * * @author Maneesh Varshney * */ public class ShuffleRewriter implements PlanRewriter { private final ObjectMapper mapper = new ObjectMapper(); private Set<String> namesUsed; @Override public JsonNode rewrite(JsonNode plan, Set<String> namesUsed, boolean debugMode, boolean revisit) { this.namesUsed = namesUsed; ObjectNode newPlan = (ObjectNode) cloneNode(plan); ArrayNode jobs = mapper.createArrayNode(); for (JsonNode job : plan.path("jobs")) { jobs.add(rewriteJob(job)); } newPlan.remove("jobs"); newPlan.put("jobs", jobs); return newPlan; } private JsonNode rewriteJob(JsonNode job) { // no rewriting, if the job does not have shuffle if (!job.has("shuffle") || job.get("shuffle").isNull()) return job; JsonNode shuffle = job.get("shuffle"); // no rewriting if the shuffle is not a macro command if (!shuffle.has("type")) return job; String type = getText(shuffle, "type"); if (type.equals("SHUFFLE")) return job; else if (type.equals("BLOCKGEN")) return rewriteBlockgen(job); else if (type.equals("CUBE")) return rewriteCube(job); else if (type.equals("CREATE-DICTIONARY")) return rewriteDictionary(job); else if (type.equals("DISTINCT")) return rewriteDistinct(job); throw new RuntimeException("Cannot rewrite shuffle type " + type); } private JsonNode rewriteDictionary(JsonNode job) { ObjectNode newJob = (ObjectNode) cloneNode(job); ObjectNode shuffle = (ObjectNode) newJob.get("shuffle"); if (shuffle == null) throw new RuntimeException("Shuffle description missing. Cannot rewrite."); newJob.put("reducers", 1); // Determine if this is a refresh job or a fresh dictionary creation by looking at // STORE location String storePath = job.get("output").get("path").getTextValue(); String dictionaryPath = storePath + "/part-r-00000.avro"; boolean refresh = false; try { FileSystem fs = FileSystem.get(new JobConf()); refresh = fs.exists(new Path(dictionaryPath)); } catch (IOException e) { // we will not refresh } // Rewrite map JsonNode relationName = shuffle.get("name"); ObjectNode mapSideOperator = JsonUtils.createObjectNode("operator", "USER_DEFINED_TUPLE_OPERATOR", "class", "com.linkedin.cubert.operator.DictionaryRefreshMapSideOperator", "input", JsonUtils.createArrayNode(relationName), "output", relationName, "columns", shuffle.get("columns")); copyLine(shuffle, mapSideOperator, "[MAP] "); for (JsonNode map : newJob.path("map")) { if (!map.has("operators") || map.get("operators").isNull()) ((ObjectNode) map).put("operators", JsonUtils.createArrayNode()); ArrayNode operators = (ArrayNode) map.get("operators"); operators.add(mapSideOperator); } // Rewrite shuffle shuffle.put("name", relationName); shuffle.put("type", "SHUFFLE"); shuffle.put("partitionKeys", JsonUtils.createArrayNode(CommonUtils.array("colname", "colvalue"))); shuffle.put("pivotKeys", JsonUtils.createArrayNode(CommonUtils.array("colname", "colvalue"))); if (shuffle.has("columns")) shuffle.remove("columns"); if (shuffle.has("dictionaryPath")) shuffle.remove("dictionaryPath"); shuffle.remove("input"); // Rewrite reduce if (!newJob.has("reduce") || newJob.get("reduce").isNull()) newJob.put("reduce", JsonUtils.createArrayNode()); ArrayNode reduceJob = (ArrayNode) newJob.get("reduce"); ObjectNode reduceSideOperator = (ObjectNode) cloneNode(mapSideOperator); reduceSideOperator.put("class", "com.linkedin.cubert.operator.DictionaryRefreshReduceSideOperator"); copyLine(shuffle, reduceSideOperator, "[REDUCE] "); reduceJob.insert(0, reduceSideOperator); // Rewrite cached files if (refresh) { String newStorePath = storePath + "/tmp"; String newDictionaryPath = newStorePath + "/part-r-00000.avro"; // put the existing dictionary file in dist cache if (!newJob.has("cachedFiles") || newJob.get("cachedFiles").isNull()) newJob.put("cachedFiles", JsonUtils.createArrayNode()); ArrayNode cachedFiles = (ArrayNode) newJob.get("cachedFiles"); cachedFiles.add(dictionaryPath + "#dictionary"); // tell the operators to use cached dictionary mapSideOperator.put("dictionary", dictionaryPath + "#dictionary"); reduceSideOperator.put("dictionary", dictionaryPath + "#dictionary"); // the output path is changed to <original path>/tmp ((ObjectNode) newJob.get("output")).put("path", newStorePath); // put onCompletion for this job to move the new dictionary to the parent // folder ArrayNode onCompletion = mapper.createArrayNode(); // onCompletion.add(JsonUtils.createObjectNode("type", // "rm", // "paths", // JsonUtils.createArrayNode(storePath // + "/dictionary.avro"))); onCompletion.add(JsonUtils.createObjectNode("type", "mv", "paths", JsonUtils.createArrayNode(new String[] { newDictionaryPath, dictionaryPath }))); onCompletion.add( JsonUtils.createObjectNode("type", "rm", "paths", JsonUtils.createArrayNode(newStorePath))); newJob.put("onCompletion", onCompletion); } return newJob; } private JsonNode rewriteBlockgen(JsonNode job) { String blockgenType = job.get("shuffle").get("blockgenType").getTextValue(); if (blockgenType.equals("BY_INDEX")) return rewriteBlockgenByIndex(job); // else: following is the rewrite of BLOCKGEN ObjectNode newJob = (ObjectNode) cloneNode(job); ObjectNode shuffle = (ObjectNode) newJob.get("shuffle"); JsonNode blockgenTypeNode = shuffle.get("blockgenType"); JsonNode blockgenValueNode = shuffle.get("blockgenValue"); if (!shuffle.has("pivotKeys")) throw new PlanRewriteException("PivotKeys are not defined in SHUFFLE"); // add CREATE_BLOCK operator in the reducer if (!newJob.has("reduce") || newJob.get("reduce").isNull()) newJob.put("reduce", mapper.createArrayNode()); ArrayNode reduce = (ArrayNode) newJob.get("reduce"); ObjectNode createBlockOperator = createObjectNode("operator", "CREATE_BLOCK", "input", shuffle.get("name"), "output", shuffle.get("name"), "blockgenType", blockgenTypeNode, "blockgenValue", blockgenValueNode, "partitionKeys", shuffle.get("partitionKeys")); copyLine(shuffle, createBlockOperator, "[REDUCE] "); reduce.insert(0, createBlockOperator); // add DISTINCT operator, if requested boolean isDistinct = shuffle.has("distinct") && shuffle.get("distinct").getBooleanValue(); if (isDistinct) { ObjectNode distinct = createObjectNode("operator", "DISTINCT", "input", shuffle.get("name"), "output", shuffle.get("name")); copyLine(shuffle, distinct, "[REDUCE DISTINCT]"); reduce.insert(0, distinct); } // the sort keys for the SHUFFLE are set to the actual // blockgen PARTITION KEYS. These sort keys are configured into the JsonNode for // the CREATE_BLOCK operator // clean up shuffle shuffle.remove("blockgenType"); shuffle.remove("blockgenValue"); shuffle.put("type", "SHUFFLE"); shuffle.put("distinct", isDistinct); if (!CommonUtils.isPrefix(asArray(shuffle, "pivotKeys"), asArray(shuffle, "partitionKeys"))) { createBlockOperator.put("pivotKeys", shuffle.get("pivotKeys")); shuffle.put("pivotKeys", shuffle.get("partitionKeys")); } return newJob; } private JsonNode rewriteBlockgenByIndex(JsonNode job) { ObjectNode newJob = (ObjectNode) cloneNode(job); ObjectNode shuffle = (ObjectNode) newJob.get("shuffle"); String path = getText(shuffle, "relation"); // add a cache index String indexName = generateVariableName(namesUsed); if (!newJob.has("cacheIndex") || newJob.get("cacheIndex").isNull()) newJob.put("cacheIndex", mapper.createArrayNode()); ArrayNode cacheIndex = (ArrayNode) newJob.get("cacheIndex"); cacheIndex.add(createObjectNode("name", indexName, "path", path)); // create BLOCK-INDEX-JOIN operator ObjectNode blockIndexJoin = createObjectNode("operator", "BLOCK_INDEX_JOIN", "input", shuffle.get("name"), "output", shuffle.get("name"), "partitionKeys", shuffle.get("partitionKeys"), "index", indexName); copyLine(shuffle, blockIndexJoin, "[MAP] "); // add it as the last operator for all mapper for (JsonNode map : newJob.path("map")) { if (!map.has("operators") || map.get("operators").isNull()) ((ObjectNode) map).put("operators", mapper.createArrayNode()); ArrayNode operators = (ArrayNode) map.get("operators"); // we need unique references for all blockIndexJoin operators.add(JsonUtils.cloneNode(blockIndexJoin)); } // create CREATE-BLOCK operator ObjectNode createBlock = createObjectNode("operator", "CREATE_BLOCK", "input", shuffle.get("name"), "output", shuffle.get("name"), "blockgenType", "BY_INDEX", "index", indexName, "partitionKeys", createArrayNode("BLOCK_ID"), "indexPath", path); copyLine(shuffle, createBlock, "[REDUCE] "); // add it as first operator in reduce if (!newJob.has("reduce") || newJob.get("reduce").isNull()) newJob.put("reduce", mapper.createArrayNode()); ArrayNode reduce = (ArrayNode) newJob.get("reduce"); reduce.insert(0, createBlock); // add DISTINCT operator, if requested boolean isDistinct = shuffle.has("distinct") && shuffle.get("distinct").getBooleanValue(); if (isDistinct) { ObjectNode distinct = createObjectNode("operator", "DISTINCT", "input", shuffle.get("name"), "output", shuffle.get("name")); copyLine(shuffle, distinct, "[REDUCE DISTINCT] "); reduce.insert(0, distinct); } // clean up shuffle shuffle.put("type", "SHUFFLE"); shuffle.put("partitionKeys", createArrayNode("BLOCK_ID")); shuffle.put("distinct", isDistinct); shuffle.remove("blockgenType"); shuffle.remove("relation"); ArrayNode pivotKeys = mapper.createArrayNode(); pivotKeys.add("BLOCK_ID"); if (shuffle.has("pivotKeys")) { for (JsonNode key : shuffle.path("pivotKeys")) pivotKeys.add(key); } shuffle.put("pivotKeys", pivotKeys); return newJob; } private JsonNode rewriteCube(JsonNode job) { ObjectNode newJob = (ObjectNode) cloneNode(job); ObjectNode shuffle = (ObjectNode) newJob.get("shuffle"); String name = getText(shuffle, "name"); JsonNode aggregates = shuffle.get("aggregates"); // create the OLAP_CUBE_COUNT_DISTINCT operator ObjectNode cube = createObjectNode("operator", "CUBE", "input", name, "output", name, "dimensions", shuffle.get("dimensions"), "aggregates", cloneNode(aggregates)); if (shuffle.has("groupingSets")) cube.put("groupingSets", shuffle.get("groupingSets")); if (shuffle.has("innerDimensions")) cube.put("innerDimensions", shuffle.get("innerDimensions")); copyLine(shuffle, cube, "[MAP] "); // add it as the last operator for all mapper for (JsonNode map : newJob.path("map")) { if (!map.has("operators") || map.get("operators").isNull()) ((ObjectNode) map).put("operators", mapper.createArrayNode()); ArrayNode operators = (ArrayNode) map.get("operators"); operators.add(cube); } rewriteGroupByAggregateForCube(aggregates); // create the GROUP BY operator at the reducer ObjectNode groupBy = createObjectNode("operator", "GROUP_BY", "input", name, "output", name, "groupBy", shuffle.get("dimensions"), "aggregates", aggregates); copyLine(shuffle, groupBy, "[REDUCE] "); // add it as first operator in reduce if (!newJob.has("reduce") || newJob.get("reduce").isNull()) newJob.put("reduce", mapper.createArrayNode()); ArrayNode reduce = (ArrayNode) newJob.get("reduce"); reduce.insert(0, groupBy); // clean up shuffle shuffle.put("type", "SHUFFLE"); shuffle.put("aggregates", aggregates); shuffle.put("partitionKeys", shuffle.get("dimensions")); shuffle.put("pivotKeys", shuffle.get("dimensions")); shuffle.remove("dimensions"); shuffle.remove("groupingSets"); shuffle.remove("innerDimensions"); return newJob; } private void rewriteGroupByAggregateForCube(JsonNode aggregates) { // modify the aggregates JsonNode object (to be used in SHUFFLE, and GBY in // reducer): // a) if it is dual agg, use the outer aggregator // b) if it is COUNT or COUNT_DISTINCT agg, switch it to SUM // c) use the original output column name as input column name for (JsonNode aggNode : aggregates) { String type; JsonNode typeJson = aggNode.get("type"); if (typeJson.isArray()) type = typeJson.get(0).getTextValue(); else type = typeJson.getTextValue(); String outputColName = getText(aggNode, "output"); ObjectNode onode = (ObjectNode) aggNode; // see if the aggregation type has to be changed if (type.equals("COUNT") || type.equals("COUNT_DISTINCT")) onode.put("type", "SUM"); else onode.put("type", type); // change the input column name onode.put("input", outputColName); } } private JsonNode rewriteDistinct(JsonNode job) { ObjectNode newJob = (ObjectNode) cloneNode(job); ObjectNode shuffle = (ObjectNode) newJob.get("shuffle"); String name = getText(shuffle, "name"); ObjectNode distinctOp = JsonUtils.createObjectNode("operator", "DISTINCT", "input", name, "output", name); if (!newJob.has("reduce") || newJob.get("reduce").isNull()) newJob.put("reduce", mapper.createArrayNode()); ArrayNode reduce = (ArrayNode) newJob.get("reduce"); reduce.insert(0, distinctOp); shuffle.put("type", "SHUFFLE"); shuffle.put("distinctShuffle", true); return newJob; } private void copyLine(ObjectNode from, ObjectNode to, String prefix) { if (from.has("line")) to.put("line", prefix + getText(from, "line")); } }