Java tutorial
/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.operator; import com.linkedin.cubert.block.Block; import com.linkedin.cubert.block.BlockProperties; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.block.DataType; import com.linkedin.cubert.block.PivotedBlock; import com.linkedin.cubert.memory.CompactHashTableBase; import com.linkedin.cubert.memory.IntIterator; import com.linkedin.cubert.memory.IntSet; import com.linkedin.cubert.operator.cube.CountDistinctCubeAggregator; import com.linkedin.cubert.operator.cube.CubeAggregator; import com.linkedin.cubert.operator.cube.CubeDimensions; import com.linkedin.cubert.operator.cube.DefaultCubeAggregator; import com.linkedin.cubert.operator.cube.DefaultDupleCubeAggregator; import com.linkedin.cubert.operator.cube.DimensionKey; import com.linkedin.cubert.operator.cube.DupleCubeAggregator; import com.linkedin.cubert.operator.cube.EasyCubeAggregator; import com.linkedin.cubert.operator.cube.EasyCubeAggregatorBridge; import com.linkedin.cubert.operator.cube.ValueAggregationType; import com.linkedin.cubert.operator.cube.ValueAggregator; import com.linkedin.cubert.operator.cube.ValueAggregatorFactory; import com.linkedin.cubert.utils.ClassCache; import com.linkedin.cubert.utils.CommonUtils; import com.linkedin.cubert.utils.JsonUtils; import com.linkedin.cubert.utils.Pair; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.mapreduce.Counter; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.codehaus.jackson.JsonNode; /** * * @author Maneesh Varshney * */ public class CubeOperator implements TupleOperator { private static final Log LOG = LogFactory.getLog(CubeOperator.class.getName()); // Default operator configurations private static final int DEFAULT_HASH_TABLE_SIZE = 2000000; // inputs private boolean hasInnerDimensions = false; private Block inputBlock; // outputs private Tuple outputTuple; // aggregators private final List<CubeAggregator> aggregators = new ArrayList<CubeAggregator>(); private final List<DupleCubeAggregator> dupleAggregators = new ArrayList<DupleCubeAggregator>(); // hash table related private int hashTableSize = DEFAULT_HASH_TABLE_SIZE; private double flushThreshold = 0.95; private CompactHashTableBase hashTable; private Iterator<Pair<DimensionKey, Integer>> iterator; private final IntSet indexSet = new IntSet(); // dimension key related private CubeDimensions dimensions; // runtime state management private boolean inputAvailable = true; private Counter flushCounter; @Override public void setInput(Map<String, Block> input, JsonNode json, BlockProperties props) throws IOException, InterruptedException { // get the input block and the input schema inputBlock = input.values().iterator().next(); BlockSchema inputSchema = inputBlock.getProperties().getSchema(); // get the output block schema BlockSchema outputSchema = props.getSchema(); // read configurations from json String[] dimensionNames = JsonUtils.asArray(json, "dimensions"); String[] innerDimensions = JsonUtils.asArray(json, "innerDimensions"); if (json.has("hashTableSize") && !json.get("hashTableSize").isNull()) hashTableSize = json.get("hashTableSize").getIntValue(); hasInnerDimensions = (innerDimensions != null); if (hasInnerDimensions) inputBlock = new PivotedBlock(inputBlock, innerDimensions); // create aggregators List<CubeAggInfo> aggs = new ArrayList<CubeAggInfo>(); List<DupleCubeAggInfo> dupleAggs = new ArrayList<DupleCubeAggInfo>(); try { createAggregators(json, inputSchema, hasInnerDimensions, aggs, dupleAggs); } catch (PreconditionException e) { // this will not happen, since this method is also called in the // getPostCondition method, and any PreconditionException will be caught at // compile time throw new RuntimeException(e); } // initialize and allocate additive aggregates for (CubeAggInfo info : aggs) { info.getFirst().setup(inputBlock, outputSchema, info.getSecond()); info.getFirst().allocate(hashTableSize); aggregators.add(info.getFirst()); } // initialize and allocate partitioned additive aggregates for (DupleCubeAggInfo info : dupleAggs) { info.getFirst().setup(inputBlock, outputSchema, info.getSecond()); info.getFirst().allocate(hashTableSize); dupleAggregators.add(info.getFirst()); } // create the single copy of output tuple outputTuple = TupleFactory.getInstance().newTuple(outputSchema.getNumColumns()); // initialize CubeDimensions dimensions = new CubeDimensions(inputSchema, outputSchema, dimensionNames, json.get("groupingSets")); // create compact hash table hashTable = new CompactHashTableBase(dimensions.getDimensionKeyLength(), hashTableSize); // set the flush threshold (if defined in conf) flushThreshold = PhaseContext.getConf().getFloat("cubert.cube.flush.threshold", (float) flushThreshold); flushCounter = CubertCounter.CUBE_FLUSH_COUNTER.getCounter(); } /** * Process input tuples for cubing without inner dimensions. Note that * DupleCubeAggregators cannot be used here (any attempt to use such aggregators would * have be caught at the compile time). * * @return boolean flag to indicate if there is more input to be processed * @throws IOException * @throws InterruptedException */ private boolean processWithoutInnerDimensions() throws IOException, InterruptedException { if (!inputAvailable) return false; Tuple tuple; while ((tuple = inputBlock.next()) != null) { // only the additive aggregators can be handled for (CubeAggregator agg : aggregators) agg.processTuple(tuple); DimensionKey[] ancestors = dimensions.ancestors(tuple); for (DimensionKey ancestor : ancestors) { Pair<Integer, Boolean> idx = hashTable.lookupOrCreateIndex(ancestor); for (CubeAggregator agg : aggregators) agg.aggregate(idx.getFirst()); } if (hashTable.size() >= hashTableSize * flushThreshold) { flushCounter.increment(1); break; } } if (tuple == null) inputAvailable = false; iterator = hashTable.getIterator(); return true; } /** * Process input tuples for cubing WITH inner dimensions. * * @return boolean flag to indicate if there is more input to be processed * @return * @throws IOException * @throws InterruptedException */ private boolean processWithInnerDimensions() throws IOException, InterruptedException { if (!inputAvailable) return false; while (true) { Tuple tuple; indexSet.clear(); int maxIndex = 0; while ((tuple = inputBlock.next()) != null) { for (CubeAggregator agg : aggregators) agg.processTuple(tuple); for (CubeAggregator agg : dupleAggregators) agg.processTuple(tuple); DimensionKey[] ancestors = dimensions.ancestors(tuple); for (DimensionKey ancestor : ancestors) { Pair<Integer, Boolean> idx = hashTable.lookupOrCreateIndex(ancestor); for (CubeAggregator agg : aggregators) agg.aggregate(idx.getFirst()); for (DupleCubeAggregator agg : dupleAggregators) agg.innerAggregate(idx.getFirst()); Integer index = idx.getFirst(); maxIndex = Math.max(maxIndex, index.intValue()); indexSet.add(index); } } // Ensure capacity if (maxIndex + 1 > hashTableSize) { for (DupleCubeAggregator agg : dupleAggregators) agg.allocate(maxIndex); } IntIterator it = indexSet.iterator(); while (it.hasNext()) { int index = it.next(); for (DupleCubeAggregator agg : dupleAggregators) agg.aggregate(index); } if (!((PivotedBlock) inputBlock).advancePivot()) { inputAvailable = false; break; } if (hashTable.size() >= hashTableSize * flushThreshold) { flushCounter.increment(1); break; } } iterator = hashTable.getIterator(); return true; } private boolean process() throws IOException, InterruptedException { hashTable.clear(); for (CubeAggregator agg : this.aggregators) agg.clear(); for (DupleCubeAggregator agg : this.dupleAggregators) agg.clear(); if (hasInnerDimensions) return processWithInnerDimensions(); else return processWithoutInnerDimensions(); } @Override public Tuple next() throws IOException, InterruptedException { if (iterator == null) { if (!process()) return null; } if (iterator.hasNext()) { Pair<DimensionKey, Integer> pair = iterator.next(); DimensionKey key = pair.getFirst(); int index = pair.getSecond(); dimensions.outputKey(key, outputTuple); for (CubeAggregator agg : aggregators) agg.outputTuple(outputTuple, index); for (CubeAggregator agg : dupleAggregators) agg.outputTuple(outputTuple, index); return outputTuple; } else { iterator = null; return next(); } } @Override public PostCondition getPostCondition(Map<String, PostCondition> preConditions, JsonNode json) throws PreconditionException { PostCondition condition = preConditions.values().iterator().next(); BlockSchema inputSchema = condition.getSchema(); String[] dimensions = JsonUtils.asArray(JsonUtils.get(json, "dimensions")); String[] innerDimensions = JsonUtils.asArray(json, "innerDimensions"); // validate that dimensions columns exist and are INT, LONG or STRING for (String dim : dimensions) { if (!inputSchema.hasIndex(dim)) throw new PreconditionException(PreconditionExceptionType.COLUMN_NOT_PRESENT, dim); DataType type = inputSchema.getType(inputSchema.getIndex(dim)); if (!type.isIntOrLong() && type != DataType.BOOLEAN && !type.equals(DataType.STRING)) throw new PreconditionException(PreconditionExceptionType.INVALID_DIMENSION_TYPE, "Expecting type: BOOLEAN, INT, LONG or STRING. Found: " + type); } // validate inner dimensions (if specified) if (innerDimensions != null) { // validate that innerDimensions exist for (String dim : innerDimensions) { if (!inputSchema.hasIndex(dim)) throw new PreconditionException(PreconditionExceptionType.COLUMN_NOT_PRESENT, dim); } // validate that block is partitioned on inner dimensions String[] partitionKeys = condition.getPartitionKeys(); if (partitionKeys == null || partitionKeys.length == 0 || !CommonUtils.isPrefix(innerDimensions, partitionKeys)) { String msg = String.format("Expected: %s. Found: %s", Arrays.toString(innerDimensions), Arrays.toString(partitionKeys)); throw new PreconditionException(PreconditionExceptionType.INVALID_PARTITION_KEYS, msg); } // validate that block is sorted on inner dimensions String[] sortKeys = condition.getSortKeys(); if (sortKeys == null || sortKeys.length == 0 || !CommonUtils.isPrefix(sortKeys, innerDimensions)) { String msg = String.format("Expected: %s. Found: %s", Arrays.toString(innerDimensions), Arrays.toString(sortKeys)); throw new PreconditionException(PreconditionExceptionType.INVALID_SORT_KEYS, msg); } } // validate that dimensions in groupingSets are valid dimensions JsonNode gsJson = json.get("groupingSets"); if (gsJson != null && !gsJson.isNull() && gsJson.size() > 0) { String[] gsInput = JsonUtils.asArray(gsJson); Set<String> dimensionSet = new HashSet<String>(); for (int i = 0; i < dimensions.length; i++) dimensionSet.add(dimensions[i]); for (int i = 0; i < gsInput.length; i++) { String[] fields = gsInput[i].split(","); for (String field : fields) { if (field.equals("")) continue; // roll up everything TODO: check ROLLUP clause (?) if (!dimensionSet.contains(field)) { String msg = String.format("Dimension \"%s\" in grouping set (%s) is not a valid dimension", field, gsInput[i]); throw new PreconditionException(PreconditionExceptionType.INVALID_DIMENSION_TYPE, msg); } } } } // generate output schema BlockSchema outputSchema = createOutputSchema(inputSchema, json); // create post condition return new PostCondition(outputSchema, condition.getPartitionKeys(), null); } private static final class CubeAggInfo extends Pair<CubeAggregator, JsonNode> { private static final long serialVersionUID = 3313689844388231187L; public CubeAggInfo(CubeAggregator x, JsonNode y) { super(x, y); } } private static final class DupleCubeAggInfo extends Pair<DupleCubeAggregator, JsonNode> { private static final long serialVersionUID = -550007348499616264L; public DupleCubeAggInfo(DupleCubeAggregator x, JsonNode y) { super(x, y); } } private static BlockSchema createOutputSchema(BlockSchema inputSchema, JsonNode json) throws PreconditionException { List<CubeAggInfo> additiveAggs = new ArrayList<CubeAggInfo>(); List<DupleCubeAggInfo> partitionedAdditiveAggs = new ArrayList<DupleCubeAggInfo>(); final String[] innerDimensions = JsonUtils.asArray(json, "innerDimensions"); createAggregators(json, inputSchema, innerDimensions != null, additiveAggs, partitionedAdditiveAggs); Map<JsonNode, BlockSchema> aggMap = new HashMap<JsonNode, BlockSchema>(); for (CubeAggInfo info : additiveAggs) { JsonNode aggNode = info.getSecond(); aggMap.put(aggNode, info.getFirst().outputSchema(inputSchema, aggNode)); } for (DupleCubeAggInfo info : partitionedAdditiveAggs) { JsonNode aggNode = info.getSecond(); aggMap.put(aggNode, info.getFirst().outputSchema(inputSchema, aggNode)); } BlockSchema outputSchema = inputSchema.getSubset(JsonUtils.asArray(JsonUtils.get(json, "dimensions"))); for (JsonNode aggregateJson : json.get("aggregates")) outputSchema = outputSchema.append(aggMap.get(aggregateJson)); return outputSchema; } private static void createAggregators(JsonNode json, BlockSchema inputSchema, boolean hasInnerDimensions, List<CubeAggInfo> aggs, List<DupleCubeAggInfo> dupleAggs) throws PreconditionException { for (JsonNode aggregateJson : json.get("aggregates")) { JsonNode typeJson = aggregateJson.get("type"); // validate that type is defined in json if (typeJson == null || typeJson.isNull()) throw new PreconditionException(PreconditionExceptionType.INVALID_CONFIG, "<type> property not defined in Json: " + typeJson.toString()); // validate that type is a string or array if (!typeJson.isTextual() && !typeJson.isArray()) throw new PreconditionException(PreconditionExceptionType.INVALID_CONFIG, "<type> property not text or array: " + typeJson.toString()); // if array, validate that type has one or two items if (typeJson.isArray() && !(typeJson.size() == 1 || typeJson.size() == 2)) throw new PreconditionException(PreconditionExceptionType.INVALID_CONFIG, "<type> property as array can have either one or two items: " + typeJson.toString()); // validate that the input columns are present in input schema String[] inputColNames = null; DataType[] inputColTypes = null; if (aggregateJson.has("input") && !aggregateJson.get("input").isNull()) { inputColNames = JsonUtils.asArray(aggregateJson, "input"); inputColTypes = new DataType[inputColNames.length]; int idx = 0; for (String colName : inputColNames) { if (!inputSchema.hasIndex(colName)) throw new PreconditionException(PreconditionExceptionType.COLUMN_NOT_PRESENT, colName); inputColTypes[idx++] = inputSchema.getType(inputSchema.getIndex(colName)); } } // handle first the special case of array with two items if (typeJson.isArray() && typeJson.size() == 2) { String[] aggregators = JsonUtils.asArray(typeJson); ValueAggregationType outerType = getCubeAggregationType(aggregators[0], true); ValueAggregationType innerType = getCubeAggregationType(aggregators[1], true); // the "type" of inner aggregate is the type of input column ValueAggregator innerAggregator = ValueAggregatorFactory.get(innerType, inputColTypes[0], inputColNames[0]); // the "type" of outer aggregate is the output type of inner aggregate ValueAggregator outerAggregator = ValueAggregatorFactory.get(outerType, innerAggregator.outputType(), inputColNames[0]); DupleCubeAggregator cubeAggregator = new DefaultDupleCubeAggregator(outerAggregator, innerAggregator); if (!hasInnerDimensions) errorInnerDimensionsNotSpecified(java.util.Arrays.toString(aggregators)); dupleAggs.add(new DupleCubeAggInfo(cubeAggregator, aggregateJson)); } else { String type = typeJson.isArray() ? typeJson.get(0).getTextValue() : typeJson.getTextValue(); ValueAggregationType aggType = getCubeAggregationType(type, false); // if this is builtin aggregator if (aggType != null) { ValueAggregator aggregator = ValueAggregatorFactory.get(aggType, inputColTypes == null ? null : inputColTypes[0], inputColNames == null ? null : inputColNames[0]); CubeAggregator cubeggregator = new DefaultCubeAggregator(aggregator); aggs.add(new CubeAggInfo(cubeggregator, aggregateJson)); } else if (type.equals("COUNT_DISTINCT")) { if (!hasInnerDimensions) errorInnerDimensionsNotSpecified(type); DupleCubeAggregator cubeAggregator = new CountDistinctCubeAggregator(inputColNames[0]); dupleAggs.add(new DupleCubeAggInfo(cubeAggregator, aggregateJson)); } // this is udaf else { Object object = null; try { Class<?> cls = ClassCache.forName(type); object = instantiateObject(cls, aggregateJson.get("constructorArgs")); } catch (ClassNotFoundException e) { throw new PreconditionException(PreconditionExceptionType.CLASS_NOT_FOUND, type); } catch (Exception e) { throw new PreconditionException(PreconditionExceptionType.MISC_ERROR, e.getClass().getSimpleName() + " " + e.getMessage() + " for class: " + type); } if (object instanceof DupleCubeAggregator) { DupleCubeAggregator cubeAggregator = (DupleCubeAggregator) object; if (!hasInnerDimensions) errorInnerDimensionsNotSpecified(type); dupleAggs.add(new DupleCubeAggInfo(cubeAggregator, aggregateJson)); } else if (object instanceof CubeAggregator) { CubeAggregator cubeAggregator = (CubeAggregator) object; aggs.add(new CubeAggInfo(cubeAggregator, aggregateJson)); } else if (object instanceof EasyCubeAggregator) { EasyCubeAggregatorBridge cubeAggregator = new EasyCubeAggregatorBridge( (EasyCubeAggregator) object); if (!hasInnerDimensions) errorInnerDimensionsNotSpecified(type); dupleAggs.add(new DupleCubeAggInfo(cubeAggregator, aggregateJson)); } else { String msg = String.format( "%s should implement one of these interfaces: AdditiveCubeAggregate, PartitionedAdditiveAggregate, EasyCubeAggregate", type); throw new PreconditionException(PreconditionExceptionType.MISC_ERROR, msg); } } } } } private static void errorInnerDimensionsNotSpecified(String aggName) throws PreconditionException { String msg = String.format("INNER dimensions must be specified for the %s PartitionedAdditive aggregator", aggName); throw new PreconditionException(PreconditionExceptionType.INVALID_CONFIG, msg); } private static Object instantiateObject(Class<?> cls, JsonNode constructorArgs) throws InstantiationException, IllegalAccessException, IllegalArgumentException, SecurityException, InvocationTargetException, NoSuchMethodException { if (constructorArgs == null || constructorArgs.isNull()) return cls.newInstance(); Object[] args = new Object[constructorArgs.size()]; Class<?>[] argClasses = new Class[args.length]; for (int i = 0; i < args.length; i++) { args[i] = JsonUtils.asObject(constructorArgs.get(i)); argClasses[i] = args[i].getClass(); } return cls.getConstructor(argClasses).newInstance(args); } private static ValueAggregationType getCubeAggregationType(String name, boolean errorOnMissing) throws PreconditionException { try { return ValueAggregationType.valueOf(name.toUpperCase()); } catch (IllegalArgumentException e) { if (errorOnMissing) { String msg = String.format("Aggregator [%s] not found. Valid aggregators: %s", name, java.util.Arrays.toString(ValueAggregationType.values())); throw new PreconditionException(PreconditionExceptionType.INVALID_CONFIG, msg); } } return null; } }