Java tutorial
/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.startree; import com.google.common.base.Objects; import com.google.common.base.Preconditions; import com.google.common.collect.BiMap; import com.google.common.collect.HashBiMap; import com.linkedin.pinot.common.data.DimensionFieldSpec; import com.linkedin.pinot.common.data.FieldSpec; import com.linkedin.pinot.common.data.FieldSpec.DataType; import com.linkedin.pinot.common.data.MetricFieldSpec; import com.linkedin.pinot.common.data.Schema; import com.linkedin.pinot.common.data.TimeFieldSpec; import com.linkedin.pinot.common.utils.Pairs.IntPair; import com.linkedin.pinot.core.data.GenericRow; import com.linkedin.pinot.core.segment.creator.impl.V1Constants; import java.io.BufferedOutputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Queue; import java.util.Set; import com.linkedin.pinot.core.startree.hll.HllUtil; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.tuple.Pair; import org.joda.time.DateTime; import org.json.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Uses file to build the star tree. Each row is divided into dimension and metrics. Time is added * to dimension list. * We use the split order to build the tree. In most cases, split order will be ranked depending on * the cardinality (descending order). * Time column will be excluded or last entry in split order irrespective of its cardinality * This is a recursive algorithm where we branch on one dimension at every level. * <b>Psuedo algo</b> * <code> * * build(){ * let table(1,N) consists of N input rows * table.sort(1,N) //sort the table on all dimensions, according to split order * constructTree(table, 0, N, 0); * } * constructTree(table,start,end, level){ * splitDimensionName = dimensionsSplitOrder[level] * groupByResult<dimName, length> = table.groupBy(dimensionsSplitOrder[level]); //returns the number of rows for each value in splitDimension * int rangeStart = 0; * for each ( entry<dimName,length> groupByResult){ * if(entry.length > minThreshold){ * constructTree(table, rangeStart, rangeStart + entry.length, level +1); * } * rangeStart = rangeStart + entry.length; * updateStarTree() //add new child * } * * //create a star tree node * * aggregatedRows = table.uniqueAfterRemovingAttributeAndAggregateMetrics(start,end, splitDimensionName); * for(each row in aggregatedRows_ * table.add(row); * if(aggregateRows.size > minThreshold) { * table.sort(end, end + aggregatedRows.size); * constructStarTree(table, end, end + aggregatedRows.size, level +1); * } * } * </code> */ public class OffHeapStarTreeBuilder implements StarTreeBuilder { private static final Logger LOG = LoggerFactory.getLogger(OffHeapStarTreeBuilder.class); File dataFile; private Schema schema; private DataOutputStream dataBuffer; int rawRecordCount = 0; int aggRecordCount = 0; private List<String> dimensionsSplitOrder; private Set<String> skipStarNodeCreationForDimensions; private Set<String> skipMaterializationForDimensions; private int maxLeafRecords; private StarTree starTree; private StarTreeIndexNode starTreeRootIndexNode; private int numDimensions; private int numMetrics; private List<String> dimensionNames; private List<String> metricNames; private String timeColumnName; private List<DataType> dimensionTypes; private Map<String, Object> dimensionNameToStarValueMap; private HashBiMap<String, Integer> dimensionNameToIndexMap; private Map<String, Integer> metricNameToIndexMap; private int dimensionSizeBytes; private int metricSizeBytes; private File outDir; private Map<String, HashBiMap<Object, Integer>> dictionaryMap; boolean debugMode = false; private int[] sortOrder; private int skipMaterializationCardinalityThreshold; private boolean enableOffHeapFormat; public void init(StarTreeBuilderConfig builderConfig) throws Exception { schema = builderConfig.schema; timeColumnName = schema.getTimeColumnName(); this.dimensionsSplitOrder = builderConfig.dimensionsSplitOrder; skipStarNodeCreationForDimensions = builderConfig.getSkipStarNodeCreationForDimensions(); skipMaterializationForDimensions = builderConfig.getSkipMaterializationForDimensions(); skipMaterializationCardinalityThreshold = builderConfig.getSkipMaterializationCardinalityThreshold(); enableOffHeapFormat = builderConfig.isEnableOffHealpFormat(); this.maxLeafRecords = builderConfig.maxLeafRecords; this.outDir = builderConfig.getOutDir(); if (outDir == null) { outDir = new File(System.getProperty("java.io.tmpdir"), V1Constants.STAR_TREE_INDEX_DIR + "_" + DateTime.now()); } LOG.info("Index output directory:{}", outDir); dimensionTypes = new ArrayList<>(); dimensionNames = new ArrayList<>(); dimensionNameToIndexMap = HashBiMap.create(); dimensionNameToStarValueMap = new HashMap<>(); dictionaryMap = new HashMap<>(); // READ DIMENSIONS COLUMNS List<DimensionFieldSpec> dimensionFieldSpecs = schema.getDimensionFieldSpecs(); for (int index = 0; index < dimensionFieldSpecs.size(); index++) { DimensionFieldSpec spec = dimensionFieldSpecs.get(index); String dimensionName = spec.getName(); dimensionNames.add(dimensionName); dimensionNameToIndexMap.put(dimensionName, index); Object starValue; starValue = getAllStarValue(spec); dimensionNameToStarValueMap.put(dimensionName, starValue); dimensionTypes.add(spec.getDataType()); HashBiMap<Object, Integer> dictionary = HashBiMap.create(); dictionaryMap.put(dimensionName, dictionary); } // treat time column as just another dimension, only difference is that we will never split on // this dimension unless explicitly specified in split order if (timeColumnName != null) { dimensionNames.add(timeColumnName); TimeFieldSpec timeFieldSpec = schema.getTimeFieldSpec(); dimensionTypes.add(timeFieldSpec.getDataType()); int index = dimensionNameToIndexMap.size(); dimensionNameToIndexMap.put(timeColumnName, index); Object starValue; starValue = getAllStarValue(timeFieldSpec); dimensionNameToStarValueMap.put(timeColumnName, starValue); HashBiMap<Object, Integer> dictionary = HashBiMap.create(); dictionaryMap.put(schema.getTimeColumnName(), dictionary); } dimensionSizeBytes = dimensionNames.size() * Integer.SIZE / 8; this.numDimensions = dimensionNames.size(); // READ METRIC COLUMNS this.metricNames = new ArrayList<>(); this.metricNameToIndexMap = new HashMap<>(); this.metricSizeBytes = 0; List<MetricFieldSpec> metricFieldSpecs = schema.getMetricFieldSpecs(); for (int index = 0; index < metricFieldSpecs.size(); index++) { MetricFieldSpec spec = metricFieldSpecs.get(index); String metricName = spec.getName(); metricNames.add(metricName); metricNameToIndexMap.put(metricName, index); metricSizeBytes += spec.getFieldSize(); } numMetrics = metricNames.size(); builderConfig.getOutDir().mkdirs(); dataFile = new File(outDir, "star-tree.buf"); LOG.info("StarTree output data file: {}", dataFile.getAbsolutePath()); dataBuffer = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(dataFile))); // INITIALIZE THE ROOT NODE this.starTreeRootIndexNode = new StarTreeIndexNode(); this.starTreeRootIndexNode.setDimensionName(StarTreeIndexNodeInterf.ALL); this.starTreeRootIndexNode.setDimensionValue(StarTreeIndexNodeInterf.ALL); this.starTreeRootIndexNode.setLevel(0); LOG.info("dimensionNames:{}", dimensionNames); LOG.info("metricNames:{}", metricNames); } private Object getAllStarValue(FieldSpec spec) throws Exception { switch (spec.getDataType()) { case STRING: return "ALL"; case BOOLEAN: case BYTE: case CHAR: case DOUBLE: case FLOAT: case INT: case LONG: return spec.getDefaultNullValue(); case OBJECT: case SHORT: case DOUBLE_ARRAY: case CHAR_ARRAY: case FLOAT_ARRAY: case INT_ARRAY: case LONG_ARRAY: case SHORT_ARRAY: case STRING_ARRAY: case BYTE_ARRAY: default: throw new Exception("Unsupported dimension data type" + spec); } } public GenericRow toGenericRow(DimensionBuffer dimensionKey, MetricBuffer metricsHolder) { GenericRow row = new GenericRow(); Map<String, Object> map = new HashMap<>(); for (int i = 0; i < dimensionNames.size(); i++) { String dimName = dimensionNames.get(i); BiMap<Integer, Object> inverseDictionary = dictionaryMap.get(dimName).inverse(); Object dimValue = inverseDictionary.get(dimensionKey.getDimension(i)); if (dimValue == null) { dimValue = dimensionNameToStarValueMap.get(dimName); } map.put(dimName, dimValue); } for (int i = 0; i < numMetrics; i++) { String metName = metricNames.get(i); map.put(metName, metricsHolder.getValueConformToDataType(i)); } row.init(map); return row; } public void append(GenericRow row) throws Exception { DimensionBuffer dimension = new DimensionBuffer(numDimensions); for (int i = 0; i < dimensionNames.size(); i++) { String dimName = dimensionNames.get(i); Map<Object, Integer> dictionary = dictionaryMap.get(dimName); Object dimValue = row.getValue(dimName); if (dimValue == null) { // TODO: Have another default value to represent STAR. Using default value to represent STAR // as of now. // It does not matter during query execution, since we know that values is STAR from the // star tree dimValue = dimensionNameToStarValueMap.get(dimName); } if (!dictionary.containsKey(dimValue)) { dictionary.put(dimValue, dictionary.size()); } dimension.setDimension(i, dictionary.get(dimValue)); } // initialize raw data row Object[] metrics = new Object[numMetrics]; for (int i = 0; i < numMetrics; i++) { String metName = metricNames.get(i); if (schema.getMetricFieldSpecs().get(i) .getDerivedMetricType() == MetricFieldSpec.DerivedMetricType.HLL) { // hll field is in string format, convert it to hll data type first metrics[i] = HllUtil.convertStringToHll((String) row.getValue(metName)); } else { // no conversion for standard data types metrics[i] = row.getValue(metName); } } MetricBuffer metricBuffer = new MetricBuffer(metrics, schema.getMetricFieldSpecs()); appendToRawBuffer(dimension, metricBuffer); } private void appendToRawBuffer(DimensionBuffer dimension, MetricBuffer metrics) throws IOException { appendToBuffer(dataBuffer, dimension, metrics); rawRecordCount++; } private void appendToAggBuffer(DimensionBuffer dimension, MetricBuffer metrics) throws IOException { appendToBuffer(dataBuffer, dimension, metrics); aggRecordCount++; } private void appendToBuffer(DataOutputStream dos, DimensionBuffer dimensions, MetricBuffer metricHolder) throws IOException { for (int i = 0; i < numDimensions; i++) { dos.writeInt(dimensions.getDimension(i)); } dos.write(metricHolder.toBytes(metricSizeBytes)); } public void build() throws Exception { if (skipMaterializationForDimensions == null || skipMaterializationForDimensions.isEmpty()) { skipMaterializationForDimensions = computeDefaultDimensionsToSkipMaterialization(); } // For default split order, give preference to skipMaterializationForDimensions. // For user-defined split order, give preference to split-order. if (dimensionsSplitOrder == null || dimensionsSplitOrder.isEmpty()) { dimensionsSplitOrder = computeDefaultSplitOrder(); dimensionsSplitOrder.removeAll(skipMaterializationForDimensions); } else { skipMaterializationForDimensions.removeAll(dimensionsSplitOrder); } LOG.info("Split order: {}", dimensionsSplitOrder); LOG.info("Skip Materilazitaion For Dimensions: {}", skipMaterializationForDimensions); long start = System.currentTimeMillis(); dataBuffer.flush(); // Sort the data based on default sort order (split order + remaining dimensions) sort(dataFile, 0, rawRecordCount); // Recursively construct the star tree, continuously sorting the data constructStarTree(starTreeRootIndexNode, 0, rawRecordCount, 0, dataFile); // Split the leaf nodes on time column. This is only possible if we have not split on time-column name // yet, and time column is still preserved (ie not replaced by StarTreeNode.all()). if (timeColumnName != null && !skipMaterializationForDimensions.contains(timeColumnName) && !dimensionsSplitOrder.contains(timeColumnName)) { splitLeafNodesOnTimeColumn(); } // Create aggregate rows for all nodes in the tree createAggDocForAllNodes(starTreeRootIndexNode); long end = System.currentTimeMillis(); LOG.info("Took {} ms to build star tree index. Original records:{} Materialized record:{}", (end - start), rawRecordCount, aggRecordCount); starTree = new StarTree(starTreeRootIndexNode, dimensionNameToIndexMap); File treeBinary = new File(outDir, "star-tree.bin"); if (enableOffHeapFormat) { LOG.info("Saving tree in off-heap binary format at: {} ", treeBinary); StarTreeSerDe.writeTreeOffHeapFormat(starTree, treeBinary); } else { LOG.info("Saving tree in on-heap binary at: {} ", treeBinary); StarTreeSerDe.writeTreeOnHeapFormat(starTree, treeBinary); } printTree(starTreeRootIndexNode, 0); LOG.info("Finished build tree. out dir: {} ", outDir); dataBuffer.close(); } /** * Create aggregated docs using BFS * @param node */ private MetricBuffer createAggDocForAllNodes(StarTreeIndexNode node) throws Exception { MetricBuffer aggMetricBuffer = null; if (node.isLeaf()) { StarTreeDataTable leafDataTable = new StarTreeDataTable(dataFile, dimensionSizeBytes, metricSizeBytes, null); Iterator<Pair<byte[], byte[]>> iterator = leafDataTable.iterator(node.getStartDocumentId(), node.getEndDocumentId()); Pair<byte[], byte[]> first = iterator.next(); aggMetricBuffer = MetricBuffer.fromBytes(first.getRight(), schema.getMetricFieldSpecs()); while (iterator.hasNext()) { Pair<byte[], byte[]> next = iterator.next(); MetricBuffer metricBuffer = MetricBuffer.fromBytes(next.getRight(), schema.getMetricFieldSpecs()); aggMetricBuffer.aggregate(metricBuffer); } } else { Iterator<StarTreeIndexNode> childrenIterator = node.getChildrenIterator(); while (childrenIterator.hasNext()) { StarTreeIndexNode child = childrenIterator.next(); MetricBuffer childMetricBuffer = createAggDocForAllNodes(child); // don't use the star node value to compute aggregate for the parent if (child.getDimensionValue() == StarTreeIndexNodeInterf.ALL) { continue; } if (aggMetricBuffer == null) { aggMetricBuffer = new MetricBuffer(childMetricBuffer); } else { aggMetricBuffer.aggregate(childMetricBuffer); } } } //compute the dimension values for this node using the path, can be optimized by passing the path in the method call. Map<Integer, Integer> pathValues = node.getPathValues(); DimensionBuffer dimensionBuffer = new DimensionBuffer(numDimensions); for (int i = 0; i < numDimensions; i++) { if (pathValues.containsKey(i)) { dimensionBuffer.setDimension(i, pathValues.get(i)); } else { dimensionBuffer.setDimension(i, StarTreeIndexNodeInterf.ALL); } } node.setAggregatedDocumentId(rawRecordCount + aggRecordCount); appendToAggBuffer(dimensionBuffer, aggMetricBuffer); return aggMetricBuffer; } /** * Helper method that visits each leaf node does the following: * - Re-orders the doc-id's corresponding to leaf node wrt time column. * - Create children nodes for each time value under this leaf node. * - Adds a new record with aggregated data for this leaf node. * @throws Exception */ private void splitLeafNodesOnTimeColumn() throws Exception { Queue<StarTreeIndexNode> nodes = new LinkedList<>(); nodes.add(starTreeRootIndexNode); StarTreeDataSorter dataSorter = new StarTreeDataSorter(dataFile, dimensionSizeBytes, metricSizeBytes); while (!nodes.isEmpty()) { StarTreeIndexNode node = nodes.remove(); if (node.isLeaf()) { // If we have time column, split on time column, helps in time based filtering if (timeColumnName != null) { int level = node.getLevel(); int[] newSortOrder = moveColumnInSortOrder(timeColumnName, getSortOrder(), level); int startDocId = node.getStartDocumentId(); int endDocId = node.getEndDocumentId(); dataSorter.sort(startDocId, endDocId, newSortOrder); int timeColIndex = dimensionNameToIndexMap.get(timeColumnName); Map<Integer, IntPair> timeColumnRangeMap = dataSorter.groupByIntColumnCount(startDocId, endDocId, timeColIndex); node.setChildDimensionName(timeColIndex); node.setChildren(new HashMap<Integer, StarTreeIndexNode>()); for (int timeValue : timeColumnRangeMap.keySet()) { IntPair range = timeColumnRangeMap.get(timeValue); StarTreeIndexNode child = new StarTreeIndexNode(); child.setDimensionName(timeColIndex); child.setDimensionValue(timeValue); child.setParent(node); child.setLevel(node.getLevel() + 1); child.setStartDocumentId(range.getLeft()); child.setEndDocumentId(range.getRight()); node.addChild(child, timeValue); } } } else { Iterator<StarTreeIndexNode> childrenIterator = node.getChildrenIterator(); while (childrenIterator.hasNext()) { nodes.add(childrenIterator.next()); } } } dataSorter.close(); } /** * Helper method that moves the given column from its current position to * the specified new position. * @param columnToMove * @param origSortOrder * @param newPositionForTimeColumn * @return */ private int[] moveColumnInSortOrder(String columnToMove, int[] origSortOrder, int newPositionForTimeColumn) { Preconditions.checkArgument(columnToMove != null); Preconditions .checkArgument(newPositionForTimeColumn >= 0 && newPositionForTimeColumn < origSortOrder.length); int timeDimensionIndex = dimensionNameToIndexMap.get(columnToMove); int[] newSortOrder = new int[origSortOrder.length]; int index = 0; // Retain the sort order based on the path to this leaf node for (int i = 0; i < newPositionForTimeColumn; i++) { newSortOrder[index++] = origSortOrder[i]; } // Move time to the front newSortOrder[index++] = timeDimensionIndex; // Append remaining columns for (int i = newPositionForTimeColumn; i < numDimensions; i++) { if (i != timeDimensionIndex) { newSortOrder[index++] = origSortOrder[i]; } } return newSortOrder; } /** * Debug method to print the tree. * @param node * @param level */ private void printTree(StarTreeIndexNode node, int level) { for (int i = 0; i < level; i++) { LOG.debug(" "); } BiMap<Integer, String> inverse = dimensionNameToIndexMap.inverse(); String dimName = "ALL"; Object dimValue = "ALL"; if (node.getDimensionName() != StarTreeIndexNodeInterf.ALL) { dimName = inverse.get(node.getDimensionName()); } if (node.getDimensionValue() != StarTreeIndexNodeInterf.ALL) { dimValue = dictionaryMap.get(dimName).inverse().get(node.getDimensionValue()); } String formattedOutput = Objects.toStringHelper(node).add("nodeId", node.getNodeId()).add("level", level) .add("dimensionName", dimName).add("dimensionValue", dimValue) .add("childDimensionName", inverse.get(node.getChildDimensionName())) .add("childCount", node.getNumChildren()).add("startDocumentId", node.getStartDocumentId()) .add("endDocumentId", node.getEndDocumentId()) .add("documentCount", (node.getEndDocumentId() - node.getStartDocumentId())).toString(); LOG.debug(formattedOutput); if (!node.isLeaf()) { Iterator<StarTreeIndexNode> childrenIterator = node.getChildrenIterator(); while (childrenIterator.hasNext()) { printTree(childrenIterator.next(), level + 1); } } } private List<String> computeDefaultSplitOrder() { ArrayList<String> defaultSplitOrder = new ArrayList<>(); // include only the dimensions not time column. Also, assumes that // skipMaterializationForDimensions is built. for (String dimensionName : dimensionNames) { if (skipMaterializationForDimensions != null && !skipMaterializationForDimensions.contains(dimensionName)) { defaultSplitOrder.add(dimensionName); } } if (timeColumnName != null) { defaultSplitOrder.remove(timeColumnName); } Collections.sort(defaultSplitOrder, new Comparator<String>() { @Override public int compare(String o1, String o2) { return dictionaryMap.get(o2).size() - dictionaryMap.get(o1).size(); // descending } }); return defaultSplitOrder; } private Set<String> computeDefaultDimensionsToSkipMaterialization() { Set<String> skipDimensions = new HashSet<String>(); for (String dimensionName : dimensionNames) { if (dictionaryMap.get(dimensionName).size() > skipMaterializationCardinalityThreshold) { skipDimensions.add(dimensionName); } } return skipDimensions; } /* * Sorts the file on all dimensions */ private void sort(File file, int startDocId, int endDocId) throws IOException { if (debugMode) { LOG.info("BEFORE SORTING"); printFile(file, startDocId, endDocId); } StarTreeDataTable dataSorter = new StarTreeDataTable(file, dimensionSizeBytes, metricSizeBytes, getSortOrder()); dataSorter.sort(startDocId, endDocId, 0, dimensionSizeBytes); if (debugMode) { LOG.info("AFTER SORTING"); printFile(file, startDocId, endDocId); } } private int[] getSortOrder() { if (sortOrder == null) { sortOrder = new int[dimensionNames.size()]; for (int i = 0; i < dimensionsSplitOrder.size(); i++) { sortOrder[i] = dimensionNameToIndexMap.get(dimensionsSplitOrder.get(i)); } // add remaining dimensions that were not part of dimensionsSplitOrder int counter = 0; for (String dimName : dimensionNames) { if (!dimensionsSplitOrder.contains(dimName)) { sortOrder[dimensionsSplitOrder.size() + counter] = dimensionNameToIndexMap.get(dimName); counter = counter + 1; } } } return sortOrder; } private void printFile(File file, int startDocId, int endDocId) throws IOException { LOG.info("Contents of file:{} from:{} to:{}", file.getName(), startDocId, endDocId); StarTreeDataTable dataSorter = new StarTreeDataTable(file, dimensionSizeBytes, metricSizeBytes, getSortOrder()); Iterator<Pair<byte[], byte[]>> iterator = dataSorter.iterator(startDocId, endDocId); int numRecordsToPrint = 100; int counter = 0; while (iterator.hasNext()) { Pair<byte[], byte[]> next = iterator.next(); LOG.info("{}, {}", DimensionBuffer.fromBytes(next.getLeft()), MetricBuffer.fromBytes(next.getRight(), schema.getMetricFieldSpecs())); if (counter++ == numRecordsToPrint) { break; } } } private int constructStarTree(StarTreeIndexNode node, int startDocId, int endDocId, int level, File file) throws Exception { // node.setStartDocumentId(startDocId); int docsAdded = 0; if (level == dimensionsSplitOrder.size()) { return 0; } String splitDimensionName = dimensionsSplitOrder.get(level); Integer splitDimensionId = dimensionNameToIndexMap.get(splitDimensionName); LOG.debug("Building tree at level:{} using file:{} from startDoc:{} endDocId:{} splitting on dimension:{}", level, file.getName(), startDocId, endDocId, splitDimensionName); Map<Integer, IntPair> sortGroupBy = groupBy(startDocId, endDocId, splitDimensionId, file); LOG.debug("Group stats:{}", sortGroupBy); node.setChildDimensionName(splitDimensionId); node.setChildren(new HashMap<Integer, StarTreeIndexNode>()); for (int childDimensionValue : sortGroupBy.keySet()) { StarTreeIndexNode child = new StarTreeIndexNode(); child.setDimensionName(splitDimensionId); child.setDimensionValue(childDimensionValue); child.setParent(node); child.setLevel(node.getLevel() + 1); // n.b. We will number the nodes later using BFS after fully split // Add child to parent node.addChild(child, childDimensionValue); int childDocs = 0; IntPair range = sortGroupBy.get(childDimensionValue); if (range.getRight() - range.getLeft() > maxLeafRecords) { childDocs = constructStarTree(child, range.getLeft(), range.getRight(), level + 1, file); docsAdded += childDocs; } // Either range <= maxLeafRecords, or we did not split further (last level). if (childDocs == 0) { child.setStartDocumentId(range.getLeft()); child.setEndDocumentId(range.getRight()); } } // Return if star node does not need to be created. if (skipStarNodeCreationForDimensions != null && skipStarNodeCreationForDimensions.contains(splitDimensionName)) { return docsAdded; } // create star node StarTreeIndexNode starChild = new StarTreeIndexNode(); starChild.setDimensionName(splitDimensionId); starChild.setDimensionValue(StarTreeIndexNodeInterf.ALL); starChild.setParent(node); starChild.setLevel(node.getLevel() + 1); // n.b. We will number the nodes later using BFS after fully split // Add child to parent node.addChild(starChild, StarTreeIndexNodeInterf.ALL); Iterator<Pair<DimensionBuffer, MetricBuffer>> iterator = uniqueCombinations(startDocId, endDocId, file, splitDimensionId); int rowsAdded = 0; int startOffset = rawRecordCount + aggRecordCount; while (iterator.hasNext()) { Pair<DimensionBuffer, MetricBuffer> next = iterator.next(); DimensionBuffer dimension = next.getLeft(); MetricBuffer metricsHolder = next.getRight(); LOG.debug("Adding row:{}", dimension); appendToAggBuffer(dimension, metricsHolder); rowsAdded++; } docsAdded += rowsAdded; LOG.debug("Added {} additional records at level {}", rowsAdded, level); // flush dataBuffer.flush(); int childDocs = 0; if (rowsAdded >= maxLeafRecords) { sort(dataFile, startOffset, startOffset + rowsAdded); childDocs = constructStarTree(starChild, startOffset, startOffset + rowsAdded, level + 1, dataFile); docsAdded += childDocs; } // Either rowsAdded < maxLeafRecords, or we did not split further (last level). if (childDocs == 0) { starChild.setStartDocumentId(startOffset); starChild.setEndDocumentId(startOffset + rowsAdded); } // node.setEndDocumentId(endDocId + docsAdded); return docsAdded; } /** * Assumes the file is already sorted, returns the unique combinations after removing a specified * dimension. * Aggregates the metrics for each unique combination, currently only sum is supported by default * @param startDocId * @param endDocId * @param file * @param splitDimensionId * @return * @throws Exception */ private Iterator<Pair<DimensionBuffer, MetricBuffer>> uniqueCombinations(int startDocId, int endDocId, File file, int splitDimensionId) throws Exception { StarTreeDataTable dataSorter = new StarTreeDataTable(file, dimensionSizeBytes, metricSizeBytes, getSortOrder()); Iterator<Pair<byte[], byte[]>> iterator1 = dataSorter.iterator(startDocId, endDocId); File tempFile = new File(outDir, file.getName() + "_" + startDocId + "_" + endDocId + ".unique.tmp"); DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tempFile))); while (iterator1.hasNext()) { Pair<byte[], byte[]> next = iterator1.next(); byte[] dimensionBuffer = next.getLeft(); byte[] metricBuffer = next.getRight(); DimensionBuffer dimensions = DimensionBuffer.fromBytes(dimensionBuffer); for (int i = 0; i < numDimensions; i++) { String dimensionName = dimensionNameToIndexMap.inverse().get(i); if (i == splitDimensionId || (skipMaterializationForDimensions != null && skipMaterializationForDimensions.contains(dimensionName))) { dos.writeInt(StarTreeIndexNodeInterf.ALL); } else { dos.writeInt(dimensions.getDimension(i)); } } dos.write(metricBuffer); } dos.close(); dataSorter = new StarTreeDataTable(tempFile, dimensionSizeBytes, metricSizeBytes, getSortOrder()); dataSorter.sort(0, endDocId - startDocId); if (debugMode) { printFile(tempFile, 0, endDocId - startDocId); } final Iterator<Pair<byte[], byte[]>> iterator = dataSorter.iterator(0, endDocId - startDocId); return new Iterator<Pair<DimensionBuffer, MetricBuffer>>() { Pair<DimensionBuffer, MetricBuffer> prev = null; boolean done = false; @Override public void remove() { throw new UnsupportedOperationException(); } @Override public boolean hasNext() { return !done; } @Override public Pair<DimensionBuffer, MetricBuffer> next() { while (iterator.hasNext()) { Pair<byte[], byte[]> next = iterator.next(); byte[] dimBuffer = next.getLeft(); byte[] metricBuffer = next.getRight(); if (prev == null) { prev = Pair.of(DimensionBuffer.fromBytes(dimBuffer), MetricBuffer.fromBytes(metricBuffer, schema.getMetricFieldSpecs())); } else { Pair<DimensionBuffer, MetricBuffer> current = Pair.of(DimensionBuffer.fromBytes(dimBuffer), MetricBuffer.fromBytes(metricBuffer, schema.getMetricFieldSpecs())); if (!current.getLeft().equals(prev.getLeft())) { Pair<DimensionBuffer, MetricBuffer> ret = prev; prev = current; LOG.debug("Returning unique {}", prev.getLeft()); return ret; } else { prev.getRight().aggregate(current.getRight()); } } } done = true; LOG.debug("Returning unique {}", prev.getLeft()); return prev; } }; } /** * Group by on dimension column, assumes data is already sorted on this dimension from start to * end doc id * @param startDocId * @param endDocId * @param dimension * @param file * @return */ private Map<Integer, IntPair> groupBy(int startDocId, int endDocId, Integer dimension, File file) { StarTreeDataTable dataSorter = new StarTreeDataTable(file, dimensionSizeBytes, metricSizeBytes, getSortOrder()); return dataSorter.groupByIntColumnCount(startDocId, endDocId, dimension); } /** * Iterator to iterate over the records from startDocId to endDocId */ @Override public Iterator<GenericRow> iterator(final int startDocId, final int endDocId) throws Exception { StarTreeDataTable dataSorter = new StarTreeDataTable(dataFile, dimensionSizeBytes, metricSizeBytes, getSortOrder()); final Iterator<Pair<byte[], byte[]>> iterator = dataSorter.iterator(startDocId, endDocId); return new Iterator<GenericRow>() { @Override public boolean hasNext() { return iterator.hasNext(); } @Override public void remove() { throw new UnsupportedOperationException(); } @Override public GenericRow next() { Pair<byte[], byte[]> pair = iterator.next(); DimensionBuffer dimensionKey = DimensionBuffer.fromBytes(pair.getLeft()); MetricBuffer metricsHolder = MetricBuffer.fromBytes(pair.getRight(), schema.getMetricFieldSpecs()); return toGenericRow(dimensionKey, metricsHolder); } }; } public JSONObject getStarTreeAsJSON() throws Exception { JSONObject json = new JSONObject(); toJson(json, starTreeRootIndexNode, dictionaryMap); return json; } private void toJson(JSONObject json, StarTreeIndexNode node, Map<String, HashBiMap<Object, Integer>> dictionaryMap) throws Exception { String dimName = "ALL"; Object dimValue = "ALL"; if (node.getDimensionName() != StarTreeIndexNodeInterf.ALL) { dimName = dimensionNames.get(node.getDimensionName()); } if (node.getDimensionValue() != StarTreeIndexNodeInterf.ALL) { dimValue = dictionaryMap.get(dimName).inverse().get(node.getDimensionValue()); } json.put("title", dimName + ":" + dimValue); Iterator<StarTreeIndexNode> childrenIterator = node.getChildrenIterator(); if (childrenIterator != null) { JSONObject[] childJsons = new JSONObject[node.getNumChildren()]; int index = 0; while (childrenIterator.hasNext()) { StarTreeIndexNode childNode = childrenIterator.next(); JSONObject childJson = new JSONObject(); toJson(childJson, childNode, dictionaryMap); childJsons[index++] = childJson; } json.put("nodes", childJsons); } } @Override public void cleanup() { if (outDir != null) { FileUtils.deleteQuietly(outDir); } } @Override public StarTree getTree() { return starTree; } @Override public int getTotalRawDocumentCount() { return rawRecordCount; } @Override public int getTotalAggregateDocumentCount() { return aggRecordCount; } @Override public int getMaxLeafRecords() { return maxLeafRecords; } @Override public List<String> getDimensionsSplitOrder() { return dimensionsSplitOrder; } public Map<String, HashBiMap<Object, Integer>> getDictionaryMap() { return dictionaryMap; } public HashBiMap<String, Integer> getDimensionNameToIndexMap() { return dimensionNameToIndexMap; } @Override public Set<String> getSkipMaterializationForDimensions() { return skipMaterializationForDimensions; } }