com.linkedin.pinot.core.startree.OffHeapStarTreeBuilder.java Source code

Introduction

Here is the source code for com.linkedin.pinot.core.startree.OffHeapStarTreeBuilder.java
Source

/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.core.startree;

import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;
import com.linkedin.pinot.common.data.DimensionFieldSpec;
import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.data.FieldSpec.DataType;
import com.linkedin.pinot.common.data.MetricFieldSpec;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.data.TimeFieldSpec;
import com.linkedin.pinot.common.utils.Pairs.IntPair;
import com.linkedin.pinot.core.data.GenericRow;
import com.linkedin.pinot.core.segment.creator.impl.V1Constants;
import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;

import com.linkedin.pinot.core.startree.hll.HllUtil;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.joda.time.DateTime;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Uses file to build the star tree. Each row is divided into dimension and metrics. Time is added
 * to dimension list.
 * We use the split order to build the tree. In most cases, split order will be ranked depending on
 * the cardinality (descending order).
 * Time column will be excluded or last entry in split order irrespective of its cardinality
 * This is a recursive algorithm where we branch on one dimension at every level.
 * <b>Psuedo algo</b>
 * <code>
 *
 * build(){
 *  let table(1,N) consists of N input rows
 *  table.sort(1,N) //sort the table on all dimensions, according to split order
 *  constructTree(table, 0, N, 0);
 * }
 * constructTree(table,start,end, level){
 *    splitDimensionName = dimensionsSplitOrder[level]
 *    groupByResult<dimName, length> = table.groupBy(dimensionsSplitOrder[level]); //returns the number of rows for each value in splitDimension
 *    int rangeStart = 0;
 *    for each ( entry<dimName,length> groupByResult){
 *      if(entry.length > minThreshold){
 *        constructTree(table, rangeStart, rangeStart + entry.length, level +1);
 *      }
 *      rangeStart = rangeStart + entry.length;
 *      updateStarTree() //add new child
 *    }
 *
 *    //create a star tree node
 *
 *    aggregatedRows = table.uniqueAfterRemovingAttributeAndAggregateMetrics(start,end, splitDimensionName);
 *    for(each row in aggregatedRows_
 *    table.add(row);
 *    if(aggregateRows.size > minThreshold) {
 *      table.sort(end, end + aggregatedRows.size);
 *      constructStarTree(table, end, end + aggregatedRows.size, level +1);
 *    }
 * }
 * </code>
 */
public class OffHeapStarTreeBuilder implements StarTreeBuilder {
    private static final Logger LOG = LoggerFactory.getLogger(OffHeapStarTreeBuilder.class);
    File dataFile;
    private Schema schema;
    private DataOutputStream dataBuffer;
    int rawRecordCount = 0;
    int aggRecordCount = 0;
    private List<String> dimensionsSplitOrder;
    private Set<String> skipStarNodeCreationForDimensions;
    private Set<String> skipMaterializationForDimensions;

    private int maxLeafRecords;
    private StarTree starTree;
    private StarTreeIndexNode starTreeRootIndexNode;
    private int numDimensions;
    private int numMetrics;
    private List<String> dimensionNames;
    private List<String> metricNames;
    private String timeColumnName;
    private List<DataType> dimensionTypes;
    private Map<String, Object> dimensionNameToStarValueMap;
    private HashBiMap<String, Integer> dimensionNameToIndexMap;
    private Map<String, Integer> metricNameToIndexMap;
    private int dimensionSizeBytes;
    private int metricSizeBytes;
    private File outDir;
    private Map<String, HashBiMap<Object, Integer>> dictionaryMap;

    boolean debugMode = false;
    private int[] sortOrder;
    private int skipMaterializationCardinalityThreshold;
    private boolean enableOffHeapFormat;

    public void init(StarTreeBuilderConfig builderConfig) throws Exception {
        schema = builderConfig.schema;
        timeColumnName = schema.getTimeColumnName();
        this.dimensionsSplitOrder = builderConfig.dimensionsSplitOrder;
        skipStarNodeCreationForDimensions = builderConfig.getSkipStarNodeCreationForDimensions();
        skipMaterializationForDimensions = builderConfig.getSkipMaterializationForDimensions();
        skipMaterializationCardinalityThreshold = builderConfig.getSkipMaterializationCardinalityThreshold();
        enableOffHeapFormat = builderConfig.isEnableOffHealpFormat();

        this.maxLeafRecords = builderConfig.maxLeafRecords;
        this.outDir = builderConfig.getOutDir();
        if (outDir == null) {
            outDir = new File(System.getProperty("java.io.tmpdir"),
                    V1Constants.STAR_TREE_INDEX_DIR + "_" + DateTime.now());
        }
        LOG.info("Index output directory:{}", outDir);

        dimensionTypes = new ArrayList<>();
        dimensionNames = new ArrayList<>();
        dimensionNameToIndexMap = HashBiMap.create();
        dimensionNameToStarValueMap = new HashMap<>();
        dictionaryMap = new HashMap<>();

        // READ DIMENSIONS COLUMNS
        List<DimensionFieldSpec> dimensionFieldSpecs = schema.getDimensionFieldSpecs();
        for (int index = 0; index < dimensionFieldSpecs.size(); index++) {
            DimensionFieldSpec spec = dimensionFieldSpecs.get(index);
            String dimensionName = spec.getName();
            dimensionNames.add(dimensionName);
            dimensionNameToIndexMap.put(dimensionName, index);
            Object starValue;
            starValue = getAllStarValue(spec);
            dimensionNameToStarValueMap.put(dimensionName, starValue);
            dimensionTypes.add(spec.getDataType());
            HashBiMap<Object, Integer> dictionary = HashBiMap.create();
            dictionaryMap.put(dimensionName, dictionary);
        }
        // treat time column as just another dimension, only difference is that we will never split on
        // this dimension unless explicitly specified in split order
        if (timeColumnName != null) {
            dimensionNames.add(timeColumnName);
            TimeFieldSpec timeFieldSpec = schema.getTimeFieldSpec();
            dimensionTypes.add(timeFieldSpec.getDataType());
            int index = dimensionNameToIndexMap.size();
            dimensionNameToIndexMap.put(timeColumnName, index);
            Object starValue;
            starValue = getAllStarValue(timeFieldSpec);
            dimensionNameToStarValueMap.put(timeColumnName, starValue);
            HashBiMap<Object, Integer> dictionary = HashBiMap.create();
            dictionaryMap.put(schema.getTimeColumnName(), dictionary);
        }
        dimensionSizeBytes = dimensionNames.size() * Integer.SIZE / 8;
        this.numDimensions = dimensionNames.size();

        // READ METRIC COLUMNS
        this.metricNames = new ArrayList<>();

        this.metricNameToIndexMap = new HashMap<>();
        this.metricSizeBytes = 0;
        List<MetricFieldSpec> metricFieldSpecs = schema.getMetricFieldSpecs();
        for (int index = 0; index < metricFieldSpecs.size(); index++) {
            MetricFieldSpec spec = metricFieldSpecs.get(index);
            String metricName = spec.getName();
            metricNames.add(metricName);
            metricNameToIndexMap.put(metricName, index);
            metricSizeBytes += spec.getFieldSize();
        }
        numMetrics = metricNames.size();
        builderConfig.getOutDir().mkdirs();
        dataFile = new File(outDir, "star-tree.buf");
        LOG.info("StarTree output data file: {}", dataFile.getAbsolutePath());
        dataBuffer = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(dataFile)));

        // INITIALIZE THE ROOT NODE
        this.starTreeRootIndexNode = new StarTreeIndexNode();
        this.starTreeRootIndexNode.setDimensionName(StarTreeIndexNodeInterf.ALL);
        this.starTreeRootIndexNode.setDimensionValue(StarTreeIndexNodeInterf.ALL);
        this.starTreeRootIndexNode.setLevel(0);
        LOG.info("dimensionNames:{}", dimensionNames);
        LOG.info("metricNames:{}", metricNames);
    }

    private Object getAllStarValue(FieldSpec spec) throws Exception {
        switch (spec.getDataType()) {
        case STRING:
            return "ALL";
        case BOOLEAN:
        case BYTE:
        case CHAR:
        case DOUBLE:
        case FLOAT:
        case INT:
        case LONG:
            return spec.getDefaultNullValue();
        case OBJECT:
        case SHORT:
        case DOUBLE_ARRAY:
        case CHAR_ARRAY:
        case FLOAT_ARRAY:
        case INT_ARRAY:
        case LONG_ARRAY:
        case SHORT_ARRAY:
        case STRING_ARRAY:
        case BYTE_ARRAY:
        default:
            throw new Exception("Unsupported dimension data type" + spec);
        }
    }

    public GenericRow toGenericRow(DimensionBuffer dimensionKey, MetricBuffer metricsHolder) {
        GenericRow row = new GenericRow();
        Map<String, Object> map = new HashMap<>();
        for (int i = 0; i < dimensionNames.size(); i++) {
            String dimName = dimensionNames.get(i);
            BiMap<Integer, Object> inverseDictionary = dictionaryMap.get(dimName).inverse();
            Object dimValue = inverseDictionary.get(dimensionKey.getDimension(i));
            if (dimValue == null) {
                dimValue = dimensionNameToStarValueMap.get(dimName);
            }
            map.put(dimName, dimValue);
        }
        for (int i = 0; i < numMetrics; i++) {
            String metName = metricNames.get(i);
            map.put(metName, metricsHolder.getValueConformToDataType(i));
        }
        row.init(map);
        return row;
    }

    public void append(GenericRow row) throws Exception {
        DimensionBuffer dimension = new DimensionBuffer(numDimensions);
        for (int i = 0; i < dimensionNames.size(); i++) {
            String dimName = dimensionNames.get(i);
            Map<Object, Integer> dictionary = dictionaryMap.get(dimName);
            Object dimValue = row.getValue(dimName);
            if (dimValue == null) {
                // TODO: Have another default value to represent STAR. Using default value to represent STAR
                // as of now.
                // It does not matter during query execution, since we know that values is STAR from the
                // star tree
                dimValue = dimensionNameToStarValueMap.get(dimName);
            }
            if (!dictionary.containsKey(dimValue)) {
                dictionary.put(dimValue, dictionary.size());
            }
            dimension.setDimension(i, dictionary.get(dimValue));
        }
        // initialize raw data row
        Object[] metrics = new Object[numMetrics];
        for (int i = 0; i < numMetrics; i++) {
            String metName = metricNames.get(i);
            if (schema.getMetricFieldSpecs().get(i)
                    .getDerivedMetricType() == MetricFieldSpec.DerivedMetricType.HLL) {
                // hll field is in string format, convert it to hll data type first
                metrics[i] = HllUtil.convertStringToHll((String) row.getValue(metName));
            } else {
                // no conversion for standard data types
                metrics[i] = row.getValue(metName);
            }
        }
        MetricBuffer metricBuffer = new MetricBuffer(metrics, schema.getMetricFieldSpecs());
        appendToRawBuffer(dimension, metricBuffer);
    }

    private void appendToRawBuffer(DimensionBuffer dimension, MetricBuffer metrics) throws IOException {
        appendToBuffer(dataBuffer, dimension, metrics);
        rawRecordCount++;
    }

    private void appendToAggBuffer(DimensionBuffer dimension, MetricBuffer metrics) throws IOException {
        appendToBuffer(dataBuffer, dimension, metrics);
        aggRecordCount++;
    }

    private void appendToBuffer(DataOutputStream dos, DimensionBuffer dimensions, MetricBuffer metricHolder)
            throws IOException {
        for (int i = 0; i < numDimensions; i++) {
            dos.writeInt(dimensions.getDimension(i));
        }
        dos.write(metricHolder.toBytes(metricSizeBytes));
    }

    public void build() throws Exception {
        if (skipMaterializationForDimensions == null || skipMaterializationForDimensions.isEmpty()) {
            skipMaterializationForDimensions = computeDefaultDimensionsToSkipMaterialization();
        }

        // For default split order, give preference to skipMaterializationForDimensions.
        // For user-defined split order, give preference to split-order.
        if (dimensionsSplitOrder == null || dimensionsSplitOrder.isEmpty()) {
            dimensionsSplitOrder = computeDefaultSplitOrder();
            dimensionsSplitOrder.removeAll(skipMaterializationForDimensions);
        } else {
            skipMaterializationForDimensions.removeAll(dimensionsSplitOrder);
        }

        LOG.info("Split order: {}", dimensionsSplitOrder);
        LOG.info("Skip Materilazitaion For Dimensions: {}", skipMaterializationForDimensions);

        long start = System.currentTimeMillis();
        dataBuffer.flush();
        // Sort the data based on default sort order (split order + remaining dimensions)
        sort(dataFile, 0, rawRecordCount);
        // Recursively construct the star tree, continuously sorting the data
        constructStarTree(starTreeRootIndexNode, 0, rawRecordCount, 0, dataFile);

        // Split the leaf nodes on time column. This is only possible if we have not split on time-column name
        // yet, and time column is still preserved (ie not replaced by StarTreeNode.all()).
        if (timeColumnName != null && !skipMaterializationForDimensions.contains(timeColumnName)
                && !dimensionsSplitOrder.contains(timeColumnName)) {
            splitLeafNodesOnTimeColumn();
        }

        // Create aggregate rows for all nodes in the tree
        createAggDocForAllNodes(starTreeRootIndexNode);
        long end = System.currentTimeMillis();
        LOG.info("Took {} ms to build star tree index. Original records:{} Materialized record:{}", (end - start),
                rawRecordCount, aggRecordCount);
        starTree = new StarTree(starTreeRootIndexNode, dimensionNameToIndexMap);
        File treeBinary = new File(outDir, "star-tree.bin");

        if (enableOffHeapFormat) {
            LOG.info("Saving tree in off-heap binary format at: {} ", treeBinary);
            StarTreeSerDe.writeTreeOffHeapFormat(starTree, treeBinary);
        } else {
            LOG.info("Saving tree in on-heap binary at: {} ", treeBinary);
            StarTreeSerDe.writeTreeOnHeapFormat(starTree, treeBinary);
        }

        printTree(starTreeRootIndexNode, 0);
        LOG.info("Finished build tree. out dir: {} ", outDir);
        dataBuffer.close();
    }

    /**
     * Create aggregated docs using BFS
     * @param node
     */
    private MetricBuffer createAggDocForAllNodes(StarTreeIndexNode node) throws Exception {
        MetricBuffer aggMetricBuffer = null;
        if (node.isLeaf()) {
            StarTreeDataTable leafDataTable = new StarTreeDataTable(dataFile, dimensionSizeBytes, metricSizeBytes,
                    null);
            Iterator<Pair<byte[], byte[]>> iterator = leafDataTable.iterator(node.getStartDocumentId(),
                    node.getEndDocumentId());
            Pair<byte[], byte[]> first = iterator.next();
            aggMetricBuffer = MetricBuffer.fromBytes(first.getRight(), schema.getMetricFieldSpecs());
            while (iterator.hasNext()) {
                Pair<byte[], byte[]> next = iterator.next();
                MetricBuffer metricBuffer = MetricBuffer.fromBytes(next.getRight(), schema.getMetricFieldSpecs());
                aggMetricBuffer.aggregate(metricBuffer);
            }
        } else {

            Iterator<StarTreeIndexNode> childrenIterator = node.getChildrenIterator();
            while (childrenIterator.hasNext()) {
                StarTreeIndexNode child = childrenIterator.next();
                MetricBuffer childMetricBuffer = createAggDocForAllNodes(child);
                // don't use the star node value to compute aggregate for the parent
                if (child.getDimensionValue() == StarTreeIndexNodeInterf.ALL) {
                    continue;
                }
                if (aggMetricBuffer == null) {
                    aggMetricBuffer = new MetricBuffer(childMetricBuffer);
                } else {
                    aggMetricBuffer.aggregate(childMetricBuffer);
                }
            }
        }
        //compute the dimension values for this node using the path, can be optimized by passing the path in the method call.
        Map<Integer, Integer> pathValues = node.getPathValues();
        DimensionBuffer dimensionBuffer = new DimensionBuffer(numDimensions);
        for (int i = 0; i < numDimensions; i++) {
            if (pathValues.containsKey(i)) {
                dimensionBuffer.setDimension(i, pathValues.get(i));
            } else {
                dimensionBuffer.setDimension(i, StarTreeIndexNodeInterf.ALL);
            }
        }
        node.setAggregatedDocumentId(rawRecordCount + aggRecordCount);
        appendToAggBuffer(dimensionBuffer, aggMetricBuffer);
        return aggMetricBuffer;

    }

    /**
     * Helper method that visits each leaf node does the following:
     * - Re-orders the doc-id's corresponding to leaf node wrt time column.
     * - Create children nodes for each time value under this leaf node.
     * - Adds a new record with aggregated data for this leaf node.
     * @throws Exception
     */
    private void splitLeafNodesOnTimeColumn() throws Exception {
        Queue<StarTreeIndexNode> nodes = new LinkedList<>();
        nodes.add(starTreeRootIndexNode);
        StarTreeDataSorter dataSorter = new StarTreeDataSorter(dataFile, dimensionSizeBytes, metricSizeBytes);
        while (!nodes.isEmpty()) {
            StarTreeIndexNode node = nodes.remove();
            if (node.isLeaf()) {
                // If we have time column, split on time column, helps in time based filtering
                if (timeColumnName != null) {
                    int level = node.getLevel();
                    int[] newSortOrder = moveColumnInSortOrder(timeColumnName, getSortOrder(), level);

                    int startDocId = node.getStartDocumentId();
                    int endDocId = node.getEndDocumentId();
                    dataSorter.sort(startDocId, endDocId, newSortOrder);
                    int timeColIndex = dimensionNameToIndexMap.get(timeColumnName);
                    Map<Integer, IntPair> timeColumnRangeMap = dataSorter.groupByIntColumnCount(startDocId,
                            endDocId, timeColIndex);

                    node.setChildDimensionName(timeColIndex);
                    node.setChildren(new HashMap<Integer, StarTreeIndexNode>());

                    for (int timeValue : timeColumnRangeMap.keySet()) {
                        IntPair range = timeColumnRangeMap.get(timeValue);
                        StarTreeIndexNode child = new StarTreeIndexNode();
                        child.setDimensionName(timeColIndex);
                        child.setDimensionValue(timeValue);
                        child.setParent(node);
                        child.setLevel(node.getLevel() + 1);
                        child.setStartDocumentId(range.getLeft());
                        child.setEndDocumentId(range.getRight());
                        node.addChild(child, timeValue);
                    }
                }
            } else {
                Iterator<StarTreeIndexNode> childrenIterator = node.getChildrenIterator();
                while (childrenIterator.hasNext()) {
                    nodes.add(childrenIterator.next());
                }
            }
        }
        dataSorter.close();
    }

    /**
     * Helper method that moves the given column from its current position to
     * the specified new position.
     * @param columnToMove
     * @param origSortOrder
     * @param newPositionForTimeColumn
     * @return
     */
    private int[] moveColumnInSortOrder(String columnToMove, int[] origSortOrder, int newPositionForTimeColumn) {
        Preconditions.checkArgument(columnToMove != null);
        Preconditions
                .checkArgument(newPositionForTimeColumn >= 0 && newPositionForTimeColumn < origSortOrder.length);

        int timeDimensionIndex = dimensionNameToIndexMap.get(columnToMove);
        int[] newSortOrder = new int[origSortOrder.length];
        int index = 0;

        // Retain the sort order based on the path to this leaf node
        for (int i = 0; i < newPositionForTimeColumn; i++) {
            newSortOrder[index++] = origSortOrder[i];
        }

        // Move time to the front
        newSortOrder[index++] = timeDimensionIndex;

        // Append remaining columns
        for (int i = newPositionForTimeColumn; i < numDimensions; i++) {
            if (i != timeDimensionIndex) {
                newSortOrder[index++] = origSortOrder[i];
            }
        }

        return newSortOrder;
    }

    /**
     * Debug method to print the tree.
     * @param node
     * @param level
     */
    private void printTree(StarTreeIndexNode node, int level) {
        for (int i = 0; i < level; i++) {
            LOG.debug("  ");
        }
        BiMap<Integer, String> inverse = dimensionNameToIndexMap.inverse();
        String dimName = "ALL";
        Object dimValue = "ALL";
        if (node.getDimensionName() != StarTreeIndexNodeInterf.ALL) {
            dimName = inverse.get(node.getDimensionName());
        }
        if (node.getDimensionValue() != StarTreeIndexNodeInterf.ALL) {
            dimValue = dictionaryMap.get(dimName).inverse().get(node.getDimensionValue());
        }

        String formattedOutput = Objects.toStringHelper(node).add("nodeId", node.getNodeId()).add("level", level)
                .add("dimensionName", dimName).add("dimensionValue", dimValue)
                .add("childDimensionName", inverse.get(node.getChildDimensionName()))
                .add("childCount", node.getNumChildren()).add("startDocumentId", node.getStartDocumentId())
                .add("endDocumentId", node.getEndDocumentId())
                .add("documentCount", (node.getEndDocumentId() - node.getStartDocumentId())).toString();
        LOG.debug(formattedOutput);

        if (!node.isLeaf()) {
            Iterator<StarTreeIndexNode> childrenIterator = node.getChildrenIterator();
            while (childrenIterator.hasNext()) {
                printTree(childrenIterator.next(), level + 1);
            }
        }
    }

    private List<String> computeDefaultSplitOrder() {
        ArrayList<String> defaultSplitOrder = new ArrayList<>();
        // include only the dimensions not time column. Also, assumes that
        // skipMaterializationForDimensions is built.
        for (String dimensionName : dimensionNames) {
            if (skipMaterializationForDimensions != null
                    && !skipMaterializationForDimensions.contains(dimensionName)) {
                defaultSplitOrder.add(dimensionName);
            }
        }
        if (timeColumnName != null) {
            defaultSplitOrder.remove(timeColumnName);
        }
        Collections.sort(defaultSplitOrder, new Comparator<String>() {
            @Override
            public int compare(String o1, String o2) {
                return dictionaryMap.get(o2).size() - dictionaryMap.get(o1).size(); // descending
            }
        });
        return defaultSplitOrder;
    }

    private Set<String> computeDefaultDimensionsToSkipMaterialization() {
        Set<String> skipDimensions = new HashSet<String>();
        for (String dimensionName : dimensionNames) {
            if (dictionaryMap.get(dimensionName).size() > skipMaterializationCardinalityThreshold) {
                skipDimensions.add(dimensionName);
            }
        }
        return skipDimensions;
    }

    /*
     * Sorts the file on all dimensions
     */
    private void sort(File file, int startDocId, int endDocId) throws IOException {
        if (debugMode) {
            LOG.info("BEFORE SORTING");
            printFile(file, startDocId, endDocId);
        }

        StarTreeDataTable dataSorter = new StarTreeDataTable(file, dimensionSizeBytes, metricSizeBytes,
                getSortOrder());
        dataSorter.sort(startDocId, endDocId, 0, dimensionSizeBytes);
        if (debugMode) {
            LOG.info("AFTER SORTING");
            printFile(file, startDocId, endDocId);
        }
    }

    private int[] getSortOrder() {
        if (sortOrder == null) {
            sortOrder = new int[dimensionNames.size()];
            for (int i = 0; i < dimensionsSplitOrder.size(); i++) {
                sortOrder[i] = dimensionNameToIndexMap.get(dimensionsSplitOrder.get(i));
            }
            // add remaining dimensions that were not part of dimensionsSplitOrder
            int counter = 0;
            for (String dimName : dimensionNames) {
                if (!dimensionsSplitOrder.contains(dimName)) {
                    sortOrder[dimensionsSplitOrder.size() + counter] = dimensionNameToIndexMap.get(dimName);
                    counter = counter + 1;
                }
            }
        }
        return sortOrder;
    }

    private void printFile(File file, int startDocId, int endDocId) throws IOException {
        LOG.info("Contents of file:{} from:{} to:{}", file.getName(), startDocId, endDocId);
        StarTreeDataTable dataSorter = new StarTreeDataTable(file, dimensionSizeBytes, metricSizeBytes,
                getSortOrder());
        Iterator<Pair<byte[], byte[]>> iterator = dataSorter.iterator(startDocId, endDocId);
        int numRecordsToPrint = 100;
        int counter = 0;
        while (iterator.hasNext()) {
            Pair<byte[], byte[]> next = iterator.next();
            LOG.info("{}, {}", DimensionBuffer.fromBytes(next.getLeft()),
                    MetricBuffer.fromBytes(next.getRight(), schema.getMetricFieldSpecs()));
            if (counter++ == numRecordsToPrint) {
                break;
            }
        }
    }

    private int constructStarTree(StarTreeIndexNode node, int startDocId, int endDocId, int level, File file)
            throws Exception {
        // node.setStartDocumentId(startDocId);
        int docsAdded = 0;
        if (level == dimensionsSplitOrder.size()) {
            return 0;
        }
        String splitDimensionName = dimensionsSplitOrder.get(level);
        Integer splitDimensionId = dimensionNameToIndexMap.get(splitDimensionName);
        LOG.debug("Building tree at level:{} using file:{} from startDoc:{} endDocId:{} splitting on dimension:{}",
                level, file.getName(), startDocId, endDocId, splitDimensionName);
        Map<Integer, IntPair> sortGroupBy = groupBy(startDocId, endDocId, splitDimensionId, file);
        LOG.debug("Group stats:{}", sortGroupBy);
        node.setChildDimensionName(splitDimensionId);
        node.setChildren(new HashMap<Integer, StarTreeIndexNode>());
        for (int childDimensionValue : sortGroupBy.keySet()) {
            StarTreeIndexNode child = new StarTreeIndexNode();
            child.setDimensionName(splitDimensionId);
            child.setDimensionValue(childDimensionValue);
            child.setParent(node);
            child.setLevel(node.getLevel() + 1);

            // n.b. We will number the nodes later using BFS after fully split

            // Add child to parent
            node.addChild(child, childDimensionValue);

            int childDocs = 0;
            IntPair range = sortGroupBy.get(childDimensionValue);
            if (range.getRight() - range.getLeft() > maxLeafRecords) {
                childDocs = constructStarTree(child, range.getLeft(), range.getRight(), level + 1, file);
                docsAdded += childDocs;
            }

            // Either range <= maxLeafRecords, or we did not split further (last level).
            if (childDocs == 0) {
                child.setStartDocumentId(range.getLeft());
                child.setEndDocumentId(range.getRight());
            }
        }

        // Return if star node does not need to be created.
        if (skipStarNodeCreationForDimensions != null
                && skipStarNodeCreationForDimensions.contains(splitDimensionName)) {
            return docsAdded;
        }

        // create star node
        StarTreeIndexNode starChild = new StarTreeIndexNode();
        starChild.setDimensionName(splitDimensionId);
        starChild.setDimensionValue(StarTreeIndexNodeInterf.ALL);
        starChild.setParent(node);
        starChild.setLevel(node.getLevel() + 1);
        // n.b. We will number the nodes later using BFS after fully split

        // Add child to parent
        node.addChild(starChild, StarTreeIndexNodeInterf.ALL);

        Iterator<Pair<DimensionBuffer, MetricBuffer>> iterator = uniqueCombinations(startDocId, endDocId, file,
                splitDimensionId);
        int rowsAdded = 0;
        int startOffset = rawRecordCount + aggRecordCount;
        while (iterator.hasNext()) {
            Pair<DimensionBuffer, MetricBuffer> next = iterator.next();
            DimensionBuffer dimension = next.getLeft();
            MetricBuffer metricsHolder = next.getRight();
            LOG.debug("Adding row:{}", dimension);
            appendToAggBuffer(dimension, metricsHolder);
            rowsAdded++;
        }
        docsAdded += rowsAdded;
        LOG.debug("Added {} additional records at level {}", rowsAdded, level);
        // flush
        dataBuffer.flush();

        int childDocs = 0;
        if (rowsAdded >= maxLeafRecords) {
            sort(dataFile, startOffset, startOffset + rowsAdded);
            childDocs = constructStarTree(starChild, startOffset, startOffset + rowsAdded, level + 1, dataFile);
            docsAdded += childDocs;
        }

        // Either rowsAdded < maxLeafRecords, or we did not split further (last level).
        if (childDocs == 0) {
            starChild.setStartDocumentId(startOffset);
            starChild.setEndDocumentId(startOffset + rowsAdded);
        }
        // node.setEndDocumentId(endDocId + docsAdded);
        return docsAdded;
    }

    /**
     * Assumes the file is already sorted, returns the unique combinations after removing a specified
     * dimension.
     * Aggregates the metrics for each unique combination, currently only sum is supported by default
     * @param startDocId
     * @param endDocId
     * @param file
     * @param splitDimensionId
     * @return
     * @throws Exception
     */
    private Iterator<Pair<DimensionBuffer, MetricBuffer>> uniqueCombinations(int startDocId, int endDocId,
            File file, int splitDimensionId) throws Exception {
        StarTreeDataTable dataSorter = new StarTreeDataTable(file, dimensionSizeBytes, metricSizeBytes,
                getSortOrder());
        Iterator<Pair<byte[], byte[]>> iterator1 = dataSorter.iterator(startDocId, endDocId);
        File tempFile = new File(outDir, file.getName() + "_" + startDocId + "_" + endDocId + ".unique.tmp");
        DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tempFile)));
        while (iterator1.hasNext()) {
            Pair<byte[], byte[]> next = iterator1.next();
            byte[] dimensionBuffer = next.getLeft();
            byte[] metricBuffer = next.getRight();
            DimensionBuffer dimensions = DimensionBuffer.fromBytes(dimensionBuffer);
            for (int i = 0; i < numDimensions; i++) {
                String dimensionName = dimensionNameToIndexMap.inverse().get(i);
                if (i == splitDimensionId || (skipMaterializationForDimensions != null
                        && skipMaterializationForDimensions.contains(dimensionName))) {
                    dos.writeInt(StarTreeIndexNodeInterf.ALL);
                } else {
                    dos.writeInt(dimensions.getDimension(i));
                }
            }
            dos.write(metricBuffer);
        }
        dos.close();
        dataSorter = new StarTreeDataTable(tempFile, dimensionSizeBytes, metricSizeBytes, getSortOrder());
        dataSorter.sort(0, endDocId - startDocId);
        if (debugMode) {
            printFile(tempFile, 0, endDocId - startDocId);
        }
        final Iterator<Pair<byte[], byte[]>> iterator = dataSorter.iterator(0, endDocId - startDocId);
        return new Iterator<Pair<DimensionBuffer, MetricBuffer>>() {

            Pair<DimensionBuffer, MetricBuffer> prev = null;
            boolean done = false;

            @Override
            public void remove() {
                throw new UnsupportedOperationException();
            }

            @Override
            public boolean hasNext() {
                return !done;
            }

            @Override
            public Pair<DimensionBuffer, MetricBuffer> next() {
                while (iterator.hasNext()) {
                    Pair<byte[], byte[]> next = iterator.next();
                    byte[] dimBuffer = next.getLeft();
                    byte[] metricBuffer = next.getRight();
                    if (prev == null) {
                        prev = Pair.of(DimensionBuffer.fromBytes(dimBuffer),
                                MetricBuffer.fromBytes(metricBuffer, schema.getMetricFieldSpecs()));
                    } else {
                        Pair<DimensionBuffer, MetricBuffer> current = Pair.of(DimensionBuffer.fromBytes(dimBuffer),
                                MetricBuffer.fromBytes(metricBuffer, schema.getMetricFieldSpecs()));
                        if (!current.getLeft().equals(prev.getLeft())) {
                            Pair<DimensionBuffer, MetricBuffer> ret = prev;
                            prev = current;
                            LOG.debug("Returning unique {}", prev.getLeft());
                            return ret;
                        } else {
                            prev.getRight().aggregate(current.getRight());
                        }
                    }
                }
                done = true;
                LOG.debug("Returning unique {}", prev.getLeft());
                return prev;
            }
        };
    }

    /**
     * Group by on dimension column, assumes data is already sorted on this dimension from start to
     * end doc id
     * @param startDocId
     * @param endDocId
     * @param dimension
     * @param file
     * @return
     */
    private Map<Integer, IntPair> groupBy(int startDocId, int endDocId, Integer dimension, File file) {
        StarTreeDataTable dataSorter = new StarTreeDataTable(file, dimensionSizeBytes, metricSizeBytes,
                getSortOrder());
        return dataSorter.groupByIntColumnCount(startDocId, endDocId, dimension);
    }

    /**
     * Iterator to iterate over the records from startDocId to endDocId
     */
    @Override
    public Iterator<GenericRow> iterator(final int startDocId, final int endDocId) throws Exception {
        StarTreeDataTable dataSorter = new StarTreeDataTable(dataFile, dimensionSizeBytes, metricSizeBytes,
                getSortOrder());
        final Iterator<Pair<byte[], byte[]>> iterator = dataSorter.iterator(startDocId, endDocId);
        return new Iterator<GenericRow>() {
            @Override
            public boolean hasNext() {
                return iterator.hasNext();
            }

            @Override
            public void remove() {
                throw new UnsupportedOperationException();
            }

            @Override
            public GenericRow next() {
                Pair<byte[], byte[]> pair = iterator.next();
                DimensionBuffer dimensionKey = DimensionBuffer.fromBytes(pair.getLeft());
                MetricBuffer metricsHolder = MetricBuffer.fromBytes(pair.getRight(), schema.getMetricFieldSpecs());
                return toGenericRow(dimensionKey, metricsHolder);
            }
        };
    }

    public JSONObject getStarTreeAsJSON() throws Exception {
        JSONObject json = new JSONObject();
        toJson(json, starTreeRootIndexNode, dictionaryMap);
        return json;
    }

    private void toJson(JSONObject json, StarTreeIndexNode node,
            Map<String, HashBiMap<Object, Integer>> dictionaryMap) throws Exception {
        String dimName = "ALL";
        Object dimValue = "ALL";
        if (node.getDimensionName() != StarTreeIndexNodeInterf.ALL) {
            dimName = dimensionNames.get(node.getDimensionName());
        }
        if (node.getDimensionValue() != StarTreeIndexNodeInterf.ALL) {
            dimValue = dictionaryMap.get(dimName).inverse().get(node.getDimensionValue());
        }
        json.put("title", dimName + ":" + dimValue);
        Iterator<StarTreeIndexNode> childrenIterator = node.getChildrenIterator();

        if (childrenIterator != null) {
            JSONObject[] childJsons = new JSONObject[node.getNumChildren()];
            int index = 0;

            while (childrenIterator.hasNext()) {
                StarTreeIndexNode childNode = childrenIterator.next();
                JSONObject childJson = new JSONObject();
                toJson(childJson, childNode, dictionaryMap);
                childJsons[index++] = childJson;
            }
            json.put("nodes", childJsons);
        }
    }

    @Override
    public void cleanup() {
        if (outDir != null) {
            FileUtils.deleteQuietly(outDir);
        }
    }

    @Override
    public StarTree getTree() {
        return starTree;
    }

    @Override
    public int getTotalRawDocumentCount() {
        return rawRecordCount;
    }

    @Override
    public int getTotalAggregateDocumentCount() {
        return aggRecordCount;
    }

    @Override
    public int getMaxLeafRecords() {
        return maxLeafRecords;
    }

    @Override
    public List<String> getDimensionsSplitOrder() {
        return dimensionsSplitOrder;
    }

    public Map<String, HashBiMap<Object, Integer>> getDictionaryMap() {
        return dictionaryMap;
    }

    public HashBiMap<String, Integer> getDimensionNameToIndexMap() {
        return dimensionNameToIndexMap;
    }

    @Override
    public Set<String> getSkipMaterializationForDimensions() {
        return skipMaterializationForDimensions;
    }
}