com.linkedin.pinot.core.startree.hll.OffheapStarTreeBuilderWithHllFieldTest.java Source code

Java tutorial

Introduction

Here is the source code for com.linkedin.pinot.core.startree.hll.OffheapStarTreeBuilderWithHllFieldTest.java

Source

/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.core.startree.hll;

import com.linkedin.pinot.common.data.*;
import com.linkedin.pinot.common.data.FieldSpec.DataType;
import com.linkedin.pinot.core.data.GenericRow;
import com.linkedin.pinot.core.startree.OffHeapStarTreeBuilder;
import com.linkedin.pinot.core.startree.StarTreeBuilderConfig;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.File;
import java.util.*;
import java.util.concurrent.TimeUnit;

public class OffheapStarTreeBuilderWithHllFieldTest {

    private static final Logger LOGGER = LoggerFactory.getLogger(OffheapStarTreeBuilderWithHllFieldTest.class);
    private static final long randomSeed = 31; // a fixed value

    private final String memberIdFieldName = "id";
    private final String hllDeriveFieldSuffix = HllConstants.DEFAULT_HLL_DERIVE_COLUMN_SUFFIX;
    private final int log2m = 8; //HllUtil.Constants.DEFAULT_LOG2M;

    private void testSimpleCore(int numDimensions, int numMetrics, int numSkipMaterializationDimensions,
            int[] memberIdColumnValues, long preciseCardinality) throws Exception {
        StarTreeBuilderConfig builderConfig = null;
        try {
            builderConfig = new StarTreeBuilderConfig();
            Schema schema = new Schema();
            builderConfig.dimensionsSplitOrder = new ArrayList<>();
            builderConfig.setSkipMaterializationForDimensions(new HashSet<String>());
            Set<String> skipMaterializationForDimensions = builderConfig.getSkipMaterializationForDimensions();

            // add member id dimension spec
            String dimName = memberIdFieldName;
            DimensionFieldSpec dimensionFieldSpec = new DimensionFieldSpec(dimName, DataType.INT, true);
            schema.addField(dimensionFieldSpec);
            // add other dimension specs
            for (int i = 1; i < numDimensions; i++) {
                dimName = "d" + (i + 1);
                dimensionFieldSpec = new DimensionFieldSpec(dimName, DataType.STRING, true);
                schema.addField(dimensionFieldSpec);

                if (i < (numDimensions - numSkipMaterializationDimensions)) {
                    builderConfig.dimensionsSplitOrder.add(dimName);
                } else {
                    builderConfig.getSkipMaterializationForDimensions().add(dimName);
                }
            }

            schema.setTimeFieldSpec(new TimeFieldSpec("daysSinceEpoch", DataType.INT, TimeUnit.DAYS));
            // add other metric specs
            for (int i = 0; i < numMetrics - 1; i++) {
                String metricName = "m" + (i + 1);
                MetricFieldSpec metricFieldSpec = new MetricFieldSpec(metricName, DataType.INT);
                schema.addField(metricFieldSpec);
            }
            // add hll metric
            String hllMetricName = memberIdFieldName + hllDeriveFieldSuffix;
            MetricFieldSpec hllDerivedFieldSpec = new MetricFieldSpec(hllMetricName, FieldSpec.DataType.STRING,
                    HllUtil.getHllFieldSizeFromLog2m(log2m), MetricFieldSpec.DerivedMetricType.HLL);
            schema.addField(hllDerivedFieldSpec);
            //
            builderConfig.maxLeafRecords = 10;
            builderConfig.schema = schema;
            builderConfig.setOutDir(new File("/tmp/startree"));
            //
            OffHeapStarTreeBuilder builder = new OffHeapStarTreeBuilder();
            builder.init(builderConfig);
            // fill values
            HashMap<String, Object> map = new HashMap<>();
            for (int row = 0; row < memberIdColumnValues.length; row++) {
                // add member id column
                dimName = memberIdFieldName;
                map.put(dimName, memberIdColumnValues[row]);
                // add other dimensions
                for (int i = 1; i < numDimensions; i++) {
                    dimName = schema.getDimensionFieldSpecs().get(i).getName();
                    map.put(dimName, dimName + "-v" + row % (numDimensions - i));
                }
                // add time column
                map.put("daysSinceEpoch", 1);
                // add other metrics
                for (int i = 0; i < numMetrics - 1; i++) {
                    String metName = schema.getMetricFieldSpecs().get(i).getName();
                    map.put(metName, 1);
                }
                // add hll column value
                map.put(hllMetricName, HllUtil.singleValueHllAsString(log2m, memberIdColumnValues[row]));
                //
                GenericRow genericRow = new GenericRow();
                genericRow.init(map);
                builder.append(genericRow);
            }
            builder.build();

            int totalDocs = builder.getTotalRawDocumentCount() + builder.getTotalAggregateDocumentCount();
            Iterator<GenericRow> iterator = builder.iterator(0, totalDocs);
            while (iterator.hasNext()) {
                GenericRow row = iterator.next();
                LOGGER.info(HllUtil.inspectGenericRow(row, hllDeriveFieldSuffix));
            }

            iterator = builder.iterator(builder.getTotalRawDocumentCount(), totalDocs);
            GenericRow lastRow = null;
            while (iterator.hasNext()) {
                GenericRow row = iterator.next();
                for (String skipDimension : skipMaterializationForDimensions) {
                    String rowValue = (String) row.getValue(skipDimension);
                    assert (rowValue.equals("ALL"));
                }
                lastRow = row;
            }

            assertApproximation(HllUtil.convertStringToHll((String) lastRow.getValue(hllMetricName)).cardinality(),
                    preciseCardinality, 0.1);
        } finally {
            if (builderConfig != null) {
                FileUtils.deleteDirectory(builderConfig.getOutDir());
            }
        }
    }

    private static void assertApproximation(double estimate, double actual, double precision) {
        estimate = Math.abs(estimate);
        actual = Math.abs(actual);

        double errorRate = 1;
        if (actual > 0) {
            errorRate = Math.abs((actual - estimate) / actual);
        }
        LOGGER.info("estimate: " + estimate + " actual: " + actual + " error (in rate): " + errorRate);
        Assert.assertEquals(errorRate < precision, true);
    }

    private static class RandomNumberArray {
        private static Random _rnd = new Random(randomSeed);

        private final int[] arr;
        private final HashSet<Integer> set = new HashSet<Integer>();

        /**
         * Data ranges between [0, size)
         * @param size
         * @param duplicationPerItem
         */
        RandomNumberArray(int size, int duplicationPerItem) {
            List<Integer> lst = new ArrayList<Integer>();
            for (int i = 0; i < size / duplicationPerItem; i++) {
                Integer item = _rnd.nextInt(size);
                for (int j = 0; j < duplicationPerItem; j++) {
                    lst.add(item); // add duplicates
                }
            }
            // add remaining items
            int st = lst.size();
            for (int i = st; i < size; i++) {
                Integer item = _rnd.nextInt(size);
                lst.add(item);
            }
            // add to set
            set.addAll(lst);
            // shuffle
            Collections.shuffle(lst, new Random(10L));
            // toIntArray
            arr = convertToIntArray(lst);
            if (arr.length != size) {
                throw new RuntimeException("should not happen");
            }
        }

        private int[] convertToIntArray(List<Integer> list) {
            int[] ret = new int[list.size()];
            for (int i = 0; i < ret.length; i++)
                ret[i] = list.get(i);
            return ret;
        }

        public int[] toIntArray() {
            return arr;
        }

        public int size() {
            return arr.length;
        }

        public int getPreciseCardinality() {
            return set.size();
        }
    }

    @Test
    public void testSmallDuplicates() throws Exception {
        RandomNumberArray rand = new RandomNumberArray(500, 1);
        testSimpleCore(3, 3, 0, rand.toIntArray(), rand.getPreciseCardinality());
    }

    @Test
    public void testMediumDuplicates() throws Exception {
        RandomNumberArray rand = new RandomNumberArray(500, 5);
        testSimpleCore(3, 3, 0, rand.toIntArray(), rand.getPreciseCardinality());
    }

    @Test
    public void testLargeDuplicates() throws Exception {
        RandomNumberArray rand = new RandomNumberArray(500, 50);
        testSimpleCore(3, 3, 0, rand.toIntArray(), rand.getPreciseCardinality());
    }

    @Test
    public void testSkipMaterialization() throws Exception {
        RandomNumberArray rand = new RandomNumberArray(250, 3);
        testSimpleCore(6, 4, 2, rand.toIntArray(), rand.getPreciseCardinality());
    }

}