com.linkedin.pinot.queries.FastHllQueriesTest.java Source code

Java tutorial

Introduction

Here is the source code for com.linkedin.pinot.queries.FastHllQueriesTest.java

Source

/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.queries;

import com.clearspring.analytics.stream.cardinality.HyperLogLog;
import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.response.broker.BrokerResponseNative;
import com.linkedin.pinot.common.segment.ReadMode;
import com.linkedin.pinot.core.data.manager.offline.OfflineSegmentDataManager;
import com.linkedin.pinot.core.data.manager.offline.SegmentDataManager;
import com.linkedin.pinot.core.indexsegment.IndexSegment;
import com.linkedin.pinot.core.indexsegment.columnar.ColumnarSegmentLoader;
import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig;
import com.linkedin.pinot.core.operator.ExecutionStatistics;
import com.linkedin.pinot.core.operator.blocks.IntermediateResultsBlock;
import com.linkedin.pinot.core.operator.query.AggregationGroupByOperator;
import com.linkedin.pinot.core.operator.query.AggregationOperator;
import com.linkedin.pinot.core.query.aggregation.groupby.AggregationGroupByResult;
import com.linkedin.pinot.core.query.aggregation.groupby.GroupKeyGenerator;
import com.linkedin.pinot.core.segment.creator.SegmentIndexCreationDriver;
import com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl;
import com.linkedin.pinot.startree.hll.HllConfig;
import java.io.File;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.commons.io.FileUtils;
import org.testng.Assert;
import org.testng.annotations.Test;

/**
 * The <code>FastHllQueriesTest</code> class sets up the index segment and create fastHll on 'column17' and 'column18'.
 * <p>There are totally 18 columns, 30000 records inside the original Avro file where 11 columns are selected to build
 * the index segment. Selected columns information are as following:
 * <ul>
 *   ColumnName, FieldType, DataType, Cardinality, IsSorted, HasInvertedIndex
 *   <li>column1, METRIC, INT, 6582, F, F</li>
 *   <li>column3, METRIC, INT, 21910, F, F</li>
 *   <li>column5, DIMENSION, STRING, 1, T, F</li>
 *   <li>column6, DIMENSION, INT, 608, F, T</li>
 *   <li>column7, DIMENSION, INT, 146, F, T</li>
 *   <li>column9, DIMENSION, INT, 1737, F, F</li>
 *   <li>column11, DIMENSION, STRING, 5, F, T</li>
 *   <li>column12, DIMENSION, STRING, 5, F, F</li>
 *   <li>column17, METRIC, INT, 24, F, T</li>
 *   <li>column18, METRIC, INT, 1440, F, T</li>
 *   <li>daysSinceEpoch, TIME, INT, 2, T, F</li>
 * </ul>
 */
@SuppressWarnings("ConstantConditions")
public class FastHllQueriesTest extends BaseQueriesTest {
    private static final String AVRO_DATA_WITHOUT_PRE_GENERATED_HLL_COLUMNS = "data" + File.separator
            + "test_data-sv.avro";
    private static final String AVRO_DATA_WITH_PRE_GENERATED_HLL_COLUMNS = "data" + File.separator
            + "test_data-sv_hll.avro";
    private static final String SEGMENT_NAME = "testTable_126164076_167572854";
    private static final File INDEX_DIR = new File(FileUtils.getTempDirectory(), "FastHllQueriesTest");
    private static final int HLL_LOG2M = 6;

    private static final String BASE_QUERY = "SELECT FASTHLL(column17_HLL), FASTHLL(column18_HLL) FROM testTable";
    private static final String GROUP_BY = " group by column11";
    private static final String QUERY_FILTER = " WHERE column1 > 100000000"
            + " AND column3 BETWEEN 20000000 AND 1000000000" + " AND column5 = 'gFuH'"
            + " AND (column6 < 500000000 OR column11 NOT IN ('t', 'P'))" + " AND daysSinceEpoch = 126164076";

    private IndexSegment _indexSegment;
    // Contains 2 identical index segments
    private List<SegmentDataManager> _segmentDataManagers;

    @Override
    protected String getFilter() {
        return QUERY_FILTER;
    }

    @Override
    protected IndexSegment getIndexSegment() {
        return _indexSegment;
    }

    @Override
    protected List<SegmentDataManager> getSegmentDataManagers() {
        return _segmentDataManagers;
    }

    @Test
    public void testFastHllWithoutPreGeneratedHllColumns() throws Exception {
        buildAndLoadSegment(false);

        // Test inner segment queries
        // Test base query
        AggregationOperator aggregationOperator = getOperatorForQuery(BASE_QUERY);
        IntermediateResultsBlock resultsBlock = (IntermediateResultsBlock) aggregationOperator.nextBlock();
        ExecutionStatistics executionStatistics = aggregationOperator.getExecutionStatistics();
        QueriesTestUtils.testInnerSegmentExecutionStatistics(executionStatistics, 1L, 0L, 2L, 30000L);
        List<Object> aggregationResult = resultsBlock.getAggregationResult();
        Assert.assertEquals(((HyperLogLog) aggregationResult.get(0)).cardinality(), 21L);
        Assert.assertEquals(((HyperLogLog) aggregationResult.get(1)).cardinality(), 1762L);
        // Test query with filter
        aggregationOperator = getOperatorForQueryWithFilter(BASE_QUERY);
        resultsBlock = (IntermediateResultsBlock) aggregationOperator.nextBlock();
        executionStatistics = aggregationOperator.getExecutionStatistics();
        QueriesTestUtils.testInnerSegmentExecutionStatistics(executionStatistics, 6129L, 112472L, 12258L, 30000L);
        aggregationResult = resultsBlock.getAggregationResult();
        Assert.assertEquals(((HyperLogLog) aggregationResult.get(0)).cardinality(), 17L);
        Assert.assertEquals(((HyperLogLog) aggregationResult.get(1)).cardinality(), 1197L);
        // Test query with group-by
        AggregationGroupByOperator aggregationGroupByOperator = getOperatorForQuery(BASE_QUERY + GROUP_BY);
        resultsBlock = (IntermediateResultsBlock) aggregationGroupByOperator.nextBlock();
        executionStatistics = aggregationGroupByOperator.getExecutionStatistics();
        QueriesTestUtils.testInnerSegmentExecutionStatistics(executionStatistics, 4613L, 0L, 13839L, 30000L);
        AggregationGroupByResult aggregationGroupByResult = resultsBlock.getAggregationGroupByResult();
        GroupKeyGenerator.GroupKey firstGroupKey = aggregationGroupByResult.getGroupKeyIterator().next();
        Assert.assertEquals(firstGroupKey._stringKey, "");
        Assert.assertEquals(
                ((HyperLogLog) aggregationGroupByResult.getResultForKey(firstGroupKey, 0)).cardinality(), 21L);
        Assert.assertEquals(
                ((HyperLogLog) aggregationGroupByResult.getResultForKey(firstGroupKey, 1)).cardinality(), 691L);

        // Test inter segments base query
        BrokerResponseNative brokerResponse = getBrokerResponseForQuery(BASE_QUERY);
        QueriesTestUtils.testInterSegmentAggregationResult(brokerResponse, 4L, 0L, 8L, 120000L,
                new String[] { "21", "1762" });
        // Test inter segments query with filter
        brokerResponse = getBrokerResponseForQueryWithFilter(BASE_QUERY);
        QueriesTestUtils.testInterSegmentAggregationResult(brokerResponse, 24516L, 449888L, 49032L, 120000L,
                new String[] { "17", "1197" });
        // Test inter segments query with group-by
        brokerResponse = getBrokerResponseForQuery(BASE_QUERY + GROUP_BY);
        QueriesTestUtils.testInterSegmentAggregationResult(brokerResponse, 18452L, 0L, 55356L, 120000L,
                new String[] { "21", "1762" });

        deleteSegment();
    }

    @Test
    public void testFastHllWithPreGeneratedHllColumns() throws Exception {
        buildAndLoadSegment(true);

        // Test inner segment queries
        // Test base query
        AggregationOperator aggregationOperator = getOperatorForQuery(BASE_QUERY);
        IntermediateResultsBlock resultsBlock = (IntermediateResultsBlock) aggregationOperator.nextBlock();
        ExecutionStatistics executionStatistics = aggregationOperator.getExecutionStatistics();
        QueriesTestUtils.testInnerSegmentExecutionStatistics(executionStatistics, 30000L, 0L, 60000L, 30000L);
        List<Object> aggregationResult = resultsBlock.getAggregationResult();
        Assert.assertEquals(((HyperLogLog) aggregationResult.get(0)).cardinality(), 21L);
        Assert.assertEquals(((HyperLogLog) aggregationResult.get(1)).cardinality(), 1762L);
        // Test query with filter
        aggregationOperator = getOperatorForQueryWithFilter(BASE_QUERY);
        resultsBlock = (IntermediateResultsBlock) aggregationOperator.nextBlock();
        executionStatistics = aggregationOperator.getExecutionStatistics();
        QueriesTestUtils.testInnerSegmentExecutionStatistics(executionStatistics, 6129L, 84134L, 12258L, 30000L);
        aggregationResult = resultsBlock.getAggregationResult();
        Assert.assertEquals(((HyperLogLog) aggregationResult.get(0)).cardinality(), 17L);
        Assert.assertEquals(((HyperLogLog) aggregationResult.get(1)).cardinality(), 1197L);
        // Test query with group-by
        AggregationGroupByOperator aggregationGroupByOperator = getOperatorForQuery(BASE_QUERY + GROUP_BY);
        resultsBlock = (IntermediateResultsBlock) aggregationGroupByOperator.nextBlock();
        executionStatistics = aggregationGroupByOperator.getExecutionStatistics();
        QueriesTestUtils.testInnerSegmentExecutionStatistics(executionStatistics, 30000L, 0L, 90000L, 30000L);
        AggregationGroupByResult aggregationGroupByResult = resultsBlock.getAggregationGroupByResult();
        GroupKeyGenerator.GroupKey firstGroupKey = aggregationGroupByResult.getGroupKeyIterator().next();
        Assert.assertEquals(firstGroupKey._stringKey, "");
        Assert.assertEquals(
                ((HyperLogLog) aggregationGroupByResult.getResultForKey(firstGroupKey, 0)).cardinality(), 21L);
        Assert.assertEquals(
                ((HyperLogLog) aggregationGroupByResult.getResultForKey(firstGroupKey, 1)).cardinality(), 691L);

        // Test inter segments base query
        BrokerResponseNative brokerResponse = getBrokerResponseForQuery(BASE_QUERY);
        QueriesTestUtils.testInterSegmentAggregationResult(brokerResponse, 120000L, 0L, 240000L, 120000L,
                new String[] { "21", "1762" });
        // Test inter segments query with filter
        brokerResponse = getBrokerResponseForQueryWithFilter(BASE_QUERY);
        QueriesTestUtils.testInterSegmentAggregationResult(brokerResponse, 24516L, 336536L, 49032L, 120000L,
                new String[] { "17", "1197" });
        // Test inter segments query with group-by
        brokerResponse = getBrokerResponseForQuery(BASE_QUERY + GROUP_BY);
        QueriesTestUtils.testInterSegmentAggregationResult(brokerResponse, 120000L, 0L, 360000L, 120000L,
                new String[] { "21", "1762" });

        deleteSegment();
    }

    private void buildAndLoadSegment(boolean hasPreGeneratedHllColumns) throws Exception {
        FileUtils.deleteQuietly(INDEX_DIR);

        // Get resource file path
        URL resource;
        if (hasPreGeneratedHllColumns) {
            resource = getClass().getClassLoader().getResource(AVRO_DATA_WITH_PRE_GENERATED_HLL_COLUMNS);
        } else {
            resource = getClass().getClassLoader().getResource(AVRO_DATA_WITHOUT_PRE_GENERATED_HLL_COLUMNS);
        }
        Assert.assertNotNull(resource);
        String filePath = resource.getFile();

        // Build the segment schema
        Schema.SchemaBuilder schemaBuilder = new Schema.SchemaBuilder().setSchemaName("testTable")
                .addMetric("column1", FieldSpec.DataType.INT).addMetric("column3", FieldSpec.DataType.INT)
                .addSingleValueDimension("column5", FieldSpec.DataType.STRING)
                .addSingleValueDimension("column6", FieldSpec.DataType.INT)
                .addSingleValueDimension("column7", FieldSpec.DataType.INT)
                .addSingleValueDimension("column9", FieldSpec.DataType.INT)
                .addSingleValueDimension("column11", FieldSpec.DataType.STRING)
                .addSingleValueDimension("column12", FieldSpec.DataType.STRING)
                .addMetric("column17", FieldSpec.DataType.INT).addMetric("column18", FieldSpec.DataType.INT)
                .addTime("daysSinceEpoch", TimeUnit.DAYS, FieldSpec.DataType.INT);
        if (hasPreGeneratedHllColumns) {
            schemaBuilder.addSingleValueDimension("column17_HLL", FieldSpec.DataType.STRING)
                    .addSingleValueDimension("column18_HLL", FieldSpec.DataType.STRING);
        }

        // Create the segment generator config
        SegmentGeneratorConfig segmentGeneratorConfig = new SegmentGeneratorConfig(schemaBuilder.build());
        segmentGeneratorConfig.setInputFilePath(filePath);
        segmentGeneratorConfig.setTableName("testTable");
        segmentGeneratorConfig.setOutDir(INDEX_DIR.getAbsolutePath());
        segmentGeneratorConfig.setInvertedIndexCreationColumns(
                Arrays.asList("column6", "column7", "column11", "column17", "column18"));
        if (hasPreGeneratedHllColumns) {
            segmentGeneratorConfig.setHllConfig(new HllConfig(HLL_LOG2M));
        } else {
            segmentGeneratorConfig.enableStarTreeIndex(null);
            // Intentionally use the non-default suffix
            segmentGeneratorConfig.setHllConfig(
                    new HllConfig(HLL_LOG2M, new HashSet<>(Arrays.asList("column17", "column18")), "_HLL"));
        }

        // Build the index segment
        SegmentIndexCreationDriver driver = new SegmentIndexCreationDriverImpl();
        driver.init(segmentGeneratorConfig);
        driver.build();

        _indexSegment = ColumnarSegmentLoader.load(new File(INDEX_DIR, SEGMENT_NAME), ReadMode.heap);
        _segmentDataManagers = Arrays.<SegmentDataManager>asList(new OfflineSegmentDataManager(_indexSegment),
                new OfflineSegmentDataManager(_indexSegment));
    }

    private void deleteSegment() {
        _indexSegment.destroy();
        FileUtils.deleteQuietly(INDEX_DIR);
    }
}