com.linkedin.pinot.perf.StringDictionaryPerfTest.java Source code

Introduction

Here is the source code for com.linkedin.pinot.perf.StringDictionaryPerfTest.java
Source

/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.perf;

import com.linkedin.pinot.common.data.DimensionFieldSpec;
import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.segment.ReadMode;
import com.linkedin.pinot.core.data.GenericRow;
import com.linkedin.pinot.core.data.readers.FileFormat;
import com.linkedin.pinot.core.data.readers.RecordReader;
import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig;
import com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl;
import com.linkedin.pinot.core.segment.index.IndexSegmentImpl;
import com.linkedin.pinot.core.segment.index.loader.Loaders;
import com.linkedin.pinot.core.segment.index.readers.ImmutableDictionaryReader;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.commons.lang.mutable.MutableLong;

/**
 * Performance test for lookup in string dictionary.
 */
public class StringDictionaryPerfTest {
    private static final int MAX_STRING_LENGTH = 100;
    private static final String TMP_DIR = System.getProperty("java.io.tmpdir");
    private static final String COLUMN_NAME = "test";
    private static final int TOTAL_NUM_LOOKUPS = 100_000;

    String[] _inputStrings;
    private File _indexDir;
    private int _dictLength;

    /**
     * Helper method to build a segment:
     * <ul>
     *   <li> Segment contains one string column </li>
     *   <li> Row values for the column are randomly generated strings of length 1 to 100 </li>
     * </ul>
     *
     * @param dictLength Length of the dictionary
     * @throws Exception
     */
    public void buildSegment(int dictLength) throws Exception {
        Schema schema = new Schema();
        String segmentName = "perfTestSegment" + System.currentTimeMillis();
        _indexDir = new File(TMP_DIR + File.separator + segmentName);
        _indexDir.deleteOnExit();

        FieldSpec fieldSpec = new DimensionFieldSpec(COLUMN_NAME, FieldSpec.DataType.STRING, true);
        schema.addField(fieldSpec);

        _dictLength = dictLength;
        _inputStrings = new String[dictLength];

        SegmentGeneratorConfig config = new SegmentGeneratorConfig(schema);
        config.setOutDir(_indexDir.getParent());
        config.setFormat(FileFormat.AVRO);
        config.setSegmentName(segmentName);

        Random random = new Random(System.nanoTime());
        final List<GenericRow> data = new ArrayList<>();
        Set<String> uniqueStrings = new HashSet<>(dictLength);

        int i = 0;
        while (i < dictLength) {
            HashMap<String, Object> map = new HashMap<>();
            String randomString = RandomStringUtils.randomAlphanumeric(1 + random.nextInt(MAX_STRING_LENGTH));

            if (uniqueStrings.contains(randomString)) {
                continue;
            }

            _inputStrings[i] = randomString;
            uniqueStrings.add(randomString);
            map.put("test", _inputStrings[i++]);

            GenericRow genericRow = new GenericRow();
            genericRow.init(map);
            data.add(genericRow);
        }

        SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
        RecordReader reader = getGenericRowRecordReader(schema, data);
        driver.init(config, reader);
        driver.build();
    }

    /**
     * Measures the performance of string dictionary lookups by performing the provided
     * number of lookups to random indices.
     *
     * @param numLookups Number of lookups to perform
     * @throws Exception
     */
    public void perfTestLookups(int numLookups) throws Exception {
        IndexSegmentImpl segment = (IndexSegmentImpl) Loaders.IndexSegment.load(_indexDir, ReadMode.heap);
        ImmutableDictionaryReader dictionary = segment.getDictionaryFor(COLUMN_NAME);

        Random random = new Random(System.nanoTime());
        long start = System.currentTimeMillis();

        for (int i = 0; i < numLookups; i++) {
            int index = 1 + random.nextInt(_dictLength);
            dictionary.indexOf(_inputStrings[index]);
        }

        FileUtils.deleteQuietly(_indexDir);
        System.out.println(
                "Total time for " + TOTAL_NUM_LOOKUPS + " lookups: " + (System.currentTimeMillis() - start));
    }

    /**
     * Returns an implementation of GenericRow record reader.
     *
     * @param schema Schema for the data
     * @param data Data
     * @return GenericRow record reader
     */
    private static RecordReader getGenericRowRecordReader(final Schema schema, final List<GenericRow> data) {
        return new RecordReader() {
            int _counter = 0;

            @Override
            public void rewind() throws Exception {
                _counter = 0;
            }

            @Override
            public GenericRow next() {
                return data.get(_counter++);
            }

            @Override
            public GenericRow next(GenericRow row) {
                return next();
            }

            @Override
            public void init() throws Exception {
            }

            @Override
            public boolean hasNext() {
                return _counter < data.size();
            }

            @Override
            public Schema getSchema() {
                return schema;
            }

            @Override
            public Map<String, MutableLong> getNullCountMap() {
                return null;
            }

            @Override
            public void close() throws Exception {
            }
        };
    }

    public static void main(String[] args) throws Exception {
        if (args.length != 2) {
            System.out.println("Usage: StringDictionaryPerfRunner <dictionary_length> <num_lookups> ");
        }

        int dictLength = Integer.valueOf(args[0]);
        int numLookups = Integer.valueOf(args[1]);

        StringDictionaryPerfTest test = new StringDictionaryPerfTest();
        test.buildSegment(dictLength);
        test.perfTestLookups(numLookups);
    }
}