com.linkedin.pinot.perf.ForwardIndexReaderBenchmark.java Source code

Java tutorial

Introduction

Here is the source code for com.linkedin.pinot.perf.ForwardIndexReaderBenchmark.java

Source

/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.perf;

import com.google.common.collect.Lists;
import com.linkedin.pinot.common.segment.ReadMode;
import com.linkedin.pinot.core.indexsegment.generator.SegmentVersion;
import com.linkedin.pinot.core.io.reader.BaseSingleColumnMultiValueReader;
import com.linkedin.pinot.core.io.reader.BaseSingleColumnSingleValueReader;
import com.linkedin.pinot.core.io.reader.impl.v1.MultiValueReaderContext;
import com.linkedin.pinot.core.segment.index.ColumnMetadata;
import com.linkedin.pinot.core.segment.index.SegmentMetadataImpl;
import com.linkedin.pinot.core.segment.memory.PinotDataBuffer;
import java.io.File;
import java.io.FileWriter;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import me.lemire.integercompression.BitPacking;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;

/**
 * Given a pinot segment directory, it benchmarks forward index scan speed
 */
public class ForwardIndexReaderBenchmark {
    static int MAX_RUNS = 10;

    public static void singleValuedReadBenchMarkV1(File file, int numDocs, int columnSizeInBits) throws Exception {
        boolean signed = false;
        boolean isMmap = false;
        PinotDataBuffer heapBuffer = PinotDataBuffer.fromFile(file, ReadMode.heap, FileChannel.MapMode.READ_ONLY,
                "benchmark");
        BaseSingleColumnSingleValueReader reader = new com.linkedin.pinot.core.io.reader.impl.v1.FixedBitSingleValueReader(
                heapBuffer, numDocs, columnSizeInBits, signed);
        // sequential read
        long start, end;
        DescriptiveStatistics stats = new DescriptiveStatistics();
        for (int run = 0; run < MAX_RUNS; run++) {
            start = System.currentTimeMillis();
            for (int i = 0; i < numDocs; i++) {
                int value = reader.getInt(i);
            }
            end = System.currentTimeMillis();
            stats.addValue(end - start);
        }
        System.out.println(" v1 sequential read stats for " + file.getName());
        System.out.println(stats.toString().replaceAll("\n", ", ") + " raw:" + Arrays.toString(stats.getValues()));
        reader.close();
        heapBuffer.close();
    }

    public static void singleValuedReadBenchMarkV2(File file, int numDocs, int numBits) throws Exception {
        boolean signed = false;
        boolean isMmap = false;
        long start, end;
        boolean fullScan = true;

        boolean batchRead = true;
        boolean singleRead = true;

        PinotDataBuffer heapBuffer = PinotDataBuffer.fromFile(file, ReadMode.heap, FileChannel.MapMode.READ_ONLY,
                "benchmarking");
        com.linkedin.pinot.core.io.reader.impl.v2.FixedBitSingleValueReader reader = new com.linkedin.pinot.core.io.reader.impl.v2.FixedBitSingleValueReader(
                heapBuffer, numDocs, numBits, signed);

        if (fullScan) {
            DescriptiveStatistics stats = new DescriptiveStatistics();
            ByteBuffer buffer = ByteBuffer.allocateDirect((int) file.length());
            RandomAccessFile raf = new RandomAccessFile(file, "r");
            raf.getChannel().read(buffer);
            raf.close();
            int[] input = new int[numBits];
            int[] output = new int[32];
            int numBatches = (numDocs + 31) / 32;
            for (int run = 0; run < MAX_RUNS; run++) {
                start = System.currentTimeMillis();
                for (int i = 0; i < numBatches; i++) {
                    for (int j = 0; j < numBits; j++) {
                        input[j] = buffer.getInt(i * numBits * 4 + j * 4);
                    }
                    BitPacking.fastunpack(input, 0, output, 0, numBits);
                }
                end = System.currentTimeMillis();
                stats.addValue((end - start));
            }
            System.out.println(" v2 full scan stats for " + file.getName());
            System.out.println(
                    stats.toString().replaceAll("\n", ", ") + " raw:" + Arrays.toString(stats.getValues()));
        }
        if (singleRead) {
            DescriptiveStatistics stats = new DescriptiveStatistics();
            // sequential read
            for (int run = 0; run < MAX_RUNS; run++) {
                start = System.currentTimeMillis();
                for (int i = 0; i < numDocs; i++) {
                    int value = reader.getInt(i);
                }
                end = System.currentTimeMillis();
                stats.addValue((end - start));
            }
            System.out.println(" v2 sequential single read for " + file.getName());
            System.out.println(
                    stats.toString().replaceAll("\n", ", ") + " raw:" + Arrays.toString(stats.getValues()));
        }
        if (batchRead) {
            DescriptiveStatistics stats = new DescriptiveStatistics();
            int batchSize = Math.min(5000, numDocs);
            int[] output = new int[batchSize];
            int[] rowIds = new int[batchSize];

            // sequential read
            for (int run = 0; run < MAX_RUNS; run++) {
                start = System.currentTimeMillis();
                int rowId = 0;
                while (rowId < numDocs) {
                    int length = Math.min(batchSize, numDocs - rowId);
                    for (int i = 0; i < length; i++) {
                        rowIds[i] = rowId + i;
                    }
                    reader.getIntBatch(rowIds, output, length);
                    rowId = rowId + length;
                }
                end = System.currentTimeMillis();
                stats.addValue((end - start));
            }
            System.out.println("v2 sequential batch read stats for " + file.getName());
            System.out.println(
                    stats.toString().replaceAll("\n", ", ") + " raw:" + Arrays.toString(stats.getValues()));
        }
        reader.close();

    }

    public static void multiValuedReadBenchMarkV1(File file, int numDocs, int totalNumValues, int maxEntriesPerDoc,
            int columnSizeInBits) throws Exception {
        System.out.println("******************************************************************");
        System.out.println("Analyzing " + file.getName() + " numDocs:" + numDocs + ", totalNumValues:"
                + totalNumValues + ", maxEntriesPerDoc:" + maxEntriesPerDoc + ", numBits:" + columnSizeInBits);
        long start, end;
        boolean readFile = true;
        boolean randomRead = true;
        boolean contextualRead = true;
        boolean signed = false;
        boolean isMmap = false;
        PinotDataBuffer heapBuffer = PinotDataBuffer.fromFile(file, ReadMode.mmap, FileChannel.MapMode.READ_ONLY,
                "benchmarking");
        BaseSingleColumnMultiValueReader reader = new com.linkedin.pinot.core.io.reader.impl.v1.FixedBitMultiValueReader(
                heapBuffer, numDocs, totalNumValues, columnSizeInBits, signed);

        int[] intArray = new int[maxEntriesPerDoc];
        File outfile = new File("/tmp/" + file.getName() + ".raw");
        FileWriter fw = new FileWriter(outfile);
        for (int i = 0; i < numDocs; i++) {
            int length = reader.getIntArray(i, intArray);
            StringBuilder sb = new StringBuilder();
            String delim = "";
            for (int j = 0; j < length; j++) {
                sb.append(delim);
                sb.append(intArray[j]);
                delim = ",";
            }
            fw.write(sb.toString());
            fw.write("\n");
        }
        fw.close();

        // sequential read
        if (readFile) {
            DescriptiveStatistics stats = new DescriptiveStatistics();
            RandomAccessFile raf = new RandomAccessFile(file, "rw");
            ByteBuffer buffer = ByteBuffer.allocateDirect((int) file.length());
            raf.getChannel().read(buffer);
            for (int run = 0; run < MAX_RUNS; run++) {
                long length = file.length();
                start = System.currentTimeMillis();
                for (int i = 0; i < length; i++) {
                    byte b = buffer.get(i);
                }
                end = System.currentTimeMillis();
                stats.addValue((end - start));
            }
            System.out.println("v1 multi value read bytes stats for " + file.getName());
            System.out.println(
                    stats.toString().replaceAll("\n", ", ") + " raw:" + Arrays.toString(stats.getValues()));

            raf.close();
        }
        if (randomRead) {
            DescriptiveStatistics stats = new DescriptiveStatistics();
            for (int run = 0; run < MAX_RUNS; run++) {
                start = System.currentTimeMillis();
                for (int i = 0; i < numDocs; i++) {
                    int length = reader.getIntArray(i, intArray);
                }
                end = System.currentTimeMillis();
                stats.addValue((end - start));
            }
            System.out.println("v1 multi value sequential read one stats for " + file.getName());
            System.out.println(
                    stats.toString().replaceAll("\n", ", ") + " raw:" + Arrays.toString(stats.getValues()));
        }

        if (contextualRead) {
            DescriptiveStatistics stats = new DescriptiveStatistics();
            for (int run = 0; run < MAX_RUNS; run++) {
                MultiValueReaderContext context = (MultiValueReaderContext) reader.createContext();
                start = System.currentTimeMillis();
                for (int i = 0; i < numDocs; i++) {
                    int length = reader.getIntArray(i, intArray, context);
                }
                end = System.currentTimeMillis();
                // System.out.println("RUN:" + run + "Time:" + (end-start));
                stats.addValue((end - start));
            }
            System.out.println("v1 multi value sequential read one with context stats for " + file.getName());
            System.out.println(
                    stats.toString().replaceAll("\n", ", ") + " raw:" + Arrays.toString(stats.getValues()));

        }
        reader.close();
        heapBuffer.close();
        System.out.println("******************************************************************");

    }

    public static void multiValuedReadBenchMarkV2(File file, int numDocs, int totalNumValues, int maxEntriesPerDoc,
            int columnSizeInBits) throws Exception {
        boolean signed = false;
        boolean isMmap = false;
        boolean readOneEachTime = true;
        PinotDataBuffer heapBuffer = PinotDataBuffer.fromFile(file, ReadMode.heap, FileChannel.MapMode.READ_ONLY,
                "benchmarking");
        com.linkedin.pinot.core.io.reader.impl.v2.FixedBitMultiValueReader reader = new com.linkedin.pinot.core.io.reader.impl.v2.FixedBitMultiValueReader(
                heapBuffer, numDocs, totalNumValues, columnSizeInBits, signed);

        int[] intArray = new int[maxEntriesPerDoc];
        long start, end;

        // read one entry at a time
        if (readOneEachTime) {
            DescriptiveStatistics stats = new DescriptiveStatistics();

            for (int run = 0; run < MAX_RUNS; run++) {
                start = System.currentTimeMillis();
                for (int i = 0; i < numDocs; i++) {
                    int length = reader.getIntArray(i, intArray);
                }
                end = System.currentTimeMillis();
                stats.addValue((end - start));
            }
            System.out.println("v2 multi value sequential read one stats for " + file.getName());
            System.out.println(
                    stats.toString().replaceAll("\n", ", ") + " raw:" + Arrays.toString(stats.getValues()));
        }
        reader.close();
        heapBuffer.close();
    }

    private static void benchmarkForwardIndex(String indexDir) throws Exception {
        benchmarkForwardIndex(indexDir, null);
    }

    private static void benchmarkForwardIndex(String indexDir, List<String> includeColumns) throws Exception {
        SegmentMetadataImpl segmentMetadata = new SegmentMetadataImpl(new File(indexDir));
        String segmentVersion = segmentMetadata.getVersion();
        Set<String> columns = segmentMetadata.getAllColumns();
        for (String column : columns) {
            if (includeColumns != null && !includeColumns.isEmpty()) {
                if (!includeColumns.contains(column)) {
                    continue;
                }
            }

            ColumnMetadata columnMetadata = segmentMetadata.getColumnMetadataFor(column);
            if (columnMetadata.isSingleValue()) {
                continue;
            }
            if (!columnMetadata.isSingleValue()) {

                String fwdIndexFileName = segmentMetadata.getForwardIndexFileName(column, segmentVersion);
                File fwdIndexFile = new File(indexDir, fwdIndexFileName);
                multiValuedReadBenchMark(segmentVersion, fwdIndexFile, segmentMetadata.getTotalDocs(),
                        columnMetadata.getTotalNumberOfEntries(), columnMetadata.getMaxNumberOfMultiValues(),
                        columnMetadata.getBitsPerElement());
            } else if (columnMetadata.isSingleValue() && !columnMetadata.isSorted()) {
                String fwdIndexFileName = segmentMetadata.getForwardIndexFileName(column, segmentVersion);
                File fwdIndexFile = new File(indexDir, fwdIndexFileName);
                singleValuedReadBenchMark(segmentVersion, fwdIndexFile, segmentMetadata.getTotalDocs(),
                        columnMetadata.getBitsPerElement());
            }
        }
    }

    private static void multiValuedReadBenchMark(String segmentVersion, File fwdIndexFile, int totalDocs,
            int totalNumberOfEntries, int maxNumberOfMultiValues, int bitsPerElement) throws Exception {
        if (SegmentVersion.v1.name().equals(segmentVersion)) {
            multiValuedReadBenchMarkV1(fwdIndexFile, totalDocs, totalNumberOfEntries, maxNumberOfMultiValues,
                    bitsPerElement);
        } else if (SegmentVersion.v2.name().equals(segmentVersion)) {
            multiValuedReadBenchMarkV2(fwdIndexFile, totalDocs, totalNumberOfEntries, maxNumberOfMultiValues,
                    bitsPerElement);
        }
    }

    private static void singleValuedReadBenchMark(String segmentVersion, File fwdIndexFile, int totalDocs,
            int bitsPerElement) throws Exception {
        if (SegmentVersion.v1.name().equals(segmentVersion)) {
            singleValuedReadBenchMarkV1(fwdIndexFile, totalDocs, bitsPerElement);
        } else if (SegmentVersion.v2.name().equals(segmentVersion)) {
            singleValuedReadBenchMarkV2(fwdIndexFile, totalDocs, bitsPerElement);
        }
    }

    /**
     * USAGE ForwardIndexReaderBenchmark <indexDir> <comma delimited column_names(optional)>
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        String indexDir = args[0];
        if (args.length == 1) {
            benchmarkForwardIndex(indexDir);
        }
        if (args.length == 2) {
            benchmarkForwardIndex(indexDir, Lists.newArrayList(args[1].trim().split(",")));
        }
    }

}