com.linkedin.pinot.tools.segment.converter.DictionaryToRawIndexConverter.java Source code

Introduction

Here is the source code for com.linkedin.pinot.tools.segment.converter.DictionaryToRawIndexConverter.java
Source

/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.tools.segment.converter;

import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.segment.ReadMode;
import com.linkedin.pinot.common.utils.TarGzCompressionUtils;
import com.linkedin.pinot.core.common.BlockSingleValIterator;
import com.linkedin.pinot.core.common.DataSource;
import com.linkedin.pinot.core.common.DataSourceMetadata;
import com.linkedin.pinot.core.indexsegment.IndexSegment;
import com.linkedin.pinot.core.segment.creator.SingleValueRawIndexCreator;
import com.linkedin.pinot.core.segment.creator.impl.SegmentColumnarIndexCreator;
import com.linkedin.pinot.core.segment.creator.impl.V1Constants;
import com.linkedin.pinot.core.segment.index.loader.Loaders;
import com.linkedin.pinot.core.segment.index.readers.Dictionary;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Field;
import java.nio.charset.Charset;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.io.FileUtils;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Class to convert segment with dictionary encoded column to raw index (without dictionary).
 */
@SuppressWarnings({ "FieldCanBeLocal", "unused" })
public class DictionaryToRawIndexConverter {
    private static final Logger LOGGER = LoggerFactory.getLogger(DictionaryToRawIndexConverter.class);
    private static final Charset UTF_8 = Charset.forName("UTF-8");

    @Option(name = "-dataDir", required = true, usage = "Directory containing uncompressed segments")
    private String _dataDir = null;

    @Option(name = "-columns", required = true, usage = "Comma separated list of column names to convert")
    private String _columns = null;

    @Option(name = "-tableName", required = false, usage = "New table name, if different from original")
    private String _tableName = null;

    @Option(name = "-outputDir", required = true, usage = "Output directory for writing results")
    private String _outputDir = null;

    @Option(name = "-overwrite", required = false, usage = "Overwrite output directory")
    private boolean _overwrite = false;

    @Option(name = "-numThreads", required = false, usage = "Number of threads to launch for conversion")
    private int _numThreads = 4;

    @Option(name = "-compressOutput", required = false, usage = "Compress (tar + gzip) output segment")
    private boolean _compressOutput = false;

    @Option(name = "-help", required = false, help = true, aliases = { "-h" }, usage = "print this message")
    private boolean _help = false;

    /**
     * Setter for {@link #_dataDir}
     * @param dataDir Data directory containing un-tarred segments.
     * @return this
     */
    public DictionaryToRawIndexConverter setDataDir(String dataDir) {
        _dataDir = dataDir;
        return this;
    }

    /**
     * Setter for {@link #_outputDir}
     *
     * @param outputDir Directory where output segments should be written
     * @return this
     */
    public DictionaryToRawIndexConverter setOutputDir(String outputDir) {
        _outputDir = outputDir;
        return this;
    }

    /**
     * Setter for columns to convert.
     *
     * @param columns Comma separated list of columns
     * @return this
     */
    public DictionaryToRawIndexConverter setColumns(String columns) {
        _columns = columns;
        return this;
    }

    /**
     * Setter for {@link #_overwrite}
     * When set to true, already existing output directory is overwritten.
     *
     * @param overwrite True for overwriting existing output dir, False otherwise
     * @return this
     */
    public DictionaryToRawIndexConverter setOverwrite(boolean overwrite) {
        _overwrite = overwrite;
        return this;
    }

    /**
     * Method to perform the conversion for a set of segments in the {@link #_dataDir}
     *
     * @return True if successful, False otherwise
     * @throws Exception
     */
    public boolean convert() throws Exception {
        if (_help) {
            printUsage();
            return true;
        }

        File dataDir = new File(_dataDir);
        File outputDir = new File(_outputDir);

        if (!dataDir.exists()) {
            LOGGER.error("Data directory '{}' does not exist.", _dataDir);
            return false;
        } else if (outputDir.exists()) {
            if (_overwrite) {
                LOGGER.info("Overwriting existing output directory '{}'", _outputDir);
                FileUtils.deleteQuietly(outputDir);
                outputDir = new File(_outputDir);
                outputDir.mkdir();
            } else {
                LOGGER.error("Output directory '{}' already exists, use -overwrite to overwrite", outputDir);
                return false;
            }
        }

        File[] segmentFiles = dataDir.listFiles();
        if (segmentFiles == null || segmentFiles.length == 0) {
            LOGGER.error("Empty data directory '{}'.", _dataDir);
            return false;
        }

        boolean ret = true;
        final File outDir = outputDir;
        ExecutorService executorService = Executors.newFixedThreadPool(_numThreads);
        for (final File segmentDir : segmentFiles) {
            executorService.execute(new Runnable() {
                @Override
                public void run() {
                    try {
                        convertSegment(segmentDir, _columns.split("\\s*,\\s*"), outDir, _compressOutput);
                    } catch (Exception e) {
                        LOGGER.error("Exception caught while converting segment {}", segmentDir.getName(), e);
                        e.printStackTrace();
                    }
                }
            });
        }

        executorService.shutdown();
        executorService.awaitTermination(1, TimeUnit.HOURS);
        return ret;
    }

    /**
     * This method converts the specified columns of the given segment from dictionary encoded
     * forward index to raw index without dictionary.
     *
     * @param segmentDir Segment directory
     * @param columns Columns to convert
     * @param outputDir Directory for writing output segment
     * @param compressOutput Tar/gzip the output segment
     * @return True if successful, False otherwise
     * @throws Exception
     */
    public boolean convertSegment(File segmentDir, String[] columns, File outputDir, boolean compressOutput)
            throws Exception {
        File newSegment;

        if (segmentDir.isFile()) {
            if (segmentDir.getName().endsWith(".tar.gz") || segmentDir.getName().endsWith(".tgz")) {
                LOGGER.info("Uncompressing input segment '{}'", segmentDir);
                newSegment = TarGzCompressionUtils.unTar(segmentDir, outputDir).get(0);
            } else {
                LOGGER.warn("Skipping non-segment file '{}'", segmentDir.getAbsoluteFile());
                return false;
            }
        } else {
            newSegment = new File(outputDir, segmentDir.getName());
            newSegment.mkdir();
            FileUtils.copyDirectory(segmentDir, newSegment);
        }

        IndexSegment segment = Loaders.IndexSegment.load(newSegment, ReadMode.mmap);
        for (String column : columns) {
            LOGGER.info("Converting column '{}' for segment '{}'.", column, segment.getSegmentName());
            convertOneColumn(segment, column, newSegment);
        }

        updateMetadata(newSegment, columns, _tableName);
        segment.destroy();

        if (compressOutput) {
            LOGGER.info("Compressing segment '{}'", newSegment);
            TarGzCompressionUtils.createTarGzOfDirectory(newSegment.getAbsolutePath(),
                    newSegment.getAbsolutePath());
            FileUtils.deleteQuietly(newSegment);
        }
        return true;
    }

    /**
     * Helper method to update the metadata.properties for the converted segment.
     *
     * @param segmentDir Segment directory
     * @param columns Converted columns
     * @param tableName New table name to be written in the meta-data. Skipped if null.
     * @throws IOException
     * @throws ConfigurationException
     */
    private void updateMetadata(File segmentDir, String[] columns, String tableName)
            throws IOException, ConfigurationException {
        File metadataFile = new File(segmentDir, V1Constants.MetadataKeys.METADATA_FILE_NAME);
        PropertiesConfiguration properties = new PropertiesConfiguration(metadataFile);

        if (tableName != null) {
            properties.setProperty(V1Constants.MetadataKeys.Segment.TABLE_NAME, tableName);
        }

        for (String column : columns) {
            properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column,
                    V1Constants.MetadataKeys.Column.HAS_DICTIONARY), false);
            properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column,
                    V1Constants.MetadataKeys.Column.BITS_PER_ELEMENT), -1);
        }
        properties.save();
    }

    /**
     * Helper method to print usage at the command line interface.
     */
    private static void printUsage() {
        System.out.println("Usage: DictionaryTORawIndexConverter");
        for (Field field : ColumnarToStarTreeConverter.class.getDeclaredFields()) {

            if (field.isAnnotationPresent(Option.class)) {
                Option option = field.getAnnotation(Option.class);

                System.out.println(String.format("\t%-15s: %s (required=%s)", option.name(), option.usage(),
                        option.required()));
            }
        }
    }

    /**
     * Helper method to perform conversion for the specific column.
     *
     * @param segment Input segment to convert
     * @param column Column to convert
     * @param newSegment Directory where raw index to be written
     * @throws IOException
     */
    private void convertOneColumn(IndexSegment segment, String column, File newSegment) throws IOException {
        DataSource dataSource = segment.getDataSource(column);
        Dictionary dictionary = dataSource.getDictionary();

        if (dictionary == null) {
            LOGGER.error("Column '{}' does not have dictionary, cannot convert to raw index.", column);
            return;
        }

        DataSourceMetadata dataSourceMetadata = dataSource.getDataSourceMetadata();
        if (!dataSourceMetadata.isSingleValue()) {
            LOGGER.error("Cannot convert multi-valued columns '{}'", column);
            return;
        }

        int totalDocs = segment.getSegmentMetadata().getTotalDocs();
        BlockSingleValIterator bvIter = (BlockSingleValIterator) dataSource.getNextBlock().getBlockValueSet()
                .iterator();

        FieldSpec.DataType dataType = dataSourceMetadata.getDataType();
        int lengthOfLongestEntry = (dataType == FieldSpec.DataType.STRING)
                ? getLengthOfLongestEntry(bvIter, dictionary)
                : -1;

        SingleValueRawIndexCreator rawIndexCreator = SegmentColumnarIndexCreator
                .getRawIndexCreatorForColumn(newSegment, column, dataType, totalDocs, lengthOfLongestEntry);

        int docId = 0;
        bvIter.reset();
        while (bvIter.hasNext()) {
            int dictId = bvIter.nextIntVal();
            Object value = dictionary.get(dictId);
            rawIndexCreator.index(docId++, value);

            if (docId % 1000000 == 0) {
                LOGGER.info("Converted {} records.", docId);
            }
        }
        rawIndexCreator.close();
        deleteForwardIndex(newSegment.getParentFile(), column, dataSourceMetadata.isSorted());
    }

    /**
     * Helper method to remove the forward index for the given column.
     *
     * @param segmentDir Segment directory from which to remove the forward index.
     * @param column Column for which to remove the index.
     * @param sorted True if column is sorted, False otherwise
     */
    private void deleteForwardIndex(File segmentDir, String column, boolean sorted) {
        File dictionaryFile = new File(segmentDir, (column + V1Constants.Dict.FILE_EXTENTION));
        FileUtils.deleteQuietly(dictionaryFile);

        String fwdIndexFileExtension = (sorted) ? V1Constants.Indexes.SORTED_FWD_IDX_FILE_EXTENTION
                : V1Constants.Indexes.UN_SORTED_SV_FWD_IDX_FILE_EXTENTION;
        File fwdIndexFile = new File(segmentDir, (column + fwdIndexFileExtension));
        FileUtils.deleteQuietly(fwdIndexFile);
    }

    /**
     * Helper method to get the length
     * @param bvIter Data source blockvalset iterator
     * @param dictionary Column dictionary
     * @return Length of longest entry
     */
    private int getLengthOfLongestEntry(BlockSingleValIterator bvIter, Dictionary dictionary) {
        int lengthOfLongestEntry = 0;

        bvIter.reset();
        while (bvIter.hasNext()) {
            int dictId = bvIter.nextIntVal();
            String value = (String) dictionary.get(dictId);
            lengthOfLongestEntry = Math.max(lengthOfLongestEntry, value.getBytes(UTF_8).length);
        }

        return lengthOfLongestEntry;
    }

    /**
     * Main method for the class.
     *
     * @param args Arguments for the converter
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        DictionaryToRawIndexConverter converter = new DictionaryToRawIndexConverter();
        CmdLineParser parser = new CmdLineParser(converter);
        parser.parseArgument(args);
        converter.convert();
    }
}