com.linkedin.pinot.hadoop.io.PinotOutputFormat.java Source code

Introduction

Here is the source code for com.linkedin.pinot.hadoop.io.PinotOutputFormat.java
Source

/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.hadoop.io;

import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.data.StarTreeIndexSpec;
import com.linkedin.pinot.core.data.readers.FileFormat;
import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig;
import com.linkedin.pinot.startree.hll.HllConfig;
import com.linkedin.pinot.startree.hll.HllConstants;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;

/**
 * Generic Pinot Output Format implementation.
 * @param <K>
 * @param <V>
 */
public class PinotOutputFormat<K, V> extends FileOutputFormat<K, V> {

    private final SegmentGeneratorConfig _segmentConfig;

    // Pinot temp directory to create segment.
    public static final String TEMP_SEGMENT_DIR = "pinot.temp.segment.dir";

    // Name of the table
    public static final String TABLE_NAME = "pinot.table.name";

    // Name of the segment.
    public static final String SEGMENT_NAME = "pinot.segment_name";

    // file containing schema for the data
    public static final String SCHEMA = "pinot.schema.file";

    // config file for the record reader
    public static final String READER_CONFIG = "pinot.reader.config.file";

    // boolean flag to enable Star Tree Index.
    public static final String ENABLE_STAR_TREE_INDEX = "pinot.enable.star.tree.index";

    // Config file for star tree index.
    public static final String STAR_TREE_INDEX_SPEC = "pinot.star.tree.index.spec.file";

    // HLL size (log scale), default is 9.
    public static final String HLL_SIZE = "pinot.hll.size";

    // HLL columns
    public static final String HLL_COLUMNS = "pinot.hll.columns";

    // Suffix for the derived HLL columns
    public static final String HLL_SUFFIX = "pinot.hll.suffix";

    public static final String PINOT_RECORD_SERIALIZATION_CLASS = "pinot.record.serialization.class";

    public PinotOutputFormat() {
        _segmentConfig = new SegmentGeneratorConfig();
    }

    public static void setTempSegmentDir(Job job, String segmentDir) {
        job.getConfiguration().set(PinotOutputFormat.TEMP_SEGMENT_DIR, segmentDir);
    }

    public static String getTempSegmentDir(JobContext job) {
        return job.getConfiguration().get(PinotOutputFormat.TEMP_SEGMENT_DIR, ".data_" + getTableName(job));
    }

    public static void setTableName(Job job, String table) {
        job.getConfiguration().set(PinotOutputFormat.TABLE_NAME, table);
    }

    public static String getTableName(JobContext job) {
        String table = job.getConfiguration().get(PinotOutputFormat.TABLE_NAME);
        if (table == null) {
            throw new RuntimeException("pinot table name not set.");
        }
        return table;
    }

    public static void setSegmentName(Job job, String segmentName) {
        job.getConfiguration().set(PinotOutputFormat.SEGMENT_NAME, segmentName);
    }

    public static String getSegmentName(JobContext context) {
        String segment = context.getConfiguration().get(PinotOutputFormat.SEGMENT_NAME);
        if (segment == null) {
            throw new RuntimeException("pinot segment name not set.");
        }
        return segment;
    }

    public static void setSchema(Job job, Schema schema) {
        job.getConfiguration().set(PinotOutputFormat.SCHEMA, schema.getJSONSchema());
    }

    public static String getSchema(JobContext context) {
        String schemaFile = context.getConfiguration().get(PinotOutputFormat.SCHEMA);
        if (schemaFile == null) {
            throw new RuntimeException("pinot schema file not set");
        }
        return schemaFile;
    }

    public static void setReaderConfig(Job job, String readConfig) {
        job.getConfiguration().set(PinotOutputFormat.READER_CONFIG, readConfig);
    }

    public static String getReaderConfig(JobContext context) {
        return context.getConfiguration().get(PinotOutputFormat.READER_CONFIG);
    }

    public static void setEnableStarTreeIndex(Job job, boolean flag) {
        job.getConfiguration().setBoolean(PinotOutputFormat.ENABLE_STAR_TREE_INDEX, flag);
    }

    public static boolean getEnableStarTreeIndex(JobContext context) {
        return context.getConfiguration().getBoolean(PinotOutputFormat.ENABLE_STAR_TREE_INDEX, false);
    }

    public static void setStarTreeIndexSpec(Job job, String starTreeIndexSpec) {
        job.getConfiguration().set(PinotOutputFormat.STAR_TREE_INDEX_SPEC, starTreeIndexSpec);
    }

    public static String getStarTreeIndexSpec(JobContext context) {
        return context.getConfiguration().get(PinotOutputFormat.STAR_TREE_INDEX_SPEC);
    }

    public static void getHllSize(Job job, int size) {
        job.getConfiguration().setInt(PinotOutputFormat.HLL_SIZE, size);
    }

    public static int getHllSize(JobContext context) {
        return context.getConfiguration().getInt(PinotOutputFormat.HLL_SIZE, 9);
    }

    public static void setHllColumns(Job job, String columns) {
        job.getConfiguration().set(PinotOutputFormat.HLL_COLUMNS, columns);
    }

    public static String getHllColumns(JobContext context) {
        return context.getConfiguration().get(PinotOutputFormat.HLL_COLUMNS);
    }

    public static void getHllSuffix(Job job, String suffix) {
        job.getConfiguration().set(PinotOutputFormat.HLL_SUFFIX, suffix);
    }

    public static String getHllSuffix(JobContext context) {
        return context.getConfiguration().get(PinotOutputFormat.HLL_SUFFIX,
                HllConstants.DEFAULT_HLL_DERIVE_COLUMN_SUFFIX);
    }

    public static void setDataWriteSupportClass(Job job,
            Class<? extends PinotRecordSerialization> pinotSerialization) {
        job.getConfiguration().set(PinotOutputFormat.PINOT_RECORD_SERIALIZATION_CLASS,
                pinotSerialization.getName());
    }

    public static Class<?> getDataWriteSupportClass(JobContext context) {
        String className = context.getConfiguration().get(PinotOutputFormat.PINOT_RECORD_SERIALIZATION_CLASS);
        if (className == null) {
            throw new RuntimeException("pinot data write support class not set");
        }
        try {
            return context.getConfiguration().getClassByName(className);
        } catch (ClassNotFoundException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
        configure(context.getConfiguration());
        final PinotRecordSerialization dataWriteSupport = getDataWriteSupport(context);
        initSegmentConfig(context);
        Path workDir = getDefaultWorkFile(context, "");
        return new PinotRecordWriter<>(_segmentConfig, context, workDir, dataWriteSupport);
    }

    /**
     * The {@link #configure(Configuration)} method called before initialize the  {@link
     * RecordWriter} Any implementation of {@link PinotOutputFormat} can use it to set additional
     * configuration properties.
     */
    public void configure(Configuration conf) {

    }

    private PinotRecordSerialization getDataWriteSupport(TaskAttemptContext context) {
        try {
            return (PinotRecordSerialization) PinotOutputFormat.getDataWriteSupportClass(context).newInstance();
        } catch (Exception e) {
            throw new RuntimeException("Error initialize data write support class", e);
        }
    }

    private void initSegmentConfig(JobContext context) throws IOException {
        _segmentConfig.setFormat(FileFormat.JSON);
        _segmentConfig.setOutDir(PinotOutputFormat.getTempSegmentDir(context) + "/segmentDir");
        _segmentConfig.setOverwrite(true);
        _segmentConfig.setTableName(PinotOutputFormat.getTableName(context));
        _segmentConfig.setSegmentName(PinotOutputFormat.getSegmentName(context));
        _segmentConfig.setSchema(Schema.fromString(PinotOutputFormat.getSchema(context)));
        _segmentConfig.setReaderConfigFile(PinotOutputFormat.getReaderConfig(context));
        initStarTreeIndex(context);
        initHllConfig(context);
    }

    private void initHllConfig(JobContext context) {
        String _hllColumns = PinotOutputFormat.getHllColumns(context);
        if (_hllColumns != null) {
            String[] hllColumns = StringUtils.split(StringUtils.deleteWhitespace(_hllColumns), ',');
            if (hllColumns.length != 0) {
                HllConfig hllConfig = new HllConfig(PinotOutputFormat.getHllSize(context));
                hllConfig.setColumnsToDeriveHllFields(new HashSet<>(Arrays.asList(hllColumns)));
                hllConfig.setHllDeriveColumnSuffix(PinotOutputFormat.getHllSuffix(context));
                _segmentConfig.setHllConfig(hllConfig);
            }
        }
    }

    private void initStarTreeIndex(JobContext context) throws IOException {
        String _starTreeIndexSpecFile = PinotOutputFormat.getStarTreeIndexSpec(context);
        if (_starTreeIndexSpecFile != null) {
            StarTreeIndexSpec starTreeIndexSpec = StarTreeIndexSpec.fromFile(new File(_starTreeIndexSpecFile));
            // Specifying star-tree index file enables star tree generation, even if _enableStarTreeIndex is not specified.
            _segmentConfig.enableStarTreeIndex(starTreeIndexSpec);
        } else if (PinotOutputFormat.getEnableStarTreeIndex(context)) {
            _segmentConfig.enableStarTreeIndex(null);
        }
    }
}