com.streamsets.pipeline.lib.util.AvroToParquetConverterUtil.java Source code

Java tutorial

Introduction

Here is the source code for com.streamsets.pipeline.lib.util.AvroToParquetConverterUtil.java

Source

/*
 * Copyright 2018 StreamSets Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.streamsets.pipeline.lib.util;

import com.streamsets.pipeline.lib.converter.AvroParquetConstants;
import org.apache.avro.Schema;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.SemanticVersion;
import org.apache.parquet.Version;
import org.apache.parquet.avro.AvroParquetWriter;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;

public class AvroToParquetConverterUtil {
    private static final Logger LOG = LoggerFactory.getLogger(AvroToParquetConverterUtil.class);

    private AvroToParquetConverterUtil() {
    }

    public static ParquetWriter.Builder initializeWriter(Path tempFile, Schema avroSchema, Configuration conf)
            throws IOException {

        //private final ByteArrayOutputStream baos = new ByteArrayOutputStream();

        // Detect Parquet version to see if it supports logical types
        LOG.info("Detected Parquet version: " + Version.FULL_VERSION);

        // Parquet Avro pre-1.9 doesn't work with logical types, so in that case we use custom Builder that injects our own
        // avro schema -> parquet schema generator class (which is a copy of the one that was provided in PARQUET-358).
        ParquetWriter.Builder builder = null;
        try {
            SemanticVersion parquetVersion = SemanticVersion.parse(Version.VERSION_NUMBER);
            if (parquetVersion.major > 1 || (parquetVersion.major == 1 && parquetVersion.minor >= 9)) {
                builder = AvroParquetWriter.builder(tempFile).withSchema(avroSchema);
            } else {
                builder = new AvroParquetWriterBuilder(tempFile).withSchema(avroSchema);
            }
        } catch (SemanticVersion.SemanticVersionParseException e) {
            LOG.warn("Can't parse parquet version string: " + Version.VERSION_NUMBER, e);
            builder = new AvroParquetWriterBuilder(tempFile).withSchema(avroSchema);
        }

        // Generic arguments from the Job
        if (propertyDefined(conf, AvroParquetConstants.COMPRESSION_CODEC_NAME)) {
            String codec = conf.get(AvroParquetConstants.COMPRESSION_CODEC_NAME);
            LOG.info("Using compression codec: {}", codec);
            builder.withCompressionCodec(CompressionCodecName.fromConf(codec));
        }
        if (propertyDefined(conf, AvroParquetConstants.ROW_GROUP_SIZE)) {
            int size = conf.getInt(AvroParquetConstants.ROW_GROUP_SIZE, -1);
            LOG.info("Using row group size: {}", size);
            builder.withRowGroupSize(size);
        }
        if (propertyDefined(conf, AvroParquetConstants.PAGE_SIZE)) {
            int size = conf.getInt(AvroParquetConstants.PAGE_SIZE, -1);
            LOG.info("Using page size: {}", size);
            builder.withPageSize(size);
        }
        if (propertyDefined(conf, AvroParquetConstants.DICTIONARY_PAGE_SIZE)) {
            int size = conf.getInt(AvroParquetConstants.DICTIONARY_PAGE_SIZE, -1);
            LOG.info("Using dictionary page size: {}", size);
            builder.withDictionaryPageSize(size);
        }
        if (propertyDefined(conf, AvroParquetConstants.MAX_PADDING_SIZE)) {
            int size = conf.getInt(AvroParquetConstants.MAX_PADDING_SIZE, -1);
            LOG.info("Using max padding size: {}", size);
            builder.withMaxPaddingSize(size);
        }

        return builder;
    }

    // Return true if and only if given property is defined with non empty non default value
    private static boolean propertyDefined(Configuration conf, String propertyName) {
        String prop = conf.get(propertyName);
        // String property will have default empty, integer -1, we'll skip both of them
        return prop != null && !prop.isEmpty() && !prop.equals("-1");
    }
}