co.cask.hydrator.plugin.batch.source.TimePartitionedFileSetDatasetParquetSource.java Source code

Introduction

Here is the source code for co.cask.hydrator.plugin.batch.source.TimePartitionedFileSetDatasetParquetSource.java
Source

/*
 * Copyright  2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.hydrator.plugin.batch.source;

import co.cask.cdap.api.annotation.Description;
import co.cask.cdap.api.annotation.Name;
import co.cask.cdap.api.annotation.Plugin;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.UnsupportedTypeException;
import co.cask.cdap.api.dataset.DatasetProperties;
import co.cask.cdap.api.dataset.lib.FileSetProperties;
import co.cask.cdap.api.dataset.lib.KeyValue;
import co.cask.cdap.api.dataset.lib.TimePartitionedFileSet;
import co.cask.cdap.etl.api.Emitter;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.api.batch.BatchSource;
import co.cask.hydrator.common.HiveSchemaConverter;
import co.cask.hydrator.common.batch.JobUtils;
import co.cask.hydrator.plugin.common.AvroToStructuredTransformer;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.base.Throwables;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import parquet.avro.AvroParquetInputFormat;
import parquet.avro.AvroParquetOutputFormat;

import java.io.IOException;
import java.util.Map;

/**
 * A {@link BatchSource} to read Avro record from {@link TimePartitionedFileSet}
 */
@Plugin(type = "batchsource")
@Name("TPFSParquet")
@Description("Reads from a TimePartitionedFileSet whose data is in Parquet format.")
public class TimePartitionedFileSetDatasetParquetSource
        extends TimePartitionedFileSetSource<NullWritable, GenericRecord> {
    private final TPFSParquetConfig tpfsParquetConfig;

    private final AvroToStructuredTransformer recordTransformer = new AvroToStructuredTransformer();

    /**
     * Config for TimePartitionedFileSetDatasetParquetSource
     */
    public static class TPFSParquetConfig extends TPFSConfig {

        @Description("The Parquet schema of the record being read from the source as a JSON Object.")
        private String schema;

        @Override
        protected void validate() {
            super.validate();
            try {
                new org.apache.avro.Schema.Parser().parse(schema);
            } catch (Exception e) {
                throw new IllegalArgumentException("Unable to parse schema with error: " + e.getMessage(), e);
            }
        }
    }

    @Override
    public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
        super.configurePipeline(pipelineConfigurer);
        Preconditions.checkArgument(!Strings.isNullOrEmpty(tpfsParquetConfig.schema), "Schema must be specified.");
        try {
            co.cask.cdap.api.data.schema.Schema schema = co.cask.cdap.api.data.schema.Schema
                    .parseJson(tpfsParquetConfig.schema);
            pipelineConfigurer.getStageConfigurer().setOutputSchema(schema);
        } catch (Exception e) {
            throw new IllegalArgumentException("Invalid output schema: " + e.getMessage(), e);
        }
    }

    public TimePartitionedFileSetDatasetParquetSource(TPFSParquetConfig tpfsParquetConfig) {
        super(tpfsParquetConfig);
        this.tpfsParquetConfig = tpfsParquetConfig;
    }

    @Override
    protected void addFileSetProperties(FileSetProperties.Builder properties) {
        properties.setInputFormat(AvroParquetInputFormat.class).setOutputFormat(AvroParquetOutputFormat.class)
                .setEnableExploreOnCreate(true).setExploreFormat("parquet");
        try {
            String schema = tpfsParquetConfig.schema.toLowerCase();
            String hiveSchema = HiveSchemaConverter
                    .toHiveSchema(co.cask.cdap.api.data.schema.Schema.parseJson(schema));
            properties.setExploreSchema(hiveSchema.substring(1, hiveSchema.length() - 1));
            properties.add(DatasetProperties.SCHEMA, schema);
        } catch (UnsupportedTypeException e) {
            throw new IllegalArgumentException("schema " + tpfsParquetConfig.schema + " is not supported.", e);
        } catch (Exception e) {
            throw new IllegalArgumentException("schema " + tpfsParquetConfig.schema + " is invalid.", e);
        }
    }

    @Override
    protected void addInputFormatConfiguration(Map<String, String> config) {
        try {
            Job job = JobUtils.createInstance();
            Configuration hConf = job.getConfiguration();

            Schema avroSchema = new Schema.Parser().parse(tpfsParquetConfig.schema.toLowerCase());
            AvroParquetInputFormat.setAvroReadSchema(job, avroSchema);
            for (Map.Entry<String, String> entry : hConf) {
                config.put(entry.getKey(), entry.getValue());
            }
        } catch (IOException e) {
            // Shouldn't happen
            throw Throwables.propagate(e);
        }
    }

    @Override
    public void transform(KeyValue<NullWritable, GenericRecord> input, Emitter<StructuredRecord> emitter)
            throws Exception {
        emitter.emit(recordTransformer.transform(input.getValue()));
    }
}