com.cohesionforce.AvroToParquet.java Source code

Java tutorial

Introduction

Here is the source code for com.cohesionforce.AvroToParquet.java

Source

/*******************************************************************************
 * Copyright (c) 2015 CohesionForce Inc
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     CohesionForce Inc - initial API and implementation
 *******************************************************************************/
package com.cohesionforce;

import java.io.File;
import java.io.IOException;

import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.hadoop.fs.Path;

import parquet.avro.AvroParquetWriter;
import parquet.hadoop.ParquetWriter;
import parquet.hadoop.metadata.CompressionCodecName;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FilenameUtils;

public class AvroToParquet {
    public static void main(String[] args) {

        String inputFile = null;
        String outputFile = null;

        HelpFormatter formatter = new HelpFormatter();
        // create Options object
        Options options = new Options();

        // add t option
        options.addOption("i", true, "input avro file");
        options.addOption("o", true, "ouptut Parquet file");
        CommandLineParser parser = new DefaultParser();
        CommandLine cmd;
        try {
            cmd = parser.parse(options, args);
            inputFile = cmd.getOptionValue("i");
            if (inputFile == null) {
                formatter.printHelp("AvroToParquet", options);
                return;
            }
            outputFile = cmd.getOptionValue("o");
        } catch (ParseException exc) {
            System.err.println("Problem with command line parameters: " + exc.getMessage());
            return;
        }

        File avroFile = new File(inputFile);

        if (!avroFile.exists()) {
            System.err.println("Could not open file: " + inputFile);
            return;
        }
        try {

            DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
            DataFileReader<GenericRecord> dataFileReader;
            dataFileReader = new DataFileReader<GenericRecord>(avroFile, datumReader);
            Schema avroSchema = dataFileReader.getSchema();

            // choose compression scheme
            CompressionCodecName compressionCodecName = CompressionCodecName.SNAPPY;

            // set Parquet file block size and page size values
            int blockSize = 256 * 1024 * 1024;
            int pageSize = 64 * 1024;

            String base = FilenameUtils.removeExtension(avroFile.getAbsolutePath()) + ".parquet";
            if (outputFile != null) {
                File file = new File(outputFile);
                base = file.getAbsolutePath();
            }

            Path outputPath = new Path("file:///" + base);

            // the ParquetWriter object that will consume Avro GenericRecords
            ParquetWriter<GenericRecord> parquetWriter;
            parquetWriter = new AvroParquetWriter<GenericRecord>(outputPath, avroSchema, compressionCodecName,
                    blockSize, pageSize);
            for (GenericRecord record : dataFileReader) {
                parquetWriter.write(record);
            }
            dataFileReader.close();
            parquetWriter.close();
        } catch (IOException e) {
            System.err.println("Caught exception: " + e.getMessage());
        }
    }
}