net.iponweb.hadoop.streaming.parquet.ParquetAsTextOutputFormat.java Source code

Java tutorial

Introduction

Here is the source code for net.iponweb.hadoop.streaming.parquet.ParquetAsTextOutputFormat.java

Source

/**
 * Copyright 2014 IPONWEB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package net.iponweb.hadoop.streaming.parquet;

import com.google.common.base.Throwables;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.util.Progressable;
import org.apache.parquet.example.data.simple.SimpleGroup;
import org.apache.parquet.hadoop.ParquetOutputFormat;
import org.apache.parquet.hadoop.ParquetRecordWriter;
import org.apache.parquet.hadoop.example.GroupWriteSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;

public class ParquetAsTextOutputFormat extends FileOutputFormat<Text, Text> {

    private static final Log LOG = LogFactory.getLog(ParquetAsTextOutputFormat.class);
    protected ParquetOutputFormat<SimpleGroup> realOutputFormat = new ParquetOutputFormat<>();

    public static void setWriteSupportClass(Configuration configuration, Class<?> writeSupportClass) {
        configuration.set(ParquetOutputFormat.WRITE_SUPPORT_CLASS, writeSupportClass.getName());
    }

    public static void setBlockSize(Configuration configuration, int blockSize) {
        configuration.setInt(ParquetOutputFormat.BLOCK_SIZE, blockSize);
    }

    public static void setPageSize(Configuration configuration, int pageSize) {
        configuration.setInt(ParquetOutputFormat.PAGE_SIZE, pageSize);
    }

    public static void setCompression(Configuration configuration, CompressionCodecName compression) {
        configuration.set(ParquetOutputFormat.COMPRESSION, compression.name());
    }

    public static void setEnableDictionary(Configuration configuration, boolean enableDictionary) {
        configuration.setBoolean(ParquetOutputFormat.ENABLE_DICTIONARY, enableDictionary);
    }

    private static Path getDefaultWorkFile(JobConf conf, String name, String extension) {
        String file = getUniqueName(conf, name) + extension;
        return new Path(getWorkOutputPath(conf), file);
    }

    private static CompressionCodecName getCodec(JobConf conf) {

        CompressionCodecName codec;

        if (ParquetOutputFormat.isCompressionSet(conf)) { // explicit parquet config
            codec = ParquetOutputFormat.getCompression(conf);
        } else if (getCompressOutput(conf)) { // from hadoop config
            // find the right codec
            Class<?> codecClass = getOutputCompressorClass(conf, DefaultCodec.class);
            LOG.info("Compression set through hadoop codec: " + codecClass.getName());
            codec = CompressionCodecName.fromCompressionCodec(codecClass);
        } else {
            codec = CompressionCodecName.UNCOMPRESSED;
        }

        LOG.info("Compression: " + codec.name());
        return codec;
    }

    public RecordWriter<Text, Text> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress)
            throws IOException {

        // find and load schema

        String writeSchema = job.get("iow.streaming.output.schema");
        MessageType s;

        if (writeSchema == null) {

            String schemaFile = job.get("iow.streaming.output.schema.file", "streaming_output_schema");

            if (job.getBoolean("iow.streaming.schema.use.prefix", false)) {
                // guess schema from file name
                // format is: schema:filename
                // with special keyword default - 'default:filename'

                String str[] = name.split(":");
                if (!str[0].equals("default"))
                    schemaFile = str[0];

                name = str[1];
            }

            LOG.info("Using schema: " + schemaFile);
            File f = new File(schemaFile);
            try {
                BufferedReader reader = new BufferedReader(new FileReader(f));
                StringBuilder r = new StringBuilder();
                String line;
                while ((line = reader.readLine()) != null)
                    r.append(line);

                writeSchema = r.toString();

            } catch (Throwable e) {
                LOG.error("Can't read schema file " + schemaFile);
                Throwables.propagateIfPossible(e, IOException.class);
                throw new RuntimeException(e);
            }
        }
        s = MessageTypeParser.parseMessageType(writeSchema);

        setWriteSupportClass(job, GroupWriteSupport.class);
        GroupWriteSupport.setSchema(s, job);

        CompressionCodecName codec = getCodec(job);
        String extension = codec.getExtension() + ".parquet";
        Path file = getDefaultWorkFile(job, name, extension);

        ParquetRecordWriter<SimpleGroup> realWriter;
        try {
            realWriter = (ParquetRecordWriter<SimpleGroup>) realOutputFormat.getRecordWriter(job, file, codec);
        } catch (InterruptedException e) {
            Thread.interrupted();
            throw new IOException(e);
        }

        return createRecordWriter(realWriter, fs, job, name, progress);
    }

    protected RecordWriter<Text, Text> createRecordWriter(ParquetRecordWriter<SimpleGroup> w, FileSystem fs,
            JobConf job, String name, Progressable p) throws IOException {

        return new TextRecordWriterWrapper(w, fs, job, name, p);
    }
}