org.apache.parquet.hadoop.TestInputFormatColumnProjection.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.parquet.hadoop.TestInputFormatColumnProjection.java

Source

/*
 *  Licensed to the Apache Software Foundation (ASF) under one
 *  or more contributor license agreements.  See the NOTICE file
 *  distributed with this work for additional information
 *  regarding copyright ownership.  The ASF licenses this file
 *  to you under the Apache License, Version 2.0 (the
 *  "License"); you may not use this file except in compliance
 *  with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing,
 *  software distributed under the License is distributed on an
 *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 *  KIND, either express or implied.  See the License for the
 *  specific language governing permissions and limitations
 *  under the License.
 */
package org.apache.parquet.hadoop;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.example.ExampleInputFormat;
import org.apache.parquet.hadoop.example.ExampleOutputFormat;
import org.apache.parquet.hadoop.util.ContextUtil;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Types;
import org.junit.Assert;
import org.junit.Assume;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.UUID;

import static java.lang.Thread.sleep;
import static org.apache.parquet.schema.OriginalType.UTF8;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;

public class TestInputFormatColumnProjection {
    public static final String FILE_CONTENT = "" + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,"
            + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,"
            + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";

    public static MessageType PARQUET_TYPE = Types.buildMessage().required(BINARY).as(UTF8).named("uuid")
            .required(BINARY).as(UTF8).named("char").named("FormatTestObject");

    public static class Writer extends Mapper<LongWritable, Text, Void, Group> {
        public static final SimpleGroupFactory GROUP_FACTORY = new SimpleGroupFactory(PARQUET_TYPE);

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            // writes each character of the line with a UUID
            String line = value.toString();
            for (int i = 0; i < line.length(); i += 1) {
                Group group = GROUP_FACTORY.newGroup();
                group.add(0, Binary.fromString(UUID.randomUUID().toString()));
                group.add(1, Binary.fromString(line.substring(i, i + 1)));
                context.write(null, group);
            }
        }
    }

    public static class Reader extends Mapper<Void, Group, LongWritable, Text> {

        public static Counter bytesReadCounter = null;

        public static void setBytesReadCounter(Counter bytesRead) {
            bytesReadCounter = bytesRead;
        }

        @Override
        protected void map(Void key, Group value, Context context) throws IOException, InterruptedException {
            // Do nothing. The test uses Hadoop FS counters for verification.
            setBytesReadCounter(ContextUtil.getCounter(context, "parquet", "bytesread"));
        }
    }

    @Rule
    public TemporaryFolder temp = new TemporaryFolder();

    @Test
    public void testProjectionSize() throws Exception {
        Assume.assumeTrue( // only run this test for Hadoop 2
                org.apache.hadoop.mapreduce.JobContext.class.isInterface());

        File inputFile = temp.newFile();
        FileOutputStream out = new FileOutputStream(inputFile);
        out.write(FILE_CONTENT.getBytes("UTF-8"));
        out.close();

        File tempFolder = temp.newFolder();
        tempFolder.delete();
        Path tempPath = new Path(tempFolder.toURI());

        File outputFolder = temp.newFile();
        outputFolder.delete();

        Configuration conf = new Configuration();
        // set the projection schema
        conf.set("parquet.read.schema",
                Types.buildMessage().required(BINARY).as(UTF8).named("char").named("FormatTestObject").toString());

        // disable summary metadata, it isn't needed
        conf.set("parquet.enable.summary-metadata", "false");
        conf.set("parquet.example.schema", PARQUET_TYPE.toString());

        {
            Job writeJob = new Job(conf, "write");
            writeJob.setInputFormatClass(TextInputFormat.class);
            TextInputFormat.addInputPath(writeJob, new Path(inputFile.toString()));

            writeJob.setOutputFormatClass(ExampleOutputFormat.class);
            writeJob.setMapperClass(Writer.class);
            writeJob.setNumReduceTasks(0); // write directly to Parquet without reduce
            ParquetOutputFormat.setBlockSize(writeJob, 10240);
            ParquetOutputFormat.setPageSize(writeJob, 512);
            ParquetOutputFormat.setDictionaryPageSize(writeJob, 1024);
            ParquetOutputFormat.setEnableDictionary(writeJob, true);
            ParquetOutputFormat.setMaxPaddingSize(writeJob, 1023); // always pad
            ParquetOutputFormat.setOutputPath(writeJob, tempPath);

            waitForJob(writeJob);
        }

        long bytesWritten = 0;
        FileSystem fs = FileSystem.getLocal(conf);
        for (FileStatus file : fs.listStatus(tempPath)) {
            bytesWritten += file.getLen();
        }

        long bytesRead;
        {
            Job readJob = new Job(conf, "read");
            readJob.setInputFormatClass(ExampleInputFormat.class);
            TextInputFormat.addInputPath(readJob, tempPath);

            readJob.setOutputFormatClass(TextOutputFormat.class);
            readJob.setMapperClass(Reader.class);
            readJob.setNumReduceTasks(0); // no reduce phase
            TextOutputFormat.setOutputPath(readJob, new Path(outputFolder.toString()));

            waitForJob(readJob);

            bytesRead = Reader.bytesReadCounter.getValue();
        }

        Assert.assertTrue("Should read less than 10% of the input file size", bytesRead < (bytesWritten / 10));
    }

    private void waitForJob(Job job) throws Exception {
        job.submit();
        while (!job.isComplete()) {
            sleep(100);
        }
        if (!job.isSuccessful()) {
            throw new RuntimeException("job failed " + job.getJobName());
        }
    }
}