com.facebook.hive.orc.TestInputOutputFormat.java Source code

Java tutorial

Introduction

Here is the source code for com.facebook.hive.orc.TestInputOutputFormat.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.hive.orc;

import static org.junit.Assert.assertEquals;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
import org.apache.hadoop.hive.ql.io.InputFormatChecker;
import org.apache.hadoop.hive.serde2.ReaderWriterProfiler;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TestName;

import com.facebook.hive.orc.OrcSerde.OrcSerdeRow;
import com.facebook.hive.orc.lazy.OrcLazyRow;

public class TestInputOutputFormat {

    Path workDir = new Path(System.getProperty("test.tmp.dir", "target/test/tmp"));

    public static class MyRow implements Writable {
        int x;
        int y;

        MyRow(int x, int y) {
            this.x = x;
            this.y = y;
        }

        @Override
        public void write(DataOutput dataOutput) throws IOException {
            throw new UnsupportedOperationException("no write");
        }

        @Override
        public void readFields(DataInput dataInput) throws IOException {
            throw new UnsupportedOperationException("no read");
        }
    }

    @Rule
    public TestName testCaseName = new TestName();
    JobConf conf;
    FileSystem fs;
    Path testFilePath;

    @Before
    public void openFileSystem() throws Exception {
        conf = new JobConf();
        fs = FileSystem.getLocal(conf);
        testFilePath = new Path(workDir, "TestInputOutputFormat." + testCaseName.getMethodName() + ".orc");
        fs.delete(testFilePath, false);
    }

    @Test
    public void testInOutFormat() throws Exception {
        Properties properties = new Properties();
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
                    ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
        }
        SerDe serde = new OrcSerde();
        HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
        FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true,
                properties, Reporter.NULL);
        ReaderWriterProfiler.setProfilerOptions(conf);
        writer.write(serde.serialize(new MyRow(1, 2), inspector));
        writer.write(serde.serialize(new MyRow(2, 2), inspector));
        writer.write(serde.serialize(new MyRow(3, 2), inspector));
        writer.close(true);
        serde = new OrcSerde();
        properties.setProperty("columns", "x,y");
        properties.setProperty("columns.types", "int:int");
        serde.initialize(conf, properties);
        assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass());
        inspector = (StructObjectInspector) serde.getObjectInspector();
        assertEquals("struct<x:int,y:int>", inspector.getTypeName());
        InputFormat<?, ?> in = new OrcInputFormat();
        FileInputFormat.setInputPaths(conf, testFilePath.toString());
        InputSplit[] splits = in.getSplits(conf, 1);
        assertEquals(1, splits.length);

        // the the validate input method
        ArrayList<FileStatus> fileList = new ArrayList<FileStatus>(3);
        assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList));
        fileList.add(fs.getFileStatus(testFilePath));
        assertEquals(true, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList));
        fileList.add(fs.getFileStatus(workDir));
        assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList));

        // read the whole file
        org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
        Object key = reader.createKey();
        Writable value = (Writable) reader.createValue();
        int rowNum = 0;
        List<? extends StructField> fields = inspector.getAllStructFieldRefs();
        IntObjectInspector intInspector = (IntObjectInspector) fields.get(0).getFieldObjectInspector();
        assertEquals(0.0, reader.getProgress(), 0.00001);
        assertEquals(0, reader.getPos());
        while (reader.next(key, value)) {
            assertEquals(++rowNum,
                    intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(0))));
            assertEquals(2,
                    intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1))));
        }
        assertEquals(3, rowNum);
        assertEquals(1.0, reader.getProgress(), 0.00001);
        reader.close();

        // read just the first column
        conf.set("hive.io.file.readcolumn.ids", "0");
        reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
        key = reader.createKey();
        value = (Writable) reader.createValue();
        rowNum = 0;
        fields = inspector.getAllStructFieldRefs();
        while (reader.next(key, value)) {
            assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0))));
            assertEquals(null, inspector.getStructFieldData(value, fields.get(1)));
        }
        assertEquals(3, rowNum);
        reader.close();

        // test the mapping of empty string to all columns
        conf.set("hive.io.file.readcolumn.ids", "");
        reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
        key = reader.createKey();
        value = (Writable) reader.createValue();
        rowNum = 0;
        fields = inspector.getAllStructFieldRefs();
        while (reader.next(key, value)) {
            assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0))));
            assertEquals(2,
                    intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1))));
        }
        assertEquals(3, rowNum);
        reader.close();
    }

    static class NestedRow implements Writable {
        int z;
        MyRow r;

        NestedRow(int x, int y, int z) {
            this.z = z;
            this.r = new MyRow(x, y);
        }

        @Override
        public void write(DataOutput dataOutput) throws IOException {
            throw new UnsupportedOperationException("unsupported");
        }

        @Override
        public void readFields(DataInput dataInput) throws IOException {
            throw new UnsupportedOperationException("unsupported");
        }
    }

    @Test
    public void testMROutput() throws Exception {
        JobConf job = new JobConf(conf);
        Properties properties = new Properties();
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(NestedRow.class,
                    ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
        }
        SerDe serde = new OrcSerde();
        OutputFormat<?, ?> outFormat = new OrcOutputFormat();
        RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL);
        writer.write(NullWritable.get(), serde.serialize(new NestedRow(1, 2, 3), inspector));
        writer.write(NullWritable.get(), serde.serialize(new NestedRow(4, 5, 6), inspector));
        writer.write(NullWritable.get(), serde.serialize(new NestedRow(7, 8, 9), inspector));
        writer.close(Reporter.NULL);
        serde = new OrcSerde();
        properties.setProperty("columns", "z,r");
        properties.setProperty("columns.types", "int:struct<x:int,y:int>");
        serde.initialize(conf, properties);
        inspector = (StructObjectInspector) serde.getObjectInspector();
        InputFormat<?, ?> in = new OrcInputFormat();
        FileInputFormat.setInputPaths(conf, testFilePath.toString());
        InputSplit[] splits = in.getSplits(conf, 1);
        assertEquals(1, splits.length);
        conf.set("hive.io.file.readcolumn.ids", "1");
        org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
        Object key = reader.createKey();
        Object value = reader.createValue();
        int rowNum = 0;
        List<? extends StructField> fields = inspector.getAllStructFieldRefs();
        StructObjectInspector inner = (StructObjectInspector) fields.get(1).getFieldObjectInspector();
        List<? extends StructField> inFields = inner.getAllStructFieldRefs();
        IntObjectInspector intInspector = (IntObjectInspector) inFields.get(0).getFieldObjectInspector();
        while (reader.next(key, value)) {
            assertEquals(null, inspector.getStructFieldData(value, fields.get(0)));
            Object sub = inspector.getStructFieldData(value, fields.get(1));
            assertEquals(3 * rowNum + 1, intInspector.get(inner.getStructFieldData(sub, inFields.get(0))));
            assertEquals(3 * rowNum + 2, intInspector.get(inner.getStructFieldData(sub, inFields.get(1))));
            rowNum += 1;
        }
        assertEquals(3, rowNum);
        reader.close();
    }

    @Test
    public void testMROutput2() throws Exception {
        JobConf job = new JobConf(conf);
        // Test that you can set the output directory using this config
        job.set("mapred.work.output.dir", testFilePath.getParent().toString());
        Properties properties = new Properties();
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class,
                    ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
        }
        SerDe serde = new OrcSerde();
        OutputFormat<?, ?> outFormat = new OrcOutputFormat();
        RecordWriter writer = outFormat.getRecordWriter(fs, job, testFilePath.getName(), Reporter.NULL);
        writer.write(NullWritable.get(), serde.serialize(new StringRow("a"), inspector));
        writer.close(Reporter.NULL);
        serde = new OrcSerde();
        properties.setProperty("columns", "col");
        properties.setProperty("columns.types", "string");
        serde.initialize(conf, properties);
        inspector = (StructObjectInspector) serde.getObjectInspector();
        InputFormat<?, ?> in = new OrcInputFormat();
        FileInputFormat.setInputPaths(conf, testFilePath.toString());
        InputSplit[] splits = in.getSplits(conf, 1);
        assertEquals(1, splits.length);
        org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
        Object key = reader.createKey();
        Object value = reader.createValue();
        int rowNum = 0;
        List<? extends StructField> fields = inspector.getAllStructFieldRefs();
        reader.next(key, value);
        assertEquals("a", ((StringObjectInspector) fields.get(0).getFieldObjectInspector())
                .getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
        reader.close();

    }

    @Test
    public void testEmptyFile() throws Exception {
        JobConf job = new JobConf(conf);
        Properties properties = new Properties();
        HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
        FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true,
                properties, Reporter.NULL);
        writer.close(true);
        properties.setProperty("columns", "x,y");
        properties.setProperty("columns.types", "int:int");
        SerDe serde = new OrcSerde();
        serde.initialize(conf, properties);
        InputFormat<?, ?> in = new OrcInputFormat();
        FileInputFormat.setInputPaths(conf, testFilePath.toString());
        InputSplit[] splits = in.getSplits(conf, 1);
        assertEquals(1, splits.length);

        // read the whole file
        conf.set("hive.io.file.readcolumn.ids", "0,1");
        org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
        Object key = reader.createKey();
        Object value = reader.createValue();
        assertEquals(0.0, reader.getProgress(), 0.00001);
        assertEquals(0, reader.getPos());
        assertEquals(false, reader.next(key, value));
        reader.close();
        assertEquals(null, serde.getSerDeStats());
    }

    static class StringRow implements Writable {
        String str;
        String str2;

        StringRow(String s) {
            str = s;
            str2 = s;
        }

        @Override
        public void write(DataOutput dataOutput) throws IOException {
            throw new UnsupportedOperationException("no write");
        }

        @Override
        public void readFields(DataInput dataInput) throws IOException {
            throw new UnsupportedOperationException("no read");
        }
    }

    @Test
    public void testDefaultTypes() throws Exception {
        JobConf job = new JobConf(conf);
        Properties properties = new Properties();
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class,
                    ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
        }
        SerDe serde = new OrcSerde();
        HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
        FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, StringRow.class,
                true, properties, Reporter.NULL);
        writer.write(serde.serialize(new StringRow("owen"), inspector));
        writer.write(serde.serialize(new StringRow("beth"), inspector));
        writer.write(serde.serialize(new StringRow("laurel"), inspector));
        writer.write(serde.serialize(new StringRow("hazen"), inspector));
        writer.write(serde.serialize(new StringRow("colin"), inspector));
        writer.write(serde.serialize(new StringRow("miles"), inspector));
        writer.close(true);
        serde = new OrcSerde();
        properties.setProperty("columns", "str,str2");
        serde.initialize(conf, properties);
        inspector = (StructObjectInspector) serde.getObjectInspector();
        assertEquals("struct<str:string,str2:string>", inspector.getTypeName());
        InputFormat<?, ?> in = new OrcInputFormat();
        FileInputFormat.setInputPaths(conf, testFilePath.toString());
        InputSplit[] splits = in.getSplits(conf, 1);
        assertEquals(1, splits.length);

        // read the whole file
        org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
        Object key = reader.createKey();
        Writable value = (Writable) reader.createValue();
        List<? extends StructField> fields = inspector.getAllStructFieldRefs();
        StringObjectInspector strInspector = (StringObjectInspector) fields.get(0).getFieldObjectInspector();
        assertEquals(true, reader.next(key, value));
        assertEquals("owen",
                strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(true, reader.next(key, value));
        assertEquals("beth",
                strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(true, reader.next(key, value));
        assertEquals("laurel",
                strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(true, reader.next(key, value));
        assertEquals("hazen",
                strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(true, reader.next(key, value));
        assertEquals("colin",
                strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(true, reader.next(key, value));
        assertEquals("miles",
                strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(false, reader.next(key, value));
        reader.close();
    }

    /**
     * Tests that passing null as the file system to getRecordWriter works, this is
     * to be compatible with the way Sequence and RC file tolerate nulls.
     * @throws Exception
     */
    @Test
    public void testNullFileSystem() throws Exception {
        conf.set("mapred.work.output.dir", testFilePath.getParent().toString());
        JobConf job = new JobConf(conf);
        Properties properties = new Properties();
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class,
                    ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
        }
        OrcSerde serde = new OrcSerde();
        OrcOutputFormat outFormat = new OrcOutputFormat();
        RecordWriter<NullWritable, OrcSerdeRow> writer = outFormat.getRecordWriter(null, conf,
                testFilePath.getName(), Reporter.NULL);

        writer.write(NullWritable.get(), (OrcSerdeRow) serde.serialize(new StringRow("a"), inspector));
        writer.write(NullWritable.get(), (OrcSerdeRow) serde.serialize(new StringRow("b"), inspector));
        writer.write(NullWritable.get(), (OrcSerdeRow) serde.serialize(new StringRow("c"), inspector));
        writer.close(Reporter.NULL);
        serde = new OrcSerde();
        properties.setProperty("columns", "str,str2");
        serde.initialize(conf, properties);
        inspector = (StructObjectInspector) serde.getObjectInspector();
        OrcInputFormat in = new OrcInputFormat();
        FileInputFormat.setInputPaths(conf, testFilePath.toString());
        InputSplit[] splits = in.getSplits(conf, 1);
        assertEquals(1, splits.length);

        // read the whole file
        org.apache.hadoop.mapred.RecordReader<NullWritable, OrcLazyRow> reader = in.getRecordReader(splits[0], conf,
                Reporter.NULL);
        NullWritable key = reader.createKey();
        OrcLazyRow value = (OrcLazyRow) reader.createValue();
        List<? extends StructField> fields = inspector.getAllStructFieldRefs();
        StringObjectInspector strInspector = (StringObjectInspector) fields.get(0).getFieldObjectInspector();
        assertEquals(true, reader.next(key, value));
        assertEquals("a", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(true, reader.next(key, value));
        assertEquals("b", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(true, reader.next(key, value));
        assertEquals("c", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(false, reader.next(key, value));
        reader.close();
    }
}