import static org.junit.Assert.*;

import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.sql.Date;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TimeZone;
import java.util.TreeSet;

import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hive.common.ValidTxnList;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.ql.exec.SerializationUtilities;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.tez.ColumnarSplitSizeEstimator;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.shims.CombineHiveKey;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Progressable;
import org.apache.orc.OrcConf;
import org.apache.orc.OrcProto;
import org.apache.orc.TypeDescription;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TestName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.esotericsoftware.kryo.Kryo;

@SuppressWarnings({ "deprecation", "unchecked", "rawtypes" })
public class TestInputOutputFormat {
    private static final Logger LOG = LoggerFactory.getLogger(TestInputOutputFormat.class);

    public static String toKryo(SearchArgument sarg) {
        Output out = new Output(4 * 1024, 10 * 1024 * 1024);
        new Kryo().writeObject(out, sarg);
        return Base64.encodeBase64String(out.toBytes());

    Path workDir = new Path(System.getProperty("test.tmp.dir", "target/tmp"));
    static final int MILLIS_IN_DAY = 1000 * 60 * 60 * 24;
    private static final SimpleDateFormat DATE_FORMAT = new SimpleDateFormat("yyyy/MM/dd");
    private static final SimpleDateFormat TIME_FORMAT = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss.SSS");
    private static final TimeZone LOCAL_TIMEZONE = TimeZone.getDefault();

    static {
        TimeZone gmt = TimeZone.getTimeZone("GMT+0");

    public static class BigRow implements Writable {
        boolean booleanValue;
        byte byteValue;
        short shortValue;
        int intValue;
        long longValue;
        float floatValue;
        double doubleValue;
        String stringValue;
        HiveDecimal decimalValue;
        Date dateValue;
        Timestamp timestampValue;

        BigRow(long x) {
            booleanValue = x % 2 == 0;
            byteValue = (byte) x;
            shortValue = (short) x;
            intValue = (int) x;
            longValue = x;
            floatValue = x;
            doubleValue = x;
            stringValue = Long.toHexString(x);
            decimalValue = HiveDecimal.create(x);
            long millisUtc = x * MILLIS_IN_DAY;
            millisUtc -= LOCAL_TIMEZONE.getOffset(millisUtc);
            dateValue = new Date(millisUtc);
            timestampValue = new Timestamp(millisUtc);

        public void write(DataOutput dataOutput) throws IOException {
            throw new UnsupportedOperationException("no write");

        public void readFields(DataInput dataInput) throws IOException {
            throw new UnsupportedOperationException("no read");

        public String toString() {
            StringBuilder builder = new StringBuilder();
            builder.append("bigrow{booleanValue: ");
            builder.append(", byteValue: ");
            builder.append(", shortValue: ");
            builder.append(", intValue: ");
            builder.append(", longValue: ");
            builder.append(", floatValue: ");
            builder.append(", doubleValue: ");
            builder.append(", stringValue: ");
            builder.append(", decimalValue: ");
            builder.append(", dateValue: ");
            builder.append(", timestampValue: ");
            return builder.toString();

        static String getColumnNamesProperty() {
            return "booleanValue,byteValue,shortValue,intValue,longValue,floatValue,doubleValue,stringValue,decimalValue,dateValue,timestampValue";

        static String getColumnTypesProperty() {
            return "boolean:tinyint:smallint:int:bigint:float:double:string:decimal:date:timestamp";

    public static class BigRowField implements StructField {
        private final int id;
        private final String fieldName;
        private final ObjectInspector inspector;

        BigRowField(int id, String fieldName, ObjectInspector inspector) {
   = id;
            this.fieldName = fieldName;
            this.inspector = inspector;

        public String getFieldName() {
            return fieldName;

        public ObjectInspector getFieldObjectInspector() {
            return inspector;

        public String getFieldComment() {
            return null;

        public int getFieldID() {
            return id;

        public String toString() {
            return "field " + id + " " + fieldName;

    public static class BigRowInspector extends StructObjectInspector {
        static final List<BigRowField> FIELDS = new ArrayList<BigRowField>();
        static {
                    new BigRowField(0, "booleanValue", PrimitiveObjectInspectorFactory.javaBooleanObjectInspector));
            FIELDS.add(new BigRowField(1, "byteValue", PrimitiveObjectInspectorFactory.javaByteObjectInspector));
            FIELDS.add(new BigRowField(2, "shortValue", PrimitiveObjectInspectorFactory.javaShortObjectInspector));
            FIELDS.add(new BigRowField(3, "intValue", PrimitiveObjectInspectorFactory.javaIntObjectInspector));
            FIELDS.add(new BigRowField(4, "longValue", PrimitiveObjectInspectorFactory.javaLongObjectInspector));
            FIELDS.add(new BigRowField(5, "floatValue", PrimitiveObjectInspectorFactory.javaFloatObjectInspector));
                    new BigRowField(6, "doubleValue", PrimitiveObjectInspectorFactory.javaDoubleObjectInspector));
                    new BigRowField(7, "stringValue", PrimitiveObjectInspectorFactory.javaStringObjectInspector));
            FIELDS.add(new BigRowField(8, "decimalValue",
            FIELDS.add(new BigRowField(9, "dateValue", PrimitiveObjectInspectorFactory.javaDateObjectInspector));
            FIELDS.add(new BigRowField(10, "timestampValue",

        public List<? extends StructField> getAllStructFieldRefs() {
            return FIELDS;

        public StructField getStructFieldRef(String fieldName) {
            for (StructField field : FIELDS) {
                if (field.getFieldName().equals(fieldName)) {
                    return field;
            throw new IllegalArgumentException("Can't find field " + fieldName);

        public Object getStructFieldData(Object data, StructField fieldRef) {
            BigRow obj = (BigRow) data;
            switch (((BigRowField) fieldRef).id) {
            case 0:
                return obj.booleanValue;
            case 1:
                return obj.byteValue;
            case 2:
                return obj.shortValue;
            case 3:
                return obj.intValue;
            case 4:
                return obj.longValue;
            case 5:
                return obj.floatValue;
            case 6:
                return obj.doubleValue;
            case 7:
                return obj.stringValue;
            case 8:
                return obj.decimalValue;
            case 9:
                return obj.dateValue;
            case 10:
                return obj.timestampValue;
            throw new IllegalArgumentException("No such field " + fieldRef);

        public List<Object> getStructFieldsDataAsList(Object data) {
            BigRow obj = (BigRow) data;
            List<Object> result = new ArrayList<Object>(11);
            return result;

        public String getTypeName() {
            return "struct<booleanValue:boolean,byteValue:tinyint,"
                    + "shortValue:smallint,intValue:int,longValue:bigint,"
                    + "floatValue:float,doubleValue:double,stringValue:string," + "decimalValue:decimal>";

        public Category getCategory() {
            return Category.STRUCT;

    public static class MyRow implements Writable {
        int x;
        int y;

        MyRow(int x, int y) {
            this.x = x;
            this.y = y;

        public void write(DataOutput dataOutput) throws IOException {
            throw new UnsupportedOperationException("no write");

        public void readFields(DataInput dataInput) throws IOException {
            throw new UnsupportedOperationException("no read");

        static String getColumnNamesProperty() {
            return "x,y";

        static String getColumnTypesProperty() {
            return "int:int";


    public TestName testCaseName = new TestName();
    JobConf conf;
    FileSystem fs;
    Path testFilePath;

    public void openFileSystem() throws Exception {
        conf = new JobConf();
        fs = FileSystem.getLocal(conf);
        testFilePath = new Path(workDir, "TestInputOutputFormat." + testCaseName.getMethodName() + ".orc");
        fs.delete(testFilePath, false);

    public void testOverlap() throws Exception {
        assertEquals(0, OrcInputFormat.SplitGenerator.getOverlap(100, 100, 200, 100));
        assertEquals(0, OrcInputFormat.SplitGenerator.getOverlap(0, 1000, 2000, 100));
        assertEquals(100, OrcInputFormat.SplitGenerator.getOverlap(1000, 1000, 1500, 100));
        assertEquals(250, OrcInputFormat.SplitGenerator.getOverlap(1000, 250, 500, 2000));
        assertEquals(100, OrcInputFormat.SplitGenerator.getOverlap(1000, 1000, 1900, 1000));
        assertEquals(500, OrcInputFormat.SplitGenerator.getOverlap(2000, 1000, 2500, 2000));

    public void testGetInputPaths() throws Exception {
        conf.set("mapred.input.dir", "a,b,c");
        assertArrayEquals(new Path[] { new Path("a"), new Path("b"), new Path("c") },
        conf.set("mapred.input.dir", "/a/b/c/d/e");
        assertArrayEquals(new Path[] { new Path("/a/b/c/d/e") }, OrcInputFormat.getInputPaths(conf));
        conf.set("mapred.input.dir", "/a/b/c\\,d,/e/f\\,g/h");
        assertArrayEquals(new Path[] { new Path("/a/b/c,d"), new Path("/e/f,g/h") },

    private FileSystem generateMockFiles(final int count, final int size) {
        final byte[] data = new byte[size];
        MockFile[] files = new MockFile[count];
        for (int i = 0; i < count; i++) {
            files[i] = new MockFile(String.format("mock:/a/b/part-%d", i), size, data);
        return new MockFileSystem(conf, files);

    public void testSplitStrategySelection() throws Exception {

        conf.set("mapreduce.input.fileinputformat.split.maxsize", "500");
        conf.set(HiveConf.ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "10Mb");
        final int[] counts = { 1, 10, 100, 256 };
        final int[] sizes = { 100, 1000 };
        final int[] numSplits = { 1, 9, 10, 11, 99, 111 };
        final String[] strategyResults = new String[] { "ETLSplitStrategy", /* 1 files x 100 size for 1 splits */
                "ETLSplitStrategy", /* 1 files x 100 size for 9 splits */
                "ETLSplitStrategy", /* 1 files x 100 size for 10 splits */
                "ETLSplitStrategy", /* 1 files x 100 size for 11 splits */
                "ETLSplitStrategy", /* 1 files x 100 size for 99 splits */
                "ETLSplitStrategy", /* 1 files x 100 size for 111 splits */
                "ETLSplitStrategy", /* 1 files x 1000 size for 1 splits */
                "ETLSplitStrategy", /* 1 files x 1000 size for 9 splits */
                "ETLSplitStrategy", /* 1 files x 1000 size for 10 splits */
                "ETLSplitStrategy", /* 1 files x 1000 size for 11 splits */
                "ETLSplitStrategy", /* 1 files x 1000 size for 99 splits */
                "ETLSplitStrategy", /* 1 files x 1000 size for 111 splits */
                "BISplitStrategy", /* 10 files x 100 size for 1 splits */
                "BISplitStrategy", /* 10 files x 100 size for 9 splits */
                "ETLSplitStrategy", /* 10 files x 100 size for 10 splits */
                "ETLSplitStrategy", /* 10 files x 100 size for 11 splits */
                "ETLSplitStrategy", /* 10 files x 100 size for 99 splits */
                "ETLSplitStrategy", /* 10 files x 100 size for 111 splits */
                "ETLSplitStrategy", /* 10 files x 1000 size for 1 splits */
                "ETLSplitStrategy", /* 10 files x 1000 size for 9 splits */
                "ETLSplitStrategy", /* 10 files x 1000 size for 10 splits */
                "ETLSplitStrategy", /* 10 files x 1000 size for 11 splits */
                "ETLSplitStrategy", /* 10 files x 1000 size for 99 splits */
                "ETLSplitStrategy", /* 10 files x 1000 size for 111 splits */
                "BISplitStrategy", /* 100 files x 100 size for 1 splits */
                "BISplitStrategy", /* 100 files x 100 size for 9 splits */
                "BISplitStrategy", /* 100 files x 100 size for 10 splits */
                "BISplitStrategy", /* 100 files x 100 size for 11 splits */
                "BISplitStrategy", /* 100 files x 100 size for 99 splits */
                "ETLSplitStrategy", /* 100 files x 100 size for 111 splits */
                "ETLSplitStrategy", /* 100 files x 1000 size for 1 splits */
                "ETLSplitStrategy", /* 100 files x 1000 size for 9 splits */
                "ETLSplitStrategy", /* 100 files x 1000 size for 10 splits */
                "ETLSplitStrategy", /* 100 files x 1000 size for 11 splits */
                "ETLSplitStrategy", /* 100 files x 1000 size for 99 splits */
                "ETLSplitStrategy", /* 100 files x 1000 size for 111 splits */
                "BISplitStrategy", /* 256 files x 100 size for 1 splits */
                "BISplitStrategy", /* 256 files x 100 size for 9 splits */
                "BISplitStrategy", /* 256 files x 100 size for 10 splits */
                "BISplitStrategy", /* 256 files x 100 size for 11 splits */
                "BISplitStrategy", /* 256 files x 100 size for 99 splits */
                "BISplitStrategy", /* 256 files x 100 size for 111 splits */
                "ETLSplitStrategy", /* 256 files x 1000 size for 1 splits */
                "ETLSplitStrategy", /* 256 files x 1000 size for 9 splits */
                "ETLSplitStrategy", /* 256 files x 1000 size for 10 splits */
                "ETLSplitStrategy", /* 256 files x 1000 size for 11 splits */
                "ETLSplitStrategy", /* 256 files x 1000 size for 99 splits */
                "ETLSplitStrategy", /* 256 files x 1000 size for 111 splits */

        int k = 0;

        for (int c : counts) {
            for (int s : sizes) {
                final FileSystem fs = generateMockFiles(c, s);
                for (int n : numSplits) {
                    final OrcInputFormat.Context context = new OrcInputFormat.Context(conf, n);
                    OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs,
                            new MockPath(fs, "mock:/a/b"), false, null);
                    List<SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
                    assertEquals(1, splitStrategies.size());
                    final SplitStrategy splitStrategy = splitStrategies.get(0);
                    assertTrue(String.format("Split strategy for %d files x %d size for %d splits", c, s, n),

        k = 0;
        conf.set(ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "0");
        for (int c : counts) {
            for (int s : sizes) {
                final FileSystem fs = generateMockFiles(c, s);
                for (int n : numSplits) {
                    final OrcInputFormat.Context context = new OrcInputFormat.Context(conf, n);
                    OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs,
                            new MockPath(fs, "mock:/a/b"), false, null);
                    List<SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
                    assertEquals(1, splitStrategies.size());
                    final SplitStrategy splitStrategy = splitStrategies.get(0);
                    assertTrue(String.format("Split strategy for %d files x %d size for %d splits", c, s, n),

    public void testFileGenerator() throws Exception {
        OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
        MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/b/part-00", 1000, new byte[1]),
                new MockFile("mock:/a/b/part-01", 1000, new byte[1]),
                new MockFile("mock:/a/b/_part-02", 1000, new byte[1]),
                new MockFile("mock:/a/b/.part-03", 1000, new byte[1]),
                new MockFile("mock:/a/b/part-04", 1000, new byte[1]));
        OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs,
                new MockPath(fs, "mock:/a/b"), false, null);
        List<OrcInputFormat.SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
        assertEquals(1, splitStrategies.size());
        assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.BISplitStrategy);

        conf.set("mapreduce.input.fileinputformat.split.maxsize", "500");
        context = new OrcInputFormat.Context(conf);
        fs = new MockFileSystem(conf, new MockFile("mock:/a/b/part-00", 1000, new byte[1000]),
                new MockFile("mock:/a/b/part-01", 1000, new byte[1000]),
                new MockFile("mock:/a/b/_part-02", 1000, new byte[1000]),
                new MockFile("mock:/a/b/.part-03", 1000, new byte[1000]),
                new MockFile("mock:/a/b/part-04", 1000, new byte[1000]));
        gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null);
        splitStrategies = createSplitStrategies(context, gen);
        assertEquals(1, splitStrategies.size());
        assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ETLSplitStrategy);

    public void testACIDSplitStrategy() throws Exception {
        conf.set("bucket_count", "2");
        OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
        MockFileSystem fs = new MockFileSystem(conf,
                new MockFile("mock:/a/delta_000_001/part-00", 1000, new byte[1], new MockBlock("host1")),
                new MockFile("mock:/a/delta_000_001/part-01", 1000, new byte[1], new MockBlock("host1")),
                new MockFile("mock:/a/delta_001_002/part-02", 1000, new byte[1], new MockBlock("host1")),
                new MockFile("mock:/a/delta_001_002/part-03", 1000, new byte[1], new MockBlock("host1")));
        OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs,
                new MockPath(fs, "mock:/a"), false, null);
        List<OrcInputFormat.SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
        assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy);
        List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
        ColumnarSplitSizeEstimator splitSizeEstimator = new ColumnarSplitSizeEstimator();
        for (OrcSplit split : splits) {
            assertEquals(Integer.MAX_VALUE, splitSizeEstimator.getEstimatedSize(split));
        assertEquals(2, splits.size());

    public void testACIDSplitStrategyForSplitUpdate() throws Exception {
        conf.set("bucket_count", "2");
        conf.set(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, "true");
        conf.set(hive_metastoreConstants.TABLE_TRANSACTIONAL_PROPERTIES, "default");
        OrcInputFormat.Context context = new OrcInputFormat.Context(conf);

        // Case 1: Test with just originals => Single split strategy with two splits.
        MockFileSystem fs = new MockFileSystem(conf,
                new MockFile("mock:/a/b/000000_0", 1000, new byte[1], new MockBlock("host1")),
                new MockFile("mock:/a/b/000000_1", 1000, new byte[1], new MockBlock("host1")));
        OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs,
                new MockPath(fs, "mock:/a"), false, null);
        List<OrcInputFormat.SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
        assertEquals(1, splitStrategies.size());
        assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy);
        List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
        assertEquals(2, splits.size());
        assertEquals("mock:/a/b/000000_0", splits.get(0).getPath().toUri().toString());
        assertEquals("mock:/a/b/000000_1", splits.get(1).getPath().toUri().toString());

        // Case 2: Test with originals and base => Single split strategy with two splits on compacted
        // base since the presence of a base will make the originals obsolete.
        fs = new MockFileSystem(conf, new MockFile("mock:/a/b/000000_0", 1000, new byte[1], new MockBlock("host1")),
                new MockFile("mock:/a/b/000000_1", 1000, new byte[1], new MockBlock("host1")),
                new MockFile("mock:/a/base_0000001/bucket_00000", 1000, new byte[1], new MockBlock("host1")),
                new MockFile("mock:/a/base_0000001/bucket_00001", 1000, new byte[1], new MockBlock("host1")));
        gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null);
        splitStrategies = createSplitStrategies(context, gen);
        assertEquals(1, splitStrategies.size());
        assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy);
        splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
        assertEquals(2, splits.size());
        assertEquals("mock:/a/base_0000001/bucket_00000", splits.get(0).getPath().toUri().toString());
        assertEquals("mock:/a/base_0000001/bucket_00001", splits.get(1).getPath().toUri().toString());

        // Case 3: Test with originals and deltas => Two split strategies with two splits for each.
        fs = new MockFileSystem(conf, new MockFile("mock:/a/b/000000_0", 1000, new byte[1], new MockBlock("host1")),
                new MockFile("mock:/a/b/000000_1", 1000, new byte[1], new MockBlock("host1")),
                new MockFile("mock:/a/delta_0000001_0000001_0000/bucket_00000", 1000, new byte[1],
                        new MockBlock("host1")),
                new MockFile("mock:/a/delta_0000001_0000001_0000/bucket_00001", 1000, new byte[1],
                        new MockBlock("host1")),
                new MockFile("mock:/a/delete_delta_0000001_0000001_0000/bucket_00000", 1000, new byte[1],
                        new MockBlock("host1")),
                new MockFile("mock:/a/delete_delta_0000001_0000001_0000/bucket_00001", 1000, new byte[1],
                        new MockBlock("host1")));
        gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null);
        splitStrategies = createSplitStrategies(context, gen);
        assertEquals(2, splitStrategies.size());
        assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy);
        splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
        assertEquals(2, splits.size());
        assertEquals("mock:/a/b/000000_0", splits.get(0).getPath().toUri().toString());
        assertEquals("mock:/a/b/000000_1", splits.get(1).getPath().toUri().toString());
        assertEquals(true, splitStrategies.get(1) instanceof OrcInputFormat.ACIDSplitStrategy);
        splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(1)).getSplits();
        assertEquals(2, splits.size());
        assertEquals("mock:/a/delta_0000001_0000001_0000/bucket_00000", splits.get(0).getPath().toUri().toString());
        assertEquals("mock:/a/delta_0000001_0000001_0000/bucket_00001", splits.get(1).getPath().toUri().toString());

        // Case 4: Test with originals and deltas but now with only one bucket covered, i.e. we will
        // have originals & insert_deltas for only one bucket, but the delete_deltas will be for two
        // buckets => Two strategies with one split for each.
        // When split-update is enabled, we do not need to account for buckets that aren't covered.
        // The reason why we are able to do so is because the valid user data has already been considered
        // as base for the covered buckets. Hence, the uncovered buckets do not have any relevant
        // data and we can just ignore them.
        fs = new MockFileSystem(conf, new MockFile("mock:/a/b/000000_0", 1000, new byte[1], new MockBlock("host1")),
                new MockFile("mock:/a/delta_0000001_0000001_0000/bucket_00000", 1000, new byte[1],
                        new MockBlock("host1")),
                new MockFile("mock:/a/delete_delta_0000001_0000001_0000/bucket_00000", 1000, new byte[1],
                        new MockBlock("host1")),
                new MockFile("mock:/a/delete_delta_0000001_0000001_0000/bucket_00001", 1000, new byte[1],
                        new MockBlock("host1")));
        gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null);
        splitStrategies = createSplitStrategies(context, gen);
        assertEquals(2, splitStrategies.size());
        assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy);
        splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
        assertEquals(1, splits.size());
        assertEquals("mock:/a/b/000000_0", splits.get(0).getPath().toUri().toString());
        assertEquals(true, splitStrategies.get(1) instanceof OrcInputFormat.ACIDSplitStrategy);
        splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(1)).getSplits();
        assertEquals(1, splits.size());
        assertEquals("mock:/a/delta_0000001_0000001_0000/bucket_00000", splits.get(0).getPath().toUri().toString());

        // Case 5: Test with originals, compacted_base, insert_deltas, delete_deltas (exhaustive test)
        // This should just generate one strategy with splits for base and insert_deltas.
        fs = new MockFileSystem(conf, new MockFile("mock:/a/b/000000_0", 1000, new byte[1], new MockBlock("host1")),
                new MockFile("mock:/a/b/000000_1", 1000, new byte[1], new MockBlock("host1")),
                new MockFile("mock:/a/base_0000001/bucket_00000", 1000, new byte[1], new MockBlock("host1")),
                new MockFile("mock:/a/base_0000001/bucket_00001", 1000, new byte[1], new MockBlock("host1")),
                new MockFile("mock:/a/delta_0000002_0000002_0000/bucket_00000", 1000, new byte[1],
                        new MockBlock("host1")),
                new MockFile("mock:/a/delta_0000002_0000002_0000/bucket_00001", 1000, new byte[1],
                        new MockBlock("host1")),
                new MockFile("mock:/a/delete_delta_0000002_0000002_0000/bucket_00000", 1000, new byte[1],
                        new MockBlock("host1")),
                new MockFile("mock:/a/delete_delta_0000002_0000002_0000/bucket_00001", 1000, new byte[1],
                        new MockBlock("host1")));
        gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null);
        splitStrategies = createSplitStrategies(context, gen);
        assertEquals(1, splitStrategies.size());
        assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy);
        splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
        assertEquals(4, splits.size());
        assertEquals("mock:/a/base_0000001/bucket_00000", splits.get(0).getPath().toUri().toString());
        assertEquals("mock:/a/base_0000001/bucket_00001", splits.get(1).getPath().toUri().toString());
        assertEquals("mock:/a/delta_0000002_0000002_0000/bucket_00000", splits.get(2).getPath().toUri().toString());
        assertEquals("mock:/a/delta_0000002_0000002_0000/bucket_00001", splits.get(3).getPath().toUri().toString());

    public void testBIStrategySplitBlockBoundary() throws Exception {
        conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "BI");
        OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
        MockFileSystem fs = new MockFileSystem(conf,
                new MockFile("mock:/a/b/part-00", 1000, new byte[1], new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-01", 1000, new byte[1], new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-02", 1000, new byte[1], new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-03", 1000, new byte[1], new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-04", 1000, new byte[1], new MockBlock("host1", "host2")));
        OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs,
                new MockPath(fs, "mock:/a/b"), false, null);
        List<OrcInputFormat.SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen);
        assertEquals(1, splitStrategies.size());
        assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.BISplitStrategy);
        List<OrcSplit> splits = ((OrcInputFormat.BISplitStrategy) splitStrategies.get(0)).getSplits();
        int numSplits = splits.size();
        assertEquals(5, numSplits);

        context = new OrcInputFormat.Context(conf);
        fs = new MockFileSystem(conf,
                new MockFile("mock:/a/b/part-00", 1000, new byte[1000], new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-01", 1000, new byte[1000], new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-02", 1000, new byte[1000], new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-03", 1000, new byte[1000], new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-04", 1000, new byte[1000], new MockBlock("host1", "host2")));
        gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null);
        splitStrategies = createSplitStrategies(context, gen);
        assertEquals(1, splitStrategies.size());
        assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.BISplitStrategy);
        splits = ((OrcInputFormat.BISplitStrategy) splitStrategies.get(0)).getSplits();
        numSplits = splits.size();
        assertEquals(5, numSplits);

        context = new OrcInputFormat.Context(conf);
        fs = new MockFileSystem(conf,
                new MockFile("mock:/a/b/part-00", 1000, new byte[1100], new MockBlock("host1", "host2"),
                        new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-01", 1000, new byte[1100], new MockBlock("host1", "host2"),
                        new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-02", 1000, new byte[1100], new MockBlock("host1", "host2"),
                        new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-03", 1000, new byte[1100], new MockBlock("host1", "host2"),
                        new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-04", 1000, new byte[1100], new MockBlock("host1", "host2"),
                        new MockBlock("host1", "host2")));
        gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null);
        splitStrategies = createSplitStrategies(context, gen);
        assertEquals(1, splitStrategies.size());
        assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.BISplitStrategy);
        splits = ((OrcInputFormat.BISplitStrategy) splitStrategies.get(0)).getSplits();
        numSplits = splits.size();
        assertEquals(10, numSplits);

        context = new OrcInputFormat.Context(conf);
        fs = new MockFileSystem(conf,
                new MockFile("mock:/a/b/part-00", 1000, new byte[2000], new MockBlock("host1", "host2"),
                        new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-01", 1000, new byte[2000], new MockBlock("host1", "host2"),
                        new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-02", 1000, new byte[2000], new MockBlock("host1", "host2"),
                        new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-03", 1000, new byte[2000], new MockBlock("host1", "host2"),
                        new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-04", 1000, new byte[2000], new MockBlock("host1", "host2"),
                        new MockBlock("host1", "host2")));
        gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null);
        splitStrategies = createSplitStrategies(context, gen);
        assertEquals(1, splitStrategies.size());
        assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.BISplitStrategy);
        splits = ((OrcInputFormat.BISplitStrategy) splitStrategies.get(0)).getSplits();
        numSplits = splits.size();
        assertEquals(10, numSplits);

        context = new OrcInputFormat.Context(conf);
        fs = new MockFileSystem(conf,
                new MockFile("mock:/a/b/part-00", 1000, new byte[2200], new MockBlock("host1", "host2"),
                        new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-01", 1000, new byte[2200], new MockBlock("host1", "host2"),
                        new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-02", 1000, new byte[2200], new MockBlock("host1", "host2"),
                        new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-03", 1000, new byte[2200], new MockBlock("host1", "host2"),
                        new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
                new MockFile("mock:/a/b/part-04", 1000, new byte[2200], new MockBlock("host1", "host2"),
                        new MockBlock("host1", "host2"), new MockBlock("host1", "host2")));
        gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null);
        splitStrategies = createSplitStrategies(context, gen);
        assertEquals(1, splitStrategies.size());
        assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.BISplitStrategy);
        splits = ((OrcInputFormat.BISplitStrategy) splitStrategies.get(0)).getSplits();
        numSplits = splits.size();
        assertEquals(15, numSplits);

    public void testEtlCombinedStrategy() throws Exception {
        conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "ETL");
        conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_DIRECTORY_BATCH_MS.varname, "1000000");
        OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
        MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/1/part-00", 1000, new byte[1]),
                new MockFile("mock:/a/1/part-01", 1000, new byte[1]),
                new MockFile("mock:/a/2/part-00", 1000, new byte[1]),
                new MockFile("mock:/a/2/part-01", 1000, new byte[1]),
                new MockFile("mock:/a/3/base_0/1", 1000, new byte[1]),
                new MockFile("mock:/a/4/base_0/1", 1000, new byte[1]),
                new MockFile("mock:/a/5/base_0/1", 1000, new byte[1]),
                new MockFile("mock:/a/5/delta_0_25/1", 1000, new byte[1]));

        OrcInputFormat.CombinedCtx combineCtx = new OrcInputFormat.CombinedCtx();
        // The first directory becomes the base for combining.
        List<SplitStrategy<?>> ss = createOrCombineStrategies(context, fs, "mock:/a/1", combineCtx);
        assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy);
        OrcInputFormat.ETLSplitStrategy etlSs = combineCtx.combined;
        assertEquals(2, etlSs.files.size());
        assertEquals(1, etlSs.dirs.size());
        // The second one should be combined into the first.
        ss = createOrCombineStrategies(context, fs, "mock:/a/2", combineCtx);
        assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy);
        assertEquals(4, etlSs.files.size());
        assertEquals(2, etlSs.dirs.size());
        // The third one has the base file, so it shouldn't be combined but could be a base.
        ss = createOrCombineStrategies(context, fs, "mock:/a/3", combineCtx);
        assertEquals(1, ss.size());
        assertSame(etlSs, ss.get(0));
        assertEquals(4, etlSs.files.size());
        assertEquals(2, etlSs.dirs.size());
        assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy);
        etlSs = combineCtx.combined;
        assertEquals(1, etlSs.files.size());
        assertEquals(1, etlSs.dirs.size());
        // Try the first again, it would not be combined and we'd retain the old base (less files).
        ss = createOrCombineStrategies(context, fs, "mock:/a/1", combineCtx);
        assertEquals(1, ss.size());
        assertTrue(ss.get(0) instanceof OrcInputFormat.ETLSplitStrategy);
        assertNotSame(etlSs, ss.get(0));
        OrcInputFormat.ETLSplitStrategy rejectedEtlSs = (OrcInputFormat.ETLSplitStrategy) ss.get(0);
        assertEquals(2, rejectedEtlSs.files.size());
        assertEquals(1, rejectedEtlSs.dirs.size());
        assertEquals(1, etlSs.files.size());
        assertEquals(1, etlSs.dirs.size());
        // The fourth could be combined again.
        ss = createOrCombineStrategies(context, fs, "mock:/a/4", combineCtx);
        assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy);
        assertEquals(2, etlSs.files.size());
        assertEquals(2, etlSs.dirs.size());
        // The fifth will not be combined because of delta files.
        ss = createOrCombineStrategies(context, fs, "mock:/a/5", combineCtx);
        assertEquals(1, ss.size());
        assertTrue(ss.get(0) instanceof OrcInputFormat.ETLSplitStrategy);
        assertNotSame(etlSs, ss);
        assertEquals(2, etlSs.files.size());
        assertEquals(2, etlSs.dirs.size());

    public List<SplitStrategy<?>> createOrCombineStrategies(OrcInputFormat.Context context, MockFileSystem fs,
            String path, OrcInputFormat.CombinedCtx combineCtx) throws IOException {
        OrcInputFormat.AcidDirInfo adi = createAdi(context, fs, path);
        return OrcInputFormat.determineSplitStrategies(combineCtx, context, adi.fs, adi.splitPath, adi.baseFiles,
                adi.parsedDeltas, null, null, true);

    public OrcInputFormat.AcidDirInfo createAdi(OrcInputFormat.Context context, MockFileSystem fs, String path)
            throws IOException {
        return new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, path), false, null).call();

    private List<OrcInputFormat.SplitStrategy<?>> createSplitStrategies(OrcInputFormat.Context context,
            OrcInputFormat.FileGenerator gen) throws IOException {
        OrcInputFormat.AcidDirInfo adi =;
        return OrcInputFormat.determineSplitStrategies(null, context, adi.fs, adi.splitPath, adi.baseFiles,
                adi.parsedDeltas, null, null, true);

    public static class MockBlock {
        int offset;
        int length;
        final String[] hosts;

        public MockBlock(String... hosts) {
            this.hosts = hosts;

        public void setOffset(int offset) {
            this.offset = offset;

        public void setLength(int length) {
            this.length = length;

        public String toString() {
            StringBuilder buffer = new StringBuilder();
            buffer.append("block{offset: ");
            buffer.append(", length: ");
            buffer.append(", hosts: [");
            for (int i = 0; i < hosts.length; i++) {
                if (i != 0) {
                    buffer.append(", ");
            return buffer.toString();

    public static class MockFile {
        final Path path;
        int blockSize;
        int length;
        MockBlock[] blocks;
        byte[] content;

        public MockFile(String path, int blockSize, byte[] content, MockBlock... blocks) {
            this.path = new Path(path);
            this.blockSize = blockSize;
            this.blocks = blocks;
            this.content = content;
            this.length = content.length;
            int offset = 0;
            for (MockBlock block : blocks) {
                block.offset = offset;
                block.length = Math.min(length - offset, blockSize);
                offset += block.length;

        public int hashCode() {
            return path.hashCode() + 31 * length;

        public boolean equals(final Object obj) {
            if (!(obj instanceof MockFile)) {
                return false;
            return ((MockFile) obj).path.equals(this.path) && ((MockFile) obj).length == this.length;

        public String toString() {
            StringBuilder buffer = new StringBuilder();
            buffer.append("mockFile{path: ");
            buffer.append(", blkSize: ");
            buffer.append(", len: ");
            buffer.append(", blocks: [");
            for (int i = 0; i < blocks.length; i++) {
                if (i != 0) {
                    buffer.append(", ");
            return buffer.toString();

    static class MockInputStream extends FSInputStream {
        final MockFile file;
        int offset = 0;

        public MockInputStream(MockFile file) throws IOException {
            this.file = file;

        public void seek(long offset) throws IOException {
            this.offset = (int) offset;

        public long getPos() throws IOException {
            return offset;

        public boolean seekToNewSource(long l) throws IOException {
            return false;

        public int read() throws IOException {
            if (offset < file.length) {
                return file.content[offset++] & 0xff;
            return -1;

    public static class MockPath extends Path {
        private final FileSystem fs;

        public MockPath(FileSystem fs, String path) {
            this.fs = fs;

        public FileSystem getFileSystem(Configuration conf) {
            return fs;

    public static class MockOutputStream extends FSDataOutputStream {
        private final MockFile file;

        public MockOutputStream(MockFile file) throws IOException {
            super(new DataOutputBuffer(), null);
            this.file = file;

         * Set the blocks and their location for the file.
         * Must be called after the stream is closed or the block length will be
         * wrong.
         * @param blocks the list of blocks
        public void setBlocks(MockBlock... blocks) {
            file.blocks = blocks;
            int offset = 0;
            int i = 0;
            while (offset < file.length && i < blocks.length) {
                blocks[i].offset = offset;
                blocks[i].length = Math.min(file.length - offset, file.blockSize);
                offset += blocks[i].length;
                i += 1;

        public void close() throws IOException {
            DataOutputBuffer buf = (DataOutputBuffer) getWrappedStream();
            file.length = buf.getLength();
            file.content = new byte[file.length];
            MockBlock block = new MockBlock("host1");
            System.arraycopy(buf.getData(), 0, file.content, 0, file.length);

        public String toString() {
            return "Out stream to " + file.toString();

    public static class MockFileSystem extends FileSystem {
        final List<MockFile> files = new ArrayList<MockFile>();
        final Map<MockFile, FileStatus> fileStatusMap = new HashMap<>();
        Path workingDir = new Path("/");
        // statics for when the mock fs is created via FileSystem.get
        private static String blockedUgi = null;
        private final static List<MockFile> globalFiles = new ArrayList<MockFile>();
        protected Statistics statistics;

        public MockFileSystem() {
            // empty

        public void initialize(URI uri, Configuration conf) {
            statistics = getStatistics("mock", getClass());

        public MockFileSystem(Configuration conf, MockFile... files) {
            statistics = getStatistics("mock", getClass());

        public static void setBlockedUgi(String s) {
            blockedUgi = s;

        void clear() {

        public URI getUri() {
            try {
                return new URI("mock:///");
            } catch (URISyntaxException err) {
                throw new IllegalArgumentException("huh?", err);

        // increments file modification time
        public void touch(MockFile file) {
            if (fileStatusMap.containsKey(file)) {
                FileStatus fileStatus = fileStatusMap.get(file);
                FileStatus fileStatusNew = new FileStatus(fileStatus.getLen(), fileStatus.isDirectory(),
                        fileStatus.getReplication(), fileStatus.getBlockSize(),
                        fileStatus.getModificationTime() + 1, fileStatus.getAccessTime(),
                        fileStatus.getPermission(), fileStatus.getOwner(), fileStatus.getGroup(),
                fileStatusMap.put(file, fileStatusNew);

        public static class MockAccessDenied extends IOException {

        public FSDataInputStream open(Path path, int i) throws IOException {
            MockFile file = findFile(path);
            if (file != null)
                return new FSDataInputStream(new MockInputStream(file));
            throw new IOException("File not found: " + path);

        private MockFile findFile(Path path) {
            for (MockFile file : files) {
                if (file.path.equals(path)) {
                    return file;
            for (MockFile file : globalFiles) {
                if (file.path.equals(path)) {
                    return file;
            return null;

        private void checkAccess() throws IOException {
            if (blockedUgi == null)
            if (!blockedUgi.equals(UserGroupInformation.getCurrentUser().getShortUserName()))
            throw new MockAccessDenied();

        public FSDataOutputStream create(Path path, FsPermission fsPermission, boolean overwrite, int bufferSize,
                short replication, long blockSize, Progressable progressable) throws IOException {
            MockFile file = findFile(path);
            if (file == null) {
                file = new MockFile(path.toString(), (int) blockSize, new byte[0]);
            return new MockOutputStream(file);

        public FSDataOutputStream append(Path path, int bufferSize, Progressable progressable) throws IOException {
            return create(path, FsPermission.getDefault(), true, bufferSize, (short) 3, 256 * 1024, progressable);

        public boolean rename(Path path, Path path2) throws IOException {
            return false;

        public boolean delete(Path path) throws IOException {
            return false;

        public boolean delete(Path path, boolean b) throws IOException {
            return false;

        public RemoteIterator<LocatedFileStatus> listLocatedStatus(final Path f) throws IOException {
            return new RemoteIterator<LocatedFileStatus>() {
                private Iterator<LocatedFileStatus> iterator = listLocatedFileStatuses(f).iterator();

                public boolean hasNext() throws IOException {
                    return iterator.hasNext();

                public LocatedFileStatus next() throws IOException {

        private List<LocatedFileStatus> listLocatedFileStatuses(Path path) throws IOException {
            path = path.makeQualified(this);
            List<LocatedFileStatus> result = new ArrayList<>();
            String pathname = path.toString();
            String pathnameAsDir = pathname + "/";
            Set<String> dirs = new TreeSet<String>();
            MockFile file = findFile(path);
            if (file != null) {
                return result;
            findMatchingLocatedFiles(files, pathnameAsDir, dirs, result);
            findMatchingLocatedFiles(globalFiles, pathnameAsDir, dirs, result);
            // for each directory add it once
            for (String dir : dirs) {
                result.add(createLocatedDirectory(new MockPath(this, pathnameAsDir + dir)));
            return result;

        public FileStatus[] listStatus(Path path) throws IOException {
            path = path.makeQualified(this);
            List<FileStatus> result = new ArrayList<FileStatus>();
            String pathname = path.toString();
            String pathnameAsDir = pathname + "/";
            Set<String> dirs = new TreeSet<String>();
            MockFile file = findFile(path);
            if (file != null) {
                return new FileStatus[] { createStatus(file) };
            findMatchingFiles(files, pathnameAsDir, dirs, result);
            findMatchingFiles(globalFiles, pathnameAsDir, dirs, result);
            // for each directory add it once
            for (String dir : dirs) {
                result.add(createDirectory(new MockPath(this, pathnameAsDir + dir)));
            return result.toArray(new FileStatus[result.size()]);

        private void findMatchingFiles(List<MockFile> files, String pathnameAsDir, Set<String> dirs,
                List<FileStatus> result) {
            for (MockFile file : files) {
                String filename = file.path.toString();
                if (filename.startsWith(pathnameAsDir)) {
                    String tail = filename.substring(pathnameAsDir.length());
                    int nextSlash = tail.indexOf('/');
                    if (nextSlash > 0) {
                        dirs.add(tail.substring(0, nextSlash));
                    } else {

        private void findMatchingLocatedFiles(List<MockFile> files, String pathnameAsDir, Set<String> dirs,
                List<LocatedFileStatus> result) throws IOException {
            for (MockFile file : files) {
                String filename = file.path.toString();
                if (filename.startsWith(pathnameAsDir)) {
                    String tail = filename.substring(pathnameAsDir.length());
                    int nextSlash = tail.indexOf('/');
                    if (nextSlash > 0) {
                        dirs.add(tail.substring(0, nextSlash));
                    } else {

        public void setWorkingDirectory(Path path) {
            workingDir = path;

        public Path getWorkingDirectory() {
            return workingDir;

        public boolean mkdirs(Path path, FsPermission fsPermission) {
            return false;

        private FileStatus createStatus(MockFile file) {
            if (fileStatusMap.containsKey(file)) {
                return fileStatusMap.get(file);
            FileStatus fileStatus = new FileStatus(file.length, false, 1, file.blockSize, 0, 0,
                    FsPermission.createImmutable((short) 644), "owen", "group", file.path);
            fileStatusMap.put(file, fileStatus);
            return fileStatus;

        private FileStatus createDirectory(Path dir) {
            return new FileStatus(0, true, 0, 0, 0, 0, FsPermission.createImmutable((short) 755), "owen", "group",

        private LocatedFileStatus createLocatedStatus(MockFile file) throws IOException {
            FileStatus fileStatus = createStatus(file);
            return new LocatedFileStatus(fileStatus,
                    getFileBlockLocationsImpl(fileStatus, 0, fileStatus.getLen(), false));

        private LocatedFileStatus createLocatedDirectory(Path dir) throws IOException {
            FileStatus fileStatus = createDirectory(dir);
            return new LocatedFileStatus(fileStatus,
                    getFileBlockLocationsImpl(fileStatus, 0, fileStatus.getLen(), false));

        public FileStatus getFileStatus(Path path) throws IOException {
            path = path.makeQualified(this);
            String pathnameAsDir = path.toString() + "/";
            MockFile file = findFile(path);
            if (file != null)
                return createStatus(file);
            for (MockFile dir : files) {
                if (dir.path.toString().startsWith(pathnameAsDir)) {
                    return createDirectory(path);
            for (MockFile dir : globalFiles) {
                if (dir.path.toString().startsWith(pathnameAsDir)) {
                    return createDirectory(path);
            throw new FileNotFoundException("File " + path + " does not exist");

        public BlockLocation[] getFileBlockLocations(FileStatus stat, long start, long len) throws IOException {
            return getFileBlockLocationsImpl(stat, start, len, true);

        private BlockLocation[] getFileBlockLocationsImpl(final FileStatus stat, final long start, final long len,
                final boolean updateStats) throws IOException {
            if (updateStats) {
            List<BlockLocation> result = new ArrayList<BlockLocation>();
            MockFile file = findFile(stat.getPath());
            if (file != null) {
                for (MockBlock block : file.blocks) {
                    if (OrcInputFormat.SplitGenerator.getOverlap(block.offset, block.length, start, len) > 0) {
                        String[] topology = new String[block.hosts.length];
                        for (int i = 0; i < topology.length; ++i) {
                            topology[i] = "/rack/ " + block.hosts[i];
                                new BlockLocation(block.hosts, block.hosts, topology, block.offset, block.length));
                return result.toArray(new BlockLocation[result.size()]);
            return new BlockLocation[0];

        public String toString() {
            StringBuilder buffer = new StringBuilder();
            for (int i = 0; i < files.size(); ++i) {
                if (i != 0) {
                    buffer.append(", ");
            return buffer.toString();

        public static void addGlobalFile(MockFile mockFile) {

        public static void clearGlobalFiles() {

    static void fill(DataOutputBuffer out, long length) throws IOException {
        for (int i = 0; i < length; ++i) {

     * Create the binary contents of an ORC file that just has enough information
     * to test the getInputSplits.
     * @param stripeLengths the length of each stripe
     * @return the bytes of the file
     * @throws IOException
    static byte[] createMockOrcFile(long... stripeLengths) throws IOException {
        OrcProto.Footer.Builder footer = OrcProto.Footer.newBuilder();
        final long headerLen = 3;
        long offset = headerLen;
        DataOutputBuffer buffer = new DataOutputBuffer();
        for (long stripeLength : stripeLengths) {
                    .setDataLength(stripeLength - 10).setFooterLength(10).setNumberOfRows(1000));
            offset += stripeLength;
        fill(buffer, offset);
        footer.setNumberOfRows(1000 * stripeLengths.length).setHeaderLength(headerLen)
                .setContentLength(offset - headerLen);
                OrcProto.ColumnStatistics.newBuilder().setNumberOfValues(1000 * stripeLengths.length).build());
        footer.addStatistics(OrcProto.ColumnStatistics.newBuilder().setNumberOfValues(1000 * stripeLengths.length)
                        .setSum(1000 * 3 * stripeLengths.length).build())
        int footerEnd = buffer.getLength();
        OrcProto.PostScript ps = OrcProto.PostScript.newBuilder().setCompression(OrcProto.CompressionKind.NONE)
                .setFooterLength(footerEnd - offset).setMagic("ORC").build();
        buffer.write(buffer.getLength() - footerEnd);
        byte[] result = new byte[buffer.getLength()];
        System.arraycopy(buffer.getData(), 0, result, 0, buffer.getLength());
        return result;

    public void testAddSplit() throws Exception {
        // create a file with 5 blocks spread around the cluster
        MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/file", 500,
                createMockOrcFile(197, 300, 600, 200, 200, 100, 100, 100, 100, 100),
                new MockBlock("host1-1", "host1-2", "host1-3"), new MockBlock("host2-1", "host0", "host2-3"),
                new MockBlock("host0", "host3-2", "host3-3"), new MockBlock("host4-1", "host4-2", "host4-3"),
                new MockBlock("host5-1", "host5-2", "host5-3")));
        OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
        OrcInputFormat.SplitGenerator splitter = new OrcInputFormat.SplitGenerator(
                new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true,
                        new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null),
                null, true);
        OrcSplit result = splitter.createSplit(0, 200, null);
        assertEquals(0, result.getStart());
        assertEquals(200, result.getLength());
        assertEquals("mock:/a/file", result.getPath().toString());
        String[] locs = result.getLocations();
        assertEquals(3, locs.length);
        assertEquals("host1-1", locs[0]);
        assertEquals("host1-2", locs[1]);
        assertEquals("host1-3", locs[2]);
        result = splitter.createSplit(500, 600, null);
        locs = result.getLocations();
        assertEquals(3, locs.length);
        assertEquals("host2-1", locs[0]);
        assertEquals("host0", locs[1]);
        assertEquals("host2-3", locs[2]);
        result = splitter.createSplit(0, 2500, null);
        locs = result.getLocations();
        assertEquals(1, locs.length);
        assertEquals("host0", locs[0]);

    public void testSplitGenerator() throws Exception {
        // create a file with 5 blocks spread around the cluster
        long[] stripeSizes = new long[] { 197, 300, 600, 200, 200, 100, 100, 100, 100, 100 };
        MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/file", 500,
                createMockOrcFile(stripeSizes), new MockBlock("host1-1", "host1-2", "host1-3"),
                new MockBlock("host2-1", "host0", "host2-3"), new MockBlock("host0", "host3-2", "host3-3"),
                new MockBlock("host4-1", "host4-2", "host4-3"), new MockBlock("host5-1", "host5-2", "host5-3")));
        HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 300);
        HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 200);
        OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
        OrcInputFormat.SplitGenerator splitter = new OrcInputFormat.SplitGenerator(
                new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true,
                        new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null),
                null, true);
        List<OrcSplit> results =;
        OrcSplit result = results.get(0);
        assertEquals(3, result.getStart());
        assertEquals(497, result.getLength());
        result = results.get(1);
        assertEquals(500, result.getStart());
        assertEquals(600, result.getLength());
        result = results.get(2);
        assertEquals(1100, result.getStart());
        assertEquals(400, result.getLength());
        result = results.get(3);
        assertEquals(1500, result.getStart());
        assertEquals(300, result.getLength());
        result = results.get(4);
        assertEquals(1800, result.getStart());
        assertEquals(200, result.getLength());
        // test min = 0, max = 0 generates each stripe
        HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 0);
        HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 0);
        context = new OrcInputFormat.Context(conf);
        splitter = new OrcInputFormat.SplitGenerator(
                new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true,
                        new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null),
                null, true);
        results =;
        for (int i = 0; i < stripeSizes.length; ++i) {
            assertEquals("checking stripe " + i + " size", stripeSizes[i], results.get(i).getLength());

    public void testProjectedColumnSize() throws Exception {
        long[] stripeSizes = new long[] { 200, 200, 200, 200, 100 };
        MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/file", 500,
                createMockOrcFile(stripeSizes), new MockBlock("host1-1", "host1-2", "host1-3"),
                new MockBlock("host2-1", "host0", "host2-3"), new MockBlock("host0", "host3-2", "host3-3"),
                new MockBlock("host4-1", "host4-2", "host4-3"), new MockBlock("host5-1", "host5-2", "host5-3")));
        HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 300);
        HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 200);
        conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
        conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
        OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
        OrcInputFormat.SplitGenerator splitter = new OrcInputFormat.SplitGenerator(
                new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true,
                        new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null),
                null, true);
        List<OrcSplit> results =;
        OrcSplit result = results.get(0);
        assertEquals(3, results.size());
        assertEquals(3, result.getStart());
        assertEquals(400, result.getLength());
        assertEquals(167468, result.getProjectedColumnsUncompressedSize());
        result = results.get(1);
        assertEquals(403, result.getStart());
        assertEquals(400, result.getLength());
        assertEquals(167468, result.getProjectedColumnsUncompressedSize());
        result = results.get(2);
        assertEquals(803, result.getStart());
        assertEquals(100, result.getLength());
        assertEquals(41867, result.getProjectedColumnsUncompressedSize());

        // test min = 0, max = 0 generates each stripe
        HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 0);
        HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 0);
        context = new OrcInputFormat.Context(conf);
        splitter = new OrcInputFormat.SplitGenerator(
                new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true,
                        new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null),
                null, true);
        results =;
        assertEquals(5, results.size());
        for (int i = 0; i < stripeSizes.length; ++i) {
            assertEquals("checking stripe " + i + " size", stripeSizes[i], results.get(i).getLength());
            if (i == stripeSizes.length - 1) {
                assertEquals(41867, results.get(i).getProjectedColumnsUncompressedSize());
            } else {
                assertEquals(83734, results.get(i).getProjectedColumnsUncompressedSize());

        // single split
        HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 1000);
        HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 100000);
        context = new OrcInputFormat.Context(conf);
        splitter = new OrcInputFormat.SplitGenerator(
                new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true,
                        new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null),
                null, true);
        results =;
        assertEquals(1, results.size());
        result = results.get(0);
        assertEquals(3, result.getStart());
        assertEquals(900, result.getLength());
        assertEquals(376804, result.getProjectedColumnsUncompressedSize());

    public void testInOutFormat() throws Exception {
        Properties properties = new Properties();
        properties.setProperty("columns", "x,y");
        properties.setProperty("columns.types", "int:int");
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        AbstractSerDe serde = new OrcSerde();
        HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
        org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf,
                testFilePath, MyRow.class, true, properties, Reporter.NULL);
        writer.write(serde.serialize(new MyRow(1, 2), inspector));
        writer.write(serde.serialize(new MyRow(2, 2), inspector));
        writer.write(serde.serialize(new MyRow(3, 2), inspector));
        serde = new OrcSerde();
        SerDeUtils.initializeSerDe(serde, conf, properties, null);
        assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass());
        inspector = (StructObjectInspector) serde.getObjectInspector();
        assertEquals("struct<x:int,y:int>", inspector.getTypeName());
        InputFormat<?, ?> in = new OrcInputFormat();
        FileInputFormat.setInputPaths(conf, testFilePath.toString());
        InputSplit[] splits = in.getSplits(conf, 1);
        assertEquals(1, splits.length);

        // the the validate input method
        ArrayList<FileStatus> fileList = new ArrayList<FileStatus>();
        assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList));
        assertEquals(true, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList));
        assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList));

        // read the whole file
        conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
        conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
        org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
        Object key = reader.createKey();
        Writable value = (Writable) reader.createValue();
        int rowNum = 0;
        List<? extends StructField> fields = inspector.getAllStructFieldRefs();
        IntObjectInspector intInspector = (IntObjectInspector) fields.get(0).getFieldObjectInspector();

        // UNDONE: Don't know why HIVE-12894 causes this to return 0?
        // assertEquals(0.33, reader.getProgress(), 0.01);

        while (, value)) {
                    intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(0))));
                    intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1))));
        assertEquals(3, rowNum);
        assertEquals(1.0, reader.getProgress(), 0.00001);

        // read just the first column
        ColumnProjectionUtils.appendReadColumns(conf, Collections.singletonList(0));
        reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
        key = reader.createKey();
        value = (Writable) reader.createValue();
        rowNum = 0;
        fields = inspector.getAllStructFieldRefs();
        while (, value)) {
            assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0))));
            assertEquals(null, inspector.getStructFieldData(value, fields.get(1)));
        assertEquals(3, rowNum);

        // test the mapping of empty string to all columns
        reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
        key = reader.createKey();
        value = (Writable) reader.createValue();
        rowNum = 0;
        fields = inspector.getAllStructFieldRefs();
        while (, value)) {
            assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0))));
                    intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1))));
        assertEquals(3, rowNum);

    static class SimpleRow implements Writable {
        Text z;

        public SimpleRow(Text t) {
            this.z = t;

        public void write(DataOutput dataOutput) throws IOException {
            throw new UnsupportedOperationException("unsupported");

        public void readFields(DataInput dataInput) throws IOException {
            throw new UnsupportedOperationException("unsupported");

    static class NestedRow implements Writable {
        int z;
        MyRow r;

        NestedRow(int x, int y, int z) {
            this.z = z;
            this.r = new MyRow(x, y);

        public void write(DataOutput dataOutput) throws IOException {
            throw new UnsupportedOperationException("unsupported");

        public void readFields(DataInput dataInput) throws IOException {
            throw new UnsupportedOperationException("unsupported");

    public void testMROutput() throws Exception {
        Properties properties = new Properties();
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(NestedRow.class,
        AbstractSerDe serde = new OrcSerde();
        OutputFormat<?, ?> outFormat = new OrcOutputFormat();
        RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL);
        writer.write(NullWritable.get(), serde.serialize(new NestedRow(1, 2, 3), inspector));
        writer.write(NullWritable.get(), serde.serialize(new NestedRow(4, 5, 6), inspector));
        writer.write(NullWritable.get(), serde.serialize(new NestedRow(7, 8, 9), inspector));
        serde = new OrcSerde();
        properties.setProperty("columns", "z,r");
        properties.setProperty("columns.types", "int:struct<x:int,y:int>");
        SerDeUtils.initializeSerDe(serde, conf, properties, null);
        inspector = (StructObjectInspector) serde.getObjectInspector();
        InputFormat<?, ?> in = new OrcInputFormat();
        FileInputFormat.setInputPaths(conf, testFilePath.toString());
        InputSplit[] splits = in.getSplits(conf, 1);
        assertEquals(1, splits.length);
        ColumnProjectionUtils.appendReadColumns(conf, Collections.singletonList(1));
        conf.set("columns", "z,r");
        conf.set("columns.types", "int:struct<x:int,y:int>");
        org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
        Object key = reader.createKey();
        Object value = reader.createValue();
        int rowNum = 0;
        List<? extends StructField> fields = inspector.getAllStructFieldRefs();
        StructObjectInspector inner = (StructObjectInspector) fields.get(1).getFieldObjectInspector();
        List<? extends StructField> inFields = inner.getAllStructFieldRefs();
        IntObjectInspector intInspector = (IntObjectInspector) fields.get(0).getFieldObjectInspector();
        while (, value)) {
            assertEquals(null, inspector.getStructFieldData(value, fields.get(0)));
            Object sub = inspector.getStructFieldData(value, fields.get(1));
            assertEquals(3 * rowNum + 1, intInspector.get(inner.getStructFieldData(sub, inFields.get(0))));
            assertEquals(3 * rowNum + 2, intInspector.get(inner.getStructFieldData(sub, inFields.get(1))));
            rowNum += 1;
        assertEquals(3, rowNum);


    public void testEmptyFile() throws Exception {
        Properties properties = new Properties();
        properties.setProperty("columns", "x,y");
        properties.setProperty("columns.types", "int:int");
        HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
        org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf,
                testFilePath, MyRow.class, true, properties, Reporter.NULL);
        AbstractSerDe serde = new OrcSerde();
        SerDeUtils.initializeSerDe(serde, conf, properties, null);
        InputFormat<?, ?> in = new OrcInputFormat();
        FileInputFormat.setInputPaths(conf, testFilePath.toString());
        InputSplit[] splits = in.getSplits(conf, 1);
        assertTrue(0 == splits.length);
        assertEquals(null, serde.getSerDeStats());

    @Test(expected = RuntimeException.class)
    public void testSplitGenFailure() throws IOException {
        Properties properties = new Properties();
        HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
        org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf,
                testFilePath, MyRow.class, true, properties, Reporter.NULL);
        writer.write(new OrcSerde().serialize(null, null));
        InputFormat<?, ?> in = new OrcInputFormat();
        fs.setPermission(testFilePath, FsPermission.createImmutable((short) 0333));
        FileInputFormat.setInputPaths(conf, testFilePath.toString());
        try {
            in.getSplits(conf, 1);
        } catch (RuntimeException e) {
            assertEquals(true, e.getMessage().contains("Permission denied"));
            throw e;

    static class StringRow implements Writable {
        String str;
        String str2;

        StringRow(String s) {
            str = s;
            str2 = s;

        public void write(DataOutput dataOutput) throws IOException {
            throw new UnsupportedOperationException("no write");

        public void readFields(DataInput dataInput) throws IOException {
            throw new UnsupportedOperationException("no read");

        static String getColumnNamesProperty() {
            return "str,str2";

        static String getColumnTypesProperty() {
            return "string:string";


    public void testDefaultTypes() throws Exception {
        Properties properties = new Properties();
        properties.setProperty("columns", "str,str2");
        properties.setProperty("columns.types", "string:string");
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class,
        AbstractSerDe serde = new OrcSerde();
        HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
        org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf,
                testFilePath, StringRow.class, true, properties, Reporter.NULL);
        writer.write(serde.serialize(new StringRow("owen"), inspector));
        writer.write(serde.serialize(new StringRow("beth"), inspector));
        writer.write(serde.serialize(new StringRow("laurel"), inspector));
        writer.write(serde.serialize(new StringRow("hazen"), inspector));
        writer.write(serde.serialize(new StringRow("colin"), inspector));
        writer.write(serde.serialize(new StringRow("miles"), inspector));
        serde = new OrcSerde();
        SerDeUtils.initializeSerDe(serde, conf, properties, null);
        inspector = (StructObjectInspector) serde.getObjectInspector();
        assertEquals("struct<str:string,str2:string>", inspector.getTypeName());
        InputFormat<?, ?> in = new OrcInputFormat();
        FileInputFormat.setInputPaths(conf, testFilePath.toString());
        InputSplit[] splits = in.getSplits(conf, 1);
        assertEquals(1, splits.length);

        // read the whole file
        conf.set("columns", StringRow.getColumnNamesProperty());
        conf.set("columns.types", StringRow.getColumnTypesProperty());
        org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
        Object key = reader.createKey();
        Writable value = (Writable) reader.createValue();
        List<? extends StructField> fields = inspector.getAllStructFieldRefs();
        StringObjectInspector strInspector = (StringObjectInspector) fields.get(0).getFieldObjectInspector();
        assertEquals(true,, value));
                strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(true,, value));
                strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(true,, value));
                strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(true,, value));
                strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(true,, value));
                strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(true,, value));
                strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
        assertEquals(false,, value));

     * Create a mock execution environment that has enough detail that
     * ORC, vectorization, HiveInputFormat, and CombineHiveInputFormat don't
     * explode.
     * @param workDir a local filesystem work directory
     * @param warehouseDir a mock filesystem warehouse directory
     * @param tableName the table name
     * @param objectInspector object inspector for the row
     * @param isVectorized should run vectorized
     * @return a JobConf that contains the necessary information
     * @throws IOException
     * @throws HiveException
    JobConf createMockExecutionEnvironment(Path workDir, Path warehouseDir, String tableName,
            ObjectInspector objectInspector, boolean isVectorized, int partitions)
            throws IOException, HiveException {
        JobConf conf = new JobConf();
        conf.set("hive.exec.plan", workDir.toString());
        conf.set("mapred.job.tracker", "local");
        String isVectorizedString = Boolean.toString(isVectorized);
        conf.set("hive.vectorized.execution.enabled", isVectorizedString);
        conf.set(Utilities.VECTOR_MODE, isVectorizedString);
        conf.set(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, isVectorizedString);
        conf.set("fs.mock.impl", MockFileSystem.class.getName());
        conf.set("mapred.mapper.class", ExecMapper.class.getName());
        Path root = new Path(warehouseDir, tableName);
        // clean out previous contents
        ((MockFileSystem) root.getFileSystem(conf)).clear();
        // build partition strings
        String[] partPath = new String[partitions];
        StringBuilder buffer = new StringBuilder();
        for (int p = 0; p < partitions; ++p) {
            partPath[p] = new Path(root, "p=" + p).toString();
            if (p != 0) {
        conf.set("mapred.input.dir", buffer.toString());
        StringBuilder columnIds = new StringBuilder();
        StringBuilder columnNames = new StringBuilder();
        StringBuilder columnTypes = new StringBuilder();
        StructObjectInspector structOI = (StructObjectInspector) objectInspector;
        List<? extends StructField> fields = structOI.getAllStructFieldRefs();
        int numCols = fields.size();
        for (int i = 0; i < numCols; ++i) {
            if (i != 0) {
        conf.set("", columnIds.toString());
        conf.set("partition_columns", "p");
        conf.set(serdeConstants.LIST_COLUMNS, columnNames.toString());
        conf.set(serdeConstants.LIST_COLUMN_TYPES, columnTypes.toString());
        MockFileSystem fs = (MockFileSystem) warehouseDir.getFileSystem(conf);

        Properties tblProps = new Properties();
        tblProps.put("name", tableName);
        tblProps.put("serialization.lib", OrcSerde.class.getName());
        tblProps.put("columns", columnNames.toString());
        tblProps.put("columns.types", columnTypes.toString());
        TableDesc tbl = new TableDesc(OrcInputFormat.class, OrcOutputFormat.class, tblProps);

        MapWork mapWork = new MapWork();
        if (isVectorized) {
            VectorizedRowBatchCtx vectorizedRowBatchCtx = new VectorizedRowBatchCtx();
            vectorizedRowBatchCtx.init(structOI, new String[0]);
        LinkedHashMap<Path, ArrayList<String>> aliasMap = new LinkedHashMap<>();
        ArrayList<String> aliases = new ArrayList<String>();
        LinkedHashMap<Path, PartitionDesc> partMap = new LinkedHashMap<>();
        for (int p = 0; p < partitions; ++p) {
            Path path = new Path(partPath[p]);
            aliasMap.put(path, aliases);
            LinkedHashMap<String, String> partSpec = new LinkedHashMap<String, String>();
            PartitionDesc part = new PartitionDesc(tbl, partSpec);
            if (isVectorized) {
                        VectorPartitionDesc.createVectorizedInputFileFormat("MockInputFileFormatClassName", false));
            partMap.put(path, part);

        // write the plan out
        FileSystem localFs = FileSystem.getLocal(conf).getRaw();
        Path mapXml = new Path(workDir, "map.xml");
        localFs.delete(mapXml, true);
        FSDataOutputStream planStream = localFs.create(mapXml);
        SerializationUtilities.serializePlan(mapWork, planStream);
        conf.setBoolean(Utilities.HAS_MAP_WORK, true);
        return conf;

     * Set the mockblocks for a file after it has been written
     * @param path the path to modify
     * @param conf the configuration
     * @param blocks the blocks to uses
     * @throws IOException
    static void setBlocks(Path path, Configuration conf, MockBlock... blocks) throws IOException {
        FileSystem mockFs = path.getFileSystem(conf);
        MockOutputStream stream = (MockOutputStream) mockFs.create(path);

    static int getLength(Path path, Configuration conf) throws IOException {
        FileSystem mockFs = path.getFileSystem(conf);
        FileStatus stat = mockFs.getFileStatus(path);
        return (int) stat.getLen();

     * Test vectorization, non-acid, non-combine.
     * @throws Exception
    public void testVectorization() throws Exception {
        // get the object inspector for MyRow
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "vectorization", inspector,
                true, 1);

        // write the orc file to the mock file system
        Path path = new Path(conf.get("mapred.input.dir") + "/0_0");
        Writer writer = OrcFile.createWriter(path,
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));
        setBlocks(path, conf, new MockBlock("host0", "host1"));

        // call getsplits
        HiveInputFormat<?, ?> inputFormat = new HiveInputFormat<WritableComparable, Writable>();
        InputSplit[] splits = inputFormat.getSplits(conf, 10);
        assertEquals(1, splits.length);

        org.apache.hadoop.mapred.RecordReader<NullWritable, VectorizedRowBatch> reader = inputFormat
                .getRecordReader(splits[0], conf, Reporter.NULL);
        NullWritable key = reader.createKey();
        VectorizedRowBatch value = reader.createValue();
        assertEquals(true,, value));
        assertEquals(10, value.count());
        LongColumnVector col0 = (LongColumnVector) value.cols[0];
        for (int i = 0; i < 10; i++) {
            assertEquals("checking " + i, i, col0.vector[i]);
        assertEquals(false,, value));

     * Test vectorization, non-acid, non-combine.
     * @throws Exception
    public void testVectorizationWithBuckets() throws Exception {
        // get the object inspector for MyRow
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "vectorBuckets", inspector,
                true, 1);

        // write the orc file to the mock file system
        Path path = new Path(conf.get("mapred.input.dir") + "/0_0");
        Writer writer = OrcFile.createWriter(path,
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));
        setBlocks(path, conf, new MockBlock("host0", "host1"));

        // call getsplits
        conf.setInt(hive_metastoreConstants.BUCKET_COUNT, 3);
        HiveInputFormat<?, ?> inputFormat = new HiveInputFormat<WritableComparable, Writable>();
        InputSplit[] splits = inputFormat.getSplits(conf, 10);
        assertEquals(1, splits.length);

        org.apache.hadoop.mapred.RecordReader<NullWritable, VectorizedRowBatch> reader = inputFormat
                .getRecordReader(splits[0], conf, Reporter.NULL);
        NullWritable key = reader.createKey();
        VectorizedRowBatch value = reader.createValue();
        assertEquals(true,, value));
        assertEquals(10, value.count());
        LongColumnVector col0 = (LongColumnVector) value.cols[0];
        for (int i = 0; i < 10; i++) {
            assertEquals("checking " + i, i, col0.vector[i]);
        assertEquals(false,, value));

    // test acid with vectorization, no combine
    public void testVectorizationWithAcid() throws Exception {
        StructObjectInspector inspector = new BigRowInspector();
        JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "vectorizationAcid", inspector,
                true, 1);

        // write the orc file to the mock file system
        Path partDir = new Path(conf.get("mapred.input.dir"));
        OrcRecordUpdater writer = new OrcRecordUpdater(partDir,
                new AcidOutputFormat.Options(conf).maximumTransactionId(10).writingBase(true).bucket(0)
        for (int i = 0; i < 100; ++i) {
            BigRow row = new BigRow(i);
            writer.insert(10, row);
        Path path = new Path("mock:/vectorizationAcid/p=0/base_0000010/bucket_00000");
        setBlocks(path, conf, new MockBlock("host0", "host1"));

        // call getsplits
        HiveInputFormat<?, ?> inputFormat = new HiveInputFormat<WritableComparable, Writable>();
        InputSplit[] splits = inputFormat.getSplits(conf, 10);
        assertEquals(1, splits.length);

        conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
        conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
        HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);

        org.apache.hadoop.mapred.RecordReader<NullWritable, VectorizedRowBatch> reader = inputFormat
                .getRecordReader(splits[0], conf, Reporter.NULL);
        NullWritable key = reader.createKey();
        VectorizedRowBatch value = reader.createValue();
        assertEquals(true,, value));
        assertEquals(100, value.count());
        LongColumnVector booleanColumn = (LongColumnVector) value.cols[0];
        LongColumnVector byteColumn = (LongColumnVector) value.cols[1];
        LongColumnVector shortColumn = (LongColumnVector) value.cols[2];
        LongColumnVector intColumn = (LongColumnVector) value.cols[3];
        LongColumnVector longColumn = (LongColumnVector) value.cols[4];
        DoubleColumnVector floatColumn = (DoubleColumnVector) value.cols[5];
        DoubleColumnVector doubleCoulmn = (DoubleColumnVector) value.cols[6];
        BytesColumnVector stringColumn = (BytesColumnVector) value.cols[7];
        DecimalColumnVector decimalColumn = (DecimalColumnVector) value.cols[8];
        LongColumnVector dateColumn = (LongColumnVector) value.cols[9];
        TimestampColumnVector timestampColumn = (TimestampColumnVector) value.cols[10];
        for (int i = 0; i < 100; i++) {
            assertEquals("checking boolean " + i, i % 2 == 0 ? 1 : 0, booleanColumn.vector[i]);
            assertEquals("checking byte " + i, (byte) i, byteColumn.vector[i]);
            assertEquals("checking short " + i, (short) i, shortColumn.vector[i]);
            assertEquals("checking int " + i, i, intColumn.vector[i]);
            assertEquals("checking long " + i, i, longColumn.vector[i]);
            assertEquals("checking float " + i, i, floatColumn.vector[i], 0.0001);
            assertEquals("checking double " + i, i, doubleCoulmn.vector[i], 0.0001);
            Text strValue = new Text();
            strValue.set(stringColumn.vector[i], stringColumn.start[i], stringColumn.length[i]);
            assertEquals("checking string " + i, new Text(Long.toHexString(i)), strValue);
            assertEquals("checking decimal " + i, HiveDecimal.create(i), decimalColumn.vector[i].getHiveDecimal());
            assertEquals("checking date " + i, i, dateColumn.vector[i]);
            long millis = (long) i * MILLIS_IN_DAY;
            millis -= LOCAL_TIMEZONE.getOffset(millis);
            assertEquals("checking timestamp " + i, millis, timestampColumn.getTime(i));
        assertEquals(false,, value));

    // test non-vectorized, non-acid, combine
    public void testCombinationInputFormat() throws Exception {
        // get the object inspector for MyRow
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "combination", inspector,
                false, 1);

        // write the orc file to the mock file system
        Path partDir = new Path(conf.get("mapred.input.dir"));
        Writer writer = OrcFile.createWriter(new Path(partDir, "0_0"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));
        Path path = new Path("mock:/combination/p=0/0_0");
        setBlocks(path, conf, new MockBlock("host0", "host1"));
        MockFileSystem mockFs = (MockFileSystem) partDir.getFileSystem(conf);
        int length0 = getLength(path, conf);
        writer = OrcFile.createWriter(new Path(partDir, "1_0"),
        for (int i = 10; i < 20; ++i) {
            writer.addRow(new MyRow(i, 2 * i));
        Path path1 = new Path("mock:/combination/p=0/1_0");
        setBlocks(path1, conf, new MockBlock("host1", "host2"));

        // call getsplits
        HiveInputFormat<?, ?> inputFormat = new CombineHiveInputFormat<WritableComparable, Writable>();
        InputSplit[] splits = inputFormat.getSplits(conf, 1);
        assertEquals(1, splits.length);
        CombineHiveInputFormat.CombineHiveInputSplit split = (CombineHiveInputFormat.CombineHiveInputSplit) splits[0];

        // check split
        assertEquals(2, split.getNumPaths());
        assertEquals(partDir.toString() + "/0_0", split.getPath(0).toString());
        assertEquals(partDir.toString() + "/1_0", split.getPath(1).toString());
        assertEquals(length0, split.getLength(0));
        assertEquals(getLength(path1, conf), split.getLength(1));
        assertEquals(0, split.getOffset(0));
        assertEquals(0, split.getOffset(1));
        // hadoop-1 gets 3 and hadoop-2 gets 0. *sigh*
        // best answer would be 1.
        assertTrue(3 >= split.getLocations().length);

        // read split
        org.apache.hadoop.mapred.RecordReader<CombineHiveKey, OrcStruct> reader = inputFormat.getRecordReader(split,
                conf, Reporter.NULL);
        CombineHiveKey key = reader.createKey();
        OrcStruct value = reader.createValue();
        for (int i = 0; i < 20; i++) {
            assertEquals(true,, value));
            assertEquals(i, ((IntWritable) value.getFieldValue(0)).get());
        assertEquals(false,, value));

    // test non-vectorized, acid, combine
    public void testCombinationInputFormatWithAcid() throws Exception {
        // get the object inspector for MyRow
        StructObjectInspector inspector;
        final int PARTITIONS = 2;
        final int BUCKETS = 3;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "combinationAcid", inspector,
                false, PARTITIONS);

        // write the orc file to the mock file system
        Path[] partDir = new Path[PARTITIONS];
        String[] paths = conf.getStrings("mapred.input.dir");
        for (int p = 0; p < PARTITIONS; ++p) {
            partDir[p] = new Path(paths[p]);

        // write a base file in partition 0
        OrcRecordUpdater writer = new OrcRecordUpdater(partDir[0],
                new AcidOutputFormat.Options(conf).maximumTransactionId(10).writingBase(true).bucket(0)
        for (int i = 0; i < 10; ++i) {
            writer.insert(10, new MyRow(i, 2 * i));

        // base file
        Path base0 = new Path("mock:/combinationAcid/p=0/base_0000010/bucket_00000");
        setBlocks(base0, conf, new MockBlock("host1", "host2"));

        // write a delta file in partition 0
        writer = new OrcRecordUpdater(partDir[0], new AcidOutputFormat.Options(conf).maximumTransactionId(10)
        for (int i = 10; i < 20; ++i) {
            writer.insert(10, new MyRow(i, 2 * i));
        Path base1 = new Path("mock:/combinationAcid/p=0/base_0000010/bucket_00001");
        setBlocks(base1, conf, new MockBlock("host1", "host2"));

        // write three files in partition 1
        for (int bucket = 0; bucket < BUCKETS; ++bucket) {
            Path path = new Path(partDir[1], "00000" + bucket + "_0");
            Writer orc = OrcFile.createWriter(path,
            orc.addRow(new MyRow(1, 2));
            setBlocks(path, conf, new MockBlock("host3", "host4"));

        // call getsplits
        conf.setInt(hive_metastoreConstants.BUCKET_COUNT, BUCKETS);
        HiveInputFormat<?, ?> inputFormat = new CombineHiveInputFormat<WritableComparable, Writable>();
        InputSplit[] splits = inputFormat.getSplits(conf, 1);
        assertEquals(3, splits.length);
        HiveInputFormat.HiveInputSplit split = (HiveInputFormat.HiveInputSplit) splits[0];
        assertEquals("", split.inputFormatClassName());
        assertEquals("mock:/combinationAcid/p=0/base_0000010/bucket_00000", split.getPath().toString());
        assertEquals(0, split.getStart());
        assertEquals(648, split.getLength());
        split = (HiveInputFormat.HiveInputSplit) splits[1];
        assertEquals("", split.inputFormatClassName());
        assertEquals("mock:/combinationAcid/p=0/base_0000010/bucket_00001", split.getPath().toString());
        assertEquals(0, split.getStart());
        assertEquals(674, split.getLength());
        CombineHiveInputFormat.CombineHiveInputSplit combineSplit = (CombineHiveInputFormat.CombineHiveInputSplit) splits[2];
        assertEquals(BUCKETS, combineSplit.getNumPaths());
        for (int bucket = 0; bucket < BUCKETS; ++bucket) {
            assertEquals("mock:/combinationAcid/p=1/00000" + bucket + "_0",
            assertEquals(0, combineSplit.getOffset(bucket));
            assertEquals(241, combineSplit.getLength(bucket));
        String[] hosts = combineSplit.getLocations();
        assertEquals(2, hosts.length);

    public void testSetSearchArgument() throws Exception {
        Reader.Options options = new Reader.Options();
        List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
        OrcProto.Type.Builder builder = OrcProto.Type.newBuilder();
                .addAllFieldNames(Arrays.asList("op", "otid", "bucket", "rowid", "ctid", "row"))
                .addAllSubtypes(Arrays.asList(1, 2, 3, 4, 5, 6));
                .addAllFieldNames(Arrays.asList("url", "purchase", "cost", "store"))
                .addAllSubtypes(Arrays.asList(7, 8, 9, 10));
        SearchArgument isNull = SearchArgumentFactory.newBuilder().startAnd()
                .isNull("cost", PredicateLeaf.Type.LONG).end().build();
        conf.set(ConvertAstToSearchArg.SARG_PUSHDOWN, toKryo(isNull));
        conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "url,cost");
        options.include(new boolean[] { true, true, false, true, false });
        OrcInputFormat.setSearchArgument(options, types, conf, false);
        String[] colNames = options.getColumnNames();
        assertEquals(null, colNames[0]);
        assertEquals("url", colNames[1]);
        assertEquals(null, colNames[2]);
        assertEquals("cost", colNames[3]);
        assertEquals(null, colNames[4]);
        SearchArgument arg = options.getSearchArgument();
        List<PredicateLeaf> leaves = arg.getLeaves();
        assertEquals("cost", leaves.get(0).getColumnName());
        assertEquals(PredicateLeaf.Operator.IS_NULL, leaves.get(0).getOperator());

    public void testSplitElimination() throws Exception {
        Properties properties = new Properties();
        properties.setProperty("columns", "z,r");
        properties.setProperty("columns.types", "int:struct<x:int,y:int>");
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(NestedRow.class,
        AbstractSerDe serde = new OrcSerde();
        OutputFormat<?, ?> outFormat = new OrcOutputFormat();
        conf.setInt("mapred.max.split.size", 50);
        RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL);
        writer.write(NullWritable.get(), serde.serialize(new NestedRow(1, 2, 3), inspector));
        writer.write(NullWritable.get(), serde.serialize(new NestedRow(4, 5, 6), inspector));
        writer.write(NullWritable.get(), serde.serialize(new NestedRow(7, 8, 9), inspector));
        serde = new OrcSerde();
        SearchArgument sarg = SearchArgumentFactory.newBuilder().startAnd()
                .lessThan("z", PredicateLeaf.Type.LONG, new Long(0)).end().build();
        conf.set("sarg.pushdown", toKryo(sarg));
        conf.set("", "z,r");
        SerDeUtils.initializeSerDe(serde, conf, properties, null);
        inspector = (StructObjectInspector) serde.getObjectInspector();
        InputFormat<?, ?> in = new OrcInputFormat();
        FileInputFormat.setInputPaths(conf, testFilePath.toString());
        InputSplit[] splits = in.getSplits(conf, 1);
        assertEquals(0, splits.length);

    public void testSplitEliminationNullStats() throws Exception {
        Properties properties = new Properties();
        StructObjectInspector inspector = createSoi();
        AbstractSerDe serde = new OrcSerde();
        OutputFormat<?, ?> outFormat = new OrcOutputFormat();
        conf.setInt("mapred.max.split.size", 50);
        RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL);
        writer.write(NullWritable.get(), serde.serialize(new SimpleRow(null), inspector));
        writer.write(NullWritable.get(), serde.serialize(new SimpleRow(null), inspector));
        writer.write(NullWritable.get(), serde.serialize(new SimpleRow(null), inspector));
        serde = new OrcSerde();
        SearchArgument sarg = SearchArgumentFactory.newBuilder().startAnd()
                .lessThan("z", PredicateLeaf.Type.STRING, new String("foo")).end().build();
        conf.set("sarg.pushdown", toKryo(sarg));
        conf.set("", "z");
        properties.setProperty("columns", "z");
        properties.setProperty("columns.types", "string");
        SerDeUtils.initializeSerDe(serde, conf, properties, null);
        inspector = (StructObjectInspector) serde.getObjectInspector();
        InputFormat<?, ?> in = new OrcInputFormat();
        FileInputFormat.setInputPaths(conf, testFilePath.toString());
        InputSplit[] splits = in.getSplits(conf, 1);
        assertEquals(0, splits.length);

    public void testDoAs() throws Exception {
        conf.setInt(ConfVars.HIVE_ORC_COMPUTE_SPLITS_NUM_THREADS.varname, 1);
        conf.set(ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "ETL");
        conf.setBoolean(ConfVars.HIVE_IN_TEST.varname, true);
        conf.setClass("fs.mock.impl", MockFileSystem.class, FileSystem.class);
        String badUser = UserGroupInformation.getCurrentUser().getShortUserName() + "-foo";
        OrcInputFormat.Context.resetThreadPool(); // We need the size above to take effect.
        try {
            // OrcInputFormat will get a mock fs from FileSystem.get; add global files.
            MockFileSystem.addGlobalFile(new MockFile("mock:/ugi/1/file", 10000, createMockOrcFile(197, 300, 600),
                    new MockBlock("host1-1", "host1-2", "host1-3")));
            MockFileSystem.addGlobalFile(new MockFile("mock:/ugi/2/file", 10000, createMockOrcFile(197, 300, 600),
                    new MockBlock("host1-1", "host1-2", "host1-3")));
            FileInputFormat.setInputPaths(conf, "mock:/ugi/1");
            UserGroupInformation ugi = UserGroupInformation.createUserForTesting(badUser, new String[0]);
            assertEquals(0, OrcInputFormat.Context.getCurrentThreadPoolSize());
            try {
                ugi.doAs(new PrivilegedExceptionAction<Void>() {
                    public Void run() throws Exception {
                        OrcInputFormat.generateSplitsInfo(conf, new Context(conf, -1, null));
                        return null;
                fail("Didn't throw");
            } catch (Exception ex) {
                Throwable cause = ex;
                boolean found = false;
                while (cause != null) {
                    if (cause instanceof MockFileSystem.MockAccessDenied) {
                        found = true; // Expected.
                    cause = cause.getCause();
                if (!found)
                    throw ex; // Unexpected.
            assertEquals(1, OrcInputFormat.Context.getCurrentThreadPoolSize());
            FileInputFormat.setInputPaths(conf, "mock:/ugi/2");
            List<OrcSplit> splits = OrcInputFormat.generateSplitsInfo(conf, new Context(conf, -1, null));
            assertEquals(1, splits.size());
        } finally {

    private StructObjectInspector createSoi() {
        synchronized (TestOrcFile.class) {
            return (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(SimpleRow.class,

    public void testSplitGenReadOps() throws Exception {
        MockFileSystem fs = new MockFileSystem(conf);
        conf.set("mapred.input.dir", "mock:///mocktable");
        conf.set("fs.defaultFS", "mock:///");
        conf.set("fs.mock.impl", MockFileSystem.class.getName());
        MockPath mockPath = new MockPath(fs, "mock:///mocktable");
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        int readOpsBefore = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
        OrcInputFormat orcInputFormat = new OrcInputFormat();
        InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
        int readOpsDelta = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: listLocatedStatus - mock:/mocktable
        // call-2: open - mock:/mocktable/0_0
        // call-3: open - mock:/mocktable/0_1
        assertEquals(3, readOpsDelta);

        assertEquals(2, splits.length);
        // revert back to local fs
        conf.set("fs.defaultFS", "file:///");

    public void testSplitGenReadOpsLocalCache() throws Exception {
        MockFileSystem fs = new MockFileSystem(conf);
        // creates the static cache
        MockPath mockPath = new MockPath(fs, "mock:///mocktbl");
        conf.set(ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "0");
        conf.set("mapred.input.dir", mockPath.toString());
        conf.set("fs.defaultFS", "mock:///");
        conf.set("fs.mock.impl", MockFileSystem.class.getName());
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        int readOpsBefore = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
        OrcInputFormat orcInputFormat = new OrcInputFormat();
        InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);
        int readOpsDelta = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: listLocatedStatus - mock:/mocktbl
        // call-2: open - mock:/mocktbl/0_0
        // call-3: open - mock:/mocktbl/0_1
        assertEquals(3, readOpsDelta);

        // force BI to avoid reading footers
        conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "BI");
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        orcInputFormat = new OrcInputFormat();
        splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: listLocatedStatus - mock:/mocktbl
        assertEquals(1, readOpsDelta);

        // enable cache and use default strategy
        conf.set(ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "10Mb");
        conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "HYBRID");
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        orcInputFormat = new OrcInputFormat();
        splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: listLocatedStatus - mock:/mocktbl
        // call-2: open - mock:/mocktbl/0_0
        // call-3: open - mock:/mocktbl/0_1
        assertEquals(3, readOpsDelta);

        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        orcInputFormat = new OrcInputFormat();
        splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: listLocatedStatus - mock:/mocktbl
        assertEquals(1, readOpsDelta);

        // revert back to local fs
        conf.set("fs.defaultFS", "file:///");

    public void testSplitGenReadOpsLocalCacheChangeFileLen() throws Exception {
        MockFileSystem fs = new MockFileSystem(conf);
        // creates the static cache
        MockPath mockPath = new MockPath(fs, "mock:///mocktbl1");
        conf.set("mapred.input.dir", mockPath.toString());
        conf.set("fs.defaultFS", "mock:///");
        conf.set("fs.mock.impl", MockFileSystem.class.getName());
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        int readOpsBefore = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
        OrcInputFormat orcInputFormat = new OrcInputFormat();
        InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);
        int readOpsDelta = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: listLocatedStatus - mock:/mocktable
        // call-2: open - mock:/mocktbl1/0_0
        // call-3: open - mock:/mocktbl1/0_1
        assertEquals(3, readOpsDelta);

        // change file length and look for cache misses


        writer = OrcFile.createWriter(new Path(mockPath + "/0_0"),
        for (int i = 0; i < 100; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
        for (int i = 0; i < 100; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        orcInputFormat = new OrcInputFormat();
        splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: listLocatedStatus - mock:/mocktable
        // call-2: open - mock:/mocktbl1/0_0
        // call-3: open - mock:/mocktbl1/0_1
        assertEquals(3, readOpsDelta);

        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        orcInputFormat = new OrcInputFormat();
        splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: listLocatedStatus - mock:/mocktbl1
        assertEquals(1, readOpsDelta);

        // revert back to local fs
        conf.set("fs.defaultFS", "file:///");

    public void testSplitGenReadOpsLocalCacheChangeModificationTime() throws Exception {
        MockFileSystem fs = new MockFileSystem(conf);
        // creates the static cache
        MockPath mockPath = new MockPath(fs, "mock:///mocktbl2");
        conf.set("hive.orc.cache.use.soft.references", "true");
        conf.set("mapred.input.dir", mockPath.toString());
        conf.set("fs.defaultFS", "mock:///");
        conf.set("fs.mock.impl", MockFileSystem.class.getName());
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        int readOpsBefore = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
        OrcInputFormat orcInputFormat = new OrcInputFormat();
        InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);
        int readOpsDelta = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: listLocatedStatus - mock:/mocktbl2
        // call-2: open - mock:/mocktbl2/0_0
        // call-3: open - mock:/mocktbl2/0_1
        assertEquals(3, readOpsDelta);

        // change file modification time and look for cache misses
        FileSystem fs1 = FileSystem.get(conf);
        MockFile mockFile = ((MockFileSystem) fs1).findFile(new Path(mockPath + "/0_0"));
        ((MockFileSystem) fs1).touch(mockFile);
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        orcInputFormat = new OrcInputFormat();
        splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: listLocatedStatus - mock:/mocktbl2
        // call-2: open - mock:/mocktbl2/0_1
        assertEquals(2, readOpsDelta);

        // touch the next file
        fs1 = FileSystem.get(conf);
        mockFile = ((MockFileSystem) fs1).findFile(new Path(mockPath + "/0_1"));
        ((MockFileSystem) fs1).touch(mockFile);
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        orcInputFormat = new OrcInputFormat();
        splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: listLocatedStatus - mock:/mocktbl2
        // call-2: open - mock:/mocktbl2/0_0
        assertEquals(2, readOpsDelta);

        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        orcInputFormat = new OrcInputFormat();
        splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: listLocatedStatus - mock:/mocktbl2
        assertEquals(1, readOpsDelta);

        // revert back to local fs
        conf.set("fs.defaultFS", "file:///");

    public void testNonVectorReaderNoFooterSerialize() throws Exception {
        MockFileSystem fs = new MockFileSystem(conf);
        MockPath mockPath = new MockPath(fs, "mock:///mocktable1");
        conf.set("hive.orc.splits.include.file.footer", "false");
        conf.set("mapred.input.dir", mockPath.toString());
        conf.set("fs.defaultFS", "mock:///");
        conf.set("fs.mock.impl", MockFileSystem.class.getName());
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        OrcInputFormat orcInputFormat = new OrcInputFormat();
        InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);
        int readOpsBefore = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);

        for (InputSplit split : splits) {
            assertTrue("OrcSplit is expected", split instanceof OrcSplit);
            // ETL strategies will have start=3 (start of first stripe)
            if (split instanceof OrcSplit) {
                assertFalse("No footer serialize test for non-vector reader, hasFooter is not expected in"
                        + " orc splits.", ((OrcSplit) split).hasFooter());
            orcInputFormat.getRecordReader(split, conf, null);

        int readOpsDelta = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: open to read footer - split 1 => mock:/mocktable1/0_0
        // call-2: open to read data - split 1 => mock:/mocktable1/0_0
        // call-3: open to read footer - split 2 => mock:/mocktable1/0_1
        // call-4: open to read data - split 2 => mock:/mocktable1/0_1
        assertEquals(4, readOpsDelta);

        // revert back to local fs
        conf.set("fs.defaultFS", "file:///");

    public void testNonVectorReaderFooterSerialize() throws Exception {
        MockFileSystem fs = new MockFileSystem(conf);
        MockPath mockPath = new MockPath(fs, "mock:///mocktable2");
        conf.set("hive.orc.splits.include.file.footer", "true");
        conf.set("mapred.input.dir", mockPath.toString());
        conf.set("fs.defaultFS", "mock:///");
        conf.set("fs.mock.impl", MockFileSystem.class.getName());
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        OrcInputFormat orcInputFormat = new OrcInputFormat();
        InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);
        int readOpsBefore = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);

        for (InputSplit split : splits) {
            assertTrue("OrcSplit is expected", split instanceof OrcSplit);
            // ETL strategies will have start=3 (start of first stripe)
            if (split instanceof OrcSplit) {
                assertTrue("Footer serialize test for non-vector reader, hasFooter is expected in" + " orc splits.",
                        ((OrcSplit) split).hasFooter());
            orcInputFormat.getRecordReader(split, conf, null);

        int readOpsDelta = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: open to read data - split 1 => mock:/mocktable2/0_0
        // call-2: open to read data - split 2 => mock:/mocktable2/0_1
        assertEquals(2, readOpsDelta);

        // revert back to local fs
        conf.set("fs.defaultFS", "file:///");

    public void testVectorReaderNoFooterSerialize() throws Exception {
        MockFileSystem fs = new MockFileSystem(conf);
        MockPath mockPath = new MockPath(fs, "mock:///mocktable3");
        conf.set("hive.orc.splits.include.file.footer", "false");
        conf.set("mapred.input.dir", mockPath.toString());
        conf.set("fs.defaultFS", "mock:///");
        conf.set("fs.mock.impl", MockFileSystem.class.getName());
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        JobConf jobConf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "mocktable3", inspector,
                true, 0);
        Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        OrcInputFormat orcInputFormat = new OrcInputFormat();
        InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);

        int readOpsBefore = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);

        for (InputSplit split : splits) {
            assertTrue("OrcSplit is expected", split instanceof OrcSplit);
            // ETL strategies will have start=3 (start of first stripe)
            if (split instanceof OrcSplit) {
                        "No footer serialize test for vector reader, hasFooter is not expected in" + " orc splits.",
                        ((OrcSplit) split).hasFooter());
            orcInputFormat.getRecordReader(split, jobConf, Reporter.NULL);

        int readOpsDelta = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: open to read footer - split 1 => mock:/mocktable3/0_0
        // call-2: open to read data - split 1 => mock:/mocktable3/0_0
        // call-3: open to read footer - split 2 => mock:/mocktable3/0_1
        // call-4: open to read data - split 2 => mock:/mocktable3/0_1
        assertEquals(4, readOpsDelta);

        // revert back to local fs
        conf.set("fs.defaultFS", "file:///");

    public void testVectorReaderFooterSerialize() throws Exception {
        MockFileSystem fs = new MockFileSystem(conf);
        MockPath mockPath = new MockPath(fs, "mock:///mocktable4");
        conf.set("hive.orc.splits.include.file.footer", "true");
        conf.set("mapred.input.dir", mockPath.toString());
        conf.set("fs.defaultFS", "mock:///");
        conf.set("fs.mock.impl", MockFileSystem.class.getName());
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        JobConf jobConf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "mocktable4", inspector,
                true, 0);
        Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        OrcInputFormat orcInputFormat = new OrcInputFormat();
        InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);

        int readOpsBefore = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);

        for (InputSplit split : splits) {
            assertTrue("OrcSplit is expected", split instanceof OrcSplit);
            // ETL strategies will have start=3 (start of first stripe)
            if (split instanceof OrcSplit) {
                assertTrue("Footer serialize test for vector reader, hasFooter is expected in" + " orc splits.",
                        ((OrcSplit) split).hasFooter());
            orcInputFormat.getRecordReader(split, jobConf, Reporter.NULL);

        int readOpsDelta = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: open to read data - split 1 => mock:/mocktable4/0_0
        // call-2: open to read data - split 2 => mock:/mocktable4/0_1
        assertEquals(2, readOpsDelta);

        // revert back to local fs
        conf.set("fs.defaultFS", "file:///");

    public void testACIDReaderNoFooterSerialize() throws Exception {
        MockFileSystem fs = new MockFileSystem(conf);
        MockPath mockPath = new MockPath(fs, "mock:///mocktable5");
        conf.set("hive.transactional.table.scan", "true");
        conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
        conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
        conf.set("hive.orc.splits.include.file.footer", "false");
        conf.set("mapred.input.dir", mockPath.toString());
        conf.set("fs.defaultFS", "mock:///");
        conf.set("fs.mock.impl", MockFileSystem.class.getName());
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        OrcInputFormat orcInputFormat = new OrcInputFormat();
        InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);
        int readOpsBefore = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);

        for (InputSplit split : splits) {
            assertTrue("OrcSplit is expected", split instanceof OrcSplit);
            // ETL strategies will have start=3 (start of first stripe)
            if (split instanceof OrcSplit) {
                assertFalse("No footer serialize test for non-vector reader, hasFooter is not expected in"
                        + " orc splits.", ((OrcSplit) split).hasFooter());
            orcInputFormat.getRecordReader(split, conf, Reporter.NULL);

        int readOpsDelta = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: open to read footer - split 1 => mock:/mocktable5/0_0
        // call-2: open to read data - split 1 => mock:/mocktable5/0_0
        // call-3: open to read footer - split 2 => mock:/mocktable5/0_1
        // call-4: open to read data - split 2 => mock:/mocktable5/0_1
        // call-5: AcidUtils.getAcidState - getLen() mock:/mocktable5/0_0
        // call-6: AcidUtils.getAcidState - getLen() mock:/mocktable5/0_1
        assertEquals(6, readOpsDelta);

        // revert back to local fs
        conf.set("fs.defaultFS", "file:///");

    public void testACIDReaderFooterSerialize() throws Exception {
        MockFileSystem fs = new MockFileSystem(conf);
        MockPath mockPath = new MockPath(fs, "mock:///mocktable6");
        conf.set("hive.transactional.table.scan", "true");
        conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
        conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
        conf.set("hive.orc.splits.include.file.footer", "true");
        conf.set("mapred.input.dir", mockPath.toString());
        conf.set("fs.defaultFS", "mock:///");
        conf.set("fs.mock.impl", MockFileSystem.class.getName());
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        writer = OrcFile.createWriter(new Path(mockPath + "/0_1"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        OrcInputFormat orcInputFormat = new OrcInputFormat();
        InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(2, splits.length);
        int readOpsBefore = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);

        for (InputSplit split : splits) {
            assertTrue("OrcSplit is expected", split instanceof OrcSplit);
            // ETL strategies will have start=3 (start of first stripe)
            if (split instanceof OrcSplit) {
                assertTrue("Footer serialize test for ACID reader, hasFooter is expected in" + " orc splits.",
                        ((OrcSplit) split).hasFooter());
            orcInputFormat.getRecordReader(split, conf, Reporter.NULL);

        int readOpsDelta = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: open to read data - split 1 => mock:/mocktable6/0_0
        // call-2: open to read data - split 2 => mock:/mocktable6/0_1
        // call-3: AcidUtils.getAcidState - getLen() mock:/mocktable6/0_0
        // call-4: AcidUtils.getAcidState - getLen() mock:/mocktable6/0_1
        assertEquals(4, readOpsDelta);

        // revert back to local fs
        conf.set("fs.defaultFS", "file:///");

    public void testACIDReaderNoFooterSerializeWithDeltas() throws Exception {
        MockFileSystem fs = new MockFileSystem(conf);
        MockPath mockPath = new MockPath(fs, "mock:///mocktable7");
        conf.set("hive.transactional.table.scan", "true");
        conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
        conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
        conf.set("hive.orc.splits.include.file.footer", "false");
        conf.set("mapred.input.dir", mockPath.toString());
        conf.set("fs.defaultFS", "mock:///");
        conf.set("fs.mock.impl", MockFileSystem.class.getName());
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        writer = OrcFile.createWriter(new Path(new Path(mockPath + "/delta_001_002") + "/0_1"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        OrcInputFormat orcInputFormat = new OrcInputFormat();
        InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(1, splits.length);
        int readOpsBefore = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);

        for (InputSplit split : splits) {
            assertTrue("OrcSplit is expected", split instanceof OrcSplit);
            // ETL strategies will have start=3 (start of first stripe)
            // NOTE: don't be surprised if deltas value is different
            // in older release deltas=2 as min and max transaction are added separately to delta list.
            // in newer release since both of them are put together deltas=1
            if (split instanceof OrcSplit) {
                        "No footer serialize test for ACID reader, hasFooter is not expected in" + " orc splits.",
                        ((OrcSplit) split).hasFooter());
            orcInputFormat.getRecordReader(split, conf, Reporter.NULL);

        int readOpsDelta = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: open to read footer - split 1 => mock:/mocktable7/0_0
        // call-2: open to read data - split 1 => mock:/mocktable7/0_0
        // call-3: open side file (flush length) of delta directory
        // call-4: fs.exists() check for delta_xxx_xxx/bucket_00000 file
        // call-5: AcidUtils.getAcidState - getLen() mock:/mocktable7/0_0
        assertEquals(5, readOpsDelta);

        // revert back to local fs
        conf.set("fs.defaultFS", "file:///");

    public void testACIDReaderFooterSerializeWithDeltas() throws Exception {
        MockFileSystem fs = new MockFileSystem(conf);
        MockPath mockPath = new MockPath(fs, "mock:///mocktable8");
        conf.set("hive.transactional.table.scan", "true");
        conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
        conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
        conf.set("hive.orc.splits.include.file.footer", "true");
        conf.set("mapred.input.dir", mockPath.toString());
        conf.set("fs.defaultFS", "mock:///");
        conf.set("fs.mock.impl", MockFileSystem.class.getName());
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,
        Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        writer = OrcFile.createWriter(new Path(new Path(mockPath + "/delta_001_002") + "/0_1"),
        for (int i = 0; i < 10; ++i) {
            writer.addRow(new MyRow(i, 2 * i));

        OrcInputFormat orcInputFormat = new OrcInputFormat();
        InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
        assertEquals(1, splits.length);
        int readOpsBefore = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsBefore = statistics.getReadOps();
        assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);

        for (InputSplit split : splits) {
            assertTrue("OrcSplit is expected", split instanceof OrcSplit);
            // ETL strategies will have start=3 (start of first stripe)
            // NOTE: don't be surprised if deltas value is different
            // in older release deltas=2 as min and max transaction are added separately to delta list.
            // in newer release since both of them are put together deltas=1
            if (split instanceof OrcSplit) {
                assertTrue("Footer serialize test for ACID reader, hasFooter is not expected in" + " orc splits.",
                        ((OrcSplit) split).hasFooter());
            orcInputFormat.getRecordReader(split, conf, Reporter.NULL);

        int readOpsDelta = -1;
        for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
            if (statistics.getScheme().equalsIgnoreCase("mock")) {
                readOpsDelta = statistics.getReadOps() - readOpsBefore;
        // call-1: open to read data - split 1 => mock:/mocktable8/0_0
        // call-2: open side file (flush length) of delta directory
        // call-3: fs.exists() check for delta_xxx_xxx/bucket_00000 file
        // call-4: AcidUtils.getAcidState - getLen() mock:/mocktable8/0_0
        assertEquals(4, readOpsDelta);

        // revert back to local fs
        conf.set("fs.defaultFS", "file:///");

     * also see {@link TestOrcFile#testPredicatePushdown()}
     * This tests that {@link RecordReader#getRowNumber()} works with multiple splits
     * @throws Exception
    public void testRowNumberUniquenessInDifferentSplits() throws Exception {
        Properties properties = new Properties();
        properties.setProperty("columns", "x,y");
        properties.setProperty("columns.types", "int:int");
        StructObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class,

        // Save the conf variable values so that they can be restored later.
        long oldDefaultStripeSize = conf.getLong(OrcConf.STRIPE_SIZE.getHiveConfName(), -1L);
        long oldMaxSplitSize = conf.getLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, -1L);

        // Set the conf variable values for this test.
        long newStripeSize = 10000L; // 10000 bytes per stripe
        long newMaxSplitSize = 100L; // 1024 bytes per split
        conf.setLong(OrcConf.STRIPE_SIZE.getHiveConfName(), newStripeSize);
        conf.setLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, newMaxSplitSize);

        AbstractSerDe serde = new OrcSerde();
        HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
        org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf,
                testFilePath, MyRow.class, true, properties, Reporter.NULL);
        // The following loop should create 20 stripes in the orc file.
        for (int i = 0; i < newStripeSize * 10; ++i) {
            writer.write(serde.serialize(new MyRow(i, i + 1), inspector));
        serde = new OrcSerde();
        SerDeUtils.initializeSerDe(serde, conf, properties, null);
        assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass());
        inspector = (StructObjectInspector) serde.getObjectInspector();
        assertEquals("struct<x:int,y:int>", inspector.getTypeName());
        InputFormat<?, ?> in = new OrcInputFormat();
        FileInputFormat.setInputPaths(conf, testFilePath.toString());
        int numExpectedSplits = 20;
        InputSplit[] splits = in.getSplits(conf, numExpectedSplits);
        assertEquals(numExpectedSplits, splits.length);

        for (int i = 0; i < numExpectedSplits; ++i) {
            OrcSplit split = (OrcSplit) splits[i];
            Reader.Options orcReaderOptions = new Reader.Options();
            orcReaderOptions.range(split.getStart(), split.getLength());
            OrcFile.ReaderOptions qlReaderOptions = OrcFile.readerOptions(conf).maxLength(split.getFileLength());
            Reader reader = OrcFile.createReader(split.getPath(), qlReaderOptions);
            RecordReader recordReader = reader.rowsOptions(orcReaderOptions);
            for (int j = 0; recordReader.hasNext(); j++) {
                long rowNum = (i * 5000) + j;
                long rowNumActual = recordReader.getRowNumber();
                assertEquals("rowNum=" + rowNum, rowNum, rowNumActual);
                Object row =;

        // Reset the conf variable values that we changed for this test.
        if (oldDefaultStripeSize != -1L) {
            conf.setLong(OrcConf.STRIPE_SIZE.getHiveConfName(), oldDefaultStripeSize);
        } else {
            // this means that nothing was set for default stripe size previously, so we should unset it.
        if (oldMaxSplitSize != -1L) {
            conf.setLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, oldMaxSplitSize);
        } else {
            // this means that nothing was set for default stripe size previously, so we should unset it.

     * Test schema evolution when using the reader directly.
    public void testSchemaEvolution() throws Exception {
        TypeDescription fileSchema = TypeDescription.fromString("struct<a:int,b:struct<c:int>,d:string>");
        Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).fileSystem(fs)
        VectorizedRowBatch batch = fileSchema.createRowBatch(1000);
        batch.size = 1000;
        LongColumnVector lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]);
        for (int r = 0; r < 1000; r++) {
            ((LongColumnVector) batch.cols[0]).vector[r] = r * 42;
            lcv.vector[r] = r * 10001;
            ((BytesColumnVector) batch.cols[2]).setVal(r, Integer.toHexString(r).getBytes(StandardCharsets.UTF_8));
        TypeDescription readerSchema = TypeDescription
        Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
        RecordReader rows = reader.rowsOptions(new Reader.Options().schema(readerSchema));
        batch = readerSchema.createRowBatch();
        lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]);
        LongColumnVector future1 = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[1]);
        assertEquals(true, rows.nextBatch(batch));
        assertEquals(1000, batch.size);
        assertEquals(true, future1.isRepeating);
        assertEquals(true, future1.isNull[0]);
        assertEquals(true, batch.cols[3].isRepeating);
        assertEquals(true, batch.cols[3].isNull[0]);
        for (int r = 0; r < batch.size; ++r) {
            assertEquals("row " + r, r * 42, ((LongColumnVector) batch.cols[0]).vector[r]);
            assertEquals("row " + r, r * 10001, lcv.vector[r]);
            assertEquals("row " + r, r * 10001, lcv.vector[r]);
            assertEquals("row " + r, Integer.toHexString(r), ((BytesColumnVector) batch.cols[2]).toString(r));
        assertEquals(false, rows.nextBatch(batch));

        // try it again with an include vector
        rows = reader.rowsOptions(new Reader.Options().schema(readerSchema)
                .include(new boolean[] { false, true, true, true, false, false, true }));
        batch = readerSchema.createRowBatch();
        lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]);
        future1 = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[1]);
        assertEquals(true, rows.nextBatch(batch));
        assertEquals(1000, batch.size);
        assertEquals(true, future1.isRepeating);
        assertEquals(true, future1.isNull[0]);
        assertEquals(true, batch.cols[3].isRepeating);
        assertEquals(true, batch.cols[3].isNull[0]);
        assertEquals(true, batch.cols[2].isRepeating);
        assertEquals(true, batch.cols[2].isNull[0]);
        for (int r = 0; r < batch.size; ++r) {
            assertEquals("row " + r, r * 42, ((LongColumnVector) batch.cols[0]).vector[r]);
            assertEquals("row " + r, r * 10001, lcv.vector[r]);
        assertEquals(false, rows.nextBatch(batch));

     * Test column projection when using ACID.
    public void testColumnProjectionWithAcid() throws Exception {
        Path baseDir = new Path(workDir, "base_00100");
        testFilePath = new Path(baseDir, "bucket_00000");
        fs.delete(testFilePath, true);
        TypeDescription fileSchema = TypeDescription
                .fromString("struct<operation:int," + "originalTransaction:bigint,bucket:int,rowId:bigint,"
                        + "currentTransaction:bigint," + "row:struct<a:int,b:struct<c:int>,d:string>>");
        Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).fileSystem(fs)
        VectorizedRowBatch batch = fileSchema.createRowBatch(1000);
        batch.size = 1000;
        StructColumnVector scv = (StructColumnVector) batch.cols[5];
        // operation
        batch.cols[0].isRepeating = true;
        ((LongColumnVector) batch.cols[0]).vector[0] = 0;
        // original transaction
        batch.cols[1].isRepeating = true;
        ((LongColumnVector) batch.cols[1]).vector[0] = 1;
        // bucket
        batch.cols[2].isRepeating = true;
        ((LongColumnVector) batch.cols[2]).vector[0] = 0;
        // current transaction
        batch.cols[4].isRepeating = true;
        ((LongColumnVector) batch.cols[4]).vector[0] = 1;

        LongColumnVector lcv = (LongColumnVector) ((StructColumnVector) scv.fields[1]).fields[0];
        for (int r = 0; r < 1000; r++) {
            // row id
            ((LongColumnVector) batch.cols[3]).vector[r] = r;
            // a
            ((LongColumnVector) scv.fields[0]).vector[r] = r * 42;
            // b.c
            lcv.vector[r] = r * 10001;
            // d
            ((BytesColumnVector) scv.fields[2]).setVal(r, Integer.toHexString(r).getBytes(StandardCharsets.UTF_8));
        long fileLength = fs.getFileStatus(testFilePath).getLen();

        // test with same schema with include
        conf.set(ValidTxnList.VALID_TXNS_KEY, "100:99:");
        conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d");
        conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct<c:int>,string");
        conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false");
        conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2");
        OrcSplit split = new OrcSplit(testFilePath, null, 0, fileLength, new String[0], null, false, true,
                new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength);
        OrcInputFormat inputFormat = new OrcInputFormat();
        AcidInputFormat.RowReader<OrcStruct> reader = inputFormat.getReader(split,
                new AcidInputFormat.Options(conf));
        int record = 0;
        RecordIdentifier id = reader.createKey();
        OrcStruct struct = reader.createValue();
        while (, struct)) {
            assertEquals("id " + record, record, id.getRowId());
            assertEquals("bucket " + record, 0, id.getBucketProperty());
            assertEquals("trans " + record, 1, id.getTransactionId());
            assertEquals("a " + record, 42 * record, ((IntWritable) struct.getFieldValue(0)).get());
            assertEquals(null, struct.getFieldValue(1));
            assertEquals("d " + record, Integer.toHexString(record), struct.getFieldValue(2).toString());
            record += 1;
        assertEquals(1000, record);

        // test with schema evolution and include
        conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d,f");
        conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct<c:int,e:string>,string,int");
        conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false");
        conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2,3");
        split = new OrcSplit(testFilePath, null, 0, fileLength, new String[0], null, false, true,
                new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength);
        inputFormat = new OrcInputFormat();
        reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf));
        record = 0;
        id = reader.createKey();
        struct = reader.createValue();
        while (, struct)) {
            assertEquals("id " + record, record, id.getRowId());
            assertEquals("bucket " + record, 0, id.getBucketProperty());
            assertEquals("trans " + record, 1, id.getTransactionId());
            assertEquals("a " + record, 42 * record, ((IntWritable) struct.getFieldValue(0)).get());
            assertEquals(null, struct.getFieldValue(1));
            assertEquals("d " + record, Integer.toHexString(record), struct.getFieldValue(2).toString());
            assertEquals("f " + record, null, struct.getFieldValue(3));
            record += 1;
        assertEquals(1000, record);