com.hotels.corc.mapred.CorcInputFormat.java Source code

Introduction

Here is the source code for com.hotels.corc.mapred.CorcInputFormat.java
Source

/**
 * Copyright 2015 Expedia Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.hotels.corc.mapred;

import java.io.IOException;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcRecordUpdater;
import org.apache.hadoop.hive.ql.io.orc.OrcSplit;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.hive.ql.io.orc.Reader;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.ReflectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.hotels.corc.ConverterFactory;
import com.hotels.corc.Corc;

/**
 * A wrapper for {@link OrcInputFormat} to expose {@link Corc} as the value type instead of {@link OrcStruct}.
 * This enables column access by name instead of by position.
 */
public class CorcInputFormat implements InputFormat<NullWritable, Corc> {

    private static final Logger LOG = LoggerFactory.getLogger(CorcInputFormat.class);

    static final String CONVERTER_FACTORY = "com.hotels.corc.mapred.converter.factory";
    static final String SCHEMA_TYPE_INFO = "com.hotels.corc.mapred.schema.type.info";
    static final String INPUT_TYPE_INFO = "com.hotels.corc.mapred.input.type.info";
    static final String SEARCH_ARGUMENT = "sarg.pushdown";
    static final int ATOMIC_ROW_COLUMN_ID;
    static final String ATOMIC_ROW_COLUMN_NAME = "row";

    static {
        ATOMIC_ROW_COLUMN_ID = getOrcAtomicRowColumnId();
    }

    /**
     * Gets the ConverterFactory from the configuration
     */
    static ConverterFactory getConverterFactory(Configuration conf) {
        Class<? extends ConverterFactory> converterFactoryClass = conf.getClass(CONVERTER_FACTORY, null,
                ConverterFactory.class);
        if (converterFactoryClass == null) {
            throw new RuntimeException("ConverterFactory class was not set on the configuration");
        }
        LOG.debug("Got input ConverterFactory class from conf: {}", converterFactoryClass);
        return ReflectionUtils.newInstance(converterFactoryClass, conf);
    }

    /**
     * Sets the ConverterFactory class
     */
    public static void setConverterFactoryClass(Configuration conf,
            Class<? extends ConverterFactory> converterFactoryClass) {
        conf.setClass(CONVERTER_FACTORY, converterFactoryClass, ConverterFactory.class);
        LOG.debug("Set input ConverterFactory class on conf: {}", converterFactoryClass);
    }

    /**
     * Gets the StructTypeInfo that declares the columns to be read from the configuration
     */
    static StructTypeInfo getTypeInfo(Configuration conf) {
        StructTypeInfo inputTypeInfo = (StructTypeInfo) TypeInfoUtils
                .getTypeInfoFromTypeString(conf.get(INPUT_TYPE_INFO));
        LOG.debug("Got input typeInfo from conf: {}", inputTypeInfo);
        return inputTypeInfo;
    }

    /**
     * Sets the StructTypeInfo that declares the columns to be read in the configuration
     */
    public static void setTypeInfo(Configuration conf, StructTypeInfo typeInfo) {
        conf.set(INPUT_TYPE_INFO, typeInfo.getTypeName());
        LOG.debug("Set input typeInfo on conf: {}", typeInfo);
    }

    /**
     * Gets the StructTypeInfo that declares the total schema of the file from the configuration
     */
    static StructTypeInfo getSchemaTypeInfo(Configuration conf) {
        String schemaTypeInfo = conf.get(SCHEMA_TYPE_INFO);
        if (schemaTypeInfo != null && !schemaTypeInfo.isEmpty()) {
            LOG.debug("Got schema typeInfo from conf: {}", schemaTypeInfo);
            return (StructTypeInfo) TypeInfoUtils.getTypeInfoFromTypeString(conf.get(SCHEMA_TYPE_INFO));
        }
        return null;
    }

    /**
     * Sets the StructTypeInfo that declares the total schema of the file in the configuration
     */
    public static void setSchemaTypeInfo(Configuration conf, StructTypeInfo schemaTypeInfo) {
        if (schemaTypeInfo != null) {
            conf.set(SCHEMA_TYPE_INFO, schemaTypeInfo.getTypeName());
            LOG.debug("Set schema typeInfo on conf: {}", schemaTypeInfo);
        }
    }

    /**
     * Sets the SearchArgument predicate pushdown in the configuration
     */
    public static void setSearchArgument(Configuration conf, SearchArgument searchArgument) {
        if (searchArgument != null) {
            setSearchArgumentKryo(conf, searchArgument.toKryo());
        }
    }

    /**
     * Sets the SearchArgument predicate pushdown in the configuration
     */
    public static void setSearchArgumentKryo(Configuration conf, String searchArgumentKryo) {
        if (searchArgumentKryo != null) {
            conf.set(SEARCH_ARGUMENT, searchArgumentKryo);
        }
    }

    /**
     * Sets which fields are to be read from the ORC file
     */
    static void setReadColumns(Configuration conf, StructTypeInfo actualStructTypeInfo) {
        StructTypeInfo readStructTypeInfo = getTypeInfo(conf);

        List<Integer> ids = new ArrayList<>();
        List<String> names = new ArrayList<>();

        List<String> readNames = readStructTypeInfo.getAllStructFieldNames();
        List<String> actualNames = actualStructTypeInfo.getAllStructFieldNames();

        for (int i = 0; i < actualNames.size(); i++) {
            String actualName = actualNames.get(i);
            if (readNames.contains(actualName)) {
                // make sure they are the same type
                TypeInfo actualTypeInfo = actualStructTypeInfo.getStructFieldTypeInfo(actualName);
                TypeInfo readTypeInfo = readStructTypeInfo.getStructFieldTypeInfo(actualName);
                if (!actualTypeInfo.equals(readTypeInfo)) {
                    throw new IllegalStateException("readTypeInfo [" + readTypeInfo
                            + "] does not match actualTypeInfo [" + actualTypeInfo + "]");
                }
                // mark the column as to-be-read
                ids.add(i);
                names.add(actualName);
            }
            LOG.debug("Set column projection on columns: {} ({})", ids, readNames);
        }
        ColumnProjectionUtils.appendReadColumns(conf, ids, names);
    }

    private final OrcInputFormat orcInputFormat = new OrcInputFormat();

    @Override
    public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
        return orcInputFormat.getSplits(conf, numSplits);
    }

    @Override
    public RecordReader<NullWritable, Corc> getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter)
            throws IOException {
        StructTypeInfo typeInfo = getSchemaTypeInfo(conf);
        if (typeInfo == null) {
            typeInfo = readStructTypeInfoFromSplit(inputSplit, conf);
        }
        setReadColumns(conf, typeInfo);
        RecordReader<NullWritable, OrcStruct> reader = orcInputFormat.getRecordReader(inputSplit, conf, reporter);
        return new CorcRecordReader(typeInfo, reader, getConverterFactory(conf));
    }

    private StructTypeInfo readStructTypeInfoFromSplit(InputSplit inputSplit, JobConf conf) throws IOException {
        LOG.debug("Attempting to read schema typeInfo from split: {}", inputSplit);
        StructTypeInfo typeInfo;
        if (inputSplit instanceof FileSplit) {
            Path path = getSplitPath((FileSplit) inputSplit, conf);
            Reader orcReader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
            ObjectInspector inspector = orcReader.getObjectInspector();
            typeInfo = (StructTypeInfo) TypeInfoUtils.getTypeInfoFromObjectInspector(inspector);

            if (isAtomic(orcReader)) {
                LOG.warn(
                        "Split is atomic yet schema typeInfo was not provided via {}."
                                + " This is not recommended and will fail if you only have deltas!",
                        SCHEMA_TYPE_INFO);
                typeInfo = extractRowStruct(typeInfo);
            }
        } else {
            throw new IOException("Unsupported InputSplit " + inputSplit.getClass().getName());
        }
        LOG.debug("Read schema typeInfo from split: {}", typeInfo);
        return typeInfo;
    }

    /*
     * This is to work around an issue reading from ORC transactional data sets that contain only deltas. These contain
     * synthesised column names that are not usable to us.
     */
    private Path getSplitPath(FileSplit inputSplit, JobConf conf) throws IOException {
        Path path = inputSplit.getPath();
        if (inputSplit instanceof OrcSplit) {
            OrcSplit orcSplit = (OrcSplit) inputSplit;
            List<Long> deltas = orcSplit.getDeltas();
            if (!orcSplit.hasBase() && deltas.size() >= 2) {
                throw new IOException("Cannot read valid StructTypeInfo from delta only file: " + path);
            }
        }
        LOG.debug("Input split path: {}", path);
        return path;
    }

    private boolean isAtomic(Reader orcReader) {
        // Use org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.isOriginal(Reader) from hive-exec:1.1.0
        boolean atomic = orcReader.hasMetadataValue(OrcRecordUpdater.ACID_KEY_INDEX_NAME);
        LOG.debug("Atomic ORCFile: {}", atomic);
        return atomic;
    }

    private StructTypeInfo extractRowStruct(StructTypeInfo typeInfo) {
        List<String> actualNames = typeInfo.getAllStructFieldNames();
        if (actualNames.size() < ATOMIC_ROW_COLUMN_ID + 1) {
            throw new IllegalArgumentException("Too few rows for a transactional table: " + actualNames);
        }
        String rowStructName = actualNames.get(ATOMIC_ROW_COLUMN_ID);
        if (!ATOMIC_ROW_COLUMN_NAME.equalsIgnoreCase(rowStructName)) {
            throw new IllegalArgumentException(
                    "Expected row column name '" + ATOMIC_ROW_COLUMN_NAME + "', found: " + rowStructName);
        }
        StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo.getStructFieldTypeInfo(rowStructName);
        LOG.debug("Row StructTypeInfo defined as: {}", structTypeInfo);
        return structTypeInfo;
    }

    /* This ugliness can go when the column id can be referenced from a public place. */
    static private int getOrcAtomicRowColumnId() {
        try {
            Field rowField = OrcRecordUpdater.class.getDeclaredField("ROW");
            rowField.setAccessible(true);
            int rowId = (int) rowField.get(null);
            rowField.setAccessible(false);
            return rowId;
        } catch (NoSuchFieldException | SecurityException | IllegalArgumentException | IllegalAccessException e) {
            throw new RuntimeException("Could not obtain OrcRecordUpdater.ROW value.", e);
        }

    }

}