org.apache.gobblin.hive.orc.HiveOrcSerDeManager.java Source code

Introduction

Here is the source code for org.apache.gobblin.hive.orc.HiveOrcSerDeManager.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.gobblin.hive.orc;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import java.util.stream.Collectors;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;

import com.codahale.metrics.Timer;
import com.google.common.base.Charsets;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;

import lombok.extern.slf4j.Slf4j;

import org.apache.gobblin.configuration.State;
import org.apache.gobblin.hive.HiveRegistrationUnit;
import org.apache.gobblin.hive.HiveSerDeManager;
import org.apache.gobblin.hive.HiveSerDeWrapper;
import org.apache.gobblin.instrumented.Instrumented;
import org.apache.gobblin.metrics.MetricContext;
import org.apache.gobblin.util.FileListUtils;
import org.apache.gobblin.util.HadoopUtils;

/**
 * A derived class of {@link org.apache.gobblin.hive.HiveSerDeManager} that is mainly responsible for adding schema
 * information into {@link HiveRegistrationUnit#serDeProps}, based on the format of the data.
 */
@Slf4j
public class HiveOrcSerDeManager extends HiveSerDeManager {
    // Schema is in the format of TypeDescriptor
    public static final String SCHEMA_LITERAL = "orc.schema.literal";

    // Extensions of files containing ORC data
    public static final String FILE_EXTENSIONS_KEY = "hiveOrcSerdeManager.fileExtensions";
    public static final String DEFAULT_FILE_EXTENSIONS = ".orc";

    // Files with these prefixes are ignored when finding the latest schema
    public static final String IGNORED_FILE_PREFIXES_KEY = "hiveOrcSerdeManager.ignoredPrefixes";
    public static final String DEFAULT_IGNORED_FILE_PREFIXES = "_,.";

    // The serde type
    public static final String SERDE_TYPE_KEY = "hiveOrcSerdeManager.serdeType";
    public static final String DEFAULT_SERDE_TYPE = "ORC";
    public static final String INPUT_FORMAT_CLASS_KEY = "hiveOrcSerdeManager.inputFormatClass";
    public static final String DEFAULT_INPUT_FORMAT_CLASS = OrcInputFormat.class.getName();

    public static final String OUTPUT_FORMAT_CLASS_KEY = "hiveOrcSerdeManager.outputFormatClass";
    public static final String DEFAULT_OUTPUT_FORMAT_CLASS = OrcOutputFormat.class.getName();

    public static final String HIVE_SPEC_SCHEMA_READING_TIMER = "hiveOrcSerdeManager.schemaReadTimer";

    private static final int EXPECTED_FOOTER_SIZE = 16 * 1024;
    private static final String ORC_FORMAT = "ORC";
    private static final ByteBuffer MAGIC_BUFFER = ByteBuffer.wrap(ORC_FORMAT.getBytes(Charsets.UTF_8));

    private final FileSystem fs;
    private final HiveSerDeWrapper serDeWrapper;
    private final List<String> fileExtensions;
    private final List<String> ignoredFilePrefixes;
    private final MetricContext metricContext;

    public HiveOrcSerDeManager(State props) throws IOException {
        super(props);
        this.fs = FileSystem.get(HadoopUtils.getConfFromState(props));

        List<String> extensions = props.getPropAsList(FILE_EXTENSIONS_KEY, DEFAULT_FILE_EXTENSIONS);
        this.fileExtensions = extensions.isEmpty() ? ImmutableList.of("") : extensions;

        this.ignoredFilePrefixes = props.getPropAsList(IGNORED_FILE_PREFIXES_KEY, DEFAULT_IGNORED_FILE_PREFIXES);
        this.metricContext = Instrumented.getMetricContext(props, HiveOrcSerDeManager.class);
        this.serDeWrapper = HiveSerDeWrapper.get(props.getProp(SERDE_TYPE_KEY, DEFAULT_SERDE_TYPE),
                Optional.of(props.getProp(INPUT_FORMAT_CLASS_KEY, DEFAULT_INPUT_FORMAT_CLASS)),
                Optional.of(props.getProp(OUTPUT_FORMAT_CLASS_KEY, DEFAULT_OUTPUT_FORMAT_CLASS)));
    }

    @Override
    public boolean haveSameSchema(HiveRegistrationUnit unit1, HiveRegistrationUnit unit2) throws IOException {
        if (unit1.getSerDeProps().contains(SCHEMA_LITERAL) && unit2.getSerDeProps().contains(SCHEMA_LITERAL)) {
            return unit1.getSerDeProps().getProp(SCHEMA_LITERAL)
                    .equals(unit2.getSerDeProps().getProp(SCHEMA_LITERAL));
        } else {
            return false;
        }
    }

    /**
     * Add ORC SerDe attributes into HiveUnit
     *
     * @param path
     * @param hiveUnit
     * @throws IOException
     */
    @Override
    public void addSerDeProperties(Path path, HiveRegistrationUnit hiveUnit) throws IOException {
        hiveUnit.setSerDeType(this.serDeWrapper.getSerDe().getClass().getName());
        hiveUnit.setInputFormat(this.serDeWrapper.getInputFormatClassName());
        hiveUnit.setOutputFormat(this.serDeWrapper.getOutputFormatClassName());

        addSchemaProperties(path, hiveUnit);
    }

    @Override
    public void addSerDeProperties(HiveRegistrationUnit source, HiveRegistrationUnit target) throws IOException {
        if (source.getSerDeType().isPresent()) {
            target.setSerDeType(source.getSerDeType().get());
        }
        if (source.getInputFormat().isPresent()) {
            target.setInputFormat(source.getInputFormat().get());
        }
        if (source.getOutputFormat().isPresent()) {
            target.setOutputFormat(source.getOutputFormat().get());
        }
        if (source.getSerDeProps().contains(SCHEMA_LITERAL)) {
            target.setSerDeProp(SCHEMA_LITERAL, source.getSerDeProps().getProp(SCHEMA_LITERAL));
        }
    }

    @Override
    public void updateSchema(HiveRegistrationUnit existingUnit, HiveRegistrationUnit newUnit) throws IOException {
        Preconditions.checkArgument(newUnit.getSerDeProps().contains(SCHEMA_LITERAL));

        existingUnit.setSerDeProp(SCHEMA_LITERAL, newUnit.getSerDeProps().getProp(SCHEMA_LITERAL));
    }

    /**
     * Get the schema as a TypeInfo object
     * @param path path that contains the ORC files
     * @param fs {@link FileSystem}
     * @return {@link TypeInfo} with the schema information
     * @throws IOException
     */
    public TypeInfo getSchemaFromLatestFile(Path path, FileSystem fs) throws IOException {
        if (fs.isDirectory(path)) {
            List<FileStatus> files = Arrays.asList(fs.listStatus(path, new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    try {
                        return ignoredFilePrefixes.stream().noneMatch(e -> path.getName().startsWith(e))
                                && fileExtensions.stream().anyMatch(e -> path.getName().endsWith(e))
                                && isORC(path, fs);
                    } catch (IOException e) {
                        log.error("Error checking file for schema retrieval", e);
                        return false;
                    }
                }
            }));

            if (files.size() > 0) {
                Collections.sort((files), FileListUtils.LATEST_MOD_TIME_ORDER);
            } else {
                throw new FileNotFoundException("No files in Dataset:" + path + " found for schema retrieval");
            }
            return getSchemaFromLatestFile(files.get(0).getPath(), fs);
        } else {
            return TypeInfoUtils
                    .getTypeInfoFromObjectInspector(OrcFile.createReader(fs, path).getObjectInspector());
        }
    }

    /**
     * Determine if a file is ORC format.
     * Steal ideas & code from presto/OrcReader under Apache License 2.0.
     */
    private static boolean isORC(Path file, FileSystem fs) throws IOException {
        try {
            FSDataInputStream inputStream = fs.open(file);
            long size = fs.getFileStatus(file).getLen();
            byte[] buffer = new byte[Math.toIntExact(Math.min(size, EXPECTED_FOOTER_SIZE))];
            if (size < buffer.length) {
                return false;
            }

            inputStream.readFully(size - buffer.length, buffer);

            // get length of PostScript - last byte of the file
            int postScriptSize = buffer[buffer.length - 1] & 0xff;
            int magicLen = MAGIC_BUFFER.remaining();

            if (postScriptSize < magicLen + 1 || postScriptSize >= buffer.length) {
                return false;
            }

            if (!MAGIC_BUFFER.equals(ByteBuffer.wrap(buffer, buffer.length - 1 - magicLen, magicLen))) {
                // Old versions of ORC (0.11) wrote the magic to the head of the file
                byte[] headerMagic = new byte[magicLen];
                inputStream.readFully(0, headerMagic);

                // if it isn't there, this isn't an ORC file
                if (!MAGIC_BUFFER.equals(ByteBuffer.wrap(headerMagic))) {
                    return false;
                }
            }

            return true;
        } catch (Exception e) {
            throw new RuntimeException("Error occured when checking the type of file:" + file);
        }
    }

    private void addSchemaProperties(Path path, HiveRegistrationUnit hiveUnit) throws IOException {
        Preconditions.checkArgument(this.fs.getFileStatus(path).isDirectory(), path + " is not a directory.");
        try (Timer.Context context = metricContext.timer(HIVE_SPEC_SCHEMA_READING_TIMER).time()) {
            addSchemaPropertiesHelper(path, hiveUnit);
        }
    }

    /**
     * Extensible if there's other source-of-truth for fetching schema instead of interacting with HDFS.
     *
     * For purpose of initializing {@link org.apache.hadoop.hive.ql.io.orc.OrcSerde} object, it will require:
     * org.apache.hadoop.hive.serde.serdeConstants#LIST_COLUMNS and
     * org.apache.hadoop.hive.serde.serdeConstants#LIST_COLUMN_TYPES
     *
     * Keeping {@link #SCHEMA_LITERAL} will be a nice-to-have thing but not actually necessary in terms of functionality.
     */
    protected void addSchemaPropertiesHelper(Path path, HiveRegistrationUnit hiveUnit) throws IOException {
        TypeInfo schema = getSchemaFromLatestFile(path, this.fs);
        if (schema instanceof StructTypeInfo) {
            StructTypeInfo structTypeInfo = (StructTypeInfo) schema;

            hiveUnit.setSerDeProp(SCHEMA_LITERAL, schema);
            hiveUnit.setSerDeProp(serdeConstants.LIST_COLUMNS,
                    Joiner.on(",").join(structTypeInfo.getAllStructFieldNames()));
            hiveUnit.setSerDeProp(serdeConstants.LIST_COLUMN_TYPES, Joiner.on(",").join(structTypeInfo
                    .getAllStructFieldTypeInfos().stream().map(x -> x.getTypeName()).collect(Collectors.toList())));
        } else {
            // Hive always uses a struct with a field for each of the top-level columns as the root object type.
            // So for here we assume to-be-registered ORC files follow this pattern.
            throw new IllegalStateException("A valid ORC schema should be an instance of struct");
        }
    }
}