org.apache.impala.analysis.ParquetHelper.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.impala.analysis.ParquetHelper.java

Source

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.impala.analysis;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType;

import org.apache.impala.catalog.ArrayType;
import org.apache.impala.catalog.MapType;
import org.apache.impala.catalog.ScalarType;
import org.apache.impala.catalog.StructField;
import org.apache.impala.catalog.StructType;
import org.apache.impala.catalog.Type;
import org.apache.impala.common.AnalysisException;
import org.apache.impala.common.FileSystemUtil;

/**
 * Provides extractParquetSchema() to extract a schema
 * from a parquet file.
 *
 * Because Parquet's Java package changed between Parquet 1.5
 * and 1.9, a second copy of this file, with "org.apache.parquet." replaced
 * with "org.apache.org.apache.parquet." is generated by the build system.
 */
class ParquetHelper {
    private final static String ERROR_MSG = "Failed to convert Parquet type\n%s\nto an Impala %s type:\n%s\n";

    /**
     * Reads the first block from the given HDFS file and returns the Parquet schema.
     * Throws Analysis exception for any failure, such as failing to read the file
     * or failing to parse the contents.
     */
    private static org.apache.parquet.schema.MessageType loadParquetSchema(Path pathToFile)
            throws AnalysisException {
        try {
            FileSystem fs = pathToFile.getFileSystem(FileSystemUtil.getConfiguration());
            if (!fs.isFile(pathToFile)) {
                throw new AnalysisException("Cannot infer schema, path is not a file: " + pathToFile);
            }
        } catch (IOException e) {
            throw new AnalysisException("Failed to connect to filesystem:" + e);
        } catch (IllegalArgumentException e) {
            throw new AnalysisException(e.getMessage());
        }
        ParquetMetadata readFooter = null;
        try {
            readFooter = ParquetFileReader.readFooter(FileSystemUtil.getConfiguration(), pathToFile);
        } catch (FileNotFoundException e) {
            throw new AnalysisException("File not found: " + e);
        } catch (IOException e) {
            throw new AnalysisException("Failed to open file as a parquet file: " + e);
        } catch (RuntimeException e) {
            // Parquet throws a generic RuntimeException when reading a non-parquet file
            if (e.toString().contains("is not a Parquet file")) {
                throw new AnalysisException("File is not a parquet file: " + pathToFile);
            }
            // otherwise, who knows what we caught, throw it back up
            throw e;
        }
        return readFooter.getFileMetaData().getSchema();
    }

    /**
     * Converts a "primitive" Parquet type to an Impala type.
     * A primitive type is a non-nested type with no annotations.
     */
    private static Type convertPrimitiveParquetType(org.apache.parquet.schema.Type parquetType)
            throws AnalysisException {
        Preconditions.checkState(parquetType.isPrimitive());
        PrimitiveType prim = parquetType.asPrimitiveType();
        switch (prim.getPrimitiveTypeName()) {
        case BINARY:
            return Type.STRING;
        case BOOLEAN:
            return Type.BOOLEAN;
        case DOUBLE:
            return Type.DOUBLE;
        case FIXED_LEN_BYTE_ARRAY:
            throw new AnalysisException(
                    "Unsupported parquet type FIXED_LEN_BYTE_ARRAY for field " + parquetType.getName());
        case FLOAT:
            return Type.FLOAT;
        case INT32:
            return Type.INT;
        case INT64:
            return Type.BIGINT;
        case INT96:
            return Type.TIMESTAMP;
        default:
            Preconditions.checkState(false, "Unexpected parquet primitive type: " + prim.getPrimitiveTypeName());
            return null;
        }
    }

    /**
     * Converts a Parquet group type to an Impala map Type. We support both standard
     * Parquet map representations, as well as legacy. Legacy representations are handled
     * according to this specification:
     * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules-1
     *
     * Standard representation of a map in Parquet:
     * <optional | required> group <name> (MAP) { <-- outerGroup is pointing at this
     * repeated group key_value {
     *     required <key-type> key;
     *     <optional | required> <value-type> value;
     *   }
     * }
     */
    private static MapType convertMap(org.apache.parquet.schema.GroupType outerGroup) throws AnalysisException {
        if (outerGroup.getFieldCount() != 1) {
            throw new AnalysisException(String.format(ERROR_MSG, outerGroup.toString(), "MAP",
                    "The logical MAP type must have exactly 1 inner field."));
        }

        org.apache.parquet.schema.Type innerField = outerGroup.getType(0);
        if (!innerField.isRepetition(org.apache.parquet.schema.Type.Repetition.REPEATED)) {
            throw new AnalysisException(String.format(ERROR_MSG, outerGroup.toString(), "MAP",
                    "The logical MAP type must have a repeated inner field."));
        }
        if (innerField.isPrimitive()) {
            throw new AnalysisException(String.format(ERROR_MSG, outerGroup.toString(), "MAP",
                    "The inner field of the logical MAP type must be a group."));
        }

        org.apache.parquet.schema.GroupType innerGroup = innerField.asGroupType();
        // It does not matter whether innerGroup has an annotation or not (for example it may
        // be annotated with MAP_KEY_VALUE). We treat the case that innerGroup has an
        // annotation and the case the innerGroup does not have an annotation the same.
        if (innerGroup.getFieldCount() != 2) {
            throw new AnalysisException(String.format(ERROR_MSG, outerGroup.toString(), "MAP",
                    "The inner field of the logical MAP type must have exactly 2 fields."));
        }

        org.apache.parquet.schema.Type key = innerGroup.getType(0);
        if (!key.getName().equals("key")) {
            throw new AnalysisException(String.format(ERROR_MSG, outerGroup.toString(), "MAP",
                    "The name of the first field of the inner field of the logical MAP " + "type must be 'key'"));
        }
        if (!key.isPrimitive()) {
            throw new AnalysisException(String.format(ERROR_MSG, outerGroup.toString(), "MAP",
                    "The key type of the logical MAP type must be primitive."));
        }
        org.apache.parquet.schema.Type value = innerGroup.getType(1);
        if (!value.getName().equals("value")) {
            throw new AnalysisException(String.format(ERROR_MSG, outerGroup.toString(), "MAP",
                    "The name of the second field of the inner field of the logical MAP "
                            + "type must be 'value'"));
        }

        return new MapType(convertParquetType(key), convertParquetType(value));
    }

    /**
     * Converts a Parquet group type to an Impala struct Type.
     */
    private static StructType convertStruct(org.apache.parquet.schema.GroupType outerGroup)
            throws AnalysisException {
        ArrayList<StructField> structFields = new ArrayList<StructField>();
        for (org.apache.parquet.schema.Type field : outerGroup.getFields()) {
            StructField f = new StructField(field.getName(), convertParquetType(field));
            structFields.add(f);
        }
        return new StructType(structFields);
    }

    /**
     * Converts a Parquet group type to an Impala array Type. We can handle the standard
     * representation, but also legacy representations for backwards compatibility.
     * Legacy representations are handled according to this specification:
     * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules
     *
     * Standard representation of an array in Parquet:
     * <optional | required> group <name> (LIST) { <-- outerGroup is pointing at this
     *   repeated group list {
     *     <optional | required> <element-type> element;
     *   }
     * }
     */
    private static ArrayType convertArray(org.apache.parquet.schema.GroupType outerGroup) throws AnalysisException {
        if (outerGroup.getFieldCount() != 1) {
            throw new AnalysisException(String.format(ERROR_MSG, outerGroup.toString(), "LIST",
                    "The logical LIST type must have exactly 1 inner field."));
        }

        org.apache.parquet.schema.Type innerField = outerGroup.getType(0);
        if (!innerField.isRepetition(org.apache.parquet.schema.Type.Repetition.REPEATED)) {
            throw new AnalysisException(String.format(ERROR_MSG, outerGroup.toString(), "LIST",
                    "The inner field of the logical LIST type must be repeated."));
        }
        if (innerField.isPrimitive() || innerField.getOriginalType() != null) {
            // From the Parquet Spec:
            // 1. If the repeated field is not a group then it's type is the element type.
            //
            // If innerField is a group, but originalType is not null, the element type is
            // based on the logical type.
            return new ArrayType(convertParquetType(innerField));
        }

        org.apache.parquet.schema.GroupType innerGroup = innerField.asGroupType();
        if (innerGroup.getFieldCount() != 1) {
            // From the Parquet Spec:
            // 2. If the repeated field is a group with multiple fields, then it's type is a
            //    struct.
            return new ArrayType(convertStruct(innerGroup));
        }

        return new ArrayType(convertParquetType(innerGroup.getType(0)));
    }

    /**
     * Converts a "logical" Parquet type to an Impala column type.
     * A Parquet type is considered logical when it has an annotation. The annotation is
     * stored as a "OriginalType". The Parquet documentation refers to these as logical
     * types, so we use that terminology here.
     */
    private static Type convertLogicalParquetType(org.apache.parquet.schema.Type parquetType)
            throws AnalysisException {
        OriginalType orig = parquetType.getOriginalType();
        if (orig == OriginalType.LIST) {
            return convertArray(parquetType.asGroupType());
        }
        if (orig == OriginalType.MAP || orig == OriginalType.MAP_KEY_VALUE) {
            // MAP_KEY_VALUE annotation should not be used any more. However, according to the
            // Parquet spec, some existing data incorrectly uses MAP_KEY_VALUE in place of MAP.
            // For backward-compatibility, a group annotated with MAP_KEY_VALUE that is not
            // contained by a MAP-annotated group should be handled as a MAP-annotated group.
            return convertMap(parquetType.asGroupType());
        }

        PrimitiveType prim = parquetType.asPrimitiveType();
        if (prim.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.BINARY
                && (orig == OriginalType.UTF8 || orig == OriginalType.ENUM)) {
            // UTF8 is the type annotation Parquet uses for strings
            // ENUM is the type annotation Parquet uses to indicate that
            // the original data type, before conversion to parquet, had been enum.
            // Applications which do not have enumerated types (e.g. Impala)
            // should interpret it as a string.
            // We check to make sure it applies to BINARY to avoid errors if there is a bad
            // annotation.
            return Type.STRING;
        }

        if (prim.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT32
                || prim.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT64) {
            // Map signed integer types to an supported Impala column type
            switch (orig) {
            case INT_8:
                return Type.TINYINT;
            case INT_16:
                return Type.SMALLINT;
            case INT_32:
                return Type.INT;
            case INT_64:
                return Type.BIGINT;
            }
        }

        if (orig == OriginalType.DECIMAL) {
            return ScalarType.createDecimalType(prim.getDecimalMetadata().getPrecision(),
                    prim.getDecimalMetadata().getScale());
        }

        throw new AnalysisException("Unsupported logical parquet type " + orig + " (primitive type is "
                + prim.getPrimitiveTypeName().name() + ") for field " + parquetType.getName());
    }

    /**
     * Converts a Parquet type into an Impala type.
     */
    private static Type convertParquetType(org.apache.parquet.schema.Type field) throws AnalysisException {
        Type type = null;
        // TODO for 2.3: If a field is not annotated with LIST, it can still be sometimes
        // interpreted as an array. The following 2 examples should be interpreted as an array
        // of integers, but this is currently not done.
        // 1. repeated int int_col;
        // 2. required group int_arr {
        //      repeated group list {
        //        required int element;
        //      }
        //    }
        if (field.getOriginalType() != null) {
            type = convertLogicalParquetType(field);
        } else if (field.isPrimitive()) {
            type = convertPrimitiveParquetType(field);
        } else {
            // If field is not primitive, it must be a struct.
            type = convertStruct(field.asGroupType());
        }
        return type;
    }

    /**
     * Parses a Parquet file stored in HDFS and returns the corresponding Impala schema.
     * This fails with an analysis exception if any errors occur reading the file,
     * parsing the Parquet schema, or if the Parquet types cannot be represented in Impala.
     */
    static List<ColumnDef> extractParquetSchema(HdfsUri location) throws AnalysisException {
        org.apache.parquet.schema.MessageType parquetSchema = loadParquetSchema(location.getPath());
        List<org.apache.parquet.schema.Type> fields = parquetSchema.getFields();
        List<ColumnDef> schema = new ArrayList<ColumnDef>();

        for (org.apache.parquet.schema.Type field : fields) {
            Type type = convertParquetType(field);
            Preconditions.checkNotNull(type);
            String colName = field.getName();
            Map<ColumnDef.Option, Object> option = Maps.newHashMap();
            option.put(ColumnDef.Option.COMMENT, "Inferred from Parquet file.");
            schema.add(new ColumnDef(colName, new TypeDef(type), option));
        }
        return schema;
    }
}