org.apache.hadoop.hive.serde2.avro.AvroSerializer.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hive.serde2.avro.AvroSerializer.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.serde2.avro;

import java.sql.Date;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type;
import org.apache.avro.generic.GenericArray;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericData.Fixed;
import org.apache.avro.generic.GenericEnumSymbol;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.common.type.HiveChar;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.apache.hadoop.io.Writable;

class AvroSerializer {
    private static final Log LOG = LogFactory.getLog(AvroSerializer.class);

    /**
     * The Schema to use when serializing Map keys.
     * Since we're sharing this across Serializer instances, it must be immutable;
     * any properties need to be added in a static initializer.
     */
    private static final Schema STRING_SCHEMA = Schema.create(Schema.Type.STRING);
    AvroGenericRecordWritable cache = new AvroGenericRecordWritable();

    // Hive is pretty simple (read: stupid) in writing out values via the serializer.
    // We're just going to go through, matching indices.  Hive formats normally
    // handle mismatches with null.  We don't have that option, so instead we'll
    // end up throwing an exception for invalid records.
    public Writable serialize(Object o, ObjectInspector objectInspector, List<String> columnNames,
            List<TypeInfo> columnTypes, Schema schema) throws AvroSerdeException {
        StructObjectInspector soi = (StructObjectInspector) objectInspector;
        GenericData.Record record = new GenericData.Record(schema);

        List<? extends StructField> outputFieldRefs = soi.getAllStructFieldRefs();
        if (outputFieldRefs.size() != columnNames.size()) {
            throw new AvroSerdeException("Number of input columns was different than output columns (in = "
                    + columnNames.size() + " vs out = " + outputFieldRefs.size());
        }

        int size = schema.getFields().size();
        if (outputFieldRefs.size() != size) {
            throw new AvroSerdeException(
                    "Hive passed in a different number of fields than the schema expected: (Hive wanted "
                            + outputFieldRefs.size() + ", Avro expected " + schema.getFields().size());
        }

        List<? extends StructField> allStructFieldRefs = soi.getAllStructFieldRefs();
        List<Object> structFieldsDataAsList = soi.getStructFieldsDataAsList(o);

        for (int i = 0; i < size; i++) {
            Field field = schema.getFields().get(i);
            TypeInfo typeInfo = columnTypes.get(i);
            StructField structFieldRef = allStructFieldRefs.get(i);
            Object structFieldData = structFieldsDataAsList.get(i);
            ObjectInspector fieldOI = structFieldRef.getFieldObjectInspector();

            Object val = serialize(typeInfo, fieldOI, structFieldData, field.schema());
            record.put(field.name(), val);
        }

        if (!GenericData.get().validate(schema, record)) {
            throw new SerializeToAvroException(schema, record);
        }

        cache.setRecord(record);

        return cache;
    }

    private Object serialize(TypeInfo typeInfo, ObjectInspector fieldOI, Object structFieldData, Schema schema)
            throws AvroSerdeException {
        if (null == structFieldData) {
            return null;
        }

        if (AvroSerdeUtils.isNullableType(schema)) {
            schema = AvroSerdeUtils.getOtherTypeFromNullableType(schema);
        }
        /* Because we use Hive's 'string' type when Avro calls for enum, we have to expressly check for enum-ness */
        if (Schema.Type.ENUM.equals(schema.getType())) {
            assert fieldOI instanceof PrimitiveObjectInspector;
            return serializeEnum(typeInfo, (PrimitiveObjectInspector) fieldOI, structFieldData, schema);
        }
        switch (typeInfo.getCategory()) {
        case PRIMITIVE:
            assert fieldOI instanceof PrimitiveObjectInspector;
            return serializePrimitive(typeInfo, (PrimitiveObjectInspector) fieldOI, structFieldData, schema);
        case MAP:
            assert fieldOI instanceof MapObjectInspector;
            assert typeInfo instanceof MapTypeInfo;
            return serializeMap((MapTypeInfo) typeInfo, (MapObjectInspector) fieldOI, structFieldData, schema);
        case LIST:
            assert fieldOI instanceof ListObjectInspector;
            assert typeInfo instanceof ListTypeInfo;
            return serializeList((ListTypeInfo) typeInfo, (ListObjectInspector) fieldOI, structFieldData, schema);
        case UNION:
            assert fieldOI instanceof UnionObjectInspector;
            assert typeInfo instanceof UnionTypeInfo;
            return serializeUnion((UnionTypeInfo) typeInfo, (UnionObjectInspector) fieldOI, structFieldData,
                    schema);
        case STRUCT:
            assert fieldOI instanceof StructObjectInspector;
            assert typeInfo instanceof StructTypeInfo;
            return serializeStruct((StructTypeInfo) typeInfo, (StructObjectInspector) fieldOI, structFieldData,
                    schema);
        default:
            throw new AvroSerdeException("Ran out of TypeInfo Categories: " + typeInfo.getCategory());
        }
    }

    /** private cache to avoid lots of EnumSymbol creation while serializing.
     *  Two levels because the enum symbol is specific to a schema.
     *  Object because we want to avoid the overhead of repeated toString calls while maintaining compatability.
     *  Provided there are few enum types per record, and few symbols per enum, memory use should be moderate.
     *  eg 20 types with 50 symbols each as length-10 Strings should be on the order of 100KB per AvroSerializer.
     */
    final InstanceCache<Schema, InstanceCache<Object, GenericEnumSymbol>> enums = new InstanceCache<Schema, InstanceCache<Object, GenericEnumSymbol>>() {
        @Override
        protected InstanceCache<Object, GenericEnumSymbol> makeInstance(final Schema schema,
                Set<Schema> seenSchemas) {
            return new InstanceCache<Object, GenericEnumSymbol>() {
                @Override
                protected GenericEnumSymbol makeInstance(Object seed, Set<Object> seenSchemas) {
                    return new GenericData.EnumSymbol(schema, seed.toString());
                }
            };
        }
    };

    private Object serializeEnum(TypeInfo typeInfo, PrimitiveObjectInspector fieldOI, Object structFieldData,
            Schema schema) throws AvroSerdeException {
        return enums.retrieve(schema).retrieve(serializePrimitive(typeInfo, fieldOI, structFieldData, schema));
    }

    private Object serializeStruct(StructTypeInfo typeInfo, StructObjectInspector ssoi, Object o, Schema schema)
            throws AvroSerdeException {
        int size = schema.getFields().size();
        List<? extends StructField> allStructFieldRefs = ssoi.getAllStructFieldRefs();
        List<Object> structFieldsDataAsList = ssoi.getStructFieldsDataAsList(o);
        GenericData.Record record = new GenericData.Record(schema);
        ArrayList<TypeInfo> allStructFieldTypeInfos = typeInfo.getAllStructFieldTypeInfos();

        for (int i = 0; i < size; i++) {
            Field field = schema.getFields().get(i);
            TypeInfo colTypeInfo = allStructFieldTypeInfos.get(i);
            StructField structFieldRef = allStructFieldRefs.get(i);
            Object structFieldData = structFieldsDataAsList.get(i);
            ObjectInspector fieldOI = structFieldRef.getFieldObjectInspector();

            Object val = serialize(colTypeInfo, fieldOI, structFieldData, field.schema());
            record.put(field.name(), val);
        }
        return record;
    }

    private Object serializePrimitive(TypeInfo typeInfo, PrimitiveObjectInspector fieldOI, Object structFieldData,
            Schema schema) throws AvroSerdeException {
        switch (fieldOI.getPrimitiveCategory()) {
        case BINARY:
            if (schema.getType() == Type.BYTES) {
                return AvroSerdeUtils.getBufferFromBytes((byte[]) fieldOI.getPrimitiveJavaObject(structFieldData));
            } else if (schema.getType() == Type.FIXED) {
                Fixed fixed = new GenericData.Fixed(schema,
                        (byte[]) fieldOI.getPrimitiveJavaObject(structFieldData));
                return fixed;
            } else {
                throw new AvroSerdeException("Unexpected Avro schema for Binary TypeInfo: " + schema.getType());
            }
        case DECIMAL:
            HiveDecimal dec = (HiveDecimal) fieldOI.getPrimitiveJavaObject(structFieldData);
            return AvroSerdeUtils.getBufferFromDecimal(dec, ((DecimalTypeInfo) typeInfo).scale());
        case CHAR:
            HiveChar ch = (HiveChar) fieldOI.getPrimitiveJavaObject(structFieldData);
            return ch.getStrippedValue();
        case VARCHAR:
            HiveVarchar vc = (HiveVarchar) fieldOI.getPrimitiveJavaObject(structFieldData);
            return vc.getValue();
        case DATE:
            Date date = ((DateObjectInspector) fieldOI).getPrimitiveJavaObject(structFieldData);
            return DateWritable.dateToDays(date);
        case TIMESTAMP:
            Timestamp timestamp = ((TimestampObjectInspector) fieldOI).getPrimitiveJavaObject(structFieldData);
            return timestamp.getTime();
        case UNKNOWN:
            throw new AvroSerdeException("Received UNKNOWN primitive category.");
        case VOID:
            return null;
        default: // All other primitive types are simple
            return fieldOI.getPrimitiveJavaObject(structFieldData);
        }
    }

    private Object serializeUnion(UnionTypeInfo typeInfo, UnionObjectInspector fieldOI, Object structFieldData,
            Schema schema) throws AvroSerdeException {
        byte tag = fieldOI.getTag(structFieldData);

        // Invariant that Avro's tag ordering must match Hive's.
        return serialize(typeInfo.getAllUnionObjectTypeInfos().get(tag), fieldOI.getObjectInspectors().get(tag),
                fieldOI.getField(structFieldData), schema.getTypes().get(tag));
    }

    private Object serializeList(ListTypeInfo typeInfo, ListObjectInspector fieldOI, Object structFieldData,
            Schema schema) throws AvroSerdeException {
        List<?> list = fieldOI.getList(structFieldData);
        List<Object> deserialized = new GenericData.Array<Object>(list.size(), schema);

        TypeInfo listElementTypeInfo = typeInfo.getListElementTypeInfo();
        ObjectInspector listElementObjectInspector = fieldOI.getListElementObjectInspector();
        Schema elementType = schema.getElementType();

        for (int i = 0; i < list.size(); i++) {
            deserialized.add(i,
                    serialize(listElementTypeInfo, listElementObjectInspector, list.get(i), elementType));
        }

        return deserialized;
    }

    private Object serializeMap(MapTypeInfo typeInfo, MapObjectInspector fieldOI, Object structFieldData,
            Schema schema) throws AvroSerdeException {
        // Avro only allows maps with string keys
        if (!mapHasStringKey(fieldOI.getMapKeyObjectInspector())) {
            throw new AvroSerdeException(
                    "Avro only supports maps with keys as Strings.  Current Map is: " + typeInfo.toString());
        }

        ObjectInspector mapKeyObjectInspector = fieldOI.getMapKeyObjectInspector();
        ObjectInspector mapValueObjectInspector = fieldOI.getMapValueObjectInspector();
        TypeInfo mapKeyTypeInfo = typeInfo.getMapKeyTypeInfo();
        TypeInfo mapValueTypeInfo = typeInfo.getMapValueTypeInfo();
        Map<?, ?> map = fieldOI.getMap(structFieldData);
        Schema valueType = schema.getValueType();

        Map<Object, Object> deserialized = new HashMap<Object, Object>(fieldOI.getMapSize(structFieldData));

        for (Map.Entry<?, ?> entry : map.entrySet()) {
            deserialized.put(serialize(mapKeyTypeInfo, mapKeyObjectInspector, entry.getKey(), STRING_SCHEMA),
                    serialize(mapValueTypeInfo, mapValueObjectInspector, entry.getValue(), valueType));
        }

        return deserialized;
    }

    private boolean mapHasStringKey(ObjectInspector mapKeyObjectInspector) {
        return mapKeyObjectInspector instanceof PrimitiveObjectInspector
                && ((PrimitiveObjectInspector) mapKeyObjectInspector).getPrimitiveCategory()
                        .equals(PrimitiveObjectInspector.PrimitiveCategory.STRING);
    }

    /**
     * Thrown when, during serialization of a Hive row to an Avro record, Avro
     * cannot verify the converted row to the record's schema.
     */
    public static class SerializeToAvroException extends AvroSerdeException {
        final private Schema schema;
        final private GenericData.Record record;

        public SerializeToAvroException(Schema schema, GenericData.Record record) {
            this.schema = schema;
            this.record = record;
        }

        @Override
        public String toString() {
            return "Avro could not validate record against schema (record = " + record + ") (schema = "
                    + schema.toString(false) + ")";
        }
    }
}