com.linkedin.haivvreo.AvroSerializer.java Source code

Java tutorial

Introduction

Here is the source code for com.linkedin.haivvreo.AvroSerializer.java

Source

/*
 * Copyright 2011 LinkedIn
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.haivvreo;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.generic.GenericData;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.serde2.objectinspector.*;
import org.apache.hadoop.hive.serde2.typeinfo.*;
import org.apache.hadoop.io.Writable;
import static org.apache.avro.Schema.Type.*;

import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;

class AvroSerializer {
    private static final Log LOG = LogFactory.getLog(AvroSerializer.class);

    AvroGenericRecordWritable cache = new AvroGenericRecordWritable();

    // Hive is pretty simple (read: stupid) in writing out values via the serializer.
    // We're just going to go through, matching indices.  Hive formats normally
    // handle mismatches with null.  We don't have that option, so instead we'll
    // end up throwing an exception for invalid records.
    public Writable serialize(Object o, ObjectInspector objectInspector, List<String> columnNames,
            List<TypeInfo> columnTypes, Schema schema) throws HaivvreoException {
        StandardStructObjectInspector ssoi = (StandardStructObjectInspector) objectInspector;
        GenericData.Record record = new GenericData.Record(schema);

        List<? extends StructField> outputFieldRefs = ssoi.getAllStructFieldRefs();
        if (outputFieldRefs.size() != columnNames.size())
            throw new HaivvreoException("Number of input columns was different than output columns (in = "
                    + columnNames.size() + " vs out = " + outputFieldRefs.size());

        int size = schema.getFields().size();
        if (outputFieldRefs.size() != size) // Hive does this check for us, so we should be ok.
            throw new HaivvreoException(
                    "Hive passed in a different number of fields than the schema expected: (Hive wanted "
                            + outputFieldRefs.size() + ", Avro expected " + schema.getFields().size());

        List<? extends StructField> allStructFieldRefs = ssoi.getAllStructFieldRefs();
        List<Object> structFieldsDataAsList = ssoi.getStructFieldsDataAsList(o);

        for (int i = 0; i < size; i++) {
            Field field = schema.getFields().get(i);
            TypeInfo typeInfo = columnTypes.get(i);
            StructField structFieldRef = allStructFieldRefs.get(i);
            Object structFieldData = structFieldsDataAsList.get(i);
            ObjectInspector fieldOI = structFieldRef.getFieldObjectInspector();

            Object val = serialize(typeInfo, fieldOI, structFieldData, field.schema());
            record.put(field.name(), val);
        }

        if (!GenericData.get().validate(schema, record))
            throw new SerializeToAvroException(schema, record);

        cache.setRecord(record);

        return cache;
    }

    private Object serialize(TypeInfo typeInfo, ObjectInspector fieldOI, Object structFieldData, Schema schema)
            throws HaivvreoException {
        switch (typeInfo.getCategory()) {
        case PRIMITIVE:
            assert fieldOI instanceof PrimitiveObjectInspector;
            return serializePrimitive(typeInfo, (PrimitiveObjectInspector) fieldOI, structFieldData);
        case MAP:
            assert fieldOI instanceof MapObjectInspector;
            assert typeInfo instanceof MapTypeInfo;
            return serializeMap((MapTypeInfo) typeInfo, (MapObjectInspector) fieldOI, structFieldData, schema);
        case LIST:
            assert fieldOI instanceof ListObjectInspector;
            assert typeInfo instanceof ListTypeInfo;
            return serializeList((ListTypeInfo) typeInfo, (ListObjectInspector) fieldOI, structFieldData, schema);
        case UNION:
            assert fieldOI instanceof UnionObjectInspector;
            assert typeInfo instanceof UnionTypeInfo;
            return serializeUnion((UnionTypeInfo) typeInfo, (UnionObjectInspector) fieldOI, structFieldData,
                    schema);
        case STRUCT:
            assert fieldOI instanceof StructObjectInspector;
            assert typeInfo instanceof StructTypeInfo;
            return serializeStruct((StructTypeInfo) typeInfo, (StructObjectInspector) fieldOI, structFieldData,
                    schema);
        default:
            throw new HaivvreoException("Ran out of TypeInfo Categories: " + typeInfo.getCategory());
        }
    }

    private Object serializeStruct(StructTypeInfo typeInfo, StructObjectInspector ssoi, Object o, Schema schema)
            throws HaivvreoException {
        int size = schema.getFields().size();
        List<? extends StructField> allStructFieldRefs = ssoi.getAllStructFieldRefs();
        List<Object> structFieldsDataAsList = ssoi.getStructFieldsDataAsList(o);
        GenericData.Record record = new GenericData.Record(schema);
        ArrayList<TypeInfo> allStructFieldTypeInfos = typeInfo.getAllStructFieldTypeInfos();

        for (int i = 0; i < size; i++) {
            Field field = schema.getFields().get(i);
            TypeInfo colTypeInfo = allStructFieldTypeInfos.get(i);
            StructField structFieldRef = allStructFieldRefs.get(i);
            Object structFieldData = structFieldsDataAsList.get(i);
            ObjectInspector fieldOI = structFieldRef.getFieldObjectInspector();

            Object val = serialize(colTypeInfo, fieldOI, structFieldData, field.schema());
            record.put(field.name(), val);
        }
        return record;
    }

    private Object serializePrimitive(TypeInfo typeInfo, PrimitiveObjectInspector fieldOI, Object structFieldData)
            throws HaivvreoException {
        switch (fieldOI.getPrimitiveCategory()) {
        case UNKNOWN:
            throw new HaivvreoException("Received UNKNOWN primitive category.");
        case VOID:
            return null;
        default: // All other primitive types are simple
            return fieldOI.getPrimitiveJavaObject(structFieldData);
        }
    }

    private Object serializeUnion(UnionTypeInfo typeInfo, UnionObjectInspector fieldOI, Object structFieldData,
            Schema schema) throws HaivvreoException {
        byte tag = fieldOI.getTag(structFieldData);

        // Invariant that Avro's tag ordering must match Hive's.
        return serialize(typeInfo.getAllUnionObjectTypeInfos().get(tag), fieldOI.getObjectInspectors().get(tag),
                fieldOI.getField(structFieldData), schema.getTypes().get(tag));
    }

    // Haivvreo treats FIXED and BYTES as arrays of tinyints within Hive.  Check
    // if we're dealing with either of these types and thus need to serialize
    // them as their Avro types.
    private boolean isTransformedType(Schema schema) {
        return schema.getType().equals(FIXED) || schema.getType().equals(BYTES);
    }

    private Object serializeTransformedType(ListTypeInfo typeInfo, ListObjectInspector fieldOI,
            Object structFieldData, Schema schema) throws HaivvreoException {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Beginning to transform " + typeInfo + " with Avro schema " + schema.toString(false));
        }
        if (schema.getType().equals(FIXED))
            return serializedAvroFixed(typeInfo, fieldOI, structFieldData, schema);
        else
            return serializeAvroBytes(typeInfo, fieldOI, structFieldData, schema);

    }

    private Object serializeAvroBytes(ListTypeInfo typeInfo, ListObjectInspector fieldOI, Object structFieldData,
            Schema schema) throws HaivvreoException {
        ByteBuffer bb = ByteBuffer.wrap(extraByteArray(fieldOI, structFieldData));
        return bb.rewind();
    }

    private Object serializedAvroFixed(ListTypeInfo typeInfo, ListObjectInspector fieldOI, Object structFieldData,
            Schema schema) throws HaivvreoException {
        return new GenericData.Fixed(extraByteArray(fieldOI, structFieldData));
    }

    // For transforming to BYTES and FIXED, pull out the byte array Avro will want
    private byte[] extraByteArray(ListObjectInspector fieldOI, Object structFieldData) throws HaivvreoException {
        // Grab a book.  This is going to be slow.
        int listLength = fieldOI.getListLength(structFieldData);
        byte[] bytes = new byte[listLength];
        assert fieldOI.getListElementObjectInspector() instanceof PrimitiveObjectInspector;
        PrimitiveObjectInspector poi = (PrimitiveObjectInspector) fieldOI.getListElementObjectInspector();
        List<?> list = fieldOI.getList(structFieldData);

        for (int i = 0; i < listLength; i++) {
            Object b = poi.getPrimitiveJavaObject(list.get(i));
            if (!(b instanceof Byte))
                throw new HaivvreoException("Attempting to transform to bytes, element was not byte but "
                        + b.getClass().getCanonicalName());
            bytes[i] = (Byte) b;
        }
        return bytes;
    }

    private Object serializeList(ListTypeInfo typeInfo, ListObjectInspector fieldOI, Object structFieldData,
            Schema schema) throws HaivvreoException {
        if (isTransformedType(schema))
            return serializeTransformedType(typeInfo, fieldOI, structFieldData, schema);

        List<?> list = fieldOI.getList(structFieldData);
        List<Object> deserialized = new ArrayList<Object>(list.size());

        TypeInfo listElementTypeInfo = typeInfo.getListElementTypeInfo();
        ObjectInspector listElementObjectInspector = fieldOI.getListElementObjectInspector();
        Schema elementType = schema.getElementType();

        for (int i = 0; i < list.size(); i++) {
            deserialized.add(i,
                    serialize(listElementTypeInfo, listElementObjectInspector, list.get(i), elementType));
        }

        return deserialized;
    }

    private Object serializeMap(MapTypeInfo typeInfo, MapObjectInspector fieldOI, Object structFieldData,
            Schema schema) throws HaivvreoException {
        // Avro only allows maps with string keys
        if (!mapHasStringKey(fieldOI.getMapKeyObjectInspector()))
            throw new HaivvreoException(
                    "Avro only supports maps with keys as Strings.  Current Map is: " + typeInfo.toString());

        ObjectInspector mapKeyObjectInspector = fieldOI.getMapKeyObjectInspector();
        ObjectInspector mapValueObjectInspector = fieldOI.getMapValueObjectInspector();
        TypeInfo mapKeyTypeInfo = typeInfo.getMapKeyTypeInfo();
        TypeInfo mapValueTypeInfo = typeInfo.getMapValueTypeInfo();
        Map<?, ?> map = fieldOI.getMap(structFieldData);
        Schema valueType = schema.getValueType();

        Map<Object, Object> deserialized = new Hashtable<Object, Object>(fieldOI.getMapSize(structFieldData));

        for (Map.Entry<?, ?> entry : map.entrySet()) {
            deserialized.put(serialize(mapKeyTypeInfo, mapKeyObjectInspector, entry.getKey(), null), // This works, but is a bit fragile.  Construct a single String schema?
                    serialize(mapValueTypeInfo, mapValueObjectInspector, entry.getValue(), valueType));
        }

        return deserialized;
    }

    private boolean mapHasStringKey(ObjectInspector mapKeyObjectInspector) {
        return mapKeyObjectInspector instanceof PrimitiveObjectInspector
                && ((PrimitiveObjectInspector) mapKeyObjectInspector).getPrimitiveCategory()
                        .equals(PrimitiveObjectInspector.PrimitiveCategory.STRING);
    }

    /**
     * Thrown when, during serialization of a Hive row to an Avro record, Avro
     * cannot verify the converted row to the record's schema.
     */
    public static class SerializeToAvroException extends HaivvreoException {
        final private Schema schema;
        final private GenericData.Record record;

        public SerializeToAvroException(Schema schema, GenericData.Record record) {
            this.schema = schema;
            this.record = record;
        }

        @Override
        public String toString() {
            return "Avro could not validate record against schema (record = " + record + ") (schema = "
                    + schema.toString(false) + ")";
        }
    }
}