tap.formats.avro.TapAvroSerialization.java Source code

Java tutorial

Introduction

Here is the source code for tap.formats.avro.TapAvroSerialization.java

Source

/*
 * Licensed to Think Big Analytics, Inc. under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  Think Big Analytics, Inc. licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * Copyright 2010 Think Big Analytics. All Rights Reserved.
 */
package tap.formats.avro;

import java.io.*;

import org.apache.avro.Schema;
import org.apache.avro.io.*;
import org.apache.avro.mapred.*;
import org.apache.avro.protobuf.ProtobufDatumReader;
import org.apache.avro.protobuf.ProtobufDatumWriter;
import org.apache.avro.reflect.ReflectDatumReader;
import org.apache.avro.reflect.ReflectDatumWriter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.serializer.*;

import com.google.protobuf.Message;

import tap.Phase;
import tap.core.io.BinaryKey;
import tap.core.io.avro.BinaryKeyDatumReader;
import tap.core.io.avro.BinaryKeyDatumWriter;
import tap.core.io.avro.BinaryKeyEncoder;

/**
 * Avro serialization format used by Tap. Derived from the Avro map/reduce framework format.
 * The {@link Serialization} used by jobs configured with {@link Phase}. 
 */
public class TapAvroSerialization<T> extends Configured implements Serialization<AvroWrapper<T>> {

    public boolean accept(Class<?> c) {
        return AvroWrapper.class.isAssignableFrom(c);
    }

    /**
     * Returns the specified map output deserializer. Defaults to the final output deserializer if no map output schema was
     * specified.
     */
    @SuppressWarnings("rawtypes")
    public Deserializer<AvroWrapper<T>> getDeserializer(Class<AvroWrapper<T>> c) {
        // We need not rely on mapred.task.is.map here to determine whether map
        // output or final output is desired, since the mapreduce framework never
        // creates a deserializer for final output, only for map output.
        boolean isKey = AvroKey.class.isAssignableFrom(c);
        Schema schema = Schema
                .parse(isKey ? getConf().get(Phase.MAP_OUT_KEY_SCHEMA) : getConf().get(Phase.MAP_OUT_VALUE_SCHEMA));

        boolean isProtobuf = !isKey && Message.class.isAssignableFrom(getMapOutClass(getConf()));

        DatumReader reader = null;
        if (isProtobuf)
            reader = new ProtobufDatumReader<T>(schema);
        else if (isKey)
            reader = new BinaryKeyDatumReader();
        else
            reader = new ReflectDatumReader<T>(schema);

        return new AvroWrapperDeserializer(reader, isKey);
    }

    private static final Class<?> getMapOutClass(Configuration conf) {
        try {
            return conf.getClassByName(conf.get(Phase.MAP_OUT_CLASS));
        } catch (ClassNotFoundException e) {
            throw new RuntimeException(e);
        }
    }

    private static final DecoderFactory FACTORY = new DecoderFactory();
    //    static {
    //        FACTORY.configureDirectDecoder(true);
    //    }

    private class AvroWrapperDeserializer implements Deserializer<AvroWrapper<T>> {

        private DatumReader<T> reader;
        private InputStream in;
        private Decoder decoder;
        private boolean isKey;

        public AvroWrapperDeserializer(DatumReader<T> reader, boolean isKey) {
            this.reader = reader;
            this.isKey = isKey;
        }

        public void open(InputStream in) {
            this.in = in;
            this.decoder = FACTORY.directBinaryDecoder(in, null);
        }

        public AvroWrapper<T> deserialize(AvroWrapper<T> wrapper) throws IOException {
            T datum = reader.read(wrapper == null ? null : wrapper.datum(), decoder);
            if (wrapper == null) {
                wrapper = isKey ? new AvroKey<T>(datum) : new AvroValue<T>(datum);
            } else {
                wrapper.datum(datum);
            }
            return wrapper;
        }

        public void close() throws IOException {
            if (decoder instanceof BinaryDecoder)
                ((BinaryDecoder) decoder).inputStream().close();
            else
                in.close();
        }
    }

    /** Returns the specified output serializer. */
    public Serializer<AvroWrapper<T>> getSerializer(Class<AvroWrapper<T>> c) {
        if (AvroKey.class.isAssignableFrom(c))
            return new BinaryKeySerializer();

        Configuration conf = getConf();
        // Here we must rely on mapred.task.is.map to tell whether the map output
        // or final output is needed.
        boolean isMap = conf.getBoolean("mapred.task.is.map", false);

        Schema schema = null;
        if (!isMap)
            schema = AvroJob.getOutputSchema(conf);
        else
            schema = Schema.parse(conf.get(Phase.MAP_OUT_VALUE_SCHEMA));

        boolean isProtobuf = Message.class.isAssignableFrom(getMapOutClass(getConf()));

        DatumWriter<T> writer;
        if (isProtobuf)
            writer = new ProtobufDatumWriter<T>(schema);
        else
            writer = new ReflectDatumWriter<T>(schema);

        return new AvroValueSerializer(writer);
    }

    private class BinaryKeySerializer implements Serializer<AvroWrapper<T>> {

        OutputStream out;

        @Override
        public void open(OutputStream out) throws IOException {
            this.out = out;
        }

        @Override
        public void serialize(AvroWrapper<T> t) throws IOException {
            BinaryKey key = (BinaryKey) t.datum();
            out.write(key.getBuffer(), 0, key.getLength());
        }

        @Override
        public void close() throws IOException {
            out.close();
        }
    }

    private class AvroValueSerializer implements Serializer<AvroWrapper<T>> {

        private DatumWriter<T> writer;
        private OutputStream out;
        private Encoder encoder;

        public AvroValueSerializer(DatumWriter<T> writer) {
            this.writer = writer;
        }

        public void open(OutputStream out) {
            this.out = out;
            this.encoder = new EncoderFactory().directBinaryEncoder(out, null);
        }

        public void serialize(AvroWrapper<T> wrapper) throws IOException {
            writer.write(wrapper.datum(), encoder);
        }

        public void close() throws IOException {
            out.close();
        }

    }
}