colossal.pipe.ColAvroSerialization.java Source code

Java tutorial

Introduction

Here is the source code for colossal.pipe.ColAvroSerialization.java

Source

/*
 * Licensed to Think Big Analytics, Inc. under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  Think Big Analytics, Inc. licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * Copyright 2010 Think Big Analytics. All Rights Reserved.
 */
package colossal.pipe;

import java.io.*;

import org.apache.avro.Schema;
import org.apache.avro.io.*;
import org.apache.avro.mapred.*;
import org.apache.avro.specific.SpecificDatumReader;
import org.apache.avro.specific.SpecificDatumWriter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.serializer.*;

/**
 * Avro serialization format used by Colossal pipe. Derived from the Avro map/reduce framework format.
 * The {@link Serialization} used by jobs configured with {@link ColPhase}. 
 */
public class ColAvroSerialization<T> extends Configured implements Serialization<AvroWrapper<T>> {

    public boolean accept(Class<?> c) {
        return AvroWrapper.class.isAssignableFrom(c);
    }

    /**
     * Returns the specified map output deserializer. Defaults to the final output deserializer if no map output schema was
     * specified.
     */
    public Deserializer<AvroWrapper<T>> getDeserializer(Class<AvroWrapper<T>> c) {
        // We need not rely on mapred.task.is.map here to determine whether map
        // output or final output is desired, since the mapreduce framework never
        // creates a deserializer for final output, only for map output.
        boolean isKey = AvroKey.class.isAssignableFrom(c);
        Schema schema = Schema.parse(
                isKey ? getConf().get(ColPhase.MAP_OUT_KEY_SCHEMA) : getConf().get(ColPhase.MAP_OUT_VALUE_SCHEMA));
        return new AvroWrapperDeserializer(new SpecificDatumReader<T>(schema), isKey);
    }

    private static final DecoderFactory FACTORY = new DecoderFactory();
    static {
        FACTORY.configureDirectDecoder(true);
    }

    private class AvroWrapperDeserializer implements Deserializer<AvroWrapper<T>> {

        private DatumReader<T> reader;
        private BinaryDecoder decoder;
        private boolean isKey;

        public AvroWrapperDeserializer(DatumReader<T> reader, boolean isKey) {
            this.reader = reader;
            this.isKey = isKey;
        }

        public void open(InputStream in) {
            this.decoder = FACTORY.createBinaryDecoder(in, decoder);
        }

        public AvroWrapper<T> deserialize(AvroWrapper<T> wrapper) throws IOException {
            T datum = reader.read(wrapper == null ? null : wrapper.datum(), decoder);
            if (wrapper == null) {
                wrapper = isKey ? new AvroKey<T>(datum) : new AvroValue<T>(datum);
            } else {
                wrapper.datum(datum);
            }
            return wrapper;
        }

        public void close() throws IOException {
            decoder.inputStream().close();
        }

    }

    /** Returns the specified output serializer. */
    public Serializer<AvroWrapper<T>> getSerializer(Class<AvroWrapper<T>> c) {
        Configuration conf = getConf();
        // Here we must rely on mapred.task.is.map to tell whether the map output
        // or final output is needed.
        boolean isMap = conf.getBoolean("mapred.task.is.map", false);
        Schema schema = !isMap ? AvroJob.getOutputSchema(conf)
                : Schema.parse(AvroKey.class.isAssignableFrom(c) ? conf.get(ColPhase.MAP_OUT_KEY_SCHEMA)
                        : conf.get(ColPhase.MAP_OUT_VALUE_SCHEMA));
        return new AvroWrapperSerializer(new SpecificDatumWriter<T>(schema));
    }

    private class AvroWrapperSerializer implements Serializer<AvroWrapper<T>> {

        private DatumWriter<T> writer;
        private OutputStream out;
        private BinaryEncoder encoder;

        public AvroWrapperSerializer(DatumWriter<T> writer) {
            this.writer = writer;
        }

        public void open(OutputStream out) {
            this.out = out;
            this.encoder = new BinaryEncoder(out);
        }

        public void serialize(AvroWrapper<T> wrapper) throws IOException {
            writer.write(wrapper.datum(), encoder);
        }

        public void close() throws IOException {
            out.close();
        }

    }

}