cascading.tuple.hadoop.TupleSerialization.java Source code

Java tutorial

Introduction

Here is the source code for cascading.tuple.hadoop.TupleSerialization.java

Source

/*
 * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Cascading is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Cascading is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Cascading.  If not, see <http://www.gnu.org/licenses/>.
 */

package cascading.tuple.hadoop;

import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;

import cascading.CascadingException;
import cascading.tuple.Comparison;
import cascading.tuple.IndexTuple;
import cascading.tuple.Tuple;
import cascading.tuple.TupleException;
import cascading.tuple.TuplePair;
import cascading.util.Util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.serializer.Deserializer;
import org.apache.hadoop.io.serializer.Serialization;
import org.apache.hadoop.io.serializer.SerializationFactory;
import org.apache.hadoop.io.serializer.Serializer;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;

/**
 * Class TupleSerialization is an implementation of Hadoop's {@link Serialization} interface.
 * <p/>
 * Typically developers will not use this implementation directly as it is automatically added
 * to any relevant MapReduce jobs via the {@link JobConf}.
 * <p/>
 * By default, all primitive types are natively handled, and {@link org.apache.hadoop.io.BytesWritable}
 * has a pre-configured serialization token since byte arrays are not handled natively by {@link Tuple}.
 */
@SerializationToken(tokens = { 127 }, classNames = { "org.apache.hadoop.io.BytesWritable" })
public class TupleSerialization extends Configured implements Serialization {
    /** Field LOG */
    private static final Logger LOG = Logger.getLogger(TupleSerialization.class);

    /** Field classCache */
    private Map<String, Class> classCache = new HashMap<String, Class>();
    /** Field serializationFactory */
    private SerializationFactory serializationFactory;

    /** Field tokenClassesMap */
    private HashMap<Integer, String> tokenClassesMap;
    /** Field classesTokensMap */
    private HashMap<String, Integer> classesTokensMap;
    /** Field tokenMapSize */
    private long tokensSize = 0;

    /**
     * Adds the given token and className pair as a serialization token property. During object serialization and deserialization,
     * the given token will be used instead of the className when an instance of the className is encountered.
     *
     * @param properties of type Map
     * @param token      of type int
     * @param className  of type String
     */
    public static void addSerializationToken(Map<Object, Object> properties, int token, String className) {
        String tokens = getSerializationTokens(properties);

        properties.put("cascading.serialization.tokens",
                Util.join(",", Util.removeNulls(tokens, token + "=" + className)));
    }

    /**
     * Returns the serialization tokens property.
     *
     * @param properties of type Map
     * @return returns a String
     */
    public static String getSerializationTokens(Map<Object, Object> properties) {
        return (String) properties.get("cascading.serialization.tokens");
    }

    static String getSerializationTokens(JobConf jobConf) {
        return jobConf.get("cascading.serialization.tokens");
    }

    /**
     * Adds the given className as a Hadoop IO serialization class.
     *
     * @param properties of type Map
     * @param className  of type String
     */
    public static void addSerialization(Map<Object, Object> properties, String className) {
        String serializations = (String) properties.get("io.serializations");

        properties.put("io.serializations", Util.join(",", Util.removeNulls(serializations, className)));
    }

    /**
     * Adds this class as a Hadoop Serialization class. This method is safe to call redundantly.
     *
     * @param jobConf of type JobConf
     */
    public static void setSerializations(JobConf jobConf) {
        String serializations = getSerializations(jobConf);

        if (serializations.contains(TupleSerialization.class.getName()))
            return;

        jobConf.set("io.serializations",
                Util.join(",", Util.removeNulls(serializations, TupleSerialization.class.getName())));
    }

    static String getSerializations(JobConf jobConf) {
        return jobConf.get("io.serializations", "");
    }

    /** Constructor TupleSerialization creates a new TupleSerialization instance. */
    public TupleSerialization() {
    }

    /**
     * Constructor TupleSerialization creates a new TupleSerialization instance.
     *
     * @param conf of type Configuration
     */
    public TupleSerialization(Configuration conf) {
        super(conf);
    }

    @Override
    public Configuration getConf() {
        if (super.getConf() == null)
            setConf(new JobConf());

        return super.getConf();
    }

    SerializationFactory getSerializationFactory() {
        if (serializationFactory == null)
            serializationFactory = new SerializationFactory(getConf());

        return serializationFactory;
    }

    /** Must be called before {@link #getClassNameFor(int)} and {@link #getTokenFor(String)} methods. */
    void initTokenMaps() {
        if (tokenClassesMap != null)
            return;

        tokenClassesMap = new HashMap<Integer, String>();
        classesTokensMap = new HashMap<String, Integer>();

        String tokenProperty = getSerializationTokens((JobConf) getConf());

        if (tokenProperty != null) {
            tokenProperty = tokenProperty.replaceAll("\\s", ""); // allow for whitespace in token set

            for (String pair : tokenProperty.split(",")) {
                String[] elements = pair.split("=");
                addToken(null, Integer.parseInt(elements[0]), elements[1]);
            }
        }

        String serializationsString = getSerializations((JobConf) getConf());

        if (serializationsString == null)
            return;

        String[] serializations = serializationsString.split(",");

        for (String serializationName : serializations) {
            try {
                Class type = getConf().getClassByName(serializationName);

                SerializationToken tokenAnnotation = (SerializationToken) type
                        .getAnnotation(SerializationToken.class);

                if (tokenAnnotation == null)
                    continue;

                if (tokenAnnotation.tokens().length != tokenAnnotation.classNames().length)
                    throw new CascadingException(
                            "serialization annotation tokens and classNames must be the same length");

                int[] tokens = tokenAnnotation.tokens();

                for (int i = 0; i < tokens.length; i++)
                    addToken(type, tokens[i], tokenAnnotation.classNames()[i]);
            } catch (ClassNotFoundException exception) {
                LOG.warn("unable to load serialization class: " + serializationName, exception);
            }
        }

        tokensSize = tokenClassesMap.size();

        return;
    }

    private void addToken(Class type, int token, String className) {
        if (type != null && !type.getName().startsWith("cascading.") && token < 128)
            throw new CascadingException("serialization annotation tokens may not be less than 128, was: " + token);

        if (tokenClassesMap.containsKey(token)) {
            if (type == null)
                throw new IllegalStateException("duplicate serialization token: " + token + " for class: "
                        + className + " found in properties");

            throw new IllegalStateException("duplicate serialization token: " + token + " for class: " + className
                    + " on serialization: " + type.getName());
        }

        if (classesTokensMap.containsKey(className)) {
            if (type == null)
                throw new IllegalStateException("duplicate serialization classname: " + className + " for token: "
                        + token + " found in properties ");

            throw new IllegalStateException("duplicate serialization classname: " + className + " for token: "
                    + token + " on serialization: " + type.getName());
        }

        tokenClassesMap.put(token, className);
        classesTokensMap.put(className, token);
    }

    /**
     * Returns the className for the given token.
     *
     * @param token of type int
     * @return a String
     */
    final String getClassNameFor(int token) {
        if (tokensSize == 0)
            return null;

        return tokenClassesMap.get(token);
    }

    /**
     * Returns the token for the given className.
     *
     * @param className of type String
     * @return an Integer
     */
    final Integer getTokenFor(String className) {
        if (tokensSize == 0)
            return null;

        return classesTokensMap.get(className);
    }

    public Comparator getComparator(Class type) {
        Serialization serialization = getSerialization(type);

        if (serialization instanceof Comparison)
            return ((Comparison) serialization).getComparator(type);

        return null;
    }

    Serialization getSerialization(String className) {
        return getSerialization(getClass(className));
    }

    Serialization getSerialization(Class type) {
        return getSerializationFactory().getSerialization(type);
    }

    Serializer getNewSerializer(Class type) {
        try {
            return getSerializationFactory().getSerializer(type);
        } catch (NullPointerException exception) {
            throw new CascadingException("unable to load serializer for: " + type.getName() + " from: "
                    + getSerializationFactory().getClass().getName());
        }
    }

    Deserializer getNewDeserializer(String className) {
        try {
            return getSerializationFactory().getDeserializer(getClass(className));
        } catch (NullPointerException exception) {
            throw new CascadingException("unable to load deserializer for: " + className + " from: "
                    + getSerializationFactory().getClass().getName());
        }
    }

    TuplePairDeserializer getTuplePairDeserializer() {
        return new TuplePairDeserializer(getElementReader());
    }

    /**
     * Method getElementReader returns the elementReader of this TupleSerialization object.
     *
     * @return the elementReader (type SerializationElementReader) of this TupleSerialization object.
     */
    public SerializationElementReader getElementReader() {
        return new SerializationElementReader(this);
    }

    TupleDeserializer getTupleDeserializer() {
        return new TupleDeserializer(getElementReader());
    }

    private TuplePairSerializer getTuplePairSerializer() {
        return new TuplePairSerializer(getElementWriter());
    }

    IndexTupleDeserializer getIndexTupleDeserializer() {
        return new IndexTupleDeserializer(getElementReader());
    }

    /**
     * Method getElementWriter returns the elementWriter of this TupleSerialization object.
     *
     * @return the elementWriter (type SerializationElementWriter) of this TupleSerialization object.
     */
    public SerializationElementWriter getElementWriter() {
        return new SerializationElementWriter(this);
    }

    private TupleSerializer getTupleSerializer() {
        return new TupleSerializer(getElementWriter());
    }

    private IndexTupleSerializer getIndexTupleSerializer() {
        return new IndexTupleSerializer(getElementWriter());
    }

    /**
     * Method accept implements {@link Serialization#accept(Class)}.
     *
     * @param c of type Class
     * @return boolean
     */
    public boolean accept(Class c) {
        return Tuple.class == c || TuplePair.class == c || IndexTuple.class == c;
    }

    /**
     * Method getDeserializer implements {@link Serialization#getDeserializer(Class)}.
     *
     * @param c of type Class
     * @return Deserializer
     */
    public Deserializer getDeserializer(Class c) {
        if (c == Tuple.class)
            return getTupleDeserializer();
        else if (c == TuplePair.class)
            return getTuplePairDeserializer();
        else if (c == IndexTuple.class)
            return getIndexTupleDeserializer();

        throw new IllegalArgumentException("unknown class, cannot deserialize: " + c.getName());
    }

    /**
     * Method getSerializer implements {@link Serialization#getSerializer(Class)}.
     *
     * @param c of type Class
     * @return Serializer
     */
    public Serializer getSerializer(Class c) {
        if (c == Tuple.class)
            return getTupleSerializer();
        else if (c == TuplePair.class)
            return getTuplePairSerializer();
        else if (c == IndexTuple.class)
            return getIndexTupleSerializer();

        throw new IllegalArgumentException("unknown class, cannot serialize: " + c.getName());
    }

    public Class getClass(String className) {
        Class type = classCache.get(className);

        if (type != null)
            return type;

        try {
            if (className.charAt(0) == '[')
                type = Class.forName(className, true, Thread.currentThread().getContextClassLoader());
            else
                type = Thread.currentThread().getContextClassLoader().loadClass(className);
        } catch (ClassNotFoundException exception) {
            throw new TupleException("unable to load class named: " + className, exception);
        }

        classCache.put(className, type);

        return type;
    }
}