Java tutorial
/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.tuple.hadoop; import java.util.Comparator; import java.util.HashMap; import java.util.Map; import cascading.CascadingException; import cascading.tuple.Comparison; import cascading.tuple.IndexTuple; import cascading.tuple.Tuple; import cascading.tuple.TupleException; import cascading.tuple.TuplePair; import cascading.util.Util; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.io.serializer.Deserializer; import org.apache.hadoop.io.serializer.Serialization; import org.apache.hadoop.io.serializer.SerializationFactory; import org.apache.hadoop.io.serializer.Serializer; import org.apache.hadoop.mapred.JobConf; import org.apache.log4j.Logger; /** * Class TupleSerialization is an implementation of Hadoop's {@link Serialization} interface. * <p/> * Typically developers will not use this implementation directly as it is automatically added * to any relevant MapReduce jobs via the {@link JobConf}. * <p/> * By default, all primitive types are natively handled, and {@link org.apache.hadoop.io.BytesWritable} * has a pre-configured serialization token since byte arrays are not handled natively by {@link Tuple}. */ @SerializationToken(tokens = { 127 }, classNames = { "org.apache.hadoop.io.BytesWritable" }) public class TupleSerialization extends Configured implements Serialization { /** Field LOG */ private static final Logger LOG = Logger.getLogger(TupleSerialization.class); /** Field classCache */ private Map<String, Class> classCache = new HashMap<String, Class>(); /** Field serializationFactory */ private SerializationFactory serializationFactory; /** Field tokenClassesMap */ private HashMap<Integer, String> tokenClassesMap; /** Field classesTokensMap */ private HashMap<String, Integer> classesTokensMap; /** Field tokenMapSize */ private long tokensSize = 0; /** * Adds the given token and className pair as a serialization token property. During object serialization and deserialization, * the given token will be used instead of the className when an instance of the className is encountered. * * @param properties of type Map * @param token of type int * @param className of type String */ public static void addSerializationToken(Map<Object, Object> properties, int token, String className) { String tokens = getSerializationTokens(properties); properties.put("cascading.serialization.tokens", Util.join(",", Util.removeNulls(tokens, token + "=" + className))); } /** * Returns the serialization tokens property. * * @param properties of type Map * @return returns a String */ public static String getSerializationTokens(Map<Object, Object> properties) { return (String) properties.get("cascading.serialization.tokens"); } static String getSerializationTokens(JobConf jobConf) { return jobConf.get("cascading.serialization.tokens"); } /** * Adds the given className as a Hadoop IO serialization class. * * @param properties of type Map * @param className of type String */ public static void addSerialization(Map<Object, Object> properties, String className) { String serializations = (String) properties.get("io.serializations"); properties.put("io.serializations", Util.join(",", Util.removeNulls(serializations, className))); } /** * Adds this class as a Hadoop Serialization class. This method is safe to call redundantly. * * @param jobConf of type JobConf */ public static void setSerializations(JobConf jobConf) { String serializations = getSerializations(jobConf); if (serializations.contains(TupleSerialization.class.getName())) return; jobConf.set("io.serializations", Util.join(",", Util.removeNulls(serializations, TupleSerialization.class.getName()))); } static String getSerializations(JobConf jobConf) { return jobConf.get("io.serializations", ""); } /** Constructor TupleSerialization creates a new TupleSerialization instance. */ public TupleSerialization() { } /** * Constructor TupleSerialization creates a new TupleSerialization instance. * * @param conf of type Configuration */ public TupleSerialization(Configuration conf) { super(conf); } @Override public Configuration getConf() { if (super.getConf() == null) setConf(new JobConf()); return super.getConf(); } SerializationFactory getSerializationFactory() { if (serializationFactory == null) serializationFactory = new SerializationFactory(getConf()); return serializationFactory; } /** Must be called before {@link #getClassNameFor(int)} and {@link #getTokenFor(String)} methods. */ void initTokenMaps() { if (tokenClassesMap != null) return; tokenClassesMap = new HashMap<Integer, String>(); classesTokensMap = new HashMap<String, Integer>(); String tokenProperty = getSerializationTokens((JobConf) getConf()); if (tokenProperty != null) { tokenProperty = tokenProperty.replaceAll("\\s", ""); // allow for whitespace in token set for (String pair : tokenProperty.split(",")) { String[] elements = pair.split("="); addToken(null, Integer.parseInt(elements[0]), elements[1]); } } String serializationsString = getSerializations((JobConf) getConf()); if (serializationsString == null) return; String[] serializations = serializationsString.split(","); for (String serializationName : serializations) { try { Class type = getConf().getClassByName(serializationName); SerializationToken tokenAnnotation = (SerializationToken) type .getAnnotation(SerializationToken.class); if (tokenAnnotation == null) continue; if (tokenAnnotation.tokens().length != tokenAnnotation.classNames().length) throw new CascadingException( "serialization annotation tokens and classNames must be the same length"); int[] tokens = tokenAnnotation.tokens(); for (int i = 0; i < tokens.length; i++) addToken(type, tokens[i], tokenAnnotation.classNames()[i]); } catch (ClassNotFoundException exception) { LOG.warn("unable to load serialization class: " + serializationName, exception); } } tokensSize = tokenClassesMap.size(); return; } private void addToken(Class type, int token, String className) { if (type != null && !type.getName().startsWith("cascading.") && token < 128) throw new CascadingException("serialization annotation tokens may not be less than 128, was: " + token); if (tokenClassesMap.containsKey(token)) { if (type == null) throw new IllegalStateException("duplicate serialization token: " + token + " for class: " + className + " found in properties"); throw new IllegalStateException("duplicate serialization token: " + token + " for class: " + className + " on serialization: " + type.getName()); } if (classesTokensMap.containsKey(className)) { if (type == null) throw new IllegalStateException("duplicate serialization classname: " + className + " for token: " + token + " found in properties "); throw new IllegalStateException("duplicate serialization classname: " + className + " for token: " + token + " on serialization: " + type.getName()); } tokenClassesMap.put(token, className); classesTokensMap.put(className, token); } /** * Returns the className for the given token. * * @param token of type int * @return a String */ final String getClassNameFor(int token) { if (tokensSize == 0) return null; return tokenClassesMap.get(token); } /** * Returns the token for the given className. * * @param className of type String * @return an Integer */ final Integer getTokenFor(String className) { if (tokensSize == 0) return null; return classesTokensMap.get(className); } public Comparator getComparator(Class type) { Serialization serialization = getSerialization(type); if (serialization instanceof Comparison) return ((Comparison) serialization).getComparator(type); return null; } Serialization getSerialization(String className) { return getSerialization(getClass(className)); } Serialization getSerialization(Class type) { return getSerializationFactory().getSerialization(type); } Serializer getNewSerializer(Class type) { try { return getSerializationFactory().getSerializer(type); } catch (NullPointerException exception) { throw new CascadingException("unable to load serializer for: " + type.getName() + " from: " + getSerializationFactory().getClass().getName()); } } Deserializer getNewDeserializer(String className) { try { return getSerializationFactory().getDeserializer(getClass(className)); } catch (NullPointerException exception) { throw new CascadingException("unable to load deserializer for: " + className + " from: " + getSerializationFactory().getClass().getName()); } } TuplePairDeserializer getTuplePairDeserializer() { return new TuplePairDeserializer(getElementReader()); } /** * Method getElementReader returns the elementReader of this TupleSerialization object. * * @return the elementReader (type SerializationElementReader) of this TupleSerialization object. */ public SerializationElementReader getElementReader() { return new SerializationElementReader(this); } TupleDeserializer getTupleDeserializer() { return new TupleDeserializer(getElementReader()); } private TuplePairSerializer getTuplePairSerializer() { return new TuplePairSerializer(getElementWriter()); } IndexTupleDeserializer getIndexTupleDeserializer() { return new IndexTupleDeserializer(getElementReader()); } /** * Method getElementWriter returns the elementWriter of this TupleSerialization object. * * @return the elementWriter (type SerializationElementWriter) of this TupleSerialization object. */ public SerializationElementWriter getElementWriter() { return new SerializationElementWriter(this); } private TupleSerializer getTupleSerializer() { return new TupleSerializer(getElementWriter()); } private IndexTupleSerializer getIndexTupleSerializer() { return new IndexTupleSerializer(getElementWriter()); } /** * Method accept implements {@link Serialization#accept(Class)}. * * @param c of type Class * @return boolean */ public boolean accept(Class c) { return Tuple.class == c || TuplePair.class == c || IndexTuple.class == c; } /** * Method getDeserializer implements {@link Serialization#getDeserializer(Class)}. * * @param c of type Class * @return Deserializer */ public Deserializer getDeserializer(Class c) { if (c == Tuple.class) return getTupleDeserializer(); else if (c == TuplePair.class) return getTuplePairDeserializer(); else if (c == IndexTuple.class) return getIndexTupleDeserializer(); throw new IllegalArgumentException("unknown class, cannot deserialize: " + c.getName()); } /** * Method getSerializer implements {@link Serialization#getSerializer(Class)}. * * @param c of type Class * @return Serializer */ public Serializer getSerializer(Class c) { if (c == Tuple.class) return getTupleSerializer(); else if (c == TuplePair.class) return getTuplePairSerializer(); else if (c == IndexTuple.class) return getIndexTupleSerializer(); throw new IllegalArgumentException("unknown class, cannot serialize: " + c.getName()); } public Class getClass(String className) { Class type = classCache.get(className); if (type != null) return type; try { if (className.charAt(0) == '[') type = Class.forName(className, true, Thread.currentThread().getContextClassLoader()); else type = Thread.currentThread().getContextClassLoader().loadClass(className); } catch (ClassNotFoundException exception) { throw new TupleException("unable to load class named: " + className, exception); } classCache.put(className, type); return type; } }