Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PigLogger; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PigProgressable; import org.apache.pig.builtin.OutputSchema; import org.apache.pig.classification.InterfaceAudience; import org.apache.pig.classification.InterfaceStability; import org.apache.pig.data.Tuple; import org.apache.pig.impl.PigContext; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.util.UDFContext; import org.apache.pig.impl.util.Utils; import org.apache.pig.parser.ParserException; import java.io.IOException; import java.lang.reflect.*; import java.util.HashMap; import java.util.List; import java.util.Map; /** * The class is used to implement functions to be applied to * fields in a dataset. The function is applied to each Tuple in the set. * The programmer should not make assumptions about state maintained * between invocations of the exec() method since the Pig runtime * will schedule and localize invocations based on information provided * at runtime. The programmer also should not make assumptions about when or * how many times the class will be instantiated, since it may be instantiated * multiple times in both the front and back end. */ @InterfaceAudience.Public @InterfaceStability.Stable public abstract class EvalFunc<T> { /** * Reporter to send heartbeats to Hadoop. If exec will take more than a * a few seconds {@link PigProgressable#progress} should be called * occasionally to avoid timeouts. Default Hadoop timeout is 600 seconds. */ protected PigProgressable reporter; /** * Logging object. Log calls made on the front end will be sent to * pig's log on the client. Log calls made on the backend will be * sent to stdout and can be seen in the Hadoop logs. */ protected Log log = LogFactory.getLog(getClass()); /** * Logger for aggregating warnings. Any warnings to be sent to the user * should be logged to this via {@link PigLogger#warn}. */ protected PigLogger pigLogger; private static int nextSchemaId; // for assigning unique ids to UDF columns protected String getSchemaName(String name, Schema input) { String alias = name + "_"; if (input != null && input.getAliases().size() > 0) { alias += input.getAliases().iterator().next() + "_"; } alias += ++nextSchemaId; return alias; } /** * Return type of this instance of EvalFunc. */ protected Type returnType; /** * EvalFunc's schema type. * @see {@link EvalFunc#getSchemaType()} */ public static enum SchemaType { NORMAL, //default field type VARARG //if the last field of the (udf) schema is of type vararg }; public EvalFunc() { // Resolve concrete type for T of EvalFunc<T> // 1. Build map from type param to type for class hierarchy from current class to EvalFunc Map<TypeVariable<?>, Type> typesByTypeVariable = new HashMap<TypeVariable<?>, Type>(); Class<?> cls = getClass(); Type type = cls.getGenericSuperclass(); cls = cls.getSuperclass(); while (EvalFunc.class.isAssignableFrom(cls)) { TypeVariable<? extends Class<?>>[] typeParams = cls.getTypeParameters(); if (type instanceof ParameterizedType) { ParameterizedType pType = (ParameterizedType) type; Type[] typeArgs = pType.getActualTypeArguments(); for (int i = 0; i < typeParams.length; i++) { typesByTypeVariable.put(typeParams[i], typeArgs[i]); } } type = cls.getGenericSuperclass(); cls = cls.getSuperclass(); } // 2. Use type param to type map to determine concrete type of for T of EvalFunc<T> Type targetType = EvalFunc.class.getTypeParameters()[0]; while (targetType != null && targetType instanceof TypeVariable) { targetType = typesByTypeVariable.get(targetType); } if (targetType == null || targetType instanceof GenericArrayType || targetType instanceof WildcardType) { throw new RuntimeException(String.format( "Failed to determine concrete type for type parameter T of EvalFunc<T> for derived class '%s'", getClass().getName())); } returnType = targetType; // Type check the initial, intermediate, and final functions if (this instanceof Algebraic) { Algebraic a = (Algebraic) this; String errMsg = "function of " + getClass().getName() + " is not of the expected type."; if (getReturnTypeFromSpec(new FuncSpec(a.getInitial())) != Tuple.class) throw new RuntimeException("Initial " + errMsg); if (getReturnTypeFromSpec(new FuncSpec(a.getIntermed())) != Tuple.class) throw new RuntimeException("Intermediate " + errMsg); if (!getReturnTypeFromSpec(new FuncSpec(a.getFinal())).equals(returnType)) throw new RuntimeException("Final " + errMsg); } } private Type getReturnTypeFromSpec(FuncSpec funcSpec) { try { return ((EvalFunc<?>) PigContext.instantiateFuncFromSpec(funcSpec)).getReturnType(); } catch (ClassCastException e) { throw new RuntimeException(funcSpec + " does not specify an eval func", e); } } /** * Get the Type that this EvalFunc returns. * @return Type */ public Type getReturnType() { return returnType; } // report that progress is being made (otherwise hadoop times out after 600 seconds working on one outer tuple) /** * Utility method to allow UDF to report progress. If exec will take more than a * a few seconds {@link PigProgressable#progress} should be called * occasionally to avoid timeouts. Default Hadoop timeout is 600 seconds. */ public final void progress() { if (reporter != null) reporter.progress(); else warn("No reporter object provided to UDF.", PigWarning.PROGRESS_REPORTER_NOT_PROVIDED); } /** * Issue a warning. Warning messages are aggregated and reported to * the user. * @param msg String message of the warning * @param warningEnum type of warning */ public final void warn(String msg, Enum warningEnum) { if (pigLogger != null) pigLogger.warn(this, msg, warningEnum); else log.warn("No logger object provided to UDF: " + this.getClass().getName() + ". " + msg); } /** * Placeholder for cleanup to be performed at the end. User defined functions can override. * Default implementation is a no-op. */ public void finish() { } /** * This callback method must be implemented by all subclasses. This * is the method that will be invoked on every Tuple of a given dataset. * Since the dataset may be divided up in a variety of ways the programmer * should not make assumptions about state that is maintained between * invocations of this method. * * @param input the Tuple to be processed. * @return result, of type T. * @throws IOException */ abstract public T exec(Tuple input) throws IOException; /** * Report the schema of the output of this UDF. Pig will make use of * this in error checking, optimization, and planning. The schema * of input data to this UDF is provided. * <p> * The default implementation interprets the {@link OutputSchema} annotation, * if one is present. Otherwise, it returns <code>null</code> (no known output schema). * * @param input Schema of the input * @return Schema of the output */ public Schema outputSchema(Schema input) { OutputSchema schema = this.getClass().getAnnotation(OutputSchema.class); try { return (schema == null) ? null : Utils.getSchemaFromString(schema.value()); } catch (ParserException e) { throw new RuntimeException(e); } } /** * This function should be overriden to return true for functions that return their values * asynchronously. Currently pig never attempts to execute a function * asynchronously. * @return true if the function can be executed asynchronously. */ @Deprecated public boolean isAsynchronous() { return false; } public PigProgressable getReporter() { return reporter; } /** * Set the reporter. Called by Pig to provide a reference of * the reporter to the UDF. * @param reporter Hadoop reporter */ public final void setReporter(PigProgressable reporter) { this.reporter = reporter; } /** * Allow a UDF to specify type specific implementations of itself. For example, * an implementation of arithmetic sum might have int and float implementations, * since integer arithmetic performs much better than floating point arithmetic. Pig's * typechecker will call this method and using the returned list plus the schema * of the function's input data, decide which implementation of the UDF to use. * @return A List containing FuncSpec objects representing the EvalFunc class * which can handle the inputs corresponding to the schema in the objects. Each * FuncSpec should be constructed with a schema that describes the input for that * implementation. For example, the sum function above would return two elements in its * list: * <ol> * <li>FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.DOUBLE))) * <li>FuncSpec(IntSum.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.INTEGER))) * </ol> * This would indicate that the main implementation is used for doubles, and the special * implementation IntSum is used for ints. */ public List<FuncSpec> getArgToFuncMapping() throws FrontendException { return null; } /** * Allow a UDF to specify a list of hdfs files it would like placed in the distributed * cache. These files will be put in the cache for every job the UDF is used in. * The default implementation returns null. * @return A list of files */ public List<String> getCacheFiles() { return null; } /** * Allow a UDF to specify a list of local files it would like placed in the distributed * cache. These files will be put in the cache for every job the UDF is used in. Check for * {@link FuncUtils} for utility function to facilitate it * The default implementation returns null. * @return A list of files */ public List<String> getShipFiles() { return null; } public PigLogger getPigLogger() { return pigLogger; } /** * Set the PigLogger object. Called by Pig to provide a reference * to the UDF. * @param pigLogger PigLogger object. */ public final void setPigLogger(PigLogger pigLogger) { this.pigLogger = pigLogger; } public Log getLogger() { return log; } private Schema inputSchemaInternal = null; /** * This method will be called by Pig both in the front end and back end to * pass a unique signature to the {@link EvalFunc}. The signature can be used * to store into the {@link UDFContext} any information which the * {@link EvalFunc} needs to store between various method invocations in the * front end and back end. * @param signature a unique signature to identify this EvalFunc */ public void setUDFContextSignature(String signature) { } /** * This method is for internal use. It is called by Pig core in both front-end * and back-end to setup the right input schema for EvalFunc */ public void setInputSchema(Schema input) { this.inputSchemaInternal = input; } /** * This method is intended to be called by the user in {@link EvalFunc} to get the input * schema of the EvalFunc */ public Schema getInputSchema() { return this.inputSchemaInternal; } /** * Returns the {@link SchemaType} of the EvalFunc. User defined functions can override * this method to return {@link SchemaType#VARARG}. In this case the last FieldSchema * added to the Schema in {@link #getArgToFuncMapping()} will be considered as a vararg field. * * @return the schema type of the UDF */ public SchemaType getSchemaType() { return SchemaType.NORMAL; } /** * Whether the UDF should be evaluated at compile time if all inputs are constant. * This is applicable for most UDF, however, if a UDF will access hdfs file which * is not available at compile time, it has to be false * @return Whether or not compile time calculation is allowed, default to false * to ensure legacy UDF will get the right behavior */ public boolean allowCompileTimeCalculation() { return false; } public boolean needEndOfAllInputProcessing() { return false; } public void setEndOfAllInput(boolean endOfAllInput) { } }