Java tutorial
/* * Copyright 2012 Hiromasa Horiguchi ( The University of Tokyo ) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package jp.ac.u.tokyo.m.pig.udf.eval.sequence; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import jp.ac.u.tokyo.m.log.LogUtil; import jp.ac.u.tokyo.m.pig.udf.AliasConstants; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pig.EvalFunc; import org.apache.pig.FuncSpec; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.DefaultBagFactory; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; public class PickupSequenceValues extends EvalFunc<DataBag> { // ----------------------------------------------------------------------------------------------------------------- private static Log mLog = LogFactory.getLog(PickupSequenceValues.class); // ----------------------------------------------------------------------------------------------------------------- private OutputMode mOutputMode; private enum OutputMode { ALL_VALUE_PACKET_BAG, ALL_VALUE_FLAT, FIRST_VALUE_FLAT, } // ----------------------------------------------------------------------------------------------------------------- public PickupSequenceValues() { this(null); } public PickupSequenceValues(String aOutputModeString) { selectOutputMode(aOutputModeString, OutputMode.ALL_VALUE_PACKET_BAG); } // ----------------------------------------------------------------------------------------------------------------- private void selectOutputMode(String aOutputModeString, OutputMode aDefaultMode) { if (aOutputModeString == null) { mOutputMode = aDefaultMode; return; } try { mOutputMode = OutputMode.valueOf(aOutputModeString.toUpperCase()); } catch (RuntimeException e) { LogUtil.errorIllegalModeName(mLog, OutputMode.values(), aOutputModeString, e); } } // ----------------------------------------------------------------------------------------------------------------- @Override public DataBag exec(Tuple aInput) throws IOException { // invalid value | if (aInput == null) return DefaultBagFactory.getInstance().newDefaultBag(); // processing target | ? DataBag tTargetBag = DataType.toBag(aInput.get(0)); DataBag tValueBag = DataType.toBag(aInput.get(1)); Long tStartValue = DataType.toLong(aInput.get(2)); return DefaultBagFactory.getInstance().newDefaultBag(composeProtoBag(tTargetBag, tValueBag, tStartValue)); } private ArrayList<Tuple> composeProtoBag(DataBag aTargetBag, DataBag aValueBag, Long aStartValue) throws ExecException { if (aStartValue == null) return new ArrayList<Tuple>(); HashMap<Long, ArrayList<Tuple>> tTargetBagMap = createValueBaseBagMap(aTargetBag, aValueBag); switch (mOutputMode) { case ALL_VALUE_PACKET_BAG: default: // sequence_values : Bag{ sequence_member_bag : Bag { Tuple( <InputTuple> ) } } return composeProtoBagModeAllValuePacketBag(tTargetBagMap, aStartValue); case ALL_VALUE_FLAT: // sequence_values : Bag{ Tuple( <InputTuple> ) } return composeProtoBagModeAllValueFlat(tTargetBagMap, aStartValue); case FIRST_VALUE_FLAT: // sequence_values : Bag{ Tuple( <InputTuple> ) } return composeProtoBagModeFirstValueFlat(tTargetBagMap, aStartValue); } } private HashMap<Long, ArrayList<Tuple>> createValueBaseBagMap(DataBag aTargetBag, DataBag aValueBag) throws ExecException { Iterator<Tuple> tTargetBagIterator = aTargetBag.iterator(); Iterator<Tuple> tDateBagIterator = aValueBag.iterator(); HashMap<Long, ArrayList<Tuple>> tTargetBagMap = new HashMap<Long, ArrayList<Tuple>>(); while (tDateBagIterator.hasNext()) { Long tKey = DataType.toLong(tDateBagIterator.next().get(0)); Tuple tValue = tTargetBagIterator.next(); if (tKey == null) continue; ArrayList<Tuple> tList = tTargetBagMap.get(tKey); if (tList == null) { tList = new ArrayList<Tuple>(); tList.add(tValue); tTargetBagMap.put(tKey, tList); } else { tList.add(tValue); } } return tTargetBagMap; } private ArrayList<Tuple> composeProtoBagModeAllValuePacketBag(HashMap<Long, ArrayList<Tuple>> aTargetBagMap, Long aStartValue) throws ExecException { ArrayList<Tuple> tProtoBag = new ArrayList<Tuple>(); TupleFactory tTupleFactory = TupleFactory.getInstance(); while (true) { ArrayList<Tuple> tCurrentProtoBag = aTargetBagMap.get(aStartValue++); if (tCurrentProtoBag == null) break; Tuple tCurrentTuple = tTupleFactory .newTuple(DefaultBagFactory.getInstance().newDefaultBag(tCurrentProtoBag)); tProtoBag.add(tCurrentTuple); } return tProtoBag; } private ArrayList<Tuple> composeProtoBagModeAllValueFlat(HashMap<Long, ArrayList<Tuple>> aTargetBagMap, Long aStartValue) throws ExecException { ArrayList<Tuple> tProtoBag = new ArrayList<Tuple>(); while (true) { ArrayList<Tuple> tCurrentProtoBag = aTargetBagMap.get(aStartValue++); if (tCurrentProtoBag == null) break; tProtoBag.addAll(tCurrentProtoBag); } return tProtoBag; } private ArrayList<Tuple> composeProtoBagModeFirstValueFlat(HashMap<Long, ArrayList<Tuple>> aTargetBagMap, Long aStartValue) throws ExecException { ArrayList<Tuple> tProtoBag = new ArrayList<Tuple>(); while (true) { ArrayList<Tuple> tCurrentProtoBag = aTargetBagMap.get(aStartValue++); if (tCurrentProtoBag == null) break; tProtoBag.add(tCurrentProtoBag.get(0)); } return tProtoBag; } // ----------------------------------------------------------------------------------------------------------------- @Override public List<FuncSpec> getArgToFuncMapping() throws FrontendException { List<FuncSpec> tFuncList = new ArrayList<FuncSpec>(); addFuncSpec(tFuncList, DataType.LONG); addFuncSpec(tFuncList, DataType.INTEGER); return tFuncList; } private void addFuncSpec(List<FuncSpec> aFuncList, byte aArg3Type) { Schema tSchema = new Schema(); tSchema = new Schema(); tSchema.add(new Schema.FieldSchema(null, DataType.BAG)); tSchema.add(new Schema.FieldSchema(null, DataType.BAG)); tSchema.add(new Schema.FieldSchema(null, aArg3Type)); aFuncList.add(new FuncSpec(this.getClass().getName(), tSchema)); } // ----------------------------------------------------------------------------------------------------------------- @Override public Schema outputSchema(Schema aInput) { List<FieldSchema> tInputFields = aInput.getFields(); FieldSchema tInputTarget = tInputFields.get(0); Schema tBagSchema = new Schema(); try { Schema tInputTargetSchema = tInputTarget.schema.getFields().get(0).schema; switch (mOutputMode) { case ALL_VALUE_PACKET_BAG: default: // sequence_values : Bag{ sequence_member_bag : Bag { Tuple( <InputTuple> ) } } Schema tInnerBagSchema = new Schema(); tInnerBagSchema.add(new FieldSchema(AliasConstants.SEQUENCE_VALUES_OUT_ALIAS_INNER, tInputTargetSchema, DataType.BAG)); tBagSchema.add(new FieldSchema(AliasConstants.SEQUENCE_VALUES_OUT_ALIAS_TOP, tInnerBagSchema, DataType.BAG)); break; case ALL_VALUE_FLAT: case FIRST_VALUE_FLAT: // sequence_values : Bag{ Tuple( <InputTuple> ) } tBagSchema.add(new FieldSchema(AliasConstants.SEQUENCE_VALUES_OUT_ALIAS_TOP, tInputTargetSchema, DataType.BAG)); break; } } catch (FrontendException e) { throw new RuntimeException(e); } return tBagSchema; } // ----------------------------------------------------------------------------------------------------------------- }