Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.backend.hadoop.executionengine.mapreduceExec; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataBag; import org.apache.pig.data.Datum; import org.apache.pig.data.IndexedTuple; import org.apache.pig.data.Tuple; import org.apache.pig.impl.PigContext; import org.apache.pig.impl.eval.EvalSpec; import org.apache.pig.impl.eval.collector.DataCollector; import org.apache.pig.impl.io.FileSpec; import org.apache.pig.impl.util.ObjectSerializer; public class PigCombine implements Reducer<Tuple, IndexedTuple, Tuple, IndexedTuple> { private final Log log = LogFactory.getLog(getClass()); private JobConf job; private CombineDataOutputCollector finalout; private DataCollector evalPipe; private int index; private int inputCount; private DataBag bags[]; private PigContext pigContext; private EvalSpec esp; public void reduce(Tuple key, Iterator<IndexedTuple> values, OutputCollector<Tuple, IndexedTuple> output, Reporter reporter) throws IOException { try { if (evalPipe == null) { finalout = new CombineDataOutputCollector(output); evalPipe = esp.setupPipe(null, finalout); //throw new RuntimeException("combine spec: " + evalSpec + " combine pipe: " + esp.toString()); bags = new DataBag[inputCount]; for (int i = 0; i < inputCount; i++) { bags[i] = BagFactory.getInstance().newDefaultBag(); } } if (PigInputFormat.getActiveSplit() == null) { } else { index = PigInputFormat.getActiveSplit().getIndex(); } Datum groupName = key.getField(0); finalout.group = key; finalout.index = index; Tuple t = new Tuple(1 + inputCount); t.setField(0, groupName); for (int i = 1; i < 1 + inputCount; i++) { bags[i - 1].clear(); t.setField(i, bags[i - 1]); } while (values.hasNext()) { IndexedTuple it = values.next(); t.getBagField(it.index + 1).add(it.toTuple()); } for (int i = 0; i < inputCount; i++) { // XXX: shouldn't we only do this if INNER flag is set? if (t.getBagField(1 + i).size() == 0) return; } // throw new RuntimeException("combine input: " + t.toString()); evalPipe.add(t); // evalPipe.add(null); // EOF marker } catch (Throwable tr) { log.error(tr); RuntimeException exp = new RuntimeException(tr.getMessage()); exp.setStackTrace(tr.getStackTrace()); throw exp; } } /** * Just save off the PigJobConf for later use. */ public void configure(JobConf job) { this.job = job; try { this.pigContext = (PigContext) ObjectSerializer.deserialize(job.get("pig.pigContext")); this.inputCount = ((ArrayList<FileSpec>) ObjectSerializer.deserialize(job.get("pig.inputs"))).size(); String evalSpec = job.get("pig.combineFunc", ""); this.esp = (EvalSpec) ObjectSerializer.deserialize(evalSpec); if (esp != null) { esp.instantiateFunc(pigContext); } } catch (IOException e) { throw new RuntimeException("unable to deserialize data", e); } } /** * Nothing happens here. */ public void close() throws IOException { } private static class CombineDataOutputCollector extends DataCollector { OutputCollector<Tuple, IndexedTuple> oc = null; Tuple group = null; int index = -1; public CombineDataOutputCollector(OutputCollector oc) { super(null); this.oc = oc; } @Override public void add(Datum d) { if (d == null) return; // EOF marker from eval pipeline; ignore try { // oc.collect(group, new IndexedTuple(((Tuple)d).getTupleField(0),index)); oc.collect(group, new IndexedTuple(((Tuple) d), index)); } catch (IOException e) { throw new RuntimeException(e); } } } }