Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.pen; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Random; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.PORead; import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataBag; import org.apache.pig.data.Tuple; import org.apache.pig.impl.PigContext; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.LOCogroup; import org.apache.pig.impl.logicalLayer.LOCross; import org.apache.pig.impl.logicalLayer.LODistinct; import org.apache.pig.impl.logicalLayer.LOFilter; import org.apache.pig.impl.logicalLayer.LOForEach; import org.apache.pig.impl.logicalLayer.LOLimit; import org.apache.pig.impl.logicalLayer.LOLoad; import org.apache.pig.impl.logicalLayer.LOSort; import org.apache.pig.impl.logicalLayer.LOSplit; import org.apache.pig.impl.logicalLayer.LOStore; import org.apache.pig.impl.logicalLayer.LOUnion; import org.apache.pig.impl.logicalLayer.LOVisitor; import org.apache.pig.impl.logicalLayer.LogicalOperator; import org.apache.pig.impl.logicalLayer.LogicalPlan; import org.apache.pig.impl.logicalLayer.PlanSetter; import org.apache.pig.impl.logicalLayer.validators.LogicalPlanValidationExecutor; import org.apache.pig.impl.plan.CompilationMessageCollector; import org.apache.pig.impl.plan.DependencyOrderWalker; import org.apache.pig.impl.plan.OperatorKey; import org.apache.pig.impl.plan.PlanException; import org.apache.pig.impl.plan.VisitorException; import org.apache.pig.impl.util.IdentityHashSet; import org.apache.pig.pen.util.DependencyOrderLimitedWalker; import org.apache.pig.pen.util.LineageTracer; //This class is used to pass data through the entire plan and save the intermediates results. public class DerivedDataVisitor extends LOVisitor { Map<LogicalOperator, DataBag> derivedData = new HashMap<LogicalOperator, DataBag>(); PhysicalPlan physPlan = null; Map<LOLoad, DataBag> baseData = null; Map<LogicalOperator, PhysicalOperator> LogToPhyMap = null; Log log = LogFactory.getLog(getClass()); Map<LogicalOperator, Collection<IdentityHashSet<Tuple>>> OpToEqClasses = null; Collection<IdentityHashSet<Tuple>> EqClasses = null; LineageTracer lineage = new LineageTracer(); public DerivedDataVisitor(LogicalPlan plan, PigContext pc, Map<LOLoad, DataBag> baseData, Map<LogicalOperator, PhysicalOperator> logToPhyMap, PhysicalPlan physPlan) { super(plan, new DependencyOrderWalker<LogicalOperator, LogicalPlan>(plan)); this.baseData = baseData; OpToEqClasses = new HashMap<LogicalOperator, Collection<IdentityHashSet<Tuple>>>(); EqClasses = new LinkedList<IdentityHashSet<Tuple>>(); LogToPhyMap = logToPhyMap; this.physPlan = physPlan; // if(logToPhyMap == null) // compilePlan(plan); // else // LogToPhyMap = logToPhyMap; } public DerivedDataVisitor(LogicalOperator op, PigContext pc, Map<LOLoad, DataBag> baseData, Map<LogicalOperator, PhysicalOperator> logToPhyMap, PhysicalPlan physPlan) { super(op.getPlan(), new DependencyOrderLimitedWalker<LogicalOperator, LogicalPlan>(op, op.getPlan())); this.baseData = baseData; OpToEqClasses = new HashMap<LogicalOperator, Collection<IdentityHashSet<Tuple>>>(); EqClasses = new LinkedList<IdentityHashSet<Tuple>>(); LogToPhyMap = logToPhyMap; this.physPlan = physPlan; // if(logToPhyMap == null) // compilePlan(op.getPlan()); // else // LogToPhyMap = logToPhyMap; } public void setOperatorToEvaluate(LogicalOperator op) { mCurrentWalker = new DependencyOrderLimitedWalker<LogicalOperator, LogicalPlan>(op, op.getPlan()); } @Override protected void visit(LOCogroup cg) throws VisitorException { // evaluateOperator(cg); // there is a slightly different code path for cogroup because of the // local rearranges PhysicalOperator physOp = LogToPhyMap.get(cg); Random r = new Random(); // get the list of original inputs // List<PhysicalOperator> inputs = physOp.getInputs(); List<PhysicalOperator> inputs = new ArrayList<PhysicalOperator>(); PhysicalPlan phy = new PhysicalPlan(); phy.add(physOp); // for(PhysicalOperator input : physOp.getInputs()) { for (PhysicalOperator input : physPlan.getPredecessors(physOp)) { inputs.add(input.getInputs().get(0)); // input.setInputs(null); phy.add(input); try { phy.connect(input, physOp); } catch (PlanException e) { // TODO Auto-generated catch block e.printStackTrace(); log.error("Error connecting " + input.name() + " to " + physOp.name()); } } physOp.setLineageTracer(lineage); // replace the original inputs by POReads for (int i = 0; i < inputs.size(); i++) { DataBag bag = derivedData.get(cg.getInputs().get(i)); PORead por = new PORead(new OperatorKey("", r.nextLong()), bag); phy.add(por); try { phy.connect(por, physOp.getInputs().get(i)); } catch (PlanException e) { // TODO Auto-generated catch block e.printStackTrace(); log.error("Error connecting " + por.name() + " to " + physOp.name()); } } DataBag output = BagFactory.getInstance().newDefaultBag(); Tuple t = null; try { for (Result res = physOp.getNext(t); res.returnStatus != POStatus.STATUS_EOP; res = physOp.getNext(t)) { output.add((Tuple) res.result); } } catch (ExecException e) { log.error("Error evaluating operator : " + physOp.name()); } derivedData.put(cg, output); try { Collection<IdentityHashSet<Tuple>> eq = EquivalenceClasses.getEquivalenceClasses(cg, derivedData); EqClasses.addAll(eq); OpToEqClasses.put(cg, eq); } catch (ExecException e) { // TODO Auto-generated catch block e.printStackTrace(); log.error("Error updating equivalence classes while evaluating operators. \n" + e.getMessage()); } // re-attach the original operators // for(int i = 0; i < inputs.size(); i++) { // try { // physPlan.connect(inputs.get(i), physOp.getInputs().get(i)); // // } catch (PlanException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // log.error("Error connecting " + inputs.get(i).name() + " to " + // physOp.getInputs().get(i).name()); // } // } physOp.setLineageTracer(null); } @Override protected void visit(LOCross cs) throws VisitorException { evaluateOperator(cs); } @Override protected void visit(LODistinct dt) throws VisitorException { evaluateOperator(dt); } @Override protected void visit(LOFilter filter) throws VisitorException { evaluateOperator(filter); } @Override protected void visit(LOForEach forEach) throws VisitorException { evaluateOperator(forEach); } @Override protected void visit(LOLoad load) throws VisitorException { derivedData.put(load, baseData.get(load)); Collection<IdentityHashSet<Tuple>> eq = EquivalenceClasses.getEquivalenceClasses(load, derivedData); EqClasses.addAll(eq); OpToEqClasses.put(load, eq); for (Iterator<Tuple> it = derivedData.get(load).iterator(); it.hasNext();) { lineage.insert(it.next()); } } @Override protected void visit(LOSplit split) throws VisitorException { evaluateOperator(split); } @Override protected void visit(LOStore store) throws VisitorException { derivedData.put(store, derivedData.get(store.getPlan().getPredecessors(store).get(0))); } @Override protected void visit(LOUnion u) throws VisitorException { evaluateOperator(u); } @Override protected void visit(LOLimit l) throws VisitorException { evaluateOperator(l); } @Override protected void visit(LOSort sort) throws VisitorException { evaluateOperator(sort); } // private void compilePlan(LogicalPlan plan) { // // plan = refineLogicalPlan(plan); // // LocalLogToPhyTranslationVisitor visitor = new // LocalLogToPhyTranslationVisitor(plan); // visitor.setPigContext(pc); // try { // visitor.visit(); // } catch (VisitorException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // log.error("Error visiting the logical plan in ExampleGenerator"); // } // physPlan = visitor.getPhysicalPlan(); // LogToPhyMap = visitor.getLogToPhyMap(); // } // // private LogicalPlan refineLogicalPlan(LogicalPlan plan) { // PlanSetter ps = new PlanSetter(plan); // try { // ps.visit(); // // } catch (VisitorException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // } // // // run through validator // CompilationMessageCollector collector = new CompilationMessageCollector() // ; // FrontendException caught = null; // try { // LogicalPlanValidationExecutor validator = // new LogicalPlanValidationExecutor(plan, pc); // validator.validate(plan, collector); // } catch (FrontendException fe) { // // Need to go through and see what the collector has in it. But // // remember what we've caught so we can wrap it into what we // // throw. // caught = fe; // } // // // return plan; // // } private void evaluateOperator(LogicalOperator op) { PhysicalOperator physOp = LogToPhyMap.get(op); Random r = new Random(); // get the list of original inputs List<PhysicalOperator> inputs = physOp.getInputs(); physOp.setInputs(null); physOp.setLineageTracer(lineage); PhysicalPlan phy = new PhysicalPlan(); phy.add(physOp); // replace the original inputs by POReads for (LogicalOperator l : op.getPlan().getPredecessors(op)) { DataBag bag = derivedData.get(l); PORead por = new PORead(new OperatorKey("", r.nextLong()), bag); phy.add(por); try { phy.connect(por, physOp); } catch (PlanException e) { // TODO Auto-generated catch block e.printStackTrace(); log.error("Error connecting " + por.name() + " to " + physOp.name()); } } DataBag output = BagFactory.getInstance().newDefaultBag(); Tuple t = null; try { for (Result res = physOp.getNext(t); res.returnStatus != POStatus.STATUS_EOP; res = physOp.getNext(t)) { output.add((Tuple) res.result); } } catch (ExecException e) { log.error("Error evaluating operator : " + physOp.name()); } derivedData.put(op, output); try { Collection<IdentityHashSet<Tuple>> eq = EquivalenceClasses.getEquivalenceClasses(op, derivedData); EqClasses.addAll(eq); OpToEqClasses.put(op, eq); } catch (ExecException e) { // TODO Auto-generated catch block e.printStackTrace(); log.error("Error updating equivalence classes while evaluating operators. \n" + e.getMessage()); } // re-attach the original operators physOp.setInputs(inputs); physOp.setLineageTracer(null); } public DataBag evaluateIsolatedOperator(LOCogroup op, List<DataBag> inputBags) { if (op.getPlan().getPredecessors(op).size() > inputBags.size()) return null; int count = 0; for (LogicalOperator inputs : op.getPlan().getPredecessors(op)) { derivedData.put(inputs, inputBags.get(count++)); } return evaluateIsolatedOperator(op); } public DataBag evaluateIsolatedOperator(LOCogroup op) { // return null if the inputs are not already evaluated for (LogicalOperator in : op.getPlan().getPredecessors(op)) { if (derivedData.get(in) == null) return null; } LineageTracer oldLineage = this.lineage; this.lineage = new LineageTracer(); PhysicalOperator physOp = LogToPhyMap.get(op); Random r = new Random(); // get the list of original inputs // List<PhysicalOperator> inputs = physOp.getInputs(); List<PhysicalOperator> inputs = new ArrayList<PhysicalOperator>(); PhysicalPlan phy = new PhysicalPlan(); phy.add(physOp); for (PhysicalOperator input : physOp.getInputs()) { inputs.add(input.getInputs().get(0)); input.setInputs(null); phy.add(input); try { phy.connect(input, physOp); } catch (PlanException e) { // TODO Auto-generated catch block e.printStackTrace(); log.error("Error connecting " + input.name() + " to " + physOp.name()); } } physOp.setLineageTracer(lineage); physOp.setLineageTracer(null); // replace the original inputs by POReads for (int i = 0; i < inputs.size(); i++) { DataBag bag = derivedData.get(op.getInputs().get(i)); PORead por = new PORead(new OperatorKey("", r.nextLong()), bag); phy.add(por); try { phy.connect(por, physOp.getInputs().get(i)); } catch (PlanException e) { // TODO Auto-generated catch block e.printStackTrace(); log.error("Error connecting " + por.name() + " to " + physOp.name()); } } // replace the original inputs by POReads // for(LogicalOperator l : op.getPlan().getPredecessors(op)) { // DataBag bag = derivedData.get(l); // PORead por = new PORead(new OperatorKey("", r.nextLong()), bag); // phy.add(por); // try { // phy.connect(por, physOp); // } catch (PlanException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // log.error("Error connecting " + por.name() + " to " + physOp.name()); // } // } DataBag output = BagFactory.getInstance().newDefaultBag(); Tuple t = null; try { for (Result res = physOp.getNext(t); res.returnStatus != POStatus.STATUS_EOP; res = physOp.getNext(t)) { output.add((Tuple) res.result); } } catch (ExecException e) { log.error("Error evaluating operator : " + physOp.name()); } this.lineage = oldLineage; physOp.setInputs(inputs); physOp.setLineageTracer(null); return output; } }