org.apache.pig.pen.DerivedDataVisitor.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pig.pen.DerivedDataVisitor.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.pig.pen;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Random;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.PORead;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.LOCogroup;
import org.apache.pig.impl.logicalLayer.LOCross;
import org.apache.pig.impl.logicalLayer.LODistinct;
import org.apache.pig.impl.logicalLayer.LOFilter;
import org.apache.pig.impl.logicalLayer.LOForEach;
import org.apache.pig.impl.logicalLayer.LOLimit;
import org.apache.pig.impl.logicalLayer.LOLoad;
import org.apache.pig.impl.logicalLayer.LOSort;
import org.apache.pig.impl.logicalLayer.LOSplit;
import org.apache.pig.impl.logicalLayer.LOStore;
import org.apache.pig.impl.logicalLayer.LOUnion;
import org.apache.pig.impl.logicalLayer.LOVisitor;
import org.apache.pig.impl.logicalLayer.LogicalOperator;
import org.apache.pig.impl.logicalLayer.LogicalPlan;
import org.apache.pig.impl.logicalLayer.PlanSetter;
import org.apache.pig.impl.logicalLayer.validators.LogicalPlanValidationExecutor;
import org.apache.pig.impl.plan.CompilationMessageCollector;
import org.apache.pig.impl.plan.DependencyOrderWalker;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.PlanException;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.util.IdentityHashSet;
import org.apache.pig.pen.util.DependencyOrderLimitedWalker;
import org.apache.pig.pen.util.LineageTracer;

//This class is used to pass data through the entire plan and save the intermediates results.
public class DerivedDataVisitor extends LOVisitor {

    Map<LogicalOperator, DataBag> derivedData = new HashMap<LogicalOperator, DataBag>();
    PhysicalPlan physPlan = null;
    Map<LOLoad, DataBag> baseData = null;

    Map<LogicalOperator, PhysicalOperator> LogToPhyMap = null;
    Log log = LogFactory.getLog(getClass());

    Map<LogicalOperator, Collection<IdentityHashSet<Tuple>>> OpToEqClasses = null;
    Collection<IdentityHashSet<Tuple>> EqClasses = null;

    LineageTracer lineage = new LineageTracer();

    public DerivedDataVisitor(LogicalPlan plan, PigContext pc, Map<LOLoad, DataBag> baseData,
            Map<LogicalOperator, PhysicalOperator> logToPhyMap, PhysicalPlan physPlan) {
        super(plan, new DependencyOrderWalker<LogicalOperator, LogicalPlan>(plan));
        this.baseData = baseData;

        OpToEqClasses = new HashMap<LogicalOperator, Collection<IdentityHashSet<Tuple>>>();
        EqClasses = new LinkedList<IdentityHashSet<Tuple>>();

        LogToPhyMap = logToPhyMap;
        this.physPlan = physPlan;
        // if(logToPhyMap == null)
        // compilePlan(plan);
        // else
        // LogToPhyMap = logToPhyMap;

    }

    public DerivedDataVisitor(LogicalOperator op, PigContext pc, Map<LOLoad, DataBag> baseData,
            Map<LogicalOperator, PhysicalOperator> logToPhyMap, PhysicalPlan physPlan) {
        super(op.getPlan(), new DependencyOrderLimitedWalker<LogicalOperator, LogicalPlan>(op, op.getPlan()));
        this.baseData = baseData;

        OpToEqClasses = new HashMap<LogicalOperator, Collection<IdentityHashSet<Tuple>>>();
        EqClasses = new LinkedList<IdentityHashSet<Tuple>>();

        LogToPhyMap = logToPhyMap;
        this.physPlan = physPlan;
        // if(logToPhyMap == null)
        // compilePlan(op.getPlan());
        // else
        // LogToPhyMap = logToPhyMap;
    }

    public void setOperatorToEvaluate(LogicalOperator op) {
        mCurrentWalker = new DependencyOrderLimitedWalker<LogicalOperator, LogicalPlan>(op, op.getPlan());
    }

    @Override
    protected void visit(LOCogroup cg) throws VisitorException {
        // evaluateOperator(cg);
        // there is a slightly different code path for cogroup because of the
        // local rearranges
        PhysicalOperator physOp = LogToPhyMap.get(cg);
        Random r = new Random();
        // get the list of original inputs

        // List<PhysicalOperator> inputs = physOp.getInputs();
        List<PhysicalOperator> inputs = new ArrayList<PhysicalOperator>();
        PhysicalPlan phy = new PhysicalPlan();
        phy.add(physOp);

        // for(PhysicalOperator input : physOp.getInputs()) {
        for (PhysicalOperator input : physPlan.getPredecessors(physOp)) {
            inputs.add(input.getInputs().get(0));
            // input.setInputs(null);
            phy.add(input);
            try {
                phy.connect(input, physOp);
            } catch (PlanException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                log.error("Error connecting " + input.name() + " to " + physOp.name());
            }
        }

        physOp.setLineageTracer(lineage);

        // replace the original inputs by POReads
        for (int i = 0; i < inputs.size(); i++) {
            DataBag bag = derivedData.get(cg.getInputs().get(i));
            PORead por = new PORead(new OperatorKey("", r.nextLong()), bag);
            phy.add(por);
            try {
                phy.connect(por, physOp.getInputs().get(i));
            } catch (PlanException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                log.error("Error connecting " + por.name() + " to " + physOp.name());
            }
        }

        DataBag output = BagFactory.getInstance().newDefaultBag();
        Tuple t = null;
        try {
            for (Result res = physOp.getNext(t); res.returnStatus != POStatus.STATUS_EOP; res = physOp.getNext(t)) {
                output.add((Tuple) res.result);
            }
        } catch (ExecException e) {
            log.error("Error evaluating operator : " + physOp.name());
        }
        derivedData.put(cg, output);

        try {
            Collection<IdentityHashSet<Tuple>> eq = EquivalenceClasses.getEquivalenceClasses(cg, derivedData);
            EqClasses.addAll(eq);
            OpToEqClasses.put(cg, eq);
        } catch (ExecException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            log.error("Error updating equivalence classes while evaluating operators. \n" + e.getMessage());
        }

        // re-attach the original operators
        // for(int i = 0; i < inputs.size(); i++) {
        // try {
        // physPlan.connect(inputs.get(i), physOp.getInputs().get(i));
        //      
        // } catch (PlanException e) {
        // // TODO Auto-generated catch block
        // e.printStackTrace();
        // log.error("Error connecting " + inputs.get(i).name() + " to " +
        // physOp.getInputs().get(i).name());
        // }
        // }
        physOp.setLineageTracer(null);
    }

    @Override
    protected void visit(LOCross cs) throws VisitorException {
        evaluateOperator(cs);
    }

    @Override
    protected void visit(LODistinct dt) throws VisitorException {
        evaluateOperator(dt);
    }

    @Override
    protected void visit(LOFilter filter) throws VisitorException {
        evaluateOperator(filter);
    }

    @Override
    protected void visit(LOForEach forEach) throws VisitorException {
        evaluateOperator(forEach);
    }

    @Override
    protected void visit(LOLoad load) throws VisitorException {
        derivedData.put(load, baseData.get(load));

        Collection<IdentityHashSet<Tuple>> eq = EquivalenceClasses.getEquivalenceClasses(load, derivedData);
        EqClasses.addAll(eq);
        OpToEqClasses.put(load, eq);

        for (Iterator<Tuple> it = derivedData.get(load).iterator(); it.hasNext();) {
            lineage.insert(it.next());
        }

    }

    @Override
    protected void visit(LOSplit split) throws VisitorException {
        evaluateOperator(split);
    }

    @Override
    protected void visit(LOStore store) throws VisitorException {
        derivedData.put(store, derivedData.get(store.getPlan().getPredecessors(store).get(0)));
    }

    @Override
    protected void visit(LOUnion u) throws VisitorException {
        evaluateOperator(u);
    }

    @Override
    protected void visit(LOLimit l) throws VisitorException {
        evaluateOperator(l);
    }

    @Override
    protected void visit(LOSort sort) throws VisitorException {
        evaluateOperator(sort);
    }

    // private void compilePlan(LogicalPlan plan) {
    //   
    // plan = refineLogicalPlan(plan);
    //   
    // LocalLogToPhyTranslationVisitor visitor = new
    // LocalLogToPhyTranslationVisitor(plan);
    // visitor.setPigContext(pc);
    // try {
    // visitor.visit();
    // } catch (VisitorException e) {
    // // TODO Auto-generated catch block
    // e.printStackTrace();
    // log.error("Error visiting the logical plan in ExampleGenerator");
    // }
    // physPlan = visitor.getPhysicalPlan();
    // LogToPhyMap = visitor.getLogToPhyMap();
    // }
    //    
    // private LogicalPlan refineLogicalPlan(LogicalPlan plan) {
    // PlanSetter ps = new PlanSetter(plan);
    // try {
    // ps.visit();
    //       
    // } catch (VisitorException e) {
    // // TODO Auto-generated catch block
    // e.printStackTrace();
    // }
    //        
    // // run through validator
    // CompilationMessageCollector collector = new CompilationMessageCollector()
    // ;
    // FrontendException caught = null;
    // try {
    // LogicalPlanValidationExecutor validator =
    // new LogicalPlanValidationExecutor(plan, pc);
    // validator.validate(plan, collector);
    // } catch (FrontendException fe) {
    // // Need to go through and see what the collector has in it. But
    // // remember what we've caught so we can wrap it into what we
    // // throw.
    // caught = fe;
    // }
    //        
    //        
    // return plan;
    //
    // }

    private void evaluateOperator(LogicalOperator op) {
        PhysicalOperator physOp = LogToPhyMap.get(op);
        Random r = new Random();
        // get the list of original inputs

        List<PhysicalOperator> inputs = physOp.getInputs();
        physOp.setInputs(null);
        physOp.setLineageTracer(lineage);
        PhysicalPlan phy = new PhysicalPlan();
        phy.add(physOp);

        // replace the original inputs by POReads
        for (LogicalOperator l : op.getPlan().getPredecessors(op)) {
            DataBag bag = derivedData.get(l);
            PORead por = new PORead(new OperatorKey("", r.nextLong()), bag);
            phy.add(por);
            try {
                phy.connect(por, physOp);
            } catch (PlanException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                log.error("Error connecting " + por.name() + " to " + physOp.name());
            }
        }

        DataBag output = BagFactory.getInstance().newDefaultBag();
        Tuple t = null;
        try {
            for (Result res = physOp.getNext(t); res.returnStatus != POStatus.STATUS_EOP; res = physOp.getNext(t)) {
                output.add((Tuple) res.result);
            }
        } catch (ExecException e) {
            log.error("Error evaluating operator : " + physOp.name());
        }
        derivedData.put(op, output);

        try {
            Collection<IdentityHashSet<Tuple>> eq = EquivalenceClasses.getEquivalenceClasses(op, derivedData);
            EqClasses.addAll(eq);
            OpToEqClasses.put(op, eq);
        } catch (ExecException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            log.error("Error updating equivalence classes while evaluating operators. \n" + e.getMessage());
        }

        // re-attach the original operators
        physOp.setInputs(inputs);
        physOp.setLineageTracer(null);
    }

    public DataBag evaluateIsolatedOperator(LOCogroup op, List<DataBag> inputBags) {
        if (op.getPlan().getPredecessors(op).size() > inputBags.size())
            return null;

        int count = 0;
        for (LogicalOperator inputs : op.getPlan().getPredecessors(op)) {
            derivedData.put(inputs, inputBags.get(count++));
        }

        return evaluateIsolatedOperator(op);

    }

    public DataBag evaluateIsolatedOperator(LOCogroup op) {
        // return null if the inputs are not already evaluated
        for (LogicalOperator in : op.getPlan().getPredecessors(op)) {
            if (derivedData.get(in) == null)
                return null;
        }

        LineageTracer oldLineage = this.lineage;
        this.lineage = new LineageTracer();

        PhysicalOperator physOp = LogToPhyMap.get(op);
        Random r = new Random();
        // get the list of original inputs
        // List<PhysicalOperator> inputs = physOp.getInputs();
        List<PhysicalOperator> inputs = new ArrayList<PhysicalOperator>();
        PhysicalPlan phy = new PhysicalPlan();
        phy.add(physOp);

        for (PhysicalOperator input : physOp.getInputs()) {
            inputs.add(input.getInputs().get(0));
            input.setInputs(null);
            phy.add(input);
            try {
                phy.connect(input, physOp);
            } catch (PlanException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                log.error("Error connecting " + input.name() + " to " + physOp.name());
            }
        }
        physOp.setLineageTracer(lineage);

        physOp.setLineageTracer(null);

        // replace the original inputs by POReads
        for (int i = 0; i < inputs.size(); i++) {
            DataBag bag = derivedData.get(op.getInputs().get(i));
            PORead por = new PORead(new OperatorKey("", r.nextLong()), bag);
            phy.add(por);
            try {
                phy.connect(por, physOp.getInputs().get(i));
            } catch (PlanException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                log.error("Error connecting " + por.name() + " to " + physOp.name());
            }
        }

        // replace the original inputs by POReads
        // for(LogicalOperator l : op.getPlan().getPredecessors(op)) {
        // DataBag bag = derivedData.get(l);
        // PORead por = new PORead(new OperatorKey("", r.nextLong()), bag);
        // phy.add(por);
        // try {
        // phy.connect(por, physOp);
        // } catch (PlanException e) {
        // // TODO Auto-generated catch block
        // e.printStackTrace();
        // log.error("Error connecting " + por.name() + " to " + physOp.name());
        // }
        // }

        DataBag output = BagFactory.getInstance().newDefaultBag();
        Tuple t = null;
        try {
            for (Result res = physOp.getNext(t); res.returnStatus != POStatus.STATUS_EOP; res = physOp.getNext(t)) {
                output.add((Tuple) res.result);
            }
        } catch (ExecException e) {
            log.error("Error evaluating operator : " + physOp.name());
        }

        this.lineage = oldLineage;

        physOp.setInputs(inputs);
        physOp.setLineageTracer(null);

        return output;
    }

}