org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POCache.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POCache.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.UUID;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PlanPrinter;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.plan.NodeIdGenerator;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.util.Pair;
import org.apache.pig.pen.util.ExampleTuple;

public class POCache extends PhysicalOperator {

    private static final Log LOG = LogFactory.getLog(POCache.class);
    private static final long serialVersionUID = 1L;

    // The expression plan
    transient PhysicalPlan plan;
    String key;

    public POCache(OperatorKey k, PhysicalPlan plan) {
        super(k);
        this.plan = plan;
    }

    /**
     * Counts the number of tuples processed into static variable soFar, if the number of tuples processed reach the
     * limit, return EOP; Otherwise, return the tuple
     */
    @Override
    public Result getNextTuple() throws ExecException {
        return processInput();
    }

    @Override
    public String name() {
        return getAliasString() + "Cache - " + mKey.toString();
    }

    @Override
    public boolean supportsMultipleInputs() {
        return false;
    }

    @Override
    public boolean supportsMultipleOutputs() {
        return false;
    }

    @Override
    public void visit(PhyPlanVisitor v) throws VisitorException {
        v.visitCache(this);
    }

    @Override
    public POCache clone() throws CloneNotSupportedException {
        POCache newCache = new POCache(
                new OperatorKey(this.mKey.scope, NodeIdGenerator.getGenerator().getNextNodeId(this.mKey.scope)),
                this.plan.clone());
        newCache.setInputs(inputs);
        return newCache;
    }

    @Override
    public Tuple illustratorMarkup(Object in, Object out, int eqClassIndex) {
        if (illustrator != null) {
            ExampleTuple tIn = (ExampleTuple) in;
            illustrator.getEquivalenceClasses().get(eqClassIndex).add(tIn);
            illustrator.addData((Tuple) in);
        }
        return (Tuple) in;
    }

    /**
     * Get a cache key for the given operator, or null if we don't know how to handle its type (or one of
     * its predcesessors' types) and want to not cache this subplan at all.
     *
     * Right now, this only handles loads. Unless we figure out a nice way to turn the PO plan into a
     * string or compare two PO plans, we'll probably have to handle each type of physical operator
     * recursively to generate a cache key.
     * @param plan
     * @throws IOException
     */
    public String computeCacheKey() throws IOException {
        if (key == null) {
            key = computeRawCacheKey(inputs);
            if (key != null) {
                // TODO deal with collisions!!
                key = UUID.nameUUIDFromBytes(key.getBytes()).toString();
            }
        }
        return key;
    }

    private String computeRawCacheKey(List<PhysicalOperator> preds) throws IOException {
        if (preds == null) {
            return "";
        }
        StringBuilder sb = new StringBuilder();
        for (PhysicalOperator operator : preds) {
            if (operator instanceof POLoad) {
                // Load operators are equivalent if the file is the same
                // and the loader is the same
                // Potential problems down the line:
                // * not checking LoadFunc arguments
                sb.append("LOAD: " + ((POLoad) operator).getLFile().getFileName()
                        + ((POLoad) operator).getLoadFunc().getClass().getName());
            } else if (operator instanceof POForEach) {
                // We consider ForEach operators to be equivalent if their inner plans
                // have the same explain plan after dropping scope markers.
                // Potential problems downstream:
                // * not checking for Nondeterministic UDFs
                // * jars / class defs changing under us
                StringBuilder foreachPlanKeysBuilder = new StringBuilder();
                for (PhysicalPlan innerPlan : ((POForEach) operator).getInputPlans()) {
                    foreachPlanKeysBuilder.append(innerPlanKey(innerPlan));
                }
                sb.append(foreachPlanKeysBuilder.toString());
                String inputKey = computeRawCacheKey(operator.getInputs());
                if (inputKey == null) {
                    return null;
                } else {
                    sb.append(inputKey);
                    LOG.info("Input key: " + inputKey);
                }
            } else if (operator instanceof POFilter) {
                // Similar to foreach.
                PhysicalPlan innerPlan = ((POFilter) operator).getPlan();
                sb.append(innerPlanKey(innerPlan));
                String inputKey = computeRawCacheKey(operator.getInputs());
                if (inputKey == null) {
                    return null;
                } else {
                    sb.append(inputKey);
                }
            } else if (operator instanceof POLocalRearrange) {
                POLocalRearrange localRearrange = (POLocalRearrange) operator;
                sb.append("LocRearrange");
                sb.append("ProjCol");
                for (Map.Entry<Integer, Integer> entry : localRearrange.getProjectedColsMap().entrySet()) {
                    sb.append(entry.getKey() + "+" + entry.getValue());
                }
                sb.append("SecProjCol");
                for (Map.Entry<Integer, Integer> entry : localRearrange.getSecondaryProjectedColsMap().entrySet()) {
                    sb.append(entry.getKey() + "+" + entry.getValue());
                }
                sb.append(localRearrange.getIndex());
                sb.append(localRearrange.getKeyType());
                for (PhysicalPlan plan : localRearrange.getPlans()) {
                    sb.append(innerPlanKey(plan));
                }
            } else if (operator instanceof POGlobalRearrange) {
                sb.append("POGLOBALREARRANGE");
            } else if (operator instanceof POPackage) {
                POPackage pkg = (POPackage) operator;
                sb.append("POPakage");
                for (Map.Entry<Integer, Pair<Boolean, Map<Integer, Integer>>> entry : pkg.getKeyInfo().entrySet()) {
                    sb.append(entry.getKey()).append("-").append(entry.getValue().first);
                    sb.append("->");
                    for (Map.Entry<Integer, Integer> valentry : entry.getValue().second.entrySet()) {
                        sb.append(valentry.getKey()).append("-").append(valentry.getValue());
                    }
                    sb.append(".");
                }
            } else {
                LOG.info("Don't know how to generate cache key for " + operator.getClass() + "; not caching");
                return null;
            }
            sb.append(computeRawCacheKey(operator.getInputs()));
        }
        return sb.toString();
    }

    private String innerPlanKey(PhysicalPlan plan) throws VisitorException, IOException {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        PlanPrinter<PhysicalOperator, PhysicalPlan> pp = new PlanPrinter<PhysicalOperator, PhysicalPlan>(plan);
        pp.print(baos);
        String explained = baos.toString();

        // get rid of scope numbers in these inner plans.
        return explained.replaceAll("scope-\\d+", "");
    }
}