org.apache.accumulo.storagehandler.CustomIndexPredicateAnalyzer.java Source code

Introduction

Here is the source code for org.apache.accumulo.storagehandler.CustomIndexPredicateAnalyzer.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.accumulo.storagehandler;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;

import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator;
import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.index.IndexPredicateAnalyzer;
import org.apache.hadoop.hive.ql.index.IndexSearchCondition;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.util.ReflectionUtils;

/**
 * IndexPredicateAnalyzer decomposes predicates, separating the parts
 * which can be satisfied by an index from the parts which cannot.
 * Currently, it only supports pure conjunctions over binary expressions
 * comparing a column reference with a constant value.  It is assumed
 * that all column aliases encountered refer to the same table.
 */
public class CustomIndexPredicateAnalyzer extends IndexPredicateAnalyzer {
    private static final String BOOLEAN_AND = "AND";
    private static final String BOOLEAN_OR = "OR";
    private static final Log LOG = LogFactory.getLog(CustomIndexPredicateAnalyzer.class.getName());
    private Set<String> udfNames;

    private Set<String> allowedColumnNames;
    private boolean orFirstLevel = false;
    private boolean orOtherLevel = false;

    public CustomIndexPredicateAnalyzer() {
        udfNames = new HashSet<String>();
    }

    /**
     * Registers a comparison operator as one which can be satisfied
     * by an index search.  Unless this is called, analyzePredicate
     * will never find any indexable conditions.
     *
     * @param udfName name of comparison operator as returned
     * by either {@link org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge#getUdfName} (for simple UDF's)
     * or udf.getClass().getName() (for generic UDF's).
     */
    public void addComparisonOp(String udfName) {
        udfNames.add(udfName);
    }

    /**
     * Clears the set of column names allowed in comparisons.  (Initially, all
     * column names are allowed.)
     */
    public void clearAllowedColumnNames() {
        allowedColumnNames = new HashSet<String>();
    }

    /**
     * Adds a column name to the set of column names allowed.
     *
     * @param columnName name of column to be allowed
     */
    public void allowColumnName(String columnName) {
        if (allowedColumnNames == null) {
            clearAllowedColumnNames();
        }
        allowedColumnNames.add(columnName);
    }

    /**
     * Analyzes a predicate.
     *
     * @param predicate predicate to be analyzed
     *
     * @param searchConditions receives conditions produced by analysis
     *
     * @return residual predicate which could not be translated to
     * searchConditions
     */
    public ExprNodeDesc analyzePredicate(ExprNodeDesc predicate,
            final List<IndexSearchCondition> searchConditions) {

        //pre-process the graph to determine if there are any OR statements and what level of nesting they are at
        orFirstLevel = false;
        orOtherLevel = false;
        NodeProcessor upFrontNodeProcessor = new NodeProcessor() {
            @Override
            public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs)
                    throws SemanticException {

                int position = 0;
                for (Node ancestor : stack) {
                    if (nd == ancestor) {
                        break;
                    }
                    if (!FunctionRegistry.isOpOr((ExprNodeDesc) ancestor)) {
                        if (position == 0) {
                            orFirstLevel = true;
                        } else {
                            orOtherLevel = true;
                        }
                        return nd;
                    }
                    position++;
                }

                return analyzeExpr((ExprNodeDesc) nd, searchConditions, nodeOutputs);
            }
        };
        Dispatcher upFrontDisp = new DefaultRuleDispatcher(upFrontNodeProcessor,
                new LinkedHashMap<Rule, NodeProcessor>(), null);
        GraphWalker upFrontOgw = new DefaultGraphWalker(upFrontDisp);
        ArrayList<Node> topNodes = new ArrayList<Node>();
        topNodes.add(predicate);
        HashMap<Node, Object> nodeOutput = new HashMap<Node, Object>();
        try {
            upFrontOgw.startWalking(topNodes, nodeOutput);
        } catch (SemanticException ex) {
            throw new RuntimeException(ex);
        }

        //if there are no OR statements or any OR statements are the first level then we can just treat this normally
        //if there are OR statements but they are only at nesting levels then we need to look for any rowid checks at the first level and process them

        Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
        NodeProcessor nodeProcessor = new NodeProcessor() {
            @Override
            public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs)
                    throws SemanticException {

                // We can only push down stuff which appears as part of
                // a pure conjunction:  reject OR, CASE, etc.
                for (Node ancestor : stack) {
                    if (nd == ancestor) {
                        break;
                    }
                    if ((!FunctionRegistry.isOpAnd((ExprNodeDesc) ancestor))
                            && (!FunctionRegistry.isOpOr((ExprNodeDesc) ancestor))) {
                        return nd;
                    }
                }

                return analyzeExpr((ExprNodeDesc) nd, searchConditions, nodeOutputs);
            }
        };

        Dispatcher disp = new DefaultRuleDispatcher(nodeProcessor, opRules, null);
        GraphWalker ogw = new DefaultGraphWalker(disp);
        topNodes.clear();
        topNodes.add(predicate);
        nodeOutput.clear();
        try {
            ogw.startWalking(topNodes, nodeOutput);
        } catch (SemanticException ex) {
            throw new RuntimeException(ex);
        }
        ExprNodeDesc residualPredicate = (ExprNodeDesc) nodeOutput.get(predicate);
        return residualPredicate;
    }

    private ExprNodeDesc analyzeExpr(ExprNodeDesc expr, List<IndexSearchCondition> searchConditions,
            Object... nodeOutputs) {

        if (!(expr instanceof ExprNodeGenericFuncDesc)) {
            return expr;
        }
        if (FunctionRegistry.isOpAnd(expr)) {
            assert (nodeOutputs.length == 2);
            ExprNodeDesc residual1 = (ExprNodeDesc) nodeOutputs[0];
            ExprNodeDesc residual2 = (ExprNodeDesc) nodeOutputs[1];
            if (residual1 == null) {
                return residual2;
            }
            if (residual2 == null) {
                return residual1;
            }
            List<ExprNodeDesc> residuals = new ArrayList<ExprNodeDesc>();
            residuals.add(residual1);
            residuals.add(residual2);
            //if there is any OR we don't push anything to the server unless it's the rowid, and then only if there are only AND operations at the first level of the expression
            //e.g. rowid > 'aaa' AND (a = b OR c = d) is ok, we push the rowid only
            //     rowid > 'aaa' OR (a = b AND c = d) is not and we push nothing at all
            // we also need to support where there are more than one rowid predicates, but they will all be together in one expression
            //e.g. rowid > 'aaa' AND rowid < 'ccc' AND a = b
            //ideally we would even support
            //     ((rowid > 'aaa' AND rowid < 'ccc') OR (rowid > 'ddd' AND rowid < 'fff')) AND a = b - this one is a stretch target however
            return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo,
                    FunctionRegistry.getGenericUDFForAnd(), residuals);
        }

        String udfName;
        ExprNodeGenericFuncDesc funcDesc = (ExprNodeGenericFuncDesc) expr;
        if (funcDesc.getGenericUDF() instanceof GenericUDFBridge) {
            GenericUDFBridge func = (GenericUDFBridge) funcDesc.getGenericUDF();
            udfName = func.getUdfName();
        } else {
            udfName = funcDesc.getGenericUDF().getClass().getName();
        }
        if (!udfNames.contains(udfName)) {
            return expr;
        }

        ExprNodeDesc child1 = extractConstant((ExprNodeDesc) nodeOutputs[0]);
        ExprNodeDesc child2 = extractConstant((ExprNodeDesc) nodeOutputs[1]);
        ExprNodeColumnDesc columnDesc = null;
        ExprNodeConstantDesc constantDesc = null;
        if ((child1 instanceof ExprNodeColumnDesc) && (child2 instanceof ExprNodeConstantDesc)) {
            // COL <op> CONSTANT
            columnDesc = (ExprNodeColumnDesc) child1;
            constantDesc = (ExprNodeConstantDesc) child2;
        } else if ((child2 instanceof ExprNodeColumnDesc) && (child1 instanceof ExprNodeConstantDesc)) {
            // CONSTANT <op> COL
            columnDesc = (ExprNodeColumnDesc) child2;
            constantDesc = (ExprNodeConstantDesc) child1;
        }
        if (columnDesc == null) {
            return expr;
        }
        if (allowedColumnNames != null) {
            if (!allowedColumnNames.contains(columnDesc.getColumn())) {
                return expr;
            }
        }
        searchConditions.add(new IndexSearchCondition(columnDesc, udfName, constantDesc, expr));

        // we converted the expression to a search condition, so
        // remove it from the residual predicate
        return null;
    }

    private ExprNodeDesc extractConstant(ExprNodeDesc expr) {
        if (!(expr instanceof ExprNodeGenericFuncDesc)) {
            return expr;
        }
        ExprNodeConstantDesc folded = foldConstant(((ExprNodeGenericFuncDesc) expr));
        return folded == null ? expr : folded;
    }

    private ExprNodeConstantDesc foldConstant(ExprNodeGenericFuncDesc func) {
        GenericUDF udf = func.getGenericUDF();
        if (!FunctionRegistry.isDeterministic(udf) || FunctionRegistry.isStateful(udf)) {
            return null;
        }
        try {
            // If the UDF depends on any external resources, we can't fold because the
            // resources may not be available at compile time.
            if (udf instanceof GenericUDFBridge) {
                UDF internal = ReflectionUtils.newInstance(((GenericUDFBridge) udf).getUdfClass(), null);
                if (internal.getRequiredFiles() != null || internal.getRequiredJars() != null) {
                    return null;
                }
            } else {
                if (udf.getRequiredFiles() != null || udf.getRequiredJars() != null) {
                    return null;
                }
            }

            for (ExprNodeDesc child : func.getChildExprs()) {
                if (child instanceof ExprNodeConstantDesc) {
                    continue;
                } else if (child instanceof ExprNodeGenericFuncDesc) {
                    if (foldConstant((ExprNodeGenericFuncDesc) child) != null) {
                        continue;
                    }
                }
                return null;
            }
            ExprNodeEvaluator evaluator = ExprNodeEvaluatorFactory.get(func);
            ObjectInspector output = evaluator.initialize(null);

            Object constant = evaluator.evaluate(null);
            Object java = ObjectInspectorUtils.copyToStandardJavaObject(constant, output);

            return new ExprNodeConstantDesc(java);
        } catch (Exception e) {
            return null;
        }
    }

    /**
     * Translates search conditions back to ExprNodeDesc form (as
     * a left-deep conjunction).
     *
     * @param searchConditions (typically produced by analyzePredicate)
     *
     * @return ExprNodeDesc form of search conditions
     */
    //  public ExprNodeDesc translateSearchConditions(
    //    List<IndexSearchCondition> searchConditions) {
    //
    //    List<ExprNodeDesc> exprs = new ArrayList<ExprNodeDesc>();
    //    for (IndexSearchCondition searchCondition : searchConditions) {
    //      if (BOOLEAN_AND.equals(searchCondition.getComparisonOp())) {
    //          List<ExprNodeDesc> children = new ArrayList<ExprNodeDesc>();
    //          children.addAll(exprs);
    //          exprs.clear();
    //          exprs.add(new ExprNodeGenericFuncDesc(
    //                  TypeInfoFactory.booleanTypeInfo,
    //                  FunctionRegistry.getGenericUDFForAnd(),
    //                  children));
    //      } else if (BOOLEAN_OR.equals(searchCondition.getComparisonOp())) {
    //          List<ExprNodeDesc> children = new ArrayList<ExprNodeDesc>();
    //          children.addAll(exprs);
    //          exprs.clear();
    //          exprs.add(new ExprNodeGenericFuncDesc(
    //                  TypeInfoFactory.booleanTypeInfo,
    //                  FunctionRegistry.getFunctionInfo("or").getGenericUDF(),
    //                  children));
    //      } else {
    //          exprs.add(searchCondition.getComparisonExpr());
    //      }
    //    }
    //    return exprs.get(0);
    //  }

    public ExprNodeDesc translateSearchConditions(List<IndexSearchCondition> searchConditions) {

        ExprNodeDesc expr = null;
        for (IndexSearchCondition searchCondition : searchConditions) {
            if (expr == null) {
                expr = searchCondition.getComparisonExpr();
                continue;
            }
            List<ExprNodeDesc> children = new ArrayList<ExprNodeDesc>();
            children.add(expr);
            children.add(searchCondition.getComparisonExpr());
            expr = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo,
                    FunctionRegistry.getGenericUDFForAnd(), children);
        }
        return expr;
    }
}