org.apache.lens.cube.parse.CubeQueryRewriter.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lens.cube.parse.CubeQueryRewriter.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.lens.cube.parse;

import static org.apache.lens.cube.error.LensCubeErrorCode.SYNTAX_ERROR;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lens.server.api.error.LensException;
import org.apache.lens.server.api.metrics.MethodMetricsContext;
import org.apache.lens.server.api.metrics.MethodMetricsFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.parse.*;

import lombok.extern.slf4j.Slf4j;

/**
 * Rewrites given cube query into simple storage table HQL.
 */
@Slf4j
public class CubeQueryRewriter {
    private final Configuration conf;
    private final List<ContextRewriter> rewriters = new ArrayList<ContextRewriter>();
    private final HiveConf hconf;
    private Context qlCtx = null;
    private boolean lightFactFirst;

    public CubeQueryRewriter(Configuration conf, HiveConf hconf) {
        this.conf = conf;
        this.hconf = hconf;
        try {
            qlCtx = new Context(conf);
        } catch (IOException e) {
            // IOException is ignorable
        }
        lightFactFirst = conf.getBoolean(CubeQueryConfUtil.LIGHTEST_FACT_FIRST,
                CubeQueryConfUtil.DEFAULT_LIGHTEST_FACT_FIRST);
        setupRewriters();
    }

    /*
     * Here is the rewriter flow.
     *
     * Expression resolver: replaces the queried expression columns with their
     * expression ASTs in the query AST.
     *
     * ColumnResolver : ColumnResolver finds all the columns in the query AST
     *
     * AliasReplacer: - Finds queried column to table alias. - Finds queried dim
     * attributes and queried measures. - Does queried field validation wrt
     * derived cubes, if all fields of queried cube cannot be queried together. -
     * Replaces all the columns in all expressions with tablealias.column
     *
     * DenormalizationResolver: Phase 1: Finds all the queried column references
     * if any.
     *
     * CandidateTableResolver: Phase 1: - Prune candidate fact tables if queried
     * dim attributes are not present. - Also Figures out if queried column is not
     * part of candidate table, but it is a denormalized field which can reached
     * through a reference - Finds all the candidate fact sets containing queried
     * measures. Prunes facts which do not contain any of the queried measures.
     *
     * JoinResolver : - Finds all the join chains for between tables queried.
     *
     * TimerangeResolver : - Finds all timeranges in the query and does validation
     * wrt the queried field's life and the range queried
     *
     * CandidateTableResolver: Phase 2: - Prunes candidates tables if required
     * join columns are not part of candidate tables - Required source
     * columns(join columns) for reaching a denormalized field, are not part of
     * candidate tables - Required denormalized fields are not part of refered
     * tables, there by all the candidates which are using denormalized fields.
     *
     * AggregateResolver : - If non default aggregate or no aggregate queried for
     * a measure, it prunes all aggregate facts from candidates. - Replaces
     * measures with default aggregates. if enabled
     *
     * GroupbyResolver : - Promotes select to groupby and groupby to select, if
     * enabled
     *
     * StorageTableResolver : - Resolves storages and partitions for all candidate
     * tables. Prunes candidates if not storages are available.
     *
     * Whenever a candidate fact is pruned (because of no storages, no default
     * aggregate and etc), the sets containing the fact are also pruned.
     *
     * LeastPartitionResolver and LightestFactResolver work on candidate fact sets
     * and considers sets with least number of partitions required and lightest
     * facts respectively.
     *
     * If LightestFact first flag is enabled, LightestFactResolver is applied
     * before StorageTableResolver.
     *
     * MaxCoveringFactResolver runs just after all candidate facts' partitions
     * are resolved. It then sees how much time range each fact set is able to cover
     * and finds the maximum coverable range. It then prunes all fact sets that
     * are covering less than that range. If fail on partial is true, then by the
     * time this resolver runs, all the candidate fact sets cover full range.
     * So there this resolver is a no-op. Same thing when fail on partial is false
     * and no fact set has any data. This is most useful when facts actually have
     * partial data. There it'll ensure the facts that are covering the maximum
     * time range will be picked.
     *
     *
     * Once all rewriters are done, finally picks up one of the available
     * candidate sets to answer the query, after all the resolvers are done. Once
     * the final candidate fact set is picked, if number of elements in the fact
     * set is one, the query written as with cube query ASTs. If the number of
     * fact sets is more, then query rewritten with MultifactHQLContext which
     * writes a join query with each fact query. A fact query contains only the
     * fields queried from that fact. The ASTs corresponding to the fact are AST
     * copied from original query and the expressions missing from this fact
     * removed.
     */
    private void setupRewriters() {
        // Resolve columns - the column alias and table alias
        rewriters.add(new ColumnResolver());
        // Rewrite base trees (groupby, having, orderby, limit) using aliases
        rewriters.add(new AliasReplacer());
        ExpressionResolver exprResolver = new ExpressionResolver();
        DenormalizationResolver denormResolver = new DenormalizationResolver(conf);
        CandidateTableResolver candidateTblResolver = new CandidateTableResolver();
        StorageTableResolver storageTableResolver = new StorageTableResolver(conf);
        rewriters.add(exprResolver);
        // De-normalized columns resolved
        rewriters.add(denormResolver);
        // Resolve time ranges
        rewriters.add(new TimerangeResolver());
        // Resolve candidate fact tables and dimension tables for columns queried
        rewriters.add(candidateTblResolver);
        // Resolve aggregations and generate base select tree
        rewriters.add(new AggregateResolver());
        rewriters.add(new GroupbyResolver(conf));
        rewriters.add(new FieldValidator());
        // Resolve joins and generate base join tree
        rewriters.add(new JoinResolver());
        // Do col life validation
        rewriters.add(new TimeRangeChecker(conf));
        // Resolve candidate fact tables and dimension tables for columns included
        // in join and denorm resolvers
        rewriters.add(candidateTblResolver);

        // Phase 1: resolve fact tables.
        rewriters.add(storageTableResolver);
        if (lightFactFirst) {
            // Prune candidate tables for which denorm column references do not exist
            rewriters.add(denormResolver);
            // Prune candidate facts without any valid expressions
            rewriters.add(exprResolver);
            rewriters.add(new LightestFactResolver());
        }
        // Phase 2: resolve fact table partitions.
        rewriters.add(storageTableResolver);
        rewriters.add(new MaxCoveringFactResolver(conf));
        // Phase 3: resolve dimension tables and partitions.
        rewriters.add(storageTableResolver);
        // Prune candidate tables for which denorm column references do not exist
        rewriters.add(denormResolver);
        // Prune candidate facts without any valid expressions
        rewriters.add(exprResolver);
        // We can have LightestFactResolver before LeastPartitionResolver - that says
        // "if two facts have the same least weight, then the fact with least number of time partitions queried will be
        // picked". This will be useful, if users did not set fact weights.
        if (!lightFactFirst) {
            rewriters.add(new LightestFactResolver());
        }
        rewriters.add(new LeastPartitionResolver());
        rewriters.add(new LightestDimensionResolver());
    }

    public CubeQueryContext rewrite(ASTNode astnode) throws LensException {
        CubeSemanticAnalyzer analyzer;
        try {
            analyzer = new CubeSemanticAnalyzer(conf, hconf);
            analyzer.analyze(astnode, qlCtx);
        } catch (SemanticException e) {
            throw new LensException(SYNTAX_ERROR.getLensErrorInfo(), e, e.getMessage());
        }
        CubeQueryContext ctx = new CubeQueryContext(astnode, analyzer.getCubeQB(), conf, hconf);
        rewrite(rewriters, ctx);
        return ctx;
    }

    public CubeQueryContext rewrite(String command) throws LensException {
        if (command != null) {
            command = command.replace("\n", "");
        }
        ASTNode tree;
        try {
            ParseDriver pd = new ParseDriver();
            tree = pd.parse(command, qlCtx, false);
            tree = ParseUtils.findRootNonNullToken(tree);
        } catch (ParseException e) {
            throw new LensException(SYNTAX_ERROR.getLensErrorInfo(), e, e.getMessage());
        }
        return rewrite(tree);
    }

    private static final String ITER_STR = "-ITER-";

    private void rewrite(List<ContextRewriter> rewriters, CubeQueryContext ctx) throws LensException {
        int i = 0;
        for (ContextRewriter rewriter : rewriters) {
            /*
             * Adding iteration number as part of gauge name since some rewriters are have more than one phase, and having
             * iter number gives the idea which iteration the rewriter was run
             */
            MethodMetricsContext mgauge = MethodMetricsFactory.createMethodGauge(ctx.getConf(), true,
                    rewriter.getClass().getCanonicalName() + ITER_STR + i);
            rewriter.rewriteContext(ctx);
            mgauge.markSuccess();
            i++;
        }
    }

    public Context getQLContext() {
        return qlCtx;
    }

    public void clear() {
        try {
            if (qlCtx != null) {
                qlCtx.clear();
            }
        } catch (IOException e) {
            log.info("Ignoring exception in clearing qlCtx:", e);
            // ignoring exception in clear
        }
    }
}