co.cask.cdap.hive.stream.HiveStreamInputFormat.java Source code

Introduction

Here is the source code for co.cask.cdap.hive.stream.HiveStreamInputFormat.java
Source

/*
 * Copyright  2014 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.hive.stream;

import co.cask.cdap.common.conf.Constants;
import co.cask.cdap.data.stream.StreamInputSplitFactory;
import co.cask.cdap.data.stream.StreamInputSplitFinder;
import co.cask.cdap.data.stream.StreamUtils;
import co.cask.cdap.data2.transaction.stream.StreamConfig;
import co.cask.cdap.hive.context.ContextManager;
import co.cask.cdap.proto.Id;
import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.index.IndexPredicateAnalyzer;
import org.apache.hadoop.hive.ql.index.IndexSearchCondition;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.twill.filesystem.Location;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.List;
import javax.annotation.Nullable;

/**
 * Stream input format for use in hive queries and only hive queries. Will not work outside of hive.
 */
public class HiveStreamInputFormat implements InputFormat<Void, ObjectWritable> {
    private static final Logger LOG = LoggerFactory.getLogger(HiveStreamInputFormat.class);

    @Override
    public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
        // right before this method is called by hive, hive copies everything that the storage handler put in the properties
        // map in it's configureTableJobProperties method into the job conf. We put the stream name in there so that
        // we can derive the stream path and properties from it.
        // this MUST be done in the input format and not in StreamSerDe's initialize method because we have no control
        // over when initialize is called. If we set job conf settings there, the settings for one stream get clobbered
        // by the settings for another stream if a join over streams is being performed.
        StreamInputSplitFinder<InputSplit> splitFinder = getSplitFinder(conf);
        List<InputSplit> splits = splitFinder.getSplits(conf);
        return splits.toArray(new InputSplit[splits.size()]);
    }

    @Override
    public RecordReader<Void, ObjectWritable> getRecordReader(InputSplit split, JobConf conf, Reporter reporter)
            throws IOException {
        return new StreamRecordReader(split, conf);
    }

    private StreamInputSplitFinder<InputSplit> getSplitFinder(JobConf conf) throws IOException {
        // first get the context we are in
        ContextManager.Context context = ContextManager.getContext(conf);

        String streamName = conf.get(Constants.Explore.STREAM_NAME);
        String streamNamespace = conf.get(Constants.Explore.STREAM_NAMESPACE);
        Id.Stream streamId = Id.Stream.from(streamNamespace, streamName);
        StreamConfig streamConfig = context.getStreamConfig(streamId);
        // make sure we get the current generation so we don't read events that occurred before a truncate.
        Location streamPath = StreamUtils.createGenerationLocation(streamConfig.getLocation(),
                StreamUtils.getGeneration(streamConfig));

        StreamInputSplitFinder.Builder builder = StreamInputSplitFinder.builder(streamPath.toURI());

        // Get the Hive table path for the InputSplit created. It is just to satisfy hive. The InputFormat never uses it.
        JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(conf));
        final Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);

        return setupBuilder(conf, streamConfig, builder).build(new StreamInputSplitFactory<InputSplit>() {
            @Override
            public InputSplit createSplit(Path eventPath, Path indexPath, long startTime, long endTime, long start,
                    long length, @Nullable String[] locations) {
                return new StreamInputSplit(tablePaths[0], eventPath, indexPath, startTime, endTime, start, length,
                        locations);
            }
        });
    }

    /**
     * Setups the given {@link StreamInputSplitFinder.Builder} by analyzing the query.
     */
    private StreamInputSplitFinder.Builder setupBuilder(Configuration conf, StreamConfig streamConfig,
            StreamInputSplitFinder.Builder builder) {
        // the conf contains a 'hive.io.filter.expr.serialized' key which contains the serialized form of ExprNodeDesc
        long startTime = Math.max(0L, System.currentTimeMillis() - streamConfig.getTTL());
        long endTime = System.currentTimeMillis();

        String serializedExpr = conf.get(TableScanDesc.FILTER_EXPR_CONF_STR);
        if (serializedExpr == null) {
            return builder.setStartTime(startTime).setEndTime(endTime);
        }

        try {
            ExprNodeGenericFuncDesc expr;
            // Hack to deal with the fact that older versions of Hive use
            // Utilities.deserializeExpression(String, Configuration),
            // whereas newer versions use Utilities.deserializeExpression(String).
            try {
                expr = Utilities.deserializeExpression(serializedExpr);
            } catch (NoSuchMethodError e) {
                expr = (ExprNodeGenericFuncDesc) Utilities.class
                        .getMethod("deserializeExpression", String.class, Configuration.class)
                        .invoke(null, serializedExpr, conf);
            }

            // Analyze the query to extract predicates that can be used for indexing (i.e. setting start/end time)
            IndexPredicateAnalyzer analyzer = new IndexPredicateAnalyzer();
            for (CompareOp op : CompareOp.values()) {
                analyzer.addComparisonOp(op.getOpClassName());
            }

            // Stream can only be indexed by timestamp
            analyzer.clearAllowedColumnNames();
            analyzer.allowColumnName("ts");

            List<IndexSearchCondition> conditions = Lists.newArrayList();
            analyzer.analyzePredicate(expr, conditions);

            for (IndexSearchCondition condition : conditions) {
                CompareOp op = CompareOp.from(condition.getComparisonOp());
                if (op == null) {
                    // Not a supported operation
                    continue;
                }
                ExprNodeConstantDesc value = condition.getConstantDesc();
                if (value == null || !(value.getValue() instanceof Long)) {
                    // Not a supported value
                    continue;
                }

                long timestamp = (Long) value.getValue();
                // If there is a equal, set both start and endtime and no need to inspect further
                if (op == CompareOp.EQUAL) {
                    startTime = timestamp;
                    endTime = (timestamp < Long.MAX_VALUE) ? timestamp + 1L : timestamp;
                    break;
                }
                if (op == CompareOp.GREATER || op == CompareOp.EQUAL_OR_GREATER) {
                    // Plus 1 for the start time if it is greater since start time is inclusive in stream
                    startTime = Math.max(startTime,
                            timestamp + (timestamp < Long.MAX_VALUE && op == CompareOp.GREATER ? 1L : 0L));
                } else {
                    // Plus 1 for end time if it is equal or less since end time is exclusive in stream
                    endTime = Math.min(endTime,
                            timestamp + (timestamp < Long.MAX_VALUE && op == CompareOp.EQUAL_OR_LESS ? 1L : 0L));
                }
            }
        } catch (Throwable t) {
            LOG.warn("Exception analyzing query predicate. A full table scan will be performed.", t);
        }

        return builder.setStartTime(startTime).setEndTime(endTime);
    }

    private enum CompareOp {
        EQUAL(GenericUDFOPEqual.class.getName()), EQUAL_OR_GREATER(GenericUDFOPEqualOrGreaterThan.class
                .getName()), EQUAL_OR_LESS(GenericUDFOPEqualOrLessThan.class.getName()), GREATER(
                        GenericUDFOPGreaterThan.class.getName()), LESS(GenericUDFOPLessThan.class.getName());

        private final String opClassName;

        CompareOp(String opClassName) {
            this.opClassName = opClassName;
        }

        public String getOpClassName() {
            return opClassName;
        }

        /**
         * Returns a {@link CompareOp} by matching the given class name or {@code null} if there is none matching.
         */
        @Nullable
        public static CompareOp from(String opClassName) {
            for (CompareOp op : values()) {
                if (op.getOpClassName().equals(opClassName)) {
                    return op;
                }
            }
            return null;
        }
    }
}