com.ask.hive.hbase.HiveHBaseTimeTableInputFormat.java Source code

Introduction

Here is the source code for com.ask.hive.hbase.HiveHBaseTimeTableInputFormat.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.ask.hive.hbase;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.CompareFilter;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hbase.filter.WhileMatchFilter;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormatBase;
import org.apache.hadoop.hbase.mapreduce.TableSplit;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Writables;
import org.apache.hadoop.hive.ql.exec.ExprNodeConstantEvaluator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.index.IndexPredicateAnalyzer;
import org.apache.hadoop.hive.ql.index.IndexSearchCondition;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.serde.Constants;
import org.apache.hadoop.hive.serde2.ByteStream;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.lazy.LazyUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.hbase.HBaseSplit;
import org.apache.hadoop.hive.hbase.HBaseSerDe;
import org.apache.hadoop.hive.hbase.HiveHBaseTableInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.hbase.filter.*;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;

/**
 * HiveHBaseTableInputFormat implements InputFormat for HBase storage handler
 * tables, decorating an underlying HBase TableInputFormat with extra Hive logic
 * such as column pruning and filter pushdown.
 */
public class HiveHBaseTimeTableInputFormat extends HiveHBaseTableInputFormat
        implements InputFormat<ImmutableBytesWritable, Result> {

    static final Log LOG = LogFactory.getLog(HiveHBaseTimeTableInputFormat.class);

    //@Override
    public RecordReader<ImmutableBytesWritable, Result> getRecordReader(InputSplit split, JobConf jobConf,
            final Reporter reporter) throws IOException {

        HBaseSplit hbaseSplit = (HBaseSplit) split;
        TableSplit tableSplit = hbaseSplit.getSplit();
        String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
        setHTable(new HTable(new HBaseConfiguration(jobConf), Bytes.toBytes(hbaseTableName)));
        String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
        List<String> hbaseColumnFamilies = new ArrayList<String>();
        List<String> hbaseColumnQualifiers = new ArrayList<String>();
        List<byte[]> hbaseColumnFamiliesBytes = new ArrayList<byte[]>();
        List<byte[]> hbaseColumnQualifiersBytes = new ArrayList<byte[]>();

        int iKey;
        try {
            iKey = HBaseSerDe.parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies, hbaseColumnFamiliesBytes,
                    hbaseColumnQualifiers, hbaseColumnQualifiersBytes);
        } catch (SerDeException se) {
            throw new IOException(se);
        }
        List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf);

        if (hbaseColumnFamilies.size() < readColIDs.size()) {
            throw new IOException("Cannot read more columns than the given table contains.");
        }

        boolean addAll = (readColIDs.size() == 0);
        Scan scan = new Scan();
        boolean empty = true;

        if (!addAll) {
            for (int i : readColIDs) {
                if (i == iKey) {
                    continue;
                }

                if (hbaseColumnQualifiers.get(i) == null) {
                    scan.addFamily(hbaseColumnFamiliesBytes.get(i));
                } else {
                    scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i));
                }

                empty = false;
            }
        }

        // The HBase table's row key maps to a Hive table column. In the corner case when only the
        // row key column is selected in Hive, the HBase Scan will be empty i.e. no column family/
        // column qualifier will have been added to the scan. We arbitrarily add at least one column
        // to the HBase scan so that we can retrieve all of the row keys and return them as the Hive
        // tables column projection.
        if (empty) {
            for (int i = 0; i < hbaseColumnFamilies.size(); i++) {
                if (i == iKey) {
                    continue;
                }

                if (hbaseColumnQualifiers.get(i) == null) {
                    scan.addFamily(hbaseColumnFamiliesBytes.get(i));
                } else {
                    scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i));
                }

                if (!addAll) {
                    break;
                }
            }
        }

        //setting start and end time for scanning
        setTime(jobConf, scan);
        // If Hive's optimizer gave us a filter to process, convert it to the
        // HBase scan form now.
        tableSplit = convertFilter(jobConf, scan, tableSplit, iKey);

        setScan(scan);

        Job job = new Job(jobConf);
        TaskAttemptContext tac = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()) {

            @Override
            public void progress() {
                reporter.progress();
            }
        };

        final org.apache.hadoop.mapreduce.RecordReader<ImmutableBytesWritable, Result> recordReader = createRecordReader(
                tableSplit, tac);

        return new RecordReader<ImmutableBytesWritable, Result>() {

            //@Override
            public void close() throws IOException {
                recordReader.close();
            }

            // @Override
            public ImmutableBytesWritable createKey() {
                return new ImmutableBytesWritable();
            }

            // @Override
            public Result createValue() {
                return new Result();
            }

            // @Override
            public long getPos() throws IOException {
                return 0;
            }

            // @Override
            public float getProgress() throws IOException {
                float progress = 0.0F;

                try {
                    progress = recordReader.getProgress();
                } catch (InterruptedException e) {
                    throw new IOException(e);
                }

                return progress;
            }

            // @Override
            public boolean next(ImmutableBytesWritable rowKey, Result value) throws IOException {

                boolean next = false;

                try {
                    next = recordReader.nextKeyValue();

                    if (next) {
                        rowKey.set(recordReader.getCurrentValue().getRow());
                        Writables.copyWritable(recordReader.getCurrentValue(), value);
                    }
                } catch (InterruptedException e) {
                    throw new IOException(e);
                }

                return next;
            }
        };
    }

    /**
     * minimum time should be less than max time <br/>
     * otherwise filter will be skiped     *
     * @param jobConf
     * @param scan
     * @throws IOException
     */
    private void setTime(JobConf jobConf, Scan scan) throws IOException {
        long min = 0l;
        String mintime = jobConf.get("hbase.mintime");
        if (StringUtils.isNotEmpty(mintime)) {
            min = Long.parseLong(mintime);
        }
        String maxtime = jobConf.get("hbase.maxtime");
        if (StringUtils.isNotEmpty(maxtime)) {
            long l = Long.parseLong(maxtime);
            if (min <= l)
                scan.setTimeRange(min, l);
        } else if (min > 0) {
            long l = System.currentTimeMillis();
            if (min <= l)
                scan.setTimeRange(min, l);
        }
    }

    /**
     * Converts a filter (which has been pushed down from Hive's optimizer)
     * into corresponding restrictions on the HBase scan.  The
     * filter should already be in a form which can be fully converted.
     *
     * @param jobConf    configuration for the scan
     * @param scan       the HBase scan object to restrict
     * @param tableSplit the HBase table split to restrict, or null
     *                   if calculating splits
     * @param iKey       0-based offset of key column within Hive table
     * @return converted table split if any
     */
    private TableSplit convertFilter(JobConf jobConf, Scan scan, TableSplit tableSplit, int iKey)
            throws IOException {

        String filterExprSerialized = jobConf.get(TableScanDesc.FILTER_EXPR_CONF_STR);
        if (filterExprSerialized == null) {
            return tableSplit;
        }
        ExprNodeDesc filterExpr = Utilities.deserializeExpression(filterExprSerialized, jobConf);

        String columnNameProperty = jobConf.get(Constants.LIST_COLUMNS);
        List<String> columnNames = Arrays.asList(columnNameProperty.split(","));

        IndexPredicateAnalyzer analyzer = newIndexPredicateAnalyzer(columnNames.get(iKey));

        List<IndexSearchCondition> searchConditions = new ArrayList<IndexSearchCondition>();
        ExprNodeDesc residualPredicate = analyzer.analyzePredicate(filterExpr, searchConditions);

        // There should be no residual since we already negotiated
        // that earlier in HBaseStorageHandler.decomposePredicate.
        if (residualPredicate != null) {
            throw new RuntimeException("Unexpected residual predicate " + residualPredicate.getExprString());
        }

        // There should be exactly one predicate since we already
        // negotiated that also.
        if (searchConditions.size() != 1) {
            throw new RuntimeException("Exactly one search condition expected in push down");
        }

        // Convert the search condition into a restriction on the HBase scan
        IndexSearchCondition sc = searchConditions.get(0);
        ExprNodeConstantEvaluator eval = new ExprNodeConstantEvaluator(sc.getConstantDesc());
        byte[] startRow;
        try {
            ObjectInspector objInspector = eval.initialize(null);
            Object writable = eval.evaluate(null);
            ByteStream.Output serializeStream = new ByteStream.Output();
            LazyUtils.writePrimitiveUTF8(serializeStream, writable, (PrimitiveObjectInspector) objInspector, false,
                    (byte) 0, null);
            startRow = new byte[serializeStream.getCount()];
            System.arraycopy(serializeStream.getData(), 0, startRow, 0, serializeStream.getCount());
        } catch (HiveException ex) {
            throw new IOException(ex);
        }

        // stopRow is exclusive, so pad it with a trailing 0 byte to
        // make it compare as the very next value after startRow
        byte[] stopRow = new byte[startRow.length + 1];
        System.arraycopy(startRow, 0, stopRow, 0, startRow.length);

        if (tableSplit != null) {
            tableSplit = new TableSplit(tableSplit.getTableName(), startRow, stopRow,
                    tableSplit.getRegionLocation());
        }
        scan.setStartRow(startRow);
        scan.setStopRow(stopRow);
        // Add a WhileMatchFilter to make the scan terminate as soon
        // as we see a non-matching key.  This is probably redundant
        // since the stopRow above should already take care of it for us.

        scan.setFilter(
                new WhileMatchFilter(new RowFilter(CompareFilter.CompareOp.EQUAL, new BinaryComparator(startRow))));
        return tableSplit;
    }

    /**
     * Instantiates a new predicate analyzer suitable for
     * determining how to push a filter down into the HBase scan,
     * based on the rules for what kinds of pushdown we currently support.
     *
     * @param keyColumnName name of the Hive column mapped to the HBase row key
     * @return preconfigured predicate analyzer
     */
    static IndexPredicateAnalyzer newIndexPredicateAnalyzer(String keyColumnName) {

        IndexPredicateAnalyzer analyzer = new IndexPredicateAnalyzer();

        // for now, we only support equality comparisons
        analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual");

        // and only on the key column
        analyzer.clearAllowedColumnNames();
        analyzer.allowColumnName(keyColumnName);

        return analyzer;
    }

    //@Override
    public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {

        String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
        setHTable(new HTable(new HBaseConfiguration(jobConf), Bytes.toBytes(hbaseTableName)));
        String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);

        if (hbaseColumnsMapping == null) {
            throw new IOException("hbase.columns.mapping required for HBase Table.");
        }

        List<String> hbaseColumnFamilies = new ArrayList<String>();
        List<String> hbaseColumnQualifiers = new ArrayList<String>();
        List<byte[]> hbaseColumnFamiliesBytes = new ArrayList<byte[]>();
        List<byte[]> hbaseColumnQualifiersBytes = new ArrayList<byte[]>();

        int iKey;
        try {
            iKey = HBaseSerDe.parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies, hbaseColumnFamiliesBytes,
                    hbaseColumnQualifiers, hbaseColumnQualifiersBytes);
        } catch (SerDeException se) {
            throw new IOException(se);
        }

        Scan scan = new Scan();

        // Take filter pushdown into account while calculating splits; this
        // allows us to prune off regions immediately.  Note that although
        // the Javadoc for the superclass getSplits says that it returns one
        // split per region, the implementation actually takes the scan
        // definition into account and excludes regions which don't satisfy
        // the start/stop row conditions (HBASE-1829).
        convertFilter(jobConf, scan, null, iKey);

        // REVIEW:  are we supposed to be applying the getReadColumnIDs
        // same as in getRecordReader?
        for (int i = 0; i < hbaseColumnFamilies.size(); i++) {
            if (i == iKey) {
                continue;
            }

            if (hbaseColumnQualifiers.get(i) == null) {
                scan.addFamily(hbaseColumnFamiliesBytes.get(i));
            } else {
                scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i));
            }
        }

        setScan(scan);
        Job job = new Job(jobConf);
        JobContext jobContext = new JobContext(job.getConfiguration(), job.getJobID());
        Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);

        List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext);
        InputSplit[] results = new InputSplit[splits.size()];

        for (int i = 0; i < splits.size(); i++) {
            results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]);
        }

        return results;
    }
}