com.ask.hive.hbase.HiveHBaseTextTableInputFormat.java Source code

Introduction

Here is the source code for com.ask.hive.hbase.HiveHBaseTextTableInputFormat.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.ask.hive.hbase;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.CompareFilter;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hbase.filter.WhileMatchFilter;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormatBase;
import org.apache.hadoop.hbase.mapreduce.TableSplit;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hive.ql.exec.ExprNodeConstantEvaluator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.index.IndexPredicateAnalyzer;
import org.apache.hadoop.hive.ql.index.IndexSearchCondition;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.serde.Constants;
import org.apache.hadoop.hive.serde2.ByteStream;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.lazy.LazyUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.hbase.HBaseSplit;
import org.apache.hadoop.hive.hbase.HBaseSerDe;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.hbase.filter.*;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;

/**
 * HiveHBaseTableInputFormat implements InputFormat for HBase storage handler
 * tables, decorating an underlying HBase TableInputFormat with extra Hive logic
 * such as column pruning and filter pushdown.
 */
public class HiveHBaseTextTableInputFormat extends TableInputFormatBase implements InputFormat<Text, Text> {

    static final Log LOG = LogFactory.getLog(HiveHBaseTextTableInputFormat.class);
    public static final String HBASE_KEY_COL = ":key";

    //@Override
    public RecordReader<Text, Text> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter)
            throws IOException {

        HBaseSplit hbaseSplit = (HBaseSplit) split;
        TableSplit tableSplit = hbaseSplit.getSplit();
        String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
        setHTable(new HTable(new HBaseConfiguration(jobConf), Bytes.toBytes(hbaseTableName)));
        String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
        List<String> hbaseColumnFamilies = new ArrayList<String>();
        List<String> hbaseColumnQualifiers = new ArrayList<String>();
        List<byte[]> hbaseColumnFamiliesBytes = new ArrayList<byte[]>();
        List<byte[]> hbaseColumnQualifiersBytes = new ArrayList<byte[]>();

        int iKey;
        try {
            iKey = parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies, hbaseColumnFamiliesBytes,
                    hbaseColumnQualifiers, hbaseColumnQualifiersBytes);
        } catch (Exception se) {
            throw new IOException(se);
        }
        List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf);

        if (hbaseColumnFamilies.size() < readColIDs.size()) {
            throw new IOException("Cannot read more columns than the given table contains.");
        }

        boolean addAll = (readColIDs.size() == 0);
        Scan scan = new Scan();
        boolean empty = true;

        if (!addAll) {
            for (int i : readColIDs) {
                if (i == iKey) {
                    continue;
                }
                scan.addFamily(hbaseColumnFamiliesBytes.get(i));
                empty = false;
            }
        }

        // The HBase table's row key maps to a Hive table column. In the corner case when only the
        // row key column is selected in Hive, the HBase Scan will be empty i.e. no column family/
        // column qualifier will have been added to the scan. We arbitrarily add at least one column
        // to the HBase scan so that we can retrieve all of the row keys and return them as the Hive
        // tables column projection.
        if (empty) {
            for (int i = 0; i < hbaseColumnFamilies.size(); i++) {
                if (i == iKey) {
                    continue;
                }

                if (hbaseColumnQualifiers.get(i) == null) {
                    scan.addFamily(hbaseColumnFamiliesBytes.get(i));
                } else {
                    scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i));
                }

                if (!addAll) {
                    break;
                }
            }
        }

        //setting start and end time for scanning
        setTime(jobConf, scan);
        // If Hive's optimizer gave us a filter to process, convert it to the
        // HBase scan form now.
        tableSplit = convertFilter(jobConf, scan, tableSplit, iKey);

        setScan(scan);

        Job job = new Job(jobConf);
        TaskAttemptContext tac = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()) {

            @Override
            public void progress() {
                reporter.progress();
            }
        };

        final org.apache.hadoop.mapreduce.RecordReader<ImmutableBytesWritable, Result> recordReader = createRecordReader(
                tableSplit, tac);

        return new RecordReader<Text, Text>() {

            //@Override
            public void close() throws IOException {
                recordReader.close();
            }

            // @Override
            public Text createKey() {
                return new Text();
            }

            // @Override
            public Text createValue() {
                return new Text();
            }

            // @Override
            public long getPos() throws IOException {
                return 0;
            }

            // @Override
            public float getProgress() throws IOException {
                float progress = 0.0F;

                try {
                    progress = recordReader.getProgress();
                } catch (InterruptedException e) {
                    throw new IOException(e);
                }

                return progress;
            }

            // @Override
            public boolean next(Text rowKey, Text value) throws IOException {

                boolean next = false;

                try {
                    next = recordReader.nextKeyValue();

                    //logic for to find the column name 
                    if (next) {
                        rowKey.set(Bytes.toString(recordReader.getCurrentValue().getRow()));
                        StringBuilder val = new StringBuilder();
                        String prev = "";
                        for (KeyValue kv : recordReader.getCurrentValue().raw()) {
                            String current = new String(kv.getQualifier());
                            char[] col = new String(current).toCharArray();
                            if (val.length() > 0) {
                                if (prev.equals(current))
                                    val.append(",");
                                else
                                    val.append("\t");
                            }
                            prev = current;
                            val.append(col[0]).append("_");
                            val.append(Bytes.toString(kv.getValue()));
                        }
                        value.set(val.toString()); // rowKey.set(Bytes.toString(recordReader.getCurrentValue().getRow()));;
                        // value.set(Bytes.toString(recordReader.getCurrentValue().value()));
                    }
                } catch (InterruptedException e) {
                    throw new IOException(e);
                }

                return next;
            }
        };
    }

    /**
     * minimum time should be less than max time <br/>
     * otherwise filter will be skiped     *
     *
     * @param jobConf
     * @param scan
     * @throws java.io.IOException
     */
    private void setTime(JobConf jobConf, Scan scan) throws IOException {
        long min = 0l;
        String mintime = jobConf.get("hbase.mintime");
        if (StringUtils.isNotEmpty(mintime)) {
            min = Long.parseLong(mintime);
        }
        String maxtime = jobConf.get("hbase.maxtime");
        if (StringUtils.isNotEmpty(maxtime)) {
            long l = Long.parseLong(maxtime);
            if (min <= l)
                scan.setTimeRange(min, l);
        }
        FilterList list = new FilterList(FilterList.Operator.MUST_PASS_ALL);
        boolean isInmissing = true;
        String missing = jobConf.get("hbase.include.missing");
        if (StringUtils.isNotEmpty(missing)) {
            isInmissing = Boolean.valueOf(missing);
        }

        String hvalue = jobConf.get("hbase.include.filter.value");
        if (StringUtils.isNotEmpty(hvalue)) {
            String[] columns = hvalue.split(",");
            if (columns.length > 0) {
                for (String column : columns) {
                    String[] fv = column.split(":");
                    SingleColumnValueFilter rowfilter = new SingleColumnValueFilter(Bytes.toBytes(fv[0]),
                            Bytes.toBytes(fv[1]), CompareOp.EQUAL, Bytes.toBytes(fv[2]));
                    rowfilter.setFilterIfMissing(isInmissing);
                    list.addFilter(rowfilter);
                }
            }
        }
        boolean isExmissing = false;
        String exMissing = jobConf.get("hbase.exclude.missing");
        if (StringUtils.isNotEmpty(exMissing)) {
            isExmissing = Boolean.valueOf(exMissing);
        }

        String hexvalue = jobConf.get("hbase.exclude.filter.value");
        if (StringUtils.isNotEmpty(hexvalue)) {
            String[] columns = hexvalue.split(",");
            if (columns.length > 0) {
                for (String column : columns) {
                    String[] fv = column.split(":");
                    SingleColumnValueFilter rowfilter = new SingleColumnValueFilter(Bytes.toBytes(fv[0]),
                            Bytes.toBytes(fv[1]), CompareOp.NOT_EQUAL, Bytes.toBytes(fv[2]));
                    rowfilter.setFilterIfMissing(isExmissing);
                    list.addFilter(rowfilter);
                }
            }
        }
        String hmax = jobConf.get("hbase.max.version");
        if (StringUtils.isNotEmpty(hmax)) {
            scan.setMaxVersions(Integer.parseInt(hmax));
        }
        scan.setFilter(list);
    }

    /**
     * Converts a filter (which has been pushed down from Hive's optimizer)
     * into corresponding restrictions on the HBase scan.  The
     * filter should already be in a form which can be fully converted.
     *
     * @param jobConf    configuration for the scan
     * @param scan       the HBase scan object to restrict
     * @param tableSplit the HBase table split to restrict, or null
     *                   if calculating splits
     * @param iKey       0-based offset of key column within Hive table
     * @return converted table split if any
     */
    private TableSplit convertFilter(JobConf jobConf, Scan scan, TableSplit tableSplit, int iKey)
            throws IOException {

        String filterExprSerialized = jobConf.get(TableScanDesc.FILTER_EXPR_CONF_STR);
        if (filterExprSerialized == null) {
            return tableSplit;
        }
        ExprNodeDesc filterExpr = Utilities.deserializeExpression(filterExprSerialized, jobConf);

        String columnNameProperty = jobConf.get(Constants.LIST_COLUMNS);
        List<String> columnNames = Arrays.asList(columnNameProperty.split(","));

        IndexPredicateAnalyzer analyzer = newIndexPredicateAnalyzer(columnNames.get(iKey));

        List<IndexSearchCondition> searchConditions = new ArrayList<IndexSearchCondition>();
        ExprNodeDesc residualPredicate = analyzer.analyzePredicate(filterExpr, searchConditions);

        // There should be no residual since we already negotiated
        // that earlier in HBaseStorageHandler.decomposePredicate.
        if (residualPredicate != null) {
            throw new RuntimeException("Unexpected residual predicate " + residualPredicate.getExprString());
        }

        // There should be exactly one predicate since we already
        // negotiated that also.
        if (searchConditions.size() != 1) {
            throw new RuntimeException("Exactly one search condition expected in push down");
        }

        // Convert the search condition into a restriction on the HBase scan
        IndexSearchCondition sc = searchConditions.get(0);
        ExprNodeConstantEvaluator eval = new ExprNodeConstantEvaluator(sc.getConstantDesc());
        byte[] startRow;
        try {
            ObjectInspector objInspector = eval.initialize(null);
            Object writable = eval.evaluate(null);
            ByteStream.Output serializeStream = new ByteStream.Output();
            LazyUtils.writePrimitiveUTF8(serializeStream, writable, (PrimitiveObjectInspector) objInspector, false,
                    (byte) 0, null);
            startRow = new byte[serializeStream.getCount()];
            System.arraycopy(serializeStream.getData(), 0, startRow, 0, serializeStream.getCount());
        } catch (HiveException ex) {
            throw new IOException(ex);
        }

        // stopRow is exclusive, so pad it with a trailing 0 byte to
        // make it compare as the very next value after startRow
        byte[] stopRow = new byte[startRow.length + 1];
        System.arraycopy(startRow, 0, stopRow, 0, startRow.length);

        if (tableSplit != null) {
            tableSplit = new TableSplit(tableSplit.getTableName(), startRow, stopRow,
                    tableSplit.getRegionLocation());
        }
        scan.setStartRow(startRow);
        scan.setStopRow(stopRow);
        // Add a WhileMatchFilter to make the scan terminate as soon
        // as we see a non-matching key.  This is probably redundant
        // since the stopRow above should already take care of it for us.
        scan.setFilter(
                new WhileMatchFilter(new RowFilter(CompareFilter.CompareOp.EQUAL, new BinaryComparator(startRow))));
        return tableSplit;
    }

    /**
     * Instantiates a new predicate analyzer suitable for
     * determining how to push a filter down into the HBase scan,
     * based on the rules for what kinds of pushdown we currently support.
     *
     * @param keyColumnName name of the Hive column mapped to the HBase row key
     * @return preconfigured predicate analyzer
     */
    static IndexPredicateAnalyzer newIndexPredicateAnalyzer(String keyColumnName) {

        IndexPredicateAnalyzer analyzer = new IndexPredicateAnalyzer();

        // for now, we only support equality comparisons
        analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual");

        // and only on the key column
        analyzer.clearAllowedColumnNames();
        analyzer.allowColumnName(keyColumnName);

        return analyzer;
    }

    //@Override
    public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {

        String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
        setHTable(new HTable(new HBaseConfiguration(jobConf), Bytes.toBytes(hbaseTableName)));
        String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);

        if (hbaseColumnsMapping == null) {
            throw new IOException("hbase.columns.mapping required for HBase Table.");
        }

        List<String> hbaseColumnFamilies = new ArrayList<String>();
        List<String> hbaseColumnQualifiers = new ArrayList<String>();
        List<byte[]> hbaseColumnFamiliesBytes = new ArrayList<byte[]>();
        List<byte[]> hbaseColumnQualifiersBytes = new ArrayList<byte[]>();

        int iKey;
        try {
            iKey = HBaseSerDe.parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies, hbaseColumnFamiliesBytes,
                    hbaseColumnQualifiers, hbaseColumnQualifiersBytes);
        } catch (SerDeException se) {
            throw new IOException(se);
        }

        Scan scan = new Scan();

        // Take filter pushdown into account while calculating splits; this
        // allows us to prune off regions immediately.  Note that although
        // the Javadoc for the superclass getSplits says that it returns one
        // split per region, the implementation actually takes the scan
        // definition into account and excludes regions which don't satisfy
        // the start/stop row conditions (HBASE-1829).
        convertFilter(jobConf, scan, null, iKey);

        // REVIEW:  are we supposed to be applying the getReadColumnIDs
        // same as in getRecordReader?
        for (int i = 0; i < hbaseColumnFamilies.size(); i++) {
            if (i == iKey) {
                continue;
            }

            if (hbaseColumnQualifiers.get(i) == null) {
                scan.addFamily(hbaseColumnFamiliesBytes.get(i));
            } else {
                scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i));
            }
        }

        setScan(scan);
        Job job = new Job(jobConf);
        JobContext jobContext = new JobContext(job.getConfiguration(), job.getJobID());
        Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);

        List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext);
        InputSplit[] results = new InputSplit[splits.size()];

        for (int i = 0; i < splits.size(); i++) {
            results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]);
        }

        return results;
    }

    /**
    * Parses the HBase columns mapping to identify the column families, qualifiers
    * and also caches the byte arrays corresponding to them. One of the Hive table
    * columns maps to the HBase row key, by default the first column.
    *
    * @param columnMapping - the column mapping specification to be parsed
    * @param colFamilies - the list of HBase column family names
    * @param colFamiliesBytes - the corresponding byte array
    * @param colQualifiers - the list of HBase column qualifier names
    * @param colQualifiersBytes - the corresponding byte array
    * @return the row key index in the column names list
    * @throws SerDeException
    */
    public static int parseColumnMapping(String columnMapping, List<String> colFamilies,
            List<byte[]> colFamiliesBytes, List<String> colQualifiers, List<byte[]> colQualifiersBytes)
            throws IOException {

        int rowKeyIndex = -1;

        if (colFamilies == null || colQualifiers == null) {
            throw new IOException("Error: caller must pass in lists for the column families " + "and qualifiers.");
        }

        colFamilies.clear();
        colQualifiers.clear();

        if (columnMapping == null) {
            throw new IOException("Error: hbase.columns.mapping missing for this HBase table.");
        }

        if (columnMapping.equals("") || columnMapping.equals(HBASE_KEY_COL)) {
            throw new IOException("Error: hbase.columns.mapping specifies only the HBase table"
                    + " row key. A valid Hive-HBase table must specify at least one additional column.");
        }

        String[] mapping = columnMapping.split(",");

        for (int i = 0; i < mapping.length; i++) {
            String elem = mapping[i];
            int idxFirst = elem.indexOf(":");
            int idxLast = elem.lastIndexOf(":");

            if (idxFirst < 0 || !(idxFirst == idxLast)) {
                throw new IOException("Error: the HBase columns mapping contains a badly formed "
                        + "column family, column qualifier specification.");
            }

            if (elem.equals(HBASE_KEY_COL)) {
                rowKeyIndex = i;
                colFamilies.add(elem);
                colQualifiers.add(null);
            } else {
                String[] parts = elem.split(":");
                assert (parts.length > 0 && parts.length <= 2);
                colFamilies.add(parts[0]);

                if (parts.length == 2) {
                    colQualifiers.add(parts[1]);
                } else {
                    colQualifiers.add(null);
                }
            }
        }

        if (rowKeyIndex == -1) {
            colFamilies.add(0, HBASE_KEY_COL);
            colQualifiers.add(0, null);
            rowKeyIndex = 0;
        }

        if (colFamilies.size() != colQualifiers.size()) {
            throw new IOException("Error in parsing the hbase columns mapping.");
        }

        // populate the corresponding byte [] if the client has passed in a non-null list
        if (colFamiliesBytes != null) {
            colFamiliesBytes.clear();

            for (String fam : colFamilies) {
                colFamiliesBytes.add(Bytes.toBytes(fam));
            }
        }

        if (colQualifiersBytes != null) {
            colQualifiersBytes.clear();

            for (String qual : colQualifiers) {
                if (qual == null) {
                    colQualifiersBytes.add(null);
                } else {
                    colQualifiersBytes.add(Bytes.toBytes(qual));
                }
            }
        }

        if (colFamiliesBytes != null && colQualifiersBytes != null) {
            if (colFamiliesBytes.size() != colQualifiersBytes.size()) {
                /* throw new SerDeException("Error in caching the bytes for the hbase column families " +
                     "and qualifiers.");*/
            }
        }

        return rowKeyIndex;
    }
}