org.apache.hawq.pxf.plugins.hive.HiveDataFragmenter.java Source code

Introduction

Here is the source code for org.apache.hawq.pxf.plugins.hive.HiveDataFragmenter.java
Source

package org.apache.hawq.pxf.plugins.hive;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import java.io.ByteArrayOutputStream;
import java.util.List;
import java.util.ListIterator;
import java.util.Properties;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.JavaUtils;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hawq.pxf.api.FilterParser;
import org.apache.hawq.pxf.api.Fragment;
import org.apache.hawq.pxf.api.Fragmenter;
import org.apache.hawq.pxf.api.FragmentsStats;
import org.apache.hawq.pxf.api.Metadata;
import org.apache.hawq.pxf.api.utilities.ColumnDescriptor;
import org.apache.hawq.pxf.api.utilities.InputData;
import org.apache.hawq.pxf.plugins.hdfs.utilities.HdfsUtilities;
import org.apache.hawq.pxf.plugins.hive.utilities.HiveUtilities;

/**
 * Fragmenter class for HIVE tables. <br>
 * Given a Hive table and its partitions divide the data into fragments (here a
 * data fragment is actually a HDFS file block) and return a list of them. Each
 * data fragment will contain the following information:
 * <ol>
 * <li>sourceName: full HDFS path to the data file that this data fragment is
 * part of</li>
 * <li>hosts: a list of the datanode machines that hold a replica of this block</li>
 * <li>userData:
 * file_input_format_name_DELIM_serde_name_DELIM_serialization_properties</li>
 * </ol>
 */
public class HiveDataFragmenter extends Fragmenter {
    private static final Log LOG = LogFactory.getLog(HiveDataFragmenter.class);
    private static final short ALL_PARTS = -1;

    static final String HIVE_UD_DELIM = "!HUDD!";
    static final String HIVE_1_PART_DELIM = "!H1PD!";
    static final String HIVE_PARTITIONS_DELIM = "!HPAD!";
    static final String HIVE_NO_PART_TBL = "!HNPT!";

    static final String HIVE_API_EQ = " = ";
    static final String HIVE_API_DQUOTE = "\"";

    private JobConf jobConf;
    private HiveMetaStoreClient client;

    protected boolean filterInFragmenter = false;

    // Data structure to hold hive partition names if exist, to be used by
    // partition filtering
    private Set<String> setPartitions = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);

    /**
     * A Hive table unit - means a subset of the HIVE table, where we can say
     * that for all files in this subset, they all have the same InputFormat and
     * Serde. For a partitioned table the HiveTableUnit will be one partition
     * and for an unpartitioned table, the HiveTableUnit will be the whole table
     */
    class HiveTablePartition {
        StorageDescriptor storageDesc;
        Properties properties;
        Partition partition;
        List<FieldSchema> partitionKeys;
        String tableName;

        HiveTablePartition(StorageDescriptor storageDesc, Properties properties, Partition partition,
                List<FieldSchema> partitionKeys, String tableName) {
            this.storageDesc = storageDesc;
            this.properties = properties;
            this.partition = partition;
            this.partitionKeys = partitionKeys;
            this.tableName = tableName;
        }

        @Override
        public String toString() {
            return "table - " + tableName + ((partition == null) ? "" : ", partition - " + partition);
        }
    }

    /**
     * Constructs a HiveDataFragmenter object.
     *
     * @param inputData all input parameters coming from the client
     */
    public HiveDataFragmenter(InputData inputData) {
        this(inputData, HiveDataFragmenter.class);
    }

    /**
     * Constructs a HiveDataFragmenter object.
     *
     * @param inputData all input parameters coming from the client
     * @param clazz Class for JobConf
     */
    public HiveDataFragmenter(InputData inputData, Class<?> clazz) {
        super(inputData);
        jobConf = new JobConf(new Configuration(), clazz);
        client = HiveUtilities.initHiveClient();
    }

    @Override
    public List<Fragment> getFragments() throws Exception {
        Metadata.Table tblDesc = HiveUtilities.parseTableQualifiedName(inputData.getDataSource());

        fetchTableMetaData(tblDesc);

        return fragments;
    }

    /**
     * Creates the partition InputFormat.
     *
     * @param inputFormatName input format class name
     * @param jobConf configuration data for the Hadoop framework
     * @return a {@link org.apache.hadoop.mapred.InputFormat} derived object
     * @throws Exception if failed to create input format
     */
    public static InputFormat<?, ?> makeInputFormat(String inputFormatName, JobConf jobConf) throws Exception {
        Class<?> c = Class.forName(inputFormatName, true, JavaUtils.getClassLoader());
        InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) c.newInstance();

        if ("org.apache.hadoop.mapred.TextInputFormat".equals(inputFormatName)) {
            // TextInputFormat needs a special configuration
            ((TextInputFormat) inputFormat).configure(jobConf);
        }

        return inputFormat;
    }

    /*
     * Goes over the table partitions metadata and extracts the splits and the
     * InputFormat and Serde per split.
     */
    private void fetchTableMetaData(Metadata.Table tblDesc) throws Exception {

        Table tbl = HiveUtilities.getHiveTable(client, tblDesc);

        verifySchema(tbl);

        List<Partition> partitions = null;
        String filterStringForHive = "";

        // If query has filter and hive table has partitions, prepare the filter
        // string for hive metastore and retrieve only the matched partitions
        if (inputData.hasFilter() && tbl.getPartitionKeysSize() > 0) {

            // Save all hive partition names in a set for later filter match
            for (FieldSchema fs : tbl.getPartitionKeys()) {
                setPartitions.add(fs.getName());
            }

            LOG.debug("setPartitions :" + setPartitions);

            // Generate filter string for retrieve match pxf filter/hive
            // partition name
            filterStringForHive = buildFilterStringForHive();
        }

        if (!filterStringForHive.isEmpty()) {

            LOG.debug("Filter String for Hive partition retrieval : " + filterStringForHive);

            filterInFragmenter = true;

            // API call to Hive Metastore, will return a List of all the
            // partitions for this table, that matches the partition filters
            // Defined in filterStringForHive.
            partitions = client.listPartitionsByFilter(tblDesc.getDbName(), tblDesc.getTableName(),
                    filterStringForHive, ALL_PARTS);

            // No matched partitions for the filter, no fragments to return.
            if (partitions == null || partitions.isEmpty()) {

                if (LOG.isDebugEnabled()) {
                    LOG.debug("Table -  " + tblDesc.getDbName() + "." + tblDesc.getTableName()
                            + " Has no matched partitions for the filter : " + filterStringForHive);
                }
                return;
            }

            if (LOG.isDebugEnabled()) {
                LOG.debug("Table -  " + tblDesc.getDbName() + "." + tblDesc.getTableName()
                        + " Matched partitions list size: " + partitions.size());
            }

        } else {
            // API call to Hive Metastore, will return a List of all the
            // partitions for this table (no filtering)
            partitions = client.listPartitions(tblDesc.getDbName(), tblDesc.getTableName(), ALL_PARTS);
        }

        StorageDescriptor descTable = tbl.getSd();
        Properties props;

        if (partitions.isEmpty()) {
            props = getSchema(tbl);
            fetchMetaDataForSimpleTable(descTable, props);
        } else {
            List<FieldSchema> partitionKeys = tbl.getPartitionKeys();

            for (Partition partition : partitions) {
                StorageDescriptor descPartition = partition.getSd();
                props = MetaStoreUtils.getSchema(descPartition, descTable, null, // Map<string, string> parameters - can be empty
                        tblDesc.getDbName(), tblDesc.getTableName(), // table
                        // name
                        partitionKeys);
                fetchMetaDataForPartitionedTable(descPartition, props, partition, partitionKeys,
                        tblDesc.getTableName());
            }
        }
    }

    void verifySchema(Table tbl) throws Exception {
        /* nothing to verify here */
    }

    private static Properties getSchema(Table table) {
        return MetaStoreUtils.getSchema(table.getSd(), table.getSd(), table.getParameters(), table.getDbName(),
                table.getTableName(), table.getPartitionKeys());
    }

    private void fetchMetaDataForSimpleTable(StorageDescriptor stdsc, Properties props) throws Exception {
        fetchMetaDataForSimpleTable(stdsc, props, null);
    }

    private void fetchMetaDataForSimpleTable(StorageDescriptor stdsc, Properties props, String tableName)
            throws Exception {
        fetchMetaData(new HiveTablePartition(stdsc, props, null, null, tableName));
    }

    private void fetchMetaDataForPartitionedTable(StorageDescriptor stdsc, Properties props, Partition partition,
            List<FieldSchema> partitionKeys, String tableName) throws Exception {
        fetchMetaData(new HiveTablePartition(stdsc, props, partition, partitionKeys, tableName));
    }

    /* Fills a table partition */
    private void fetchMetaData(HiveTablePartition tablePartition) throws Exception {
        InputFormat<?, ?> fformat = makeInputFormat(tablePartition.storageDesc.getInputFormat(), jobConf);
        FileInputFormat.setInputPaths(jobConf, new Path(tablePartition.storageDesc.getLocation()));

        InputSplit[] splits = null;
        try {
            splits = fformat.getSplits(jobConf, 1);
        } catch (org.apache.hadoop.mapred.InvalidInputException e) {
            LOG.debug("getSplits failed on " + e.getMessage());
            return;
        }

        for (InputSplit split : splits) {
            FileSplit fsp = (FileSplit) split;
            String[] hosts = fsp.getLocations();
            String filepath = fsp.getPath().toUri().getPath();

            byte[] locationInfo = HdfsUtilities.prepareFragmentMetadata(fsp);
            Fragment fragment = new Fragment(filepath, hosts, locationInfo, makeUserData(tablePartition));
            fragments.add(fragment);
        }
    }

    /* Turns a Properties class into a string */
    private String serializeProperties(Properties props) throws Exception {
        ByteArrayOutputStream outStream = new ByteArrayOutputStream();
        props.store(outStream, ""/* comments */);
        return outStream.toString();
    }

    /* Turns the partition keys into a string */
    String serializePartitionKeys(HiveTablePartition partData) throws Exception {
        if (partData.partition == null) /*
                                         * this is a simple hive table - there
                                         * are no partitions
                                         */ {
            return HIVE_NO_PART_TBL;
        }

        StringBuilder partitionKeys = new StringBuilder();
        String prefix = "";
        ListIterator<String> valsIter = partData.partition.getValues().listIterator();
        ListIterator<FieldSchema> keysIter = partData.partitionKeys.listIterator();
        while (valsIter.hasNext() && keysIter.hasNext()) {
            FieldSchema key = keysIter.next();
            String name = key.getName();
            String type = key.getType();
            String val = valsIter.next();
            String oneLevel = prefix + name + HIVE_1_PART_DELIM + type + HIVE_1_PART_DELIM + val;
            partitionKeys.append(oneLevel);
            prefix = HIVE_PARTITIONS_DELIM;
        }

        return partitionKeys.toString();
    }

    byte[] makeUserData(HiveTablePartition partData) throws Exception {
        String inputFormatName = partData.storageDesc.getInputFormat();
        String serdeName = partData.storageDesc.getSerdeInfo().getSerializationLib();
        String propertiesString = serializeProperties(partData.properties);
        String partitionKeys = serializePartitionKeys(partData);
        String userData = inputFormatName + HIVE_UD_DELIM + serdeName + HIVE_UD_DELIM + propertiesString
                + HIVE_UD_DELIM + partitionKeys + HIVE_UD_DELIM + filterInFragmenter;

        return userData.getBytes();
    }

    /*
     * Build filter string for HiveMetaStoreClient.listPartitionsByFilter API
     * method.
     *
     * The filter string parameter for
     * HiveMetaStoreClient.listPartitionsByFilter will be created from the
     * incoming getFragments filter string parameter. It will be in a format of:
     * [PARTITON1 NAME] = \"[PARTITON1 VALUE]\" AND [PARTITON2 NAME] =
     * \"[PARTITON2 VALUE]\" ... Filtering can be done only on string partition
     * keys and AND operators.
     *
     * For Example for query: SELECT * FROM TABLE1 WHERE part1 = 'AAAA' AND
     * part2 = '1111' For HIVE HiveMetaStoreClient.listPartitionsByFilter, the
     * incoming HAWQ filter string will be mapped into :
     * "part1 = \"AAAA\" and part2 = \"1111\""
     */
    private String buildFilterStringForHive() throws Exception {

        StringBuilder filtersString = new StringBuilder();
        String filterInput = inputData.getFilterString();

        if (LOG.isDebugEnabled()) {

            for (ColumnDescriptor cd : inputData.getTupleDescription()) {
                LOG.debug("ColumnDescriptor : " + cd);
            }

            LOG.debug("Filter string input : " + inputData.getFilterString());
        }

        HiveFilterBuilder eval = new HiveFilterBuilder(inputData);
        Object filter = eval.getFilterObject(filterInput);

        String prefix = "";

        if (filter instanceof List) {

            for (Object f : (List<?>) filter) {
                if (buildSingleFilter(f, filtersString, prefix)) {
                    // Set 'and' operator between each matched partition filter.
                    prefix = " and ";
                }
            }

        } else {
            buildSingleFilter(filter, filtersString, prefix);
        }

        return filtersString.toString();
    }

    /*
     * Build filter string for a single filter and append to the filters string.
     * Filter string shell be added if filter name match hive partition name
     * Single filter will be in a format of: [PARTITON NAME] = \"[PARTITON
     * VALUE]\"
     */
    private boolean buildSingleFilter(Object filter, StringBuilder filtersString, String prefix) throws Exception {

        // Let's look first at the filter
        FilterParser.BasicFilter bFilter = (FilterParser.BasicFilter) filter;

        // In case this is not an "equality filter", we ignore this filter (no
        // add to filter list)
        if (!(bFilter.getOperation() == FilterParser.Operation.HDOP_EQ)) {
            LOG.debug("Filter operator is not EQ, ignore this filter for hive : " + filter);
            return false;
        }

        // Extract column name and value
        int filterColumnIndex = bFilter.getColumn().index();
        String filterValue = bFilter.getConstant().constant().toString();
        ColumnDescriptor filterColumn = inputData.getColumn(filterColumnIndex);
        String filterColumnName = filterColumn.columnName();

        // In case this filter is not a partition, we ignore this filter (no add
        // to filter list)
        if (!setPartitions.contains(filterColumnName)) {
            LOG.debug("Filter name is not a partition , ignore this filter for hive: " + filter);
            return false;
        }

        filtersString.append(prefix);
        filtersString.append(filterColumnName);
        filtersString.append(HIVE_API_EQ);
        filtersString.append(HIVE_API_DQUOTE);
        filtersString.append(filterValue);
        filtersString.append(HIVE_API_DQUOTE);

        return true;
    }

    /**
     * Returns statistics for Hive table. Currently it's not implemented.
     */
    @Override
    public FragmentsStats getFragmentsStats() throws Exception {
        throw new UnsupportedOperationException("ANALYZE for Hive plugin is not supported");
    }
}