com.cloudera.recordservice.pig.PigHCatUtil.java Source code

Introduction

Here is the source code for com.cloudera.recordservice.pig.PigHCatUtil.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.cloudera.recordservice.pig;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.TimeZone;

import com.cloudera.recordservice.core.Decimal;
import com.cloudera.recordservice.core.TimestampNanos;
import com.cloudera.recordservice.hcatalog.common.HCatRSUtil;
import com.cloudera.recordservice.mr.DecimalWritable;
import com.cloudera.recordservice.mr.RecordServiceRecord;
import com.cloudera.recordservice.mr.TimestampNanosWritable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.ShortWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hive.hcatalog.common.HCatConstants;
import org.apache.hive.hcatalog.common.HCatException;
import org.apache.hive.hcatalog.common.HCatUtil;
import org.apache.hive.hcatalog.data.Pair;
import org.apache.hive.hcatalog.data.schema.HCatFieldSchema;
import org.apache.hive.hcatalog.data.schema.HCatFieldSchema.Type;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.pig.LoadPushDown.RequiredField;
import org.apache.pig.PigException;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;

import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.Utils;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/*
 * This Class was copied from the Hcatalog-Pig-Adapter Project
 * Orginal name: PigHCatUtil
 * Changes: transferToTuple method now converts RecordServiceRecord
 * instead of HCatRecord
 */
class PigHCatUtil {

    private static final Logger LOG = LoggerFactory.getLogger(PigHCatUtil.class);

    // http://wiki.apache.org/pig/PigErrorHandlingFunctionalSpecification#Error_codes
    static final int PIG_EXCEPTION_CODE = 1115;
    private static final String DEFAULT_DB = MetaStoreUtils.DEFAULT_DATABASE_NAME;

    private final Map<Pair<String, String>, Table> hcatTableCache = new HashMap<Pair<String, String>, Table>();

    private static final TupleFactory tupFac = TupleFactory.getInstance();

    private static boolean pigHasBooleanSupport = false;

    /**
     * Determine if the current Pig version supports boolean columns. This works around a
     * dependency conflict preventing HCatalog from requiring a version of Pig with boolean
     * field support and should be removed once HCATALOG-466 has been resolved.
     */
    static {
        // DETAILS:
        //
        // PIG-1429 added support for boolean fields, which shipped in 0.10.0;
        // this version of Pig depends on antlr 3.4.
        //
        // HCatalog depends heavily on Hive, which at this time uses antlr 3.0.1.
        //
        // antlr 3.0.1 and 3.4 are incompatible, so Pig 0.10.0 and Hive cannot be depended on
        // in the same project. Pig 0.8.0 did not use antlr for its parser and can coexist
        // with Hive, so that Pig version is depended on by HCatalog at this time.
        try {
            Schema schema = Utils.getSchemaFromString("myBooleanField: boolean");
            pigHasBooleanSupport = (schema.getField("myBooleanField").type == DataType.BOOLEAN);
        } catch (Throwable e) {
            // pass
        }

        if (!pigHasBooleanSupport) {
            LOG.info("This version of Pig does not support boolean fields. To enable "
                    + "boolean-to-integer conversion, set the " + HCatConstants.HCAT_DATA_CONVERT_BOOLEAN_TO_INTEGER
                    + "=true configuration parameter.");
        }
    }

    static public boolean pigHasBooleanSupport() {
        return pigHasBooleanSupport;
    }

    static public Pair<String, String> getDBTableNames(String location) throws IOException {
        // the location string will be of the form:
        // <database name>.<table name> - parse it and
        // communicate the information to HCatInputFormat

        try {
            return HCatUtil.getDbAndTableName(location);
        } catch (IOException e) {
            String locationErrMsg = "The input location in load statement " + "should be of the form "
                    + "<databasename>.<table name> or <table name>. Got " + location;
            throw new PigException(locationErrMsg, PIG_EXCEPTION_CODE);
        }
    }

    static public String getHCatServerUri(Job job) {

        return job.getConfiguration().get(HiveConf.ConfVars.METASTOREURIS.varname);
    }

    static public String getHCatServerPrincipal(Job job) {

        return job.getConfiguration().get(HCatConstants.HCAT_METASTORE_PRINCIPAL);
    }

    private static HiveMetaStoreClient getHiveMetaClient(String serverUri, String serverKerberosPrincipal,
            Class<?> clazz, Job job) throws Exception {

        // The job configuration is passed in so the configuration will be cloned
        // from the pig job configuration. This is necessary for overriding
        // metastore configuration arguments like the metastore jdbc connection string
        // and password, in the case of an embedded metastore, which you get when
        // hive.metastore.uris = "".
        HiveConf hiveConf = new HiveConf(job.getConfiguration(), clazz);

        if (serverUri != null) {
            hiveConf.setVar(HiveConf.ConfVars.METASTOREURIS, serverUri.trim());
        }

        if (serverKerberosPrincipal != null) {
            hiveConf.setBoolVar(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL, true);
            hiveConf.setVar(HiveConf.ConfVars.METASTORE_KERBEROS_PRINCIPAL, serverKerberosPrincipal);
        }

        try {
            return HCatUtil.getHiveClient(hiveConf);
        } catch (Exception e) {
            throw new Exception(
                    "Could not instantiate a HiveMetaStoreClient connecting to server uri:[" + serverUri + "]", e);
        }
    }

    HCatSchema getHCatSchema(List<RequiredField> fields, String signature, Class<?> classForUDFCLookup)
            throws IOException {
        if (fields == null) {
            return null;
        }

        Properties props = UDFContext.getUDFContext().getUDFProperties(classForUDFCLookup,
                new String[] { signature });
        HCatSchema hcatTableSchema = (HCatSchema) props.get(HCatConstants.HCAT_TABLE_SCHEMA);

        ArrayList<HCatFieldSchema> fcols = new ArrayList<HCatFieldSchema>();
        for (RequiredField rf : fields) {
            fcols.add(hcatTableSchema.getFields().get(rf.getIndex()));
        }
        return new HCatSchema(fcols);
    }

    /*
    * The job argument is passed so that configuration overrides can be used to initialize
    * the metastore configuration in the special case of an embedded metastore
    * (hive.metastore.uris = "").
    */
    public Table getTable(String location, String hcatServerUri, String hcatServerPrincipal, Job job)
            throws IOException {
        Pair<String, String> loc_server = new Pair<String, String>(location, hcatServerUri);
        Table hcatTable = hcatTableCache.get(loc_server);
        if (hcatTable != null) {
            return hcatTable;
        }

        Pair<String, String> dbTablePair = PigHCatUtil.getDBTableNames(location);
        dbTablePair = HCatRSUtil.cleanQueryPair(dbTablePair);
        String dbName = dbTablePair.first;
        String tableName = dbTablePair.second;
        Table table = null;
        HiveMetaStoreClient client = null;
        try {
            client = getHiveMetaClient(hcatServerUri, hcatServerPrincipal, PigHCatUtil.class, job);
            table = HCatUtil.getTable(client, dbName, tableName);
        } catch (NoSuchObjectException nsoe) {
            throw new PigException("Table not found : " + nsoe.getMessage(), PIG_EXCEPTION_CODE); // prettier error messages to frontend
        } catch (Exception e) {
            throw new IOException(e);
        } finally {
            HCatUtil.closeHiveClientQuietly(client);
        }
        hcatTableCache.put(loc_server, table);
        return table;
    }

    public static ResourceSchema getResourceSchema(HCatSchema hcatSchema) throws IOException {
        List<ResourceFieldSchema> rfSchemaList = new ArrayList<ResourceFieldSchema>();
        for (HCatFieldSchema hfs : hcatSchema.getFields()) {
            ResourceFieldSchema rfSchema;
            rfSchema = getResourceSchemaFromFieldSchema(hfs);
            rfSchemaList.add(rfSchema);
        }
        ResourceSchema rSchema = new ResourceSchema();
        rSchema.setFields(rfSchemaList.toArray(new ResourceFieldSchema[rfSchemaList.size()]));
        return rSchema;

    }

    private static ResourceFieldSchema getResourceSchemaFromFieldSchema(HCatFieldSchema hfs) throws IOException {
        ResourceFieldSchema rfSchema;
        // if we are dealing with a bag or tuple column - need to worry about subschema
        if (hfs.getType() == Type.STRUCT) {
            rfSchema = new ResourceFieldSchema().setName(hfs.getName()).setDescription(hfs.getComment())
                    .setType(getPigType(hfs)).setSchema(getTupleSubSchema(hfs));
        } else if (hfs.getType() == Type.ARRAY) {
            rfSchema = new ResourceFieldSchema().setName(hfs.getName()).setDescription(hfs.getComment())
                    .setType(getPigType(hfs)).setSchema(getBagSubSchema(hfs));
        } else {
            rfSchema = new ResourceFieldSchema().setName(hfs.getName()).setDescription(hfs.getComment())
                    .setType(getPigType(hfs)).setSchema(null); // no munging inner-schemas
        }
        return rfSchema;
    }

    protected static ResourceSchema getBagSubSchema(HCatFieldSchema hfs) throws IOException {
        // there are two cases - array<Type> and array<struct<...>>
        // in either case the element type of the array is represented in a
        // tuple field schema in the bag's field schema - the second case (struct)
        // more naturally translates to the tuple - in the first case (array<Type>)
        // we simulate the tuple by putting the single field in a tuple

        Properties props = UDFContext.getUDFContext().getClientSystemProps();
        String innerTupleName = HCatConstants.HCAT_PIG_INNER_TUPLE_NAME_DEFAULT;
        if (props != null && props.containsKey(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME)) {
            innerTupleName = props.getProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME).replaceAll("FIELDNAME",
                    hfs.getName());
        }
        String innerFieldName = HCatConstants.HCAT_PIG_INNER_FIELD_NAME_DEFAULT;
        if (props != null && props.containsKey(HCatConstants.HCAT_PIG_INNER_FIELD_NAME)) {
            innerFieldName = props.getProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME).replaceAll("FIELDNAME",
                    hfs.getName());
        }

        ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1];
        bagSubFieldSchemas[0] = new ResourceFieldSchema().setName(innerTupleName)
                .setDescription("The tuple in the bag").setType(DataType.TUPLE);
        HCatFieldSchema arrayElementFieldSchema = hfs.getArrayElementSchema().get(0);
        if (arrayElementFieldSchema.getType() == Type.STRUCT) {
            bagSubFieldSchemas[0].setSchema(getTupleSubSchema(arrayElementFieldSchema));
        } else if (arrayElementFieldSchema.getType() == Type.ARRAY) {
            ResourceSchema s = new ResourceSchema();
            List<ResourceFieldSchema> lrfs = Arrays
                    .asList(getResourceSchemaFromFieldSchema(arrayElementFieldSchema));
            s.setFields(lrfs.toArray(new ResourceFieldSchema[lrfs.size()]));
            bagSubFieldSchemas[0].setSchema(s);
        } else {
            ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1];
            innerTupleFieldSchemas[0] = new ResourceFieldSchema().setName(innerFieldName)
                    .setDescription("The inner field in the tuple in the bag")
                    .setType(getPigType(arrayElementFieldSchema)).setSchema(null); // the element type is not a tuple - so no subschema
            bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas));
        }
        return new ResourceSchema().setFields(bagSubFieldSchemas);

    }

    private static ResourceSchema getTupleSubSchema(HCatFieldSchema hfs) throws IOException {
        // for each struct subfield, create equivalent ResourceFieldSchema
        ResourceSchema s = new ResourceSchema();
        List<ResourceFieldSchema> lrfs = new ArrayList<ResourceFieldSchema>();
        for (HCatFieldSchema subField : hfs.getStructSubSchema().getFields()) {
            lrfs.add(getResourceSchemaFromFieldSchema(subField));
        }
        s.setFields(lrfs.toArray(new ResourceFieldSchema[lrfs.size()]));
        return s;
    }

    /**
     * @param hfs the field schema of the column
     * @return corresponding pig type
     * @throws IOException
     */
    static public byte getPigType(HCatFieldSchema hfs) throws IOException {
        return getPigType(hfs.getType());
    }

    /**
     * Defines a mapping of HCatalog type to Pig type; not every mapping is exact, 
     * see {@link #extractPigObject(Object,
     * com.cloudera.recordservice.core.Schema.TypeDesc)}
     * See http://pig.apache.org/docs/r0.12.0/basic.html#data-types
     * See {@link org.apache.hive.hcatalog.pig.HCatBaseStorer#validateSchema(
     * Schema.FieldSchema, HCatFieldSchema, Schema, HCatSchema, int)}
     * for Pig->Hive type mapping.
     */
    static public byte getPigType(Type type) throws IOException {
        if (type == Type.STRING || type == Type.CHAR || type == Type.VARCHAR) {
            //CHARARRAY is unbounded so Hive->Pig is lossless
            return DataType.CHARARRAY;
        }

        if ((type == Type.INT) || (type == Type.SMALLINT) || (type == Type.TINYINT)) {
            return DataType.INTEGER;
        }

        if (type == Type.ARRAY) {
            return DataType.BAG;
        }

        if (type == Type.STRUCT) {
            return DataType.TUPLE;
        }

        if (type == Type.MAP) {
            return DataType.MAP;
        }

        if (type == Type.BIGINT) {
            return DataType.LONG;
        }

        if (type == Type.FLOAT) {
            return DataType.FLOAT;
        }

        if (type == Type.DOUBLE) {
            return DataType.DOUBLE;
        }

        if (type == Type.BINARY) {
            return DataType.BYTEARRAY;
        }

        if (type == Type.BOOLEAN && pigHasBooleanSupport) {
            return DataType.BOOLEAN;
        }
        if (type == Type.DECIMAL) {
            //Hive is more restrictive, so Hive->Pig works
            return DataType.BIGDECIMAL;
        }
        if (type == Type.DATE || type == Type.TIMESTAMP) {
            //Hive Date is representable as Pig DATETIME
            return DataType.DATETIME;
        }

        throw new PigException(
                "HCatalog column type '" + type.toString() + "' is not supported in Pig as a column type",
                PIG_EXCEPTION_CODE);
    }

    public static Tuple transformToTuple(RecordServiceRecord record) throws Exception {
        if (record == null) {
            return null;
        }
        com.cloudera.recordservice.mr.Schema schema = record.getSchema();
        List objList = new ArrayList();
        for (int i = 0; i < schema.getNumColumns(); ++i) {
            objList.add(record.getColumnValue(i));
        }
        return transformToTuple(objList, schema);
    }

    /**
     * Converts object from Hive's value system to Pig's value system
     * see HCatBaseStorer#getJavaObj() for Pig->Hive conversion 
     * @param o object from Hive value system
     * @return object in Pig value system 
     */
    public static Object extractPigObject(Object o, com.cloudera.recordservice.core.Schema.TypeDesc itemType)
            throws Exception {
        // Note that HCatRecordSerDe.serializePrimitiveField() will be called before this,
        // thus some type promotion/conversion may occur: e.g. Short to Integer. We should
        // refactor this so that it's hapenning in one place per module/product that we are
        // integrating with. All Pig conversion should be done here, etc.
        if (o == null) {
            return null;
        }
        Object result;
        switch (itemType.typeId) {
        case BOOLEAN:
            result = ((BooleanWritable) o).get();
            break;
        case TINYINT:
            result = ((ByteWritable) o).get();
            break;
        case SMALLINT:
            result = (int) ((ShortWritable) o).get();
            break;
        case INT:
            result = ((IntWritable) o).get();
            break;
        case BIGINT:
            result = ((LongWritable) o).get();
            break;
        case FLOAT:
            result = ((FloatWritable) o).get();
            break;
        case DOUBLE:
            result = ((DoubleWritable) o).get();
            break;
        case STRING:
        case VARCHAR:
        case CHAR:
            result = o.toString();
            break;
        case TIMESTAMP_NANOS:
            TimestampNanos timestampNanos = ((TimestampNanosWritable) o).get();
            // TODO: make sure this is correct
            result = new DateTime(timestampNanos.toTimeStamp(),
                    DateTimeZone.forTimeZone(TimeZone.getTimeZone("GMT")));
            break;
        case DECIMAL:
            Decimal decimal = ((DecimalWritable) o).get();
            result = decimal.toBigDecimal();
            break;
        default:
            result = o;
            break;
        }
        return result;
    }

    // TODO: Re-implement when RecordService supports complex types.
    /*private static Tuple transformToTuple(List<?> objList, HCatFieldSchema hfs) throws Exception {
      try {
        return transformToTuple(objList, hfs.getStructSubSchema());
      } catch (Exception e) {
        if (hfs.getType() != Type.STRUCT) {
    throw new Exception("Expected Struct type, got " + hfs.getType(), e);
        } else {
    throw e;
        }
      }
    }*/

    private static Tuple transformToTuple(List<?> objList, com.cloudera.recordservice.mr.Schema schema)
            throws Exception {
        if (objList == null) {
            return null;
        }
        Tuple t = tupFac.newTuple(objList.size());
        for (int i = 0; i < schema.getNumColumns(); i++) {
            t.set(i, extractPigObject(objList.get(i), schema.getColumnInfo(i).type));
        }
        return t;
    }

    private static void validateHCatSchemaFollowsPigRules(HCatSchema tblSchema) throws PigException {
        for (HCatFieldSchema hcatField : tblSchema.getFields()) {
            validateHcatFieldFollowsPigRules(hcatField);
        }
    }

    private static void validateHcatFieldFollowsPigRules(HCatFieldSchema hcatField) throws PigException {
        try {
            Type hType = hcatField.getType();
            switch (hType) {
            case BOOLEAN:
                if (!pigHasBooleanSupport) {
                    throw new PigException("Incompatible type found in HCat table schema: " + hcatField,
                            PigHCatUtil.PIG_EXCEPTION_CODE);
                }
                break;
            case ARRAY:
                validateHCatSchemaFollowsPigRules(hcatField.getArrayElementSchema());
                break;
            case STRUCT:
                validateHCatSchemaFollowsPigRules(hcatField.getStructSubSchema());
                break;
            case MAP:
                // key is only string
                if (hcatField.getMapKeyType() != Type.STRING) {
                    LOG.info("Converting non-String key of map " + hcatField.getName() + " from "
                            + hcatField.getMapKeyType() + " to String.");
                }
                validateHCatSchemaFollowsPigRules(hcatField.getMapValueSchema());
                break;
            }
        } catch (HCatException e) {
            throw new PigException("Incompatible type found in hcat table schema: " + hcatField,
                    PigHCatUtil.PIG_EXCEPTION_CODE, e);
        }
    }

    public static void validateHCatTableSchemaFollowsPigRules(HCatSchema hcatTableSchema) throws IOException {
        validateHCatSchemaFollowsPigRules(hcatTableSchema);
    }

    public static void getConfigFromUDFProperties(Properties p, Configuration config, String propName) {
        if (p.getProperty(propName) != null) {
            config.set(propName, p.getProperty(propName));
        }
    }

    public static void saveConfigIntoUDFProperties(Properties p, Configuration config, String propName) {
        if (config.get(propName) != null) {
            p.setProperty(propName, config.get(propName));
        }
    }

}