com.github.dryangkun.hbase.tidx.hive.HBaseSerDeParameters.java Source code

Java tutorial

Introduction

Here is the source code for com.github.dryangkun.hbase.tidx.hive.HBaseSerDeParameters.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.github.dryangkun.hbase.tidx.hive;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import org.apache.avro.Schema;
import org.apache.avro.reflect.ReflectData;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.JavaUtils;
import com.github.dryangkun.hbase.tidx.hive.ColumnMappings.ColumnMapping;
import com.github.dryangkun.hbase.tidx.hive.struct.AvroHBaseValueFactory;
import com.github.dryangkun.hbase.tidx.hive.struct.DefaultHBaseValueFactory;
import com.github.dryangkun.hbase.tidx.hive.struct.HBaseValueFactory;
import com.github.dryangkun.hbase.tidx.hive.struct.StructHBaseValueFactory;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils;
import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.util.ReflectionUtils;

import javax.annotation.Nullable;

/**
 * HBaseSerDeParameters encapsulates SerDeParameters and additional configurations that are specific for
 * HBaseSerDe.
 *
 */
public class HBaseSerDeParameters {

    public static final String AVRO_SERIALIZATION_TYPE = "avro";
    public static final String STRUCT_SERIALIZATION_TYPE = "struct";

    private final LazySerDeParameters serdeParams;

    private final Configuration job;

    private final String columnMappingString;
    private final ColumnMappings columnMappings;
    private final boolean doColumnRegexMatching;

    private final long putTimestamp;
    private final HBaseKeyFactory keyFactory;
    private final List<HBaseValueFactory> valueFactories;

    HBaseSerDeParameters(Configuration job, Properties tbl, String serdeName) throws SerDeException {
        this.job = job;

        // Read configuration parameters
        columnMappingString = tbl.getProperty(HBaseSerDe.HBASE_COLUMNS_MAPPING);
        doColumnRegexMatching = Boolean.valueOf(tbl.getProperty(HBaseSerDe.HBASE_COLUMNS_REGEX_MATCHING, "true"));
        // Parse and initialize the HBase columns mapping
        columnMappings = HBaseSerDe.parseColumnsMapping(columnMappingString, doColumnRegexMatching);

        // Build the type property string if not supplied
        String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES);
        String autogenerate = tbl.getProperty(HBaseSerDe.HBASE_AUTOGENERATE_STRUCT);

        if (columnTypeProperty == null || columnTypeProperty.isEmpty()) {
            String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS);
            if (columnNameProperty == null || columnNameProperty.isEmpty()) {
                if (autogenerate == null || autogenerate.isEmpty()) {
                    throw new IllegalArgumentException("Either the columns must be specified or the "
                            + HBaseSerDe.HBASE_AUTOGENERATE_STRUCT + " property must be set to true.");
                }

                tbl.setProperty(serdeConstants.LIST_COLUMNS, columnMappings.toNamesString(tbl, autogenerate));
            }

            tbl.setProperty(serdeConstants.LIST_COLUMN_TYPES, columnMappings.toTypesString(tbl, job, autogenerate));
        }

        this.serdeParams = new LazySerDeParameters(job, tbl, serdeName);
        this.putTimestamp = Long.valueOf(tbl.getProperty(HBaseSerDe.HBASE_PUT_TIMESTAMP, "-1"));

        columnMappings.setHiveColumnDescription(serdeName, serdeParams.getColumnNames(),
                serdeParams.getColumnTypes());

        // Precondition: make sure this is done after the rest of the SerDe initialization is done.
        String hbaseTableStorageType = tbl.getProperty(HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE);
        columnMappings.parseColumnStorageTypes(hbaseTableStorageType);

        this.keyFactory = initKeyFactory(job, tbl);
        this.valueFactories = initValueFactories(job, tbl);
    }

    public List<String> getColumnNames() {
        return serdeParams.getColumnNames();
    }

    public List<TypeInfo> getColumnTypes() {
        return serdeParams.getColumnTypes();
    }

    public LazySerDeParameters getSerdeParams() {
        return serdeParams;
    }

    public long getPutTimestamp() {
        return putTimestamp;
    }

    public int getKeyIndex() {
        return columnMappings.getKeyIndex();
    }

    public ColumnMapping getKeyColumnMapping() {
        return columnMappings.getKeyMapping();
    }

    public int getTimestampIndex() {
        return columnMappings.getTimestampIndex();
    }

    public ColumnMapping getTimestampColumnMapping() {
        return columnMappings.getTimestampMapping();
    }

    public ColumnMappings getColumnMappings() {
        return columnMappings;
    }

    public HBaseKeyFactory getKeyFactory() {
        return keyFactory;
    }

    public List<HBaseValueFactory> getValueFactories() {
        return valueFactories;
    }

    public Configuration getBaseConfiguration() {
        return job;
    }

    public TypeInfo getTypeForName(String columnName) {
        List<String> columnNames = serdeParams.getColumnNames();
        List<TypeInfo> columnTypes = serdeParams.getColumnTypes();
        for (int i = 0; i < columnNames.size(); i++) {
            if (columnName.equals(columnNames.get(i))) {
                return columnTypes.get(i);
            }
        }
        throw new IllegalArgumentException("Invalid column name " + columnName);
    }

    public String toString() {
        return "[" + columnMappingString + ":" + getColumnNames() + ":" + getColumnTypes() + "]";
    }

    private HBaseKeyFactory initKeyFactory(Configuration conf, Properties tbl) throws SerDeException {
        try {
            HBaseKeyFactory keyFactory = createKeyFactory(conf, tbl);
            if (keyFactory != null) {
                keyFactory.init(this, tbl);
            }
            return keyFactory;
        } catch (Exception e) {
            throw new SerDeException(e);
        }
    }

    private static HBaseKeyFactory createKeyFactory(Configuration job, Properties tbl) throws Exception {
        String factoryClassName = tbl.getProperty(HBaseSerDe.HBASE_COMPOSITE_KEY_FACTORY);
        if (factoryClassName != null) {
            Class<?> factoryClazz = loadClass(factoryClassName, job);
            return (HBaseKeyFactory) ReflectionUtils.newInstance(factoryClazz, job);
        }
        String keyClassName = tbl.getProperty(HBaseSerDe.HBASE_COMPOSITE_KEY_CLASS);
        if (keyClassName != null) {
            Class<?> keyClass = loadClass(keyClassName, job);
            return new CompositeHBaseKeyFactory(keyClass);
        }
        return new DefaultHBaseKeyFactory();
    }

    private static Class<?> loadClass(String className, @Nullable Configuration configuration) throws Exception {
        if (configuration != null) {
            return configuration.getClassByName(className);
        }
        return JavaUtils.loadClass(className);
    }

    private List<HBaseValueFactory> initValueFactories(Configuration conf, Properties tbl) throws SerDeException {
        List<HBaseValueFactory> valueFactories = createValueFactories(conf, tbl);

        for (HBaseValueFactory valueFactory : valueFactories) {
            valueFactory.init(this, conf, tbl);
        }

        return valueFactories;
    }

    private List<HBaseValueFactory> createValueFactories(Configuration conf, Properties tbl) throws SerDeException {
        List<HBaseValueFactory> valueFactories = new ArrayList<HBaseValueFactory>();

        try {
            for (int i = 0; i < columnMappings.size(); i++) {
                String serType = getSerializationType(conf, tbl, columnMappings.getColumnsMapping()[i]);

                if (AVRO_SERIALIZATION_TYPE.equals(serType)) {
                    Schema schema = getSchema(conf, tbl, columnMappings.getColumnsMapping()[i]);
                    valueFactories.add(new AvroHBaseValueFactory(i, schema));
                } else if (STRUCT_SERIALIZATION_TYPE.equals(serType)) {
                    String structValueClassName = tbl.getProperty(HBaseSerDe.HBASE_STRUCT_SERIALIZER_CLASS);

                    if (structValueClassName == null) {
                        throw new IllegalArgumentException(HBaseSerDe.HBASE_STRUCT_SERIALIZER_CLASS
                                + " must be set for hbase columns of type [" + STRUCT_SERIALIZATION_TYPE + "]");
                    }

                    Class<?> structValueClass = loadClass(structValueClassName, job);
                    valueFactories.add(new StructHBaseValueFactory(i, structValueClass));
                } else {
                    valueFactories.add(new DefaultHBaseValueFactory(i));
                }
            }
        } catch (Exception e) {
            throw new SerDeException(e);
        }

        return valueFactories;
    }

    /**
     * Get the type for the given {@link ColumnMapping colMap}
     * */
    private String getSerializationType(Configuration conf, Properties tbl, ColumnMapping colMap) throws Exception {
        String serType = null;

        if (colMap.qualifierName == null) {
            // only a column family

            if (colMap.qualifierPrefix != null) {
                serType = tbl.getProperty(
                        colMap.familyName + "." + colMap.qualifierPrefix + "." + HBaseSerDe.SERIALIZATION_TYPE);
            } else {
                serType = tbl.getProperty(colMap.familyName + "." + HBaseSerDe.SERIALIZATION_TYPE);
            }
        } else if (!colMap.hbaseRowKey) {
            // not an hbase row key. This should either be a prefix or an individual qualifier
            String qualifierName = colMap.qualifierName;

            if (colMap.qualifierName.endsWith("*")) {
                qualifierName = colMap.qualifierName.substring(0, colMap.qualifierName.length() - 1);
            }

            serType = tbl
                    .getProperty(colMap.familyName + "." + qualifierName + "." + HBaseSerDe.SERIALIZATION_TYPE);
        }

        return serType;
    }

    private Schema getSchema(Configuration conf, Properties tbl, ColumnMapping colMap) throws Exception {
        String serType = null;
        String serClassName = null;
        String schemaLiteral = null;
        String schemaUrl = null;

        if (colMap.qualifierName == null) {
            // only a column family

            if (colMap.qualifierPrefix != null) {
                serType = tbl.getProperty(
                        colMap.familyName + "." + colMap.qualifierPrefix + "." + HBaseSerDe.SERIALIZATION_TYPE);

                serClassName = tbl.getProperty(colMap.familyName + "." + colMap.qualifierPrefix + "."
                        + serdeConstants.SERIALIZATION_CLASS);

                schemaLiteral = tbl.getProperty(
                        colMap.familyName + "." + colMap.qualifierPrefix + "." + AvroSerdeUtils.SCHEMA_LITERAL);

                schemaUrl = tbl.getProperty(
                        colMap.familyName + "." + colMap.qualifierPrefix + "." + AvroSerdeUtils.SCHEMA_URL);
            } else {
                serType = tbl.getProperty(colMap.familyName + "." + HBaseSerDe.SERIALIZATION_TYPE);

                serClassName = tbl.getProperty(colMap.familyName + "." + serdeConstants.SERIALIZATION_CLASS);

                schemaLiteral = tbl.getProperty(colMap.familyName + "." + AvroSerdeUtils.SCHEMA_LITERAL);

                schemaUrl = tbl.getProperty(colMap.familyName + "." + AvroSerdeUtils.SCHEMA_URL);
            }
        } else if (!colMap.hbaseRowKey) {
            // not an hbase row key. This should either be a prefix or an individual qualifier
            String qualifierName = colMap.qualifierName;

            if (colMap.qualifierName.endsWith("*")) {
                qualifierName = colMap.qualifierName.substring(0, colMap.qualifierName.length() - 1);
            }

            serType = tbl
                    .getProperty(colMap.familyName + "." + qualifierName + "." + HBaseSerDe.SERIALIZATION_TYPE);

            serClassName = tbl.getProperty(
                    colMap.familyName + "." + qualifierName + "." + serdeConstants.SERIALIZATION_CLASS);

            schemaLiteral = tbl
                    .getProperty(colMap.familyName + "." + qualifierName + "." + AvroSerdeUtils.SCHEMA_LITERAL);

            schemaUrl = tbl.getProperty(colMap.familyName + "." + qualifierName + "." + AvroSerdeUtils.SCHEMA_URL);
        }

        if (serType == null) {
            throw new IllegalArgumentException("serialization.type property is missing");
        }

        String avroSchemaRetClass = tbl.getProperty(AvroSerdeUtils.SCHEMA_RETRIEVER);

        if (schemaLiteral == null && serClassName == null && schemaUrl == null && avroSchemaRetClass == null) {
            throw new IllegalArgumentException("serialization.type was set to [" + serType + "] but neither "
                    + AvroSerdeUtils.SCHEMA_LITERAL + ", " + AvroSerdeUtils.SCHEMA_URL + ", serialization.class or "
                    + AvroSerdeUtils.SCHEMA_RETRIEVER + " property was set");
        }

        Class<?> deserializerClass = null;

        if (serClassName != null) {
            deserializerClass = loadClass(serClassName, conf);
        }

        Schema schema = null;

        // only worry about getting schema if we are dealing with Avro
        if (serType.equalsIgnoreCase(AVRO_SERIALIZATION_TYPE)) {
            if (avroSchemaRetClass == null) {
                // bother about generating a schema only if a schema retriever class wasn't provided
                if (schemaLiteral != null) {
                    schema = Schema.parse(schemaLiteral);
                } else if (schemaUrl != null) {
                    schema = HBaseSerDeHelper.getSchemaFromFS(schemaUrl, conf);
                } else if (deserializerClass != null) {
                    schema = ReflectData.get().getSchema(deserializerClass);
                }
            }
        }

        return schema;
    }
}