com.ask.hive.hbase.HBaseTimeSerDe.java Source code

Java tutorial

Introduction

Here is the source code for com.ask.hive.hbase.HBaseTimeSerDe.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.ask.hive.hbase;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hive.serde.Constants;
import org.apache.hadoop.hive.serde2.ByteStream;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.lazy.LazyFactory;
import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
import org.apache.hadoop.hive.serde2.lazy.LazyUtils;
import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.SerDeParameters;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.hbase.LazyHBaseRow;
import org.apache.hadoop.hive.hbase.HBaseSerDe;
import org.apache.hadoop.io.Writable;

/**
 * HBaseSerDe can be used to serialize object into an HBase table and
 * deserialize objects from an HBase table.
 */
public class HBaseTimeSerDe extends HBaseSerDe {

    public static final String HBASE_COLUMNS_MAPPING = "hbase.columns.mapping";
    public static final String HBASE_TABLE_NAME = "hbase.table.name";
    public static final String HBASE_KEY_COL = ":key";
    public static final Log LOG = LogFactory.getLog(HBaseTimeSerDe.class);

    private ObjectInspector cachedObjectInspector;
    private String hbaseColumnsMapping;
    private List<String> hbaseColumnFamilies;
    private List<byte[]> hbaseColumnFamiliesBytes;
    private List<String> hbaseColumnQualifiers;
    private List<byte[]> hbaseColumnQualifiersBytes;
    private SerDeParameters serdeParams;
    private boolean useJSONSerialize;
    private LazyHBaseRow cachedHBaseRow;
    private final ByteStream.Output serializeStream = new ByteStream.Output();
    private int iKey;

    // used for serializing a field
    private byte[] separators; // the separators array
    private boolean escaped; // whether we need to escape the data when writing out
    private byte escapeChar; // which char to use as the escape char, e.g. '\\'
    private boolean[] needsEscape; // which chars need to be escaped. This array should have size
                                   // of 128. Negative byte values (or byte values >= 128) are
                                   // never escaped.

    @Override
    public String toString() {
        return getClass().toString() + "[" + hbaseColumnsMapping + ":"
                + ((StructTypeInfo) serdeParams.getRowTypeInfo()).getAllStructFieldNames() + ":"
                + ((StructTypeInfo) serdeParams.getRowTypeInfo()).getAllStructFieldTypeInfos() + "]";
    }

    public HBaseTimeSerDe() throws SerDeException {
    }

    /**
     * Initialize the SerDe given parameters.
     * @see org.apache.hadoop.hive.serde2.SerDe#initialize(org.apache.hadoop.conf.Configuration, java.util.Properties)
     */
    @Override
    public void initialize(Configuration conf, Properties tbl) throws SerDeException {

        initHBaseSerDeParameters(conf, tbl, getClass().getName());

        cachedObjectInspector = LazyFactory.createLazyStructInspector(serdeParams.getColumnNames(),
                serdeParams.getColumnTypes(), serdeParams.getSeparators(), serdeParams.getNullSequence(),
                serdeParams.isLastColumnTakesRest(), serdeParams.isEscaped(), serdeParams.getEscapeChar());

        cachedHBaseRow = new LazyHBaseRow((LazySimpleStructObjectInspector) cachedObjectInspector);

        if (LOG.isDebugEnabled()) {
            LOG.debug("HBaseSerDe initialized with : columnNames = " + serdeParams.getColumnNames()
                    + " columnTypes = " + serdeParams.getColumnTypes() + " hbaseColumnMapping = "
                    + hbaseColumnsMapping);
        }
    }

    /**
     * Parses the HBase columns mapping to identify the column families, qualifiers
     * and also caches the byte arrays corresponding to them. One of the Hive table
     * columns maps to the HBase row key, by default the first column.
     *
     * @param columnMapping - the column mapping specification to be parsed
     * @param colFamilies - the list of HBase column family names
     * @param colFamiliesBytes - the corresponding byte array
     * @param colQualifiers - the list of HBase column qualifier names
     * @param colQualifiersBytes - the corresponding byte array
     * @return the row key index in the column names list
     * @throws org.apache.hadoop.hive.serde2.SerDeException
     */
    public static int parseColumnMapping(String columnMapping, List<String> colFamilies,
            List<byte[]> colFamiliesBytes, List<String> colQualifiers, List<byte[]> colQualifiersBytes)
            throws SerDeException {

        int rowKeyIndex = -1;

        if (colFamilies == null || colQualifiers == null) {
            throw new SerDeException(
                    "Error: caller must pass in lists for the column families " + "and qualifiers.");
        }

        colFamilies.clear();
        colQualifiers.clear();

        if (columnMapping == null) {
            throw new SerDeException("Error: hbase.columns.mapping missing for this HBase table.");
        }

        if (columnMapping.equals("") || columnMapping.equals(HBASE_KEY_COL)) {
            throw new SerDeException("Error: hbase.columns.mapping specifies only the HBase table"
                    + " row key. A valid Hive-HBase table must specify at least one additional column.");
        }

        String[] mapping = columnMapping.split(",");

        for (int i = 0; i < mapping.length; i++) {
            String elem = mapping[i];
            int idxFirst = elem.indexOf(":");
            int idxLast = elem.lastIndexOf(":");

            if (idxFirst < 0 || !(idxFirst == idxLast)) {
                throw new SerDeException("Error: the HBase columns mapping contains a badly formed "
                        + "column family, column qualifier specification.");
            }

            if (elem.equals(HBASE_KEY_COL)) {
                rowKeyIndex = i;
                colFamilies.add(elem);
                colQualifiers.add(null);
            } else {
                String[] parts = elem.split(":");
                assert (parts.length > 0 && parts.length <= 2);
                colFamilies.add(parts[0]);

                if (parts.length == 2) {
                    colQualifiers.add(parts[1]);
                } else {
                    colQualifiers.add(null);
                }
            }
        }

        if (rowKeyIndex == -1) {
            colFamilies.add(0, HBASE_KEY_COL);
            colQualifiers.add(0, null);
            rowKeyIndex = 0;
        }

        if (colFamilies.size() != colQualifiers.size()) {
            throw new SerDeException("Error in parsing the hbase columns mapping.");
        }

        // populate the corresponding byte [] if the client has passed in a non-null list
        if (colFamiliesBytes != null) {
            colFamiliesBytes.clear();

            for (String fam : colFamilies) {
                colFamiliesBytes.add(Bytes.toBytes(fam));
            }
        }

        if (colQualifiersBytes != null) {
            colQualifiersBytes.clear();

            for (String qual : colQualifiers) {
                if (qual == null) {
                    colQualifiersBytes.add(null);
                } else {
                    colQualifiersBytes.add(Bytes.toBytes(qual));
                }
            }
        }

        if (colFamiliesBytes != null && colQualifiersBytes != null) {
            if (colFamiliesBytes.size() != colQualifiersBytes.size()) {
                throw new SerDeException(
                        "Error in caching the bytes for the hbase column families " + "and qualifiers.");
            }
        }

        return rowKeyIndex;
    }

    public static boolean isSpecialColumn(String hbaseColumnName) {
        return hbaseColumnName.equals(HBASE_KEY_COL);
    }

    private void initHBaseSerDeParameters(Configuration job, Properties tbl, String serdeName)
            throws SerDeException {

        // Read configuration parameters
        hbaseColumnsMapping = tbl.getProperty(HBaseTimeSerDe.HBASE_COLUMNS_MAPPING);
        String columnTypeProperty = tbl.getProperty(Constants.LIST_COLUMN_TYPES);

        // Parse the HBase columns mapping and initialize the col family & qualifiers
        hbaseColumnFamilies = new ArrayList<String>();
        hbaseColumnFamiliesBytes = new ArrayList<byte[]>();
        hbaseColumnQualifiers = new ArrayList<String>();
        hbaseColumnQualifiersBytes = new ArrayList<byte[]>();
        iKey = parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies, hbaseColumnFamiliesBytes,
                hbaseColumnQualifiers, hbaseColumnQualifiersBytes);

        // Build the type property string if not supplied
        if (columnTypeProperty == null) {
            StringBuilder sb = new StringBuilder();

            for (int i = 0; i < hbaseColumnFamilies.size(); i++) {
                if (sb.length() > 0) {
                    sb.append(":");
                }
                String colFamily = hbaseColumnFamilies.get(i);
                String colQualifier = hbaseColumnQualifiers.get(i);
                if (isSpecialColumn(colFamily)) {
                    // the row key column becomes a STRING
                    sb.append(Constants.STRING_TYPE_NAME);
                } else if (colQualifier == null) {
                    // a column family become a MAP
                    sb.append(Constants.MAP_TYPE_NAME + "<" + Constants.STRING_TYPE_NAME + ","
                            + Constants.STRING_TYPE_NAME + ">");
                } else {
                    // an individual column becomes a STRING
                    sb.append(Constants.STRING_TYPE_NAME);
                }
            }
            tbl.setProperty(Constants.LIST_COLUMN_TYPES, sb.toString());
        }

        serdeParams = LazySimpleSerDe.initSerdeParams(job, tbl, serdeName);

        if (hbaseColumnFamilies.size() != serdeParams.getColumnNames().size()) {
            throw new SerDeException(serdeName + ": columns has " + serdeParams.getColumnNames().size()
                    + " elements while hbase.columns.mapping has " + hbaseColumnFamilies.size() + " elements"
                    + " (counting the key if implicit)");
        }

        separators = serdeParams.getSeparators();
        escaped = serdeParams.isEscaped();
        escapeChar = serdeParams.getEscapeChar();
        needsEscape = serdeParams.getNeedsEscape();

        // check that the mapping schema is right;
        // check that the "column-family:" is mapped to MAP<String,?>
        for (int i = 0; i < hbaseColumnFamilies.size(); i++) {
            String colFamily = hbaseColumnFamilies.get(i);
            String colQualifier = hbaseColumnQualifiers.get(i);
            if (colQualifier == null && !isSpecialColumn(colFamily)) {
                TypeInfo typeInfo = serdeParams.getColumnTypes().get(i);
                if ((typeInfo.getCategory() != Category.MAP) || (((MapTypeInfo) typeInfo).getMapKeyTypeInfo()
                        .getTypeName() != Constants.STRING_TYPE_NAME)) {

                    throw new SerDeException(serdeName + ": hbase column family '" + colFamily
                            + "' should be mapped to Map<String,?> but is mapped to " + typeInfo.getTypeName());
                }
            }
        }
    }

    /**
     * Deserialize a row from the HBase Result writable to a LazyObject
     * @param result the HBase Result Writable containing the row
     * @return the deserialized object
     * @see org.apache.hadoop.hive.serde2.SerDe#deserialize(org.apache.hadoop.io.Writable)
     */
    @Override
    public Object deserialize(Writable result) throws SerDeException {

        if (!(result instanceof Result)) {
            throw new SerDeException(getClass().getName() + ": expects Result!");
        }

        cachedHBaseRow.init((Result) result, hbaseColumnFamilies, hbaseColumnFamiliesBytes, hbaseColumnQualifiers,
                hbaseColumnQualifiersBytes);

        return cachedHBaseRow;
    }

    @Override
    public ObjectInspector getObjectInspector() throws SerDeException {
        return cachedObjectInspector;
    }

    @Override
    public Class<? extends Writable> getSerializedClass() {
        return Put.class;
    }

    @Override
    public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException {
        if (objInspector.getCategory() != Category.STRUCT) {
            throw new SerDeException(getClass().toString() + " can only serialize struct types, but we got: "
                    + objInspector.getTypeName());
        }

        // Prepare the field ObjectInspectors
        StructObjectInspector soi = (StructObjectInspector) objInspector;
        List<? extends StructField> fields = soi.getAllStructFieldRefs();
        List<Object> list = soi.getStructFieldsDataAsList(obj);
        List<? extends StructField> declaredFields = (serdeParams.getRowTypeInfo() != null
                && ((StructTypeInfo) serdeParams.getRowTypeInfo()).getAllStructFieldNames().size() > 0)
                        ? ((StructObjectInspector) getObjectInspector()).getAllStructFieldRefs()
                        : null;

        Put put = null;

        try {
            byte[] key = serializeField(iKey, null, fields, list, declaredFields);

            if (key == null) {
                throw new SerDeException("HBase row key cannot be NULL");
            }

            put = new Put(key);

            // Serialize each field
            for (int i = 0; i < fields.size(); i++) {
                if (i == iKey) {
                    // already processed the key above
                    continue;
                }
                serializeField(i, put, fields, list, declaredFields);
            }
        } catch (IOException e) {
            throw new SerDeException(e);
        }

        return put;
    }

    private byte[] serializeField(int i, Put put, List<? extends StructField> fields, List<Object> list,
            List<? extends StructField> declaredFields) throws IOException {

        // column name
        String hbaseColumnFamily = hbaseColumnFamilies.get(i);
        String hbaseColumnQualifier = hbaseColumnQualifiers.get(i);

        // Get the field objectInspector and the field object.
        ObjectInspector foi = fields.get(i).getFieldObjectInspector();
        Object f = (list == null ? null : list.get(i));

        if (f == null) {
            // a null object, we do not serialize it
            return null;
        }

        // If the field corresponds to a column family in HBase
        if (hbaseColumnQualifier == null && !isSpecialColumn(hbaseColumnFamily)) {
            MapObjectInspector moi = (MapObjectInspector) foi;
            ObjectInspector koi = moi.getMapKeyObjectInspector();
            ObjectInspector voi = moi.getMapValueObjectInspector();

            Map<?, ?> map = moi.getMap(f);
            if (map == null) {
                return null;
            } else {
                for (Map.Entry<?, ?> entry : map.entrySet()) {
                    // Get the Key
                    serializeStream.reset();
                    serialize(entry.getKey(), koi, 3);

                    // Get the column-qualifier
                    byte[] columnQualifierBytes = new byte[serializeStream.getCount()];
                    System.arraycopy(serializeStream.getData(), 0, columnQualifierBytes, 0,
                            serializeStream.getCount());

                    // Get the Value
                    serializeStream.reset();
                    boolean isNotNull = serialize(entry.getValue(), voi, 3);
                    if (!isNotNull) {
                        continue;
                    }
                    byte[] value = new byte[serializeStream.getCount()];
                    System.arraycopy(serializeStream.getData(), 0, value, 0, serializeStream.getCount());
                    put.add(hbaseColumnFamiliesBytes.get(i), columnQualifierBytes, value);
                }
            }
        } else {
            // If the field that is passed in is NOT a primitive, and either the
            // field is not declared (no schema was given at initialization), or
            // the field is declared as a primitive in initialization, serialize
            // the data to JSON string.  Otherwise serialize the data in the
            // delimited way.
            serializeStream.reset();
            boolean isNotNull;
            if (!foi.getCategory().equals(Category.PRIMITIVE) && (declaredFields == null
                    || declaredFields.get(i).getFieldObjectInspector().getCategory().equals(Category.PRIMITIVE)
                    || useJSONSerialize)) {

                isNotNull = serialize(SerDeUtils.getJSONString(f, foi),
                        PrimitiveObjectInspectorFactory.javaStringObjectInspector, 1);
            } else {
                isNotNull = serialize(f, foi, 1);
            }
            if (!isNotNull) {
                return null;
            }
            byte[] key = new byte[serializeStream.getCount()];
            System.arraycopy(serializeStream.getData(), 0, key, 0, serializeStream.getCount());
            if (i == iKey) {
                return key;
            }
            put.add(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i), key);
        }

        return null;
    }

    /**
     * Serialize the row into a ByteStream.
     *
     * @param obj           The object for the current field.
     * @param objInspector  The ObjectInspector for the current Object.
     * @param level         The current level of separator.
     * @throws java.io.IOException
     * @return true, if serialize is a not-null object; otherwise false.
     */
    private boolean serialize(Object obj, ObjectInspector objInspector, int level) throws IOException {

        switch (objInspector.getCategory()) {
        case PRIMITIVE: {
            LazyUtils.writePrimitiveUTF8(serializeStream, obj, (PrimitiveObjectInspector) objInspector, escaped,
                    escapeChar, needsEscape);
            return true;
        }
        case LIST: {
            char separator = (char) separators[level];
            ListObjectInspector loi = (ListObjectInspector) objInspector;
            List<?> list = loi.getList(obj);
            ObjectInspector eoi = loi.getListElementObjectInspector();
            if (list == null) {
                return false;
            } else {
                for (int i = 0; i < list.size(); i++) {
                    if (i > 0) {
                        serializeStream.write(separator);
                    }
                    serialize(list.get(i), eoi, level + 1);
                }
            }
            return true;
        }
        case MAP: {
            char separator = (char) separators[level];
            char keyValueSeparator = (char) separators[level + 1];
            MapObjectInspector moi = (MapObjectInspector) objInspector;
            ObjectInspector koi = moi.getMapKeyObjectInspector();
            ObjectInspector voi = moi.getMapValueObjectInspector();

            Map<?, ?> map = moi.getMap(obj);
            if (map == null) {
                return false;
            } else {
                boolean first = true;
                for (Map.Entry<?, ?> entry : map.entrySet()) {
                    if (first) {
                        first = false;
                    } else {
                        serializeStream.write(separator);
                    }
                    serialize(entry.getKey(), koi, level + 2);
                    serializeStream.write(keyValueSeparator);
                    serialize(entry.getValue(), voi, level + 2);
                }
            }
            return true;
        }
        case STRUCT: {
            char separator = (char) separators[level];
            StructObjectInspector soi = (StructObjectInspector) objInspector;
            List<? extends StructField> fields = soi.getAllStructFieldRefs();
            List<Object> list = soi.getStructFieldsDataAsList(obj);
            if (list == null) {
                return false;
            } else {
                for (int i = 0; i < list.size(); i++) {
                    if (i > 0) {
                        serializeStream.write(separator);
                    }
                    serialize(list.get(i), fields.get(i).getFieldObjectInspector(), level + 1);
                }
            }
            return true;
        }
        }

        throw new RuntimeException("Unknown category type: " + objInspector.getCategory());
    }

    /**
     * @return the useJSONSerialize
     */
    public boolean isUseJSONSerialize() {
        return useJSONSerialize;
    }

    /**
     * @param useJSONSerialize the useJSONSerialize to set
     */
    public void setUseJSONSerialize(boolean useJSONSerialize) {
        this.useJSONSerialize = useJSONSerialize;
    }

    /**
     * @return 0-based offset of the key column within the table
     */
    int getKeyColumnOffset() {
        return iKey;
    }
}