Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.ask.hive.hbase; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Properties; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hive.serde.Constants; import org.apache.hadoop.hive.serde2.ByteStream; import org.apache.hadoop.hive.serde2.SerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.lazy.LazyFactory; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.serde2.lazy.LazyUtils; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.SerDeParameters; import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.hbase.LazyHBaseRow; import org.apache.hadoop.hive.hbase.HBaseSerDe; import org.apache.hadoop.io.Writable; /** * HBaseSerDe can be used to serialize object into an HBase table and * deserialize objects from an HBase table. */ public class HBaseTimeSerDe extends HBaseSerDe { public static final String HBASE_COLUMNS_MAPPING = "hbase.columns.mapping"; public static final String HBASE_TABLE_NAME = "hbase.table.name"; public static final String HBASE_KEY_COL = ":key"; public static final Log LOG = LogFactory.getLog(HBaseTimeSerDe.class); private ObjectInspector cachedObjectInspector; private String hbaseColumnsMapping; private List<String> hbaseColumnFamilies; private List<byte[]> hbaseColumnFamiliesBytes; private List<String> hbaseColumnQualifiers; private List<byte[]> hbaseColumnQualifiersBytes; private SerDeParameters serdeParams; private boolean useJSONSerialize; private LazyHBaseRow cachedHBaseRow; private final ByteStream.Output serializeStream = new ByteStream.Output(); private int iKey; // used for serializing a field private byte[] separators; // the separators array private boolean escaped; // whether we need to escape the data when writing out private byte escapeChar; // which char to use as the escape char, e.g. '\\' private boolean[] needsEscape; // which chars need to be escaped. This array should have size // of 128. Negative byte values (or byte values >= 128) are // never escaped. @Override public String toString() { return getClass().toString() + "[" + hbaseColumnsMapping + ":" + ((StructTypeInfo) serdeParams.getRowTypeInfo()).getAllStructFieldNames() + ":" + ((StructTypeInfo) serdeParams.getRowTypeInfo()).getAllStructFieldTypeInfos() + "]"; } public HBaseTimeSerDe() throws SerDeException { } /** * Initialize the SerDe given parameters. * @see org.apache.hadoop.hive.serde2.SerDe#initialize(org.apache.hadoop.conf.Configuration, java.util.Properties) */ @Override public void initialize(Configuration conf, Properties tbl) throws SerDeException { initHBaseSerDeParameters(conf, tbl, getClass().getName()); cachedObjectInspector = LazyFactory.createLazyStructInspector(serdeParams.getColumnNames(), serdeParams.getColumnTypes(), serdeParams.getSeparators(), serdeParams.getNullSequence(), serdeParams.isLastColumnTakesRest(), serdeParams.isEscaped(), serdeParams.getEscapeChar()); cachedHBaseRow = new LazyHBaseRow((LazySimpleStructObjectInspector) cachedObjectInspector); if (LOG.isDebugEnabled()) { LOG.debug("HBaseSerDe initialized with : columnNames = " + serdeParams.getColumnNames() + " columnTypes = " + serdeParams.getColumnTypes() + " hbaseColumnMapping = " + hbaseColumnsMapping); } } /** * Parses the HBase columns mapping to identify the column families, qualifiers * and also caches the byte arrays corresponding to them. One of the Hive table * columns maps to the HBase row key, by default the first column. * * @param columnMapping - the column mapping specification to be parsed * @param colFamilies - the list of HBase column family names * @param colFamiliesBytes - the corresponding byte array * @param colQualifiers - the list of HBase column qualifier names * @param colQualifiersBytes - the corresponding byte array * @return the row key index in the column names list * @throws org.apache.hadoop.hive.serde2.SerDeException */ public static int parseColumnMapping(String columnMapping, List<String> colFamilies, List<byte[]> colFamiliesBytes, List<String> colQualifiers, List<byte[]> colQualifiersBytes) throws SerDeException { int rowKeyIndex = -1; if (colFamilies == null || colQualifiers == null) { throw new SerDeException( "Error: caller must pass in lists for the column families " + "and qualifiers."); } colFamilies.clear(); colQualifiers.clear(); if (columnMapping == null) { throw new SerDeException("Error: hbase.columns.mapping missing for this HBase table."); } if (columnMapping.equals("") || columnMapping.equals(HBASE_KEY_COL)) { throw new SerDeException("Error: hbase.columns.mapping specifies only the HBase table" + " row key. A valid Hive-HBase table must specify at least one additional column."); } String[] mapping = columnMapping.split(","); for (int i = 0; i < mapping.length; i++) { String elem = mapping[i]; int idxFirst = elem.indexOf(":"); int idxLast = elem.lastIndexOf(":"); if (idxFirst < 0 || !(idxFirst == idxLast)) { throw new SerDeException("Error: the HBase columns mapping contains a badly formed " + "column family, column qualifier specification."); } if (elem.equals(HBASE_KEY_COL)) { rowKeyIndex = i; colFamilies.add(elem); colQualifiers.add(null); } else { String[] parts = elem.split(":"); assert (parts.length > 0 && parts.length <= 2); colFamilies.add(parts[0]); if (parts.length == 2) { colQualifiers.add(parts[1]); } else { colQualifiers.add(null); } } } if (rowKeyIndex == -1) { colFamilies.add(0, HBASE_KEY_COL); colQualifiers.add(0, null); rowKeyIndex = 0; } if (colFamilies.size() != colQualifiers.size()) { throw new SerDeException("Error in parsing the hbase columns mapping."); } // populate the corresponding byte [] if the client has passed in a non-null list if (colFamiliesBytes != null) { colFamiliesBytes.clear(); for (String fam : colFamilies) { colFamiliesBytes.add(Bytes.toBytes(fam)); } } if (colQualifiersBytes != null) { colQualifiersBytes.clear(); for (String qual : colQualifiers) { if (qual == null) { colQualifiersBytes.add(null); } else { colQualifiersBytes.add(Bytes.toBytes(qual)); } } } if (colFamiliesBytes != null && colQualifiersBytes != null) { if (colFamiliesBytes.size() != colQualifiersBytes.size()) { throw new SerDeException( "Error in caching the bytes for the hbase column families " + "and qualifiers."); } } return rowKeyIndex; } public static boolean isSpecialColumn(String hbaseColumnName) { return hbaseColumnName.equals(HBASE_KEY_COL); } private void initHBaseSerDeParameters(Configuration job, Properties tbl, String serdeName) throws SerDeException { // Read configuration parameters hbaseColumnsMapping = tbl.getProperty(HBaseTimeSerDe.HBASE_COLUMNS_MAPPING); String columnTypeProperty = tbl.getProperty(Constants.LIST_COLUMN_TYPES); // Parse the HBase columns mapping and initialize the col family & qualifiers hbaseColumnFamilies = new ArrayList<String>(); hbaseColumnFamiliesBytes = new ArrayList<byte[]>(); hbaseColumnQualifiers = new ArrayList<String>(); hbaseColumnQualifiersBytes = new ArrayList<byte[]>(); iKey = parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies, hbaseColumnFamiliesBytes, hbaseColumnQualifiers, hbaseColumnQualifiersBytes); // Build the type property string if not supplied if (columnTypeProperty == null) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < hbaseColumnFamilies.size(); i++) { if (sb.length() > 0) { sb.append(":"); } String colFamily = hbaseColumnFamilies.get(i); String colQualifier = hbaseColumnQualifiers.get(i); if (isSpecialColumn(colFamily)) { // the row key column becomes a STRING sb.append(Constants.STRING_TYPE_NAME); } else if (colQualifier == null) { // a column family become a MAP sb.append(Constants.MAP_TYPE_NAME + "<" + Constants.STRING_TYPE_NAME + "," + Constants.STRING_TYPE_NAME + ">"); } else { // an individual column becomes a STRING sb.append(Constants.STRING_TYPE_NAME); } } tbl.setProperty(Constants.LIST_COLUMN_TYPES, sb.toString()); } serdeParams = LazySimpleSerDe.initSerdeParams(job, tbl, serdeName); if (hbaseColumnFamilies.size() != serdeParams.getColumnNames().size()) { throw new SerDeException(serdeName + ": columns has " + serdeParams.getColumnNames().size() + " elements while hbase.columns.mapping has " + hbaseColumnFamilies.size() + " elements" + " (counting the key if implicit)"); } separators = serdeParams.getSeparators(); escaped = serdeParams.isEscaped(); escapeChar = serdeParams.getEscapeChar(); needsEscape = serdeParams.getNeedsEscape(); // check that the mapping schema is right; // check that the "column-family:" is mapped to MAP<String,?> for (int i = 0; i < hbaseColumnFamilies.size(); i++) { String colFamily = hbaseColumnFamilies.get(i); String colQualifier = hbaseColumnQualifiers.get(i); if (colQualifier == null && !isSpecialColumn(colFamily)) { TypeInfo typeInfo = serdeParams.getColumnTypes().get(i); if ((typeInfo.getCategory() != Category.MAP) || (((MapTypeInfo) typeInfo).getMapKeyTypeInfo() .getTypeName() != Constants.STRING_TYPE_NAME)) { throw new SerDeException(serdeName + ": hbase column family '" + colFamily + "' should be mapped to Map<String,?> but is mapped to " + typeInfo.getTypeName()); } } } } /** * Deserialize a row from the HBase Result writable to a LazyObject * @param result the HBase Result Writable containing the row * @return the deserialized object * @see org.apache.hadoop.hive.serde2.SerDe#deserialize(org.apache.hadoop.io.Writable) */ @Override public Object deserialize(Writable result) throws SerDeException { if (!(result instanceof Result)) { throw new SerDeException(getClass().getName() + ": expects Result!"); } cachedHBaseRow.init((Result) result, hbaseColumnFamilies, hbaseColumnFamiliesBytes, hbaseColumnQualifiers, hbaseColumnQualifiersBytes); return cachedHBaseRow; } @Override public ObjectInspector getObjectInspector() throws SerDeException { return cachedObjectInspector; } @Override public Class<? extends Writable> getSerializedClass() { return Put.class; } @Override public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { if (objInspector.getCategory() != Category.STRUCT) { throw new SerDeException(getClass().toString() + " can only serialize struct types, but we got: " + objInspector.getTypeName()); } // Prepare the field ObjectInspectors StructObjectInspector soi = (StructObjectInspector) objInspector; List<? extends StructField> fields = soi.getAllStructFieldRefs(); List<Object> list = soi.getStructFieldsDataAsList(obj); List<? extends StructField> declaredFields = (serdeParams.getRowTypeInfo() != null && ((StructTypeInfo) serdeParams.getRowTypeInfo()).getAllStructFieldNames().size() > 0) ? ((StructObjectInspector) getObjectInspector()).getAllStructFieldRefs() : null; Put put = null; try { byte[] key = serializeField(iKey, null, fields, list, declaredFields); if (key == null) { throw new SerDeException("HBase row key cannot be NULL"); } put = new Put(key); // Serialize each field for (int i = 0; i < fields.size(); i++) { if (i == iKey) { // already processed the key above continue; } serializeField(i, put, fields, list, declaredFields); } } catch (IOException e) { throw new SerDeException(e); } return put; } private byte[] serializeField(int i, Put put, List<? extends StructField> fields, List<Object> list, List<? extends StructField> declaredFields) throws IOException { // column name String hbaseColumnFamily = hbaseColumnFamilies.get(i); String hbaseColumnQualifier = hbaseColumnQualifiers.get(i); // Get the field objectInspector and the field object. ObjectInspector foi = fields.get(i).getFieldObjectInspector(); Object f = (list == null ? null : list.get(i)); if (f == null) { // a null object, we do not serialize it return null; } // If the field corresponds to a column family in HBase if (hbaseColumnQualifier == null && !isSpecialColumn(hbaseColumnFamily)) { MapObjectInspector moi = (MapObjectInspector) foi; ObjectInspector koi = moi.getMapKeyObjectInspector(); ObjectInspector voi = moi.getMapValueObjectInspector(); Map<?, ?> map = moi.getMap(f); if (map == null) { return null; } else { for (Map.Entry<?, ?> entry : map.entrySet()) { // Get the Key serializeStream.reset(); serialize(entry.getKey(), koi, 3); // Get the column-qualifier byte[] columnQualifierBytes = new byte[serializeStream.getCount()]; System.arraycopy(serializeStream.getData(), 0, columnQualifierBytes, 0, serializeStream.getCount()); // Get the Value serializeStream.reset(); boolean isNotNull = serialize(entry.getValue(), voi, 3); if (!isNotNull) { continue; } byte[] value = new byte[serializeStream.getCount()]; System.arraycopy(serializeStream.getData(), 0, value, 0, serializeStream.getCount()); put.add(hbaseColumnFamiliesBytes.get(i), columnQualifierBytes, value); } } } else { // If the field that is passed in is NOT a primitive, and either the // field is not declared (no schema was given at initialization), or // the field is declared as a primitive in initialization, serialize // the data to JSON string. Otherwise serialize the data in the // delimited way. serializeStream.reset(); boolean isNotNull; if (!foi.getCategory().equals(Category.PRIMITIVE) && (declaredFields == null || declaredFields.get(i).getFieldObjectInspector().getCategory().equals(Category.PRIMITIVE) || useJSONSerialize)) { isNotNull = serialize(SerDeUtils.getJSONString(f, foi), PrimitiveObjectInspectorFactory.javaStringObjectInspector, 1); } else { isNotNull = serialize(f, foi, 1); } if (!isNotNull) { return null; } byte[] key = new byte[serializeStream.getCount()]; System.arraycopy(serializeStream.getData(), 0, key, 0, serializeStream.getCount()); if (i == iKey) { return key; } put.add(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i), key); } return null; } /** * Serialize the row into a ByteStream. * * @param obj The object for the current field. * @param objInspector The ObjectInspector for the current Object. * @param level The current level of separator. * @throws java.io.IOException * @return true, if serialize is a not-null object; otherwise false. */ private boolean serialize(Object obj, ObjectInspector objInspector, int level) throws IOException { switch (objInspector.getCategory()) { case PRIMITIVE: { LazyUtils.writePrimitiveUTF8(serializeStream, obj, (PrimitiveObjectInspector) objInspector, escaped, escapeChar, needsEscape); return true; } case LIST: { char separator = (char) separators[level]; ListObjectInspector loi = (ListObjectInspector) objInspector; List<?> list = loi.getList(obj); ObjectInspector eoi = loi.getListElementObjectInspector(); if (list == null) { return false; } else { for (int i = 0; i < list.size(); i++) { if (i > 0) { serializeStream.write(separator); } serialize(list.get(i), eoi, level + 1); } } return true; } case MAP: { char separator = (char) separators[level]; char keyValueSeparator = (char) separators[level + 1]; MapObjectInspector moi = (MapObjectInspector) objInspector; ObjectInspector koi = moi.getMapKeyObjectInspector(); ObjectInspector voi = moi.getMapValueObjectInspector(); Map<?, ?> map = moi.getMap(obj); if (map == null) { return false; } else { boolean first = true; for (Map.Entry<?, ?> entry : map.entrySet()) { if (first) { first = false; } else { serializeStream.write(separator); } serialize(entry.getKey(), koi, level + 2); serializeStream.write(keyValueSeparator); serialize(entry.getValue(), voi, level + 2); } } return true; } case STRUCT: { char separator = (char) separators[level]; StructObjectInspector soi = (StructObjectInspector) objInspector; List<? extends StructField> fields = soi.getAllStructFieldRefs(); List<Object> list = soi.getStructFieldsDataAsList(obj); if (list == null) { return false; } else { for (int i = 0; i < list.size(); i++) { if (i > 0) { serializeStream.write(separator); } serialize(list.get(i), fields.get(i).getFieldObjectInspector(), level + 1); } } return true; } } throw new RuntimeException("Unknown category type: " + objInspector.getCategory()); } /** * @return the useJSONSerialize */ public boolean isUseJSONSerialize() { return useJSONSerialize; } /** * @param useJSONSerialize the useJSONSerialize to set */ public void setUseJSONSerialize(boolean useJSONSerialize) { this.useJSONSerialize = useJSONSerialize; } /** * @return 0-based offset of the key column within the table */ int getKeyColumnOffset() { return iKey; } }