Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.lens.lib.query; import static org.apache.hadoop.hive.serde.serdeConstants.LIST_COLUMNS; import static org.apache.hadoop.hive.serde.serdeConstants.LIST_COLUMN_TYPES; import java.io.*; import java.util.*; import org.apache.commons.io.output.ByteArrayOutputStream; import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.hive.serde2.lazy.LazyInteger; import org.apache.hadoop.hive.serde2.objectinspector.*; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.*; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import au.com.bytecode.opencsv.CSVReader; import au.com.bytecode.opencsv.CSVWriter; /** * CSVSerde uses opencsv (http://opencsv.sourceforge.net/) to serialize/deserialize columns as CSV. */ public final class CSVSerde extends AbstractSerDe { /** * The default null format. */ public static final String DEFAULT_NULL_FORMAT = "NULL"; /** * The default collection seperator. */ public static final char DEFAULT_COLLECTION_SEPERATOR = ','; /** * The default struct field seperator. */ public static final char DEFAULT_STRUCT_FIELD_SEPERATOR = ':'; /** * The default union tag field seperator. */ public static final char DEFAULT_UNION_TAG_FIELD_SEPERATOR = ':'; /** * The default map key value seperator. */ public static final char DEFAULT_MAP_KEY_VALUE_SEPERATOR = '='; /** * The inspector. */ private ObjectInspector inspector; /** * The output fields. */ private String[] outputFields; /** * The num cols. */ private int numCols; /** * The row. */ private List<Object> row; /** * The column types. */ private List<TypeInfo> columnTypes; /** * The column object inspectors. */ private List<ObjectInspector> columnObjectInspectors; /** * The separator char. */ private char separatorChar; /** * The quote char. */ private char quoteChar; /** * The escape char. */ private char escapeChar; /** * The collection seperator. */ private char collectionSeperator; /** * The struct field seperator. */ private char structFieldSeperator; /** * The union tag field seperator. */ private char unionTagFieldSeperator; /** * The map key value seperator. */ private char mapKeyValueSeperator; /** * The null string. */ private String nullString; /* * (non-Javadoc) * * @see org.apache.hadoop.hive.serde2.AbstractSerDe#initialize(org.apache.hadoop.conf.Configuration, * java.util.Properties) */ @Override public void initialize(final Configuration conf, final Properties tbl) throws SerDeException { List<String> columnNames = new ArrayList<String>(); if (tbl.getProperty(LIST_COLUMNS) != null) { String[] names = tbl.getProperty(LIST_COLUMNS).split("(?!\"),(?!\")"); for (String name : names) { columnNames.add(StringEscapeUtils.unescapeCsv(name)); } } String columnTypeProperty = tbl.getProperty(LIST_COLUMN_TYPES); columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); numCols = columnNames.size(); this.outputFields = new String[numCols]; row = new ArrayList<Object>(numCols); for (int i = 0; i < numCols; i++) { row.add(null); } ObjectInspector colObjectInspector; columnObjectInspectors = new ArrayList<ObjectInspector>(numCols); for (int col = 0; col < numCols; col++) { colObjectInspector = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(columnTypes.get(col)); columnObjectInspectors.add(colObjectInspector); } this.inspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnObjectInspectors); separatorChar = getProperty(tbl, "separatorChar", CSVWriter.DEFAULT_SEPARATOR); quoteChar = getProperty(tbl, "quoteChar", CSVWriter.DEFAULT_QUOTE_CHARACTER); escapeChar = getProperty(tbl, "escapeChar", CSVWriter.DEFAULT_ESCAPE_CHARACTER); nullString = tbl.getProperty("nullString", DEFAULT_NULL_FORMAT); collectionSeperator = getProperty(tbl, "collectionSeperator", DEFAULT_COLLECTION_SEPERATOR); structFieldSeperator = getProperty(tbl, "structFieldSeperator", DEFAULT_STRUCT_FIELD_SEPERATOR); unionTagFieldSeperator = getProperty(tbl, "unionTagFieldSeperator", DEFAULT_UNION_TAG_FIELD_SEPERATOR); mapKeyValueSeperator = getProperty(tbl, "mapKeyValueSeperator", DEFAULT_MAP_KEY_VALUE_SEPERATOR); } /** * Gets the property. * * @param tbl the tbl * @param property the property * @param def the def * @return the property */ private char getProperty(final Properties tbl, final String property, final char def) { final String val = tbl.getProperty(property); if (val != null) { return val.charAt(0); } return def; } /* * (non-Javadoc) * * @see org.apache.hadoop.hive.serde2.AbstractSerDe#serialize(java.lang.Object, * org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) */ @Override public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { final StructObjectInspector outputRowOI = (StructObjectInspector) objInspector; final List<? extends StructField> outputFieldRefs = outputRowOI.getAllStructFieldRefs(); if (outputFieldRefs.size() != numCols) { throw new SerDeException("Cannot serialize the object because there are " + outputFieldRefs.size() + " fields but the table has " + numCols + " columns."); } try { // Get all data out. for (int c = 0; c < numCols; c++) { final Object field = outputRowOI.getStructFieldData(obj, outputFieldRefs.get(c)); // Get the field objectInspector and the field object. ObjectInspector fieldOI = outputFieldRefs.get(c).getFieldObjectInspector(); outputFields[c] = serializeField(field, fieldOI); } final StringWriter writer = new StringWriter(); final CSVWriter csv = newWriter(writer, separatorChar, quoteChar, escapeChar); csv.writeNext(outputFields); csv.close(); return new Text(writer.toString()); } catch (final IOException ioe) { throw new SerDeException(ioe); } } /** * Serialize field. * * @param field the field * @param fieldOI the field oi * @return the string * @throws IOException Signals that an I/O exception has occurred. * @throws SerDeException the ser de exception */ private String serializeField(Object field, ObjectInspector fieldOI) throws IOException, SerDeException { if (field == null) { return nullString; } List<?> list; switch (fieldOI.getCategory()) { case PRIMITIVE: if (fieldOI instanceof StringObjectInspector) { final StringObjectInspector fieldStringOI = (StringObjectInspector) fieldOI; return fieldStringOI.getPrimitiveJavaObject(field); } else { return field.toString(); } case LIST: ListObjectInspector loi = (ListObjectInspector) fieldOI; list = loi.getList(field); ObjectInspector eoi = loi.getListElementObjectInspector(); if (list == null) { return nullString; } else { StringBuilder listString = new StringBuilder(); for (int i = 0; i < list.size(); i++) { if (i > 0) { listString.append(collectionSeperator); } listString.append(serializeField(list.get(i), eoi)); } return listString.toString(); } case MAP: MapObjectInspector moi = (MapObjectInspector) fieldOI; ObjectInspector koi = moi.getMapKeyObjectInspector(); ObjectInspector voi = moi.getMapValueObjectInspector(); Map<?, ?> map = moi.getMap(field); if (map == null) { return nullString; } else { StringBuilder mapString = new StringBuilder(); boolean first = true; for (Map.Entry<?, ?> entry : map.entrySet()) { if (first) { first = false; } else { mapString.append(collectionSeperator); } mapString.append(serializeField(entry.getKey(), koi)); mapString.append(mapKeyValueSeperator); mapString.append(serializeField(entry.getValue(), voi)); } return mapString.toString(); } case STRUCT: StructObjectInspector soi = (StructObjectInspector) fieldOI; List<? extends StructField> fields = soi.getAllStructFieldRefs(); list = soi.getStructFieldsDataAsList(field); if (list == null) { return nullString; } else { StringBuilder structString = new StringBuilder(); for (int i = 0; i < list.size(); i++) { if (i > 0) { structString.append(structFieldSeperator); } structString.append(serializeField(list.get(i), fields.get(i).getFieldObjectInspector())); } return structString.toString(); } case UNION: UnionObjectInspector uoi = (UnionObjectInspector) fieldOI; List<? extends ObjectInspector> ois = uoi.getObjectInspectors(); if (ois == null) { return nullString; } else { StringBuilder unionString = new StringBuilder(); ByteArrayOutputStream tagStream = new ByteArrayOutputStream(); LazyInteger.writeUTF8(tagStream, uoi.getTag(field)); unionString.append(new String(tagStream.toByteArray(), "UTF-8")); unionString.append(unionTagFieldSeperator); unionString.append(serializeField(uoi.getField(field), ois.get(uoi.getTag(field)))); return unionString.toString(); } default: break; } throw new RuntimeException("Unknown category type: " + fieldOI.getCategory()); } /** * Gets the Java Object corresponding to the type, represented as string. * * @param colString the col string * @param type the type * @return Standard Java Object for primitive types List of Objects for Array type Map<Object,Object> for Map type * List of Objects for Struct type Object itself contained in Union type */ private Object getColumnObject(String colString, TypeInfo type) { if (colString.equals(nullString)) { return null; } switch (type.getCategory()) { case PRIMITIVE: return ObjectInspectorConverters.getConverter(PrimitiveObjectInspectorFactory.javaStringObjectInspector, TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(type)).convert(colString); case LIST: TypeInfo elementType = ((ListTypeInfo) type).getListElementTypeInfo(); List<Object> olist = new ArrayList<Object>(); List<String> inlist = Arrays.asList(StringUtils.split(colString, collectionSeperator)); for (String ins : inlist) { olist.add(getColumnObject(ins, elementType)); } return olist; case MAP: TypeInfo keyType = ((MapTypeInfo) type).getMapKeyTypeInfo(); TypeInfo valueType = ((MapTypeInfo) type).getMapValueTypeInfo(); Map<Object, Object> omap = new LinkedHashMap<Object, Object>(); List<String> maplist = Arrays.asList(StringUtils.split(colString, collectionSeperator)); for (String ins : maplist) { String[] entry = StringUtils.split(ins, mapKeyValueSeperator); omap.put(getColumnObject(entry[0], keyType), getColumnObject(entry[1], valueType)); } return omap; case STRUCT: List<TypeInfo> elementTypes = ((StructTypeInfo) type).getAllStructFieldTypeInfos(); List<Object> slist = new ArrayList<Object>(); List<String> instructlist = Arrays.asList(StringUtils.split(colString, structFieldSeperator)); for (int i = 0; i < elementTypes.size(); i++) { slist.add(getColumnObject(instructlist.get(i), elementTypes.get(i))); } return slist; case UNION: List<TypeInfo> unionTypes = ((UnionTypeInfo) type).getAllUnionObjectTypeInfos(); String[] unionElements = StringUtils.split(colString, unionTagFieldSeperator); int tag = Integer.parseInt(unionElements[0]); return getColumnObject(colString, unionTypes.get(tag)); } return null; } /* * (non-Javadoc) * * @see org.apache.hadoop.hive.serde2.AbstractSerDe#deserialize(org.apache.hadoop.io.Writable) */ @Override public Object deserialize(final Writable blob) throws SerDeException { Text rowText = (Text) blob; CSVReader csv = null; try { csv = newReader(new CharArrayReader(rowText.toString().toCharArray()), separatorChar, quoteChar, escapeChar); final String[] read = csv.readNext(); for (int i = 0; i < numCols; i++) { if (read != null && i < read.length && !read[i].equals(nullString)) { row.set(i, getColumnObject(read[i], columnTypes.get(i))); } else { row.set(i, null); } } return row; } catch (final Exception e) { throw new SerDeException(e); } finally { if (csv != null) { try { csv.close(); } catch (final Exception e) { // ignore } } } } /** * New reader. * * @param reader the reader * @param separator the separator * @param quote the quote * @param escape the escape * @return the CSV reader */ private CSVReader newReader(final Reader reader, char separator, char quote, char escape) { // CSVReader will throw an exception if any of separator, quote, or escape is the same, but // the CSV format specifies that the escape character and quote char are the same... very weird if (CSVWriter.DEFAULT_ESCAPE_CHARACTER == escape) { return new CSVReader(reader, separator, quote); } else { return new CSVReader(reader, separator, quote, escape); } } /** * New writer. * * @param writer the writer * @param separator the separator * @param quote the quote * @param escape the escape * @return the CSV writer */ private CSVWriter newWriter(final Writer writer, char separator, char quote, char escape) { if (CSVWriter.DEFAULT_ESCAPE_CHARACTER == escape) { return new CSVWriter(writer, separator, quote, ""); } else { return new CSVWriter(writer, separator, quote, escape, ""); } } @Override public ObjectInspector getObjectInspector() throws SerDeException { return inspector; } @Override public Class<? extends Writable> getSerializedClass() { return Text.class; } @Override public SerDeStats getSerDeStats() { return null; } }