Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.ebay.nest.io.sede; import java.sql.Date; import java.sql.Timestamp; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Properties; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import com.ebay.nest.io.nestfile.HiveDecimal; import com.ebay.nest.io.nestfile.HiveVarchar; import com.ebay.nest.io.sede.objectinspector.ObjectInspector; import com.ebay.nest.io.sede.objectinspector.ObjectInspectorFactory; import com.ebay.nest.io.sede.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import com.ebay.nest.io.sede.objectinspector.StructObjectInspector; import com.ebay.nest.io.sede.objectinspector.primitive.PrimitiveObjectInspectorFactory; import com.ebay.nest.io.sede.typeinfo.ParameterizedPrimitiveTypeUtils; import com.ebay.nest.io.sede.typeinfo.PrimitiveTypeInfo; import com.ebay.nest.io.sede.typeinfo.TypeInfo; import com.ebay.nest.io.sede.typeinfo.TypeInfoUtils; import com.ebay.nest.io.sede.typeinfo.VarcharTypeParams; /** * RegexSerDe uses regular expression (regex) to deserialize data. It doesn't * support data serialization. * * It can deserialize the data using regex and extracts groups as columns. * * In deserialization stage, if a row does not match the regex, then all columns * in the row will be NULL. If a row matches the regex but has less than * expected groups, the missing groups will be NULL. If a row matches the regex * but has more than expected groups, the additional groups are just ignored. * * NOTE: Regex SerDe supports primitive column types such as TINYINT, SMALLINT, * INT, BIGINT, FLOAT, DOUBLE, STRING, BOOLEAN and DECIMAL * * * NOTE: This implementation uses javaStringObjectInspector for STRING. A * more efficient implementation should use UTF-8 encoded Text and * writableStringObjectInspector. We should switch to that when we have a UTF-8 * based Regex library. */ public class RegexSerDe extends AbstractSerDe { public static final Log LOG = LogFactory.getLog(RegexSerDe.class.getName()); int numColumns; String inputRegex; Pattern inputPattern; StructObjectInspector rowOI; List<Object> row; List<TypeInfo> columnTypes; Object[] outputFields; Text outputRowText; boolean alreadyLoggedNoMatch = false; boolean alreadyLoggedPartialMatch = false; @Override public void initialize(Configuration conf, Properties tbl) throws SerDeException { // We can get the table definition from tbl. // Read the configuration parameters inputRegex = tbl.getProperty("input.regex"); String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS); String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES); boolean inputRegexIgnoreCase = "true".equalsIgnoreCase(tbl.getProperty("input.regex.case.insensitive")); // output format string is not supported anymore, warn user of deprecation if (null != tbl.getProperty("output.format.string")) { LOG.warn("output.format.string has been deprecated"); } // Parse the configuration parameters if (inputRegex != null) { inputPattern = Pattern.compile(inputRegex, Pattern.DOTALL + (inputRegexIgnoreCase ? Pattern.CASE_INSENSITIVE : 0)); } else { inputPattern = null; throw new SerDeException("This table does not have serde property \"input.regex\"!"); } List<String> columnNames = Arrays.asList(columnNameProperty.split(",")); columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); assert columnNames.size() == columnTypes.size(); numColumns = columnNames.size(); /* Constructing the row ObjectInspector: * The row consists of some set of primitive columns, each column will * be a java object of primitive type. */ List<ObjectInspector> columnOIs = new ArrayList<ObjectInspector>(columnNames.size()); for (int c = 0; c < numColumns; c++) { TypeInfo typeInfo = columnTypes.get(c); String typeName = typeInfo.getTypeName(); if (typeName.equals(serdeConstants.STRING_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); } else if (typeName.equals(serdeConstants.TINYINT_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaByteObjectInspector); } else if (typeName.equals(serdeConstants.SMALLINT_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaShortObjectInspector); } else if (typeName.equals(serdeConstants.INT_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector); } else if (typeName.equals(serdeConstants.BIGINT_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaLongObjectInspector); } else if (typeName.equals(serdeConstants.FLOAT_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaFloatObjectInspector); } else if (typeName.equals(serdeConstants.DOUBLE_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); } else if (typeName.equals(serdeConstants.BOOLEAN_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector); } else if (typeName.equals(serdeConstants.TIMESTAMP_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaTimestampObjectInspector); } else if (typeName.equals(serdeConstants.DATE_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaDateObjectInspector); } else if (typeName.equals(serdeConstants.DECIMAL_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector); } else if (typeInfo instanceof PrimitiveTypeInfo && ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory() == PrimitiveCategory.VARCHAR) { VarcharTypeParams varcharParams = (VarcharTypeParams) ParameterizedPrimitiveTypeUtils .getTypeParamsFromTypeInfo(typeInfo); columnOIs.add(PrimitiveObjectInspectorFactory .getPrimitiveJavaObjectInspector((PrimitiveTypeInfo) typeInfo)); } else { throw new SerDeException(getClass().getName() + " doesn't allow column [" + c + "] named " + columnNames.get(c) + " with type " + columnTypes.get(c)); } } // StandardStruct uses ArrayList to store the row. rowOI = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnOIs); row = new ArrayList<Object>(numColumns); // Constructing the row object, etc, which will be reused for all rows. for (int c = 0; c < numColumns; c++) { row.add(null); } outputFields = new Object[numColumns]; outputRowText = new Text(); } @Override public ObjectInspector getObjectInspector() throws SerDeException { return rowOI; } @Override public Class<? extends Writable> getSerializedClass() { return Text.class; } // Number of rows not matching the regex long unmatchedRowsCount = 0; // Number of rows that match the regex but have missing groups. long partialMatchedRowsCount = 0; @Override public Object deserialize(Writable blob) throws SerDeException { Text rowText = (Text) blob; Matcher m = inputPattern.matcher(rowText.toString()); if (m.groupCount() != numColumns) { throw new SerDeException("Number of matching groups doesn't match the number of columns"); } // If do not match, ignore the line, return a row with all nulls. if (!m.matches()) { unmatchedRowsCount++; if (!alreadyLoggedNoMatch) { // Report the row if its the first time LOG.warn("" + unmatchedRowsCount + " unmatched rows are found: " + rowText); alreadyLoggedNoMatch = true; } return null; } // Otherwise, return the row. for (int c = 0; c < numColumns; c++) { try { String t = m.group(c + 1); TypeInfo typeInfo = columnTypes.get(c); String typeName = typeInfo.getTypeName(); // Convert the column to the correct type when needed and set in row obj if (typeName.equals(serdeConstants.STRING_TYPE_NAME)) { row.set(c, t); } else if (typeName.equals(serdeConstants.TINYINT_TYPE_NAME)) { Byte b; b = Byte.valueOf(t); row.set(c, b); } else if (typeName.equals(serdeConstants.SMALLINT_TYPE_NAME)) { Short s; s = Short.valueOf(t); row.set(c, s); } else if (typeName.equals(serdeConstants.INT_TYPE_NAME)) { Integer i; i = Integer.valueOf(t); row.set(c, i); } else if (typeName.equals(serdeConstants.BIGINT_TYPE_NAME)) { Long l; l = Long.valueOf(t); row.set(c, l); } else if (typeName.equals(serdeConstants.FLOAT_TYPE_NAME)) { Float f; f = Float.valueOf(t); row.set(c, f); } else if (typeName.equals(serdeConstants.DOUBLE_TYPE_NAME)) { Double d; d = Double.valueOf(t); row.set(c, d); } else if (typeName.equals(serdeConstants.BOOLEAN_TYPE_NAME)) { Boolean b; b = Boolean.valueOf(t); row.set(c, b); } else if (typeName.equals(serdeConstants.TIMESTAMP_TYPE_NAME)) { Timestamp ts; ts = Timestamp.valueOf(t); row.set(c, ts); } else if (typeName.equals(serdeConstants.DATE_TYPE_NAME)) { Date d; d = Date.valueOf(t); row.set(c, d); } else if (typeName.equals(serdeConstants.DECIMAL_TYPE_NAME)) { HiveDecimal bd; bd = new HiveDecimal(t); row.set(c, bd); } else if (typeInfo instanceof PrimitiveTypeInfo && ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory() == PrimitiveCategory.VARCHAR) { VarcharTypeParams varcharParams = (VarcharTypeParams) ParameterizedPrimitiveTypeUtils .getTypeParamsFromTypeInfo(typeInfo); HiveVarchar hv = new HiveVarchar(t, varcharParams != null ? varcharParams.length : -1); row.set(c, hv); } } catch (RuntimeException e) { partialMatchedRowsCount++; if (!alreadyLoggedPartialMatch) { // Report the row if its the first row LOG.warn("" + partialMatchedRowsCount + " partially unmatched rows are found, " + " cannot find group " + c + ": " + rowText); alreadyLoggedPartialMatch = true; } row.set(c, null); } } return row; } @Override public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { throw new UnsupportedOperationException("Regex SerDe doesn't support the serialize() method"); } @Override public SerDeStats getSerDeStats() { // no support for statistics return null; } }