Java tutorial
/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2018 by Hitachi Vantara : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.hadoop.shim.common.format.avro; import org.apache.avro.Conversions; import org.apache.avro.LogicalTypes; import org.apache.avro.Schema; import org.apache.avro.file.DataFileStream; import org.apache.avro.generic.GenericContainer; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.io.Decoder; import org.apache.avro.io.DecoderFactory; import org.apache.avro.util.Utf8; import org.apache.commons.vfs2.FileObject; import org.apache.commons.vfs2.FileSystemException; import org.pentaho.di.core.Const; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.logging.LogChannelInterface; import org.pentaho.di.core.row.RowDataUtil; import org.pentaho.di.core.row.RowMetaInterface; import org.pentaho.di.core.row.ValueMeta; import org.pentaho.di.core.row.ValueMetaInterface; import org.pentaho.di.core.row.value.ValueMetaBase; import org.pentaho.di.core.variables.VariableSpace; import org.pentaho.di.core.vfs.KettleVFS; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.hadoop.shim.api.format.AvroSpec; import java.io.IOException; import java.io.InputStream; import java.net.InetAddress; import java.net.UnknownHostException; import java.nio.ByteBuffer; import java.sql.Timestamp; import java.text.SimpleDateFormat; import java.time.LocalDate; import java.time.ZoneId; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; /** * The bulk of this code was taken from legacy AvroInputData and then logic from legacy AvroInputField was merged into * it. The idea here is to remove the dependency on the schema from AvroInputField so that the current new * AvroInputField can remain visible by AEL. AEL cannot see any classes dependant on shim. So code that normally be in * format-meta is here, instead. */ public class AvroNestedReader { /** * For reading container files - will be null if file is not a container file */ protected DataFileStream m_containerReader; /** * If the top level is a record */ protected GenericData.Record m_topLevelRecord; /** * If the top level is an array */ protected GenericData.Array m_topLevelArray; /** * If the top level is a map */ protected Map<Utf8, Object> m_topLevelMap; /** * True if decoding from an incoming field */ protected boolean m_decodingFromField; /** * For logging */ protected LogChannelInterface m_log; /** * The input data format */ protected RowMetaInterface m_incomingRowMeta; /** * The output data format */ protected RowMetaInterface m_outputRowMeta; /** * For reading from files of just serialized objects */ protected GenericDatumReader m_datumReader; protected Decoder m_decoder; protected InputStream m_inStream; /** * The schema used to write the file - will be null if the file is not a container file */ protected Schema m_writerSchema; /** * The schema to use for extracting values */ protected Schema m_schemaToUse; /** * The default schema to use (in the case where the schema is in an incoming field and a particular row has a null (or * unparsable/unavailable) schema */ protected Schema m_defaultSchema; /** * The default datum reader (constructed with the default schema) */ protected GenericDatumReader m_defaultDatumReader; protected Object m_defaultTopLevelObject; /** * Schema cache. Map of strings (actual schema or path to schema) to two element array. Element 0 = GenericDatumReader * configured with schema; 2 = top level structure object to use. */ protected Map<String, Object[]> m_schemaCache = new HashMap<String, Object[]>(); /** * True if the data to be decoded is json rather than binary */ protected boolean m_jsonEncoded; protected List<AvroInputField> m_normalFields; protected AvroArrayExpansion m_expansionHandler; /** * The index that the decoded fields start at in the output row */ protected int m_newFieldOffset; /** * If decoding from an incoming field, this holds its index */ protected int m_fieldToDecodeIndex = -1; /** * True if schema is in an incoming field */ protected boolean m_schemaInField; /** * If decoding from an incoming field and schema is in an incoming field, then this holds the schema field's index */ protected int m_schemaFieldIndex = -1; /** * True if the schema field contains a path to a schema rather than the schema itself */ protected boolean m_schemaFieldIsPath; /** * True if schemas read from incoming fields are to be cached in memory */ protected boolean m_cacheSchemas; /** * True if null should be output for a field if it is not present in the schema being used (otherwise an exeption is * raised) */ protected boolean m_dontComplainAboutMissingFields; protected DataFileStream<Object> m_avroDataFileStream; protected AvroToPdiConverter m_avroToPdiConverter; /** * Factory for obtaining a decoder */ protected DecoderFactory m_factory; //protected static Class<?> PKG = AvroInputMeta.class; protected static Class<?> PKG = AvroNestedReader.class; protected void init() throws KettleException { if (m_schemaToUse != null) { m_avroToPdiConverter = new AvroToPdiConverter(m_schemaToUse); initTopLevelStructure(m_schemaToUse, true); } if (m_normalFields == null || m_normalFields.size() == 0) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.NoFieldPathsDefined")); } m_expansionHandler = checkFieldPaths(m_normalFields, m_outputRowMeta); int killmeIndex = 0; for (AvroInputField f : m_normalFields) { //bypass this for now: int outputIndex = m_outputRowMeta.indexOfValue( f.getPentahoFieldName() ); int outputIndex = killmeIndex++; fieldInit(f, outputIndex); } if (m_expansionHandler != null) { m_expansionHandler.init(); } m_factory = new DecoderFactory(); } /** * Reset this field. Should be called prior to processing a new field value from the avro file * * @param space environment variables (values that environment variables resolve to cannot contain "."s) */ public void resetField(AvroInputField avroInputField, VariableSpace space) { // first clear because there may be stuff left over from processing // the previous avro object (especially if a path exited early due to // non-existent map key or array index out of bounds) avroInputField.getTempParts().clear(); for (String part : avroInputField.getPathParts()) { if (space == null) { avroInputField.getTempParts().add(part); } else { avroInputField.getTempParts().add(space.environmentSubstitute(part)); } } } /** * Initialize this field by parsing the path etc. * * @param outputIndex the index in the output row structure for this field * @throws KettleException if a problem occurs */ public void fieldInit(AvroInputField avroInputField, int outputIndex) throws KettleException { if (Const.isEmpty(avroInputField.getAvroFieldName())) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.NoPathSet")); } if (avroInputField.getPathParts() != null) { return; } String fieldPath = cleansePath(avroInputField.getAvroFieldName()); String[] temp = fieldPath.split("\\."); List<String> pathParts = new ArrayList<>(); avroInputField.setPathParts(pathParts); for (String part : temp) { pathParts.add(part); } if (pathParts.get(0).equals("$")) { pathParts.remove(0); // root record indicator } else if (pathParts.get(0).startsWith("$[")) { // strip leading $ off of array String r = pathParts.get(0).substring(1, pathParts.get(0).length()); pathParts.set(0, r); } //Re-init the temp vars? Should probably move them here (tk) avroInputField.setTempParts(new ArrayList<String>()); ValueMeta resetMeta = new ValueMeta(); resetMeta.setType(avroInputField.getPentahoType()); avroInputField.setTempValueMeta(resetMeta); avroInputField.setOutputIndex(outputIndex); } protected void initTopLevelStructure(Schema schema, boolean setDefault) throws KettleException { // what top-level structure are we using? if (schema.getType() == Schema.Type.RECORD) { m_topLevelRecord = new GenericData.Record(schema); if (setDefault) { m_defaultTopLevelObject = m_topLevelRecord; } } else if (schema.getType() == Schema.Type.UNION) { // ASSUMPTION: if the top level structure is a union then each // object we will read will be a record. We'll assume that any // non-record types in the top-level union are named types that // are referenced in the record types. We'll scan the union for the // first record type to construct our // our initial top-level object. When reading, the read method will give // us a new object (with appropriate schema) if this top level object's // schema does not match the schema of the record being currently read Schema firstUnion = null; for (Schema uS : schema.getTypes()) { if (uS.getType() == Schema.Type.RECORD) { firstUnion = uS; break; } } m_topLevelRecord = new GenericData.Record(firstUnion); if (setDefault) { m_defaultTopLevelObject = m_topLevelRecord; } } else if (schema.getType() == Schema.Type.ARRAY) { m_topLevelArray = new GenericData.Array(1, schema); // capacity, // schema if (setDefault) { m_defaultTopLevelObject = m_topLevelArray; } } else if (schema.getType() == Schema.Type.MAP) { m_topLevelMap = new HashMap<Utf8, Object>(); if (setDefault) { m_defaultTopLevelObject = m_topLevelMap; } } else { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.UnsupportedTopLevelStructure")); } } /** * Examines the user-specified paths for the presence of a map/array expansion. If such an expansion is detected it * checks that it is valid and, if so, creates an expansion handler for processing it. * * @param normalFields the original user-specified paths. This is modified to contain only non-expansion paths. * @param outputRowMeta the output row format * @return an AvroArrayExpansion object to handle expansions or null if no expansions are present in the user-supplied * path definitions. * @throws KettleException if a problem occurs */ protected AvroArrayExpansion checkFieldPaths(List<AvroInputField> normalFields, RowMetaInterface outputRowMeta) throws KettleException { // here we check whether there are any full map/array expansions // specified in the paths (via [*]). If so, we want to make sure // that only one is present across all paths. E.g. we can handle // multiple fields like $.person[*].first, $.person[*].last etc. // but not $.person[*].first, $.person[*].address[*].street. String expansion = null; List<AvroInputField> normalList = new ArrayList<AvroInputField>(); List<AvroInputField> expansionList = new ArrayList<AvroInputField>(); for (AvroInputField f : normalFields) { String path = f.getAvroFieldName(); if (path != null && path.lastIndexOf("[*]") >= 0) { if (path.indexOf("[*]") != path.lastIndexOf("[*]")) { throw new KettleException( BaseMessages.getString(PKG, "AvroInput.Error.PathContainsMultipleExpansions", path)); } String pathPart = path.substring(0, path.lastIndexOf("[*]") + 3); if (expansion == null) { expansion = pathPart; } else { if (!expansion.equals(pathPart)) { throw new KettleException( BaseMessages.getString(PKG, "AvroInput.Error.MutipleDifferentExpansions")); } } expansionList.add(f); } else { normalList.add(f); } } normalFields.clear(); for (AvroInputField f : normalList) { normalFields.add(f); } if (expansionList.size() > 0) { List<AvroInputField> subFields = new ArrayList<AvroInputField>(); for (AvroInputField ef : expansionList) { AvroInputField subField = new AvroInputField(); subField.setPentahoFieldName(ef.getPentahoFieldName()); String path = ef.getAvroFieldName(); if (path.charAt(path.length() - 2) == '*') { path = "dummy"; // pulling a primitive out of the map/array (path // doesn't matter) } else { path = path.substring(path.lastIndexOf("[*]") + 3, path.length()); path = "$" + path; } subField.setAvroFieldName(path); subField.setIndexedVals(ef.getIndexedVals()); subField.setPentahoType(ef.getPentahoType()); subFields.add(subField); } AvroArrayExpansion exp = this.new AvroArrayExpansion(subFields); exp.m_expansionPath = expansion; exp.m_outputRowMeta = outputRowMeta; return exp; } return null; } /////////////////////// ******* from Legacy AvroInputField *************** /////////////////////////// /** * Processes a map at this point in the path. * * @param map the map to process * @param s the current schema at this point in the path * @param ignoreMissing true if null is to be returned for user fields that don't appear in the schema * @return the field value or null for out-of-bounds array indexes, non-existent map keys or unsupported avro types. * @throws KettleException if a problem occurs */ public Object convertToKettleValue(AvroInputField avroInputField, Map<Utf8, Object> map, Schema s, Schema defaultSchema, boolean ignoreMissing) throws KettleException { if (map == null) { return null; } if (avroInputField.getTempParts().size() == 0) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.MalformedPathMap")); } String part = avroInputField.getTempParts().remove(0); if (!(part.charAt(0) == '[')) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.MalformedPathMap2", part)); } String key = part.substring(1, part.indexOf(']')); if (part.indexOf(']') < part.length() - 1) { // more dimensions to the array/map part = part.substring(part.indexOf(']') + 1, part.length()); avroInputField.getTempParts().add(0, part); } Object value = map.get(new Utf8(key)); if (value == null) { return null; } Schema valueType = s.getValueType(); if (valueType.getType() == Schema.Type.UNION) { if (value instanceof GenericContainer) { // we can ask these things for their schema (covers // records, arrays, enums and fixed) valueType = ((GenericContainer) value).getSchema(); } else { // either have a map or primitive here if (value instanceof Map) { // now have to look for the schema of the map Schema mapSchema = null; for (Schema ts : valueType.getTypes()) { if (ts.getType() == Schema.Type.MAP) { mapSchema = ts; break; } } if (mapSchema == null) { throw new KettleException( BaseMessages.getString(PKG, "AvroInput.Error.UnableToFindSchemaForUnionMap")); } valueType = mapSchema; } else { if (avroInputField.getTempValueMeta().getType() != ValueMetaInterface.TYPE_STRING) { // we have a two element union, where one element is the type // "null". So in this case we actually have just one type and can // output specific values of it (instead of using String as a // catch all for varying primitive types in the union) valueType = checkUnion(valueType); } else { // use the string representation of the value valueType = Schema.create(Schema.Type.STRING); } } } } // what have we got? if (valueType.getType() == Schema.Type.RECORD) { return convertToKettleValue(avroInputField, (GenericData.Record) value, valueType, defaultSchema, ignoreMissing); } else if (valueType.getType() == Schema.Type.ARRAY) { return convertToKettleValue(avroInputField, (GenericData.Array) value, valueType, defaultSchema, ignoreMissing); } else if (valueType.getType() == Schema.Type.MAP) { return convertToKettleValue(avroInputField, (Map<Utf8, Object>) value, valueType, defaultSchema, ignoreMissing); } else { // assume a primitive return getPrimitive(avroInputField, value, valueType); } } /** * Processes an array at this point in the path. * * @param array the array to process * @param s the current schema at this point in the path * @param ignoreMissing true if null is to be returned for user fields that don't appear in the schema * @return the field value or null for out-of-bounds array indexes, non-existent map keys or unsupported avro types. * @throws KettleException if a problem occurs */ public Object convertToKettleValue(AvroInputField avroInputField, GenericData.Array array, Schema s, Schema defaultSchema, boolean ignoreMissing) throws KettleException { if (array == null) { return null; } if (avroInputField.getTempParts().size() == 0) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.MalformedPathArray")); } String part = avroInputField.getTempParts().remove(0); if (!(part.charAt(0) == '[')) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.MalformedPathArray2", part)); } String index = part.substring(1, part.indexOf(']')); int arrayI = 0; try { arrayI = Integer.parseInt(index.trim()); } catch (NumberFormatException e) { throw new KettleException( BaseMessages.getString(PKG, "AvroInput.Error.UnableToParseArrayIndex", index)); } if (part.indexOf(']') < part.length() - 1) { // more dimensions to the array part = part.substring(part.indexOf(']') + 1, part.length()); avroInputField.getTempParts().add(0, part); } if (arrayI >= array.size() || arrayI < 0) { return null; } Object element = array.get(arrayI); Schema elementType = s.getElementType(); if (element == null) { return null; } if (elementType.getType() == Schema.Type.UNION) { if (element instanceof GenericContainer) { // we can ask these things for their schema (covers // records, arrays, enums and fixed) elementType = ((GenericContainer) element).getSchema(); } else { // either have a map or primitive here if (element instanceof Map) { // now have to look for the schema of the map Schema mapSchema = null; for (Schema ts : elementType.getTypes()) { if (ts.getType() == Schema.Type.MAP) { mapSchema = ts; break; } } if (mapSchema == null) { throw new KettleException( BaseMessages.getString(PKG, "AvroInput.Error.UnableToFindSchemaForUnionMap")); } elementType = mapSchema; } else { if (avroInputField.getTempValueMeta().getType() != ValueMetaInterface.TYPE_STRING) { // we have a two element union, where one element is the type // "null". So in this case we actually have just one type and can // output specific values of it (instead of using String as a // catch all for varying primitive types in the union) elementType = checkUnion(elementType); } else { // use the string representation of the value elementType = Schema.create(Schema.Type.STRING); } } } } // what have we got? if (elementType.getType() == Schema.Type.RECORD) { return convertToKettleValue(avroInputField, (GenericData.Record) element, elementType, defaultSchema, ignoreMissing); } else if (elementType.getType() == Schema.Type.ARRAY) { return convertToKettleValue(avroInputField, (GenericData.Array) element, elementType, defaultSchema, ignoreMissing); } else if (elementType.getType() == Schema.Type.MAP) { return convertToKettleValue(avroInputField, (Map<Utf8, Object>) element, elementType, defaultSchema, ignoreMissing); } else { // assume a primitive (covers bytes encapsulated in FIXED type) return getPrimitive(avroInputField, element, elementType); } } /** * Processes a record at this point in the path. * * @param record the record to process * @param s the current schema at this point in the path * @param ignoreMissing true if null is to be returned for user fields that don't appear in the schema * @return the field value or null for out-of-bounds array indexes, non-existent map keys or unsupported avro types. * @throws KettleException if a problem occurs */ public Object convertToKettleValue(AvroInputField avroInputField, GenericData.Record record, Schema s, Schema defaultSchema, boolean ignoreMissing) throws KettleException { if (record == null) { return null; } if (avroInputField.getTempParts().size() == 0) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.MalformedPathRecord")); } String part = avroInputField.getTempParts().remove(0); if (part.charAt(0) == '[') { throw new KettleException( BaseMessages.getString(PKG, "AvroInput.Error.InvalidPath") + avroInputField.getTempParts()); } if (part.indexOf('[') > 0) { String arrayPart = part.substring(part.indexOf('[')); part = part.substring(0, part.indexOf('[')); // put the array section back into location zero avroInputField.getTempParts().add(0, arrayPart); } // part is a named field of the record Schema.Field fieldS = s.getField(part); if (fieldS == null && !ignoreMissing) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.NonExistentField", part)); } Object field = record.get(part); if (field == null) { if (defaultSchema != null) { fieldS = defaultSchema.getField(part); } if (fieldS == null || fieldS.defaultValue() == null) { return null; } field = fieldS.defaultValue(); } Schema.Type fieldT = fieldS.schema().getType(); Schema fieldSchema = fieldS.schema(); if (fieldT == Schema.Type.UNION) { if (field instanceof GenericContainer) { // we can ask these things for their schema (covers // records, arrays, enums and fixed) fieldSchema = ((GenericContainer) field).getSchema(); fieldT = fieldSchema.getType(); } else { // either have a map or primitive here if (field instanceof Map) { // now have to look for the schema of the map Schema mapSchema = null; for (Schema ts : fieldSchema.getTypes()) { if (ts.getType() == Schema.Type.MAP) { mapSchema = ts; break; } } if (mapSchema == null) { throw new KettleException( BaseMessages.getString(PKG, "AvroInput.Error.UnableToFindSchemaForUnionMap")); } fieldSchema = mapSchema; fieldT = Schema.Type.MAP; } else { fieldSchema = checkUnion(fieldSchema); fieldT = fieldSchema.getType(); } } } // what have we got? if (fieldT == Schema.Type.RECORD) { return convertToKettleValue(avroInputField, (GenericData.Record) field, fieldSchema, defaultSchema, ignoreMissing); } else if (fieldT == Schema.Type.ARRAY) { return convertToKettleValue(avroInputField, (GenericData.Array) field, fieldSchema, defaultSchema, ignoreMissing); } else if (fieldT == Schema.Type.MAP) { return convertToKettleValue(avroInputField, (Map<Utf8, Object>) field, fieldSchema, defaultSchema, ignoreMissing); } else if (fieldT == Schema.Type.BYTES) { return convertToKettleValue(avroInputField, (ByteBuffer) field, fieldSchema); } else { // assume primitive (covers bytes encapsulated in FIXED type) return getPrimitive(avroInputField, field, fieldSchema); } } /** * @param pentahoType * @param avroData * @param fieldSchema * @return */ public Object convertToKettleValue(AvroInputField pentahoType, ByteBuffer avroData, Schema fieldSchema) { Object pentahoData = null; if (avroData != null) { try { switch (pentahoType.getPentahoType()) { case ValueMetaInterface.TYPE_BIGNUMBER: Conversions.DecimalConversion converter = new Conversions.DecimalConversion(); Schema schema = fieldSchema; if (schema.getType().equals(Schema.Type.UNION)) { List<Schema> schemas = schema.getTypes(); for (Schema s : schemas) { if (!s.getName().equalsIgnoreCase("null")) { schema = s; break; } } } Object precision = schema.getObjectProp(AvroSpec.DECIMAL_PRECISION); Object scale = schema.getObjectProp(AvroSpec.DECIMAL_SCALE); LogicalTypes.Decimal decimalType = LogicalTypes.decimal(Integer.parseInt(precision.toString()), Integer.parseInt(scale.toString())); pentahoData = converter.fromBytes(avroData, m_schemaToUse, decimalType); break; case ValueMetaInterface.TYPE_BINARY: pentahoData = new byte[avroData.remaining()]; avroData.get((byte[]) pentahoData); break; } } catch (Exception e) { // If unable to do the type conversion just ignore. null will be returned. } } return pentahoData; } /** * Get the value of the Avro leaf primitive with respect to the Kettle type for this path. * * @param fieldValue the Avro leaf value * @param s the schema for the leaf value * @return the appropriate Kettle typed value * @throws KettleException if a problem occurs */ protected Object getPrimitive(AvroInputField avroInputField, Object fieldValue, Schema s) throws KettleException { return m_avroToPdiConverter.converAvroToPdi(fieldValue, avroInputField, s); } /** * Perform Kettle type conversions for the Avro leaf field value. * * @param fieldValue the leaf value from the Avro structure * @return an Object of the appropriate Kettle type * @throws KettleException if a problem occurs */ protected Object getKettleValue(AvroInputField avroInputField, Object fieldValue) throws KettleException { switch (avroInputField.getTempValueMeta().getType()) { case ValueMetaInterface.TYPE_BIGNUMBER: return avroInputField.getTempValueMeta().getBigNumber(fieldValue); case ValueMetaInterface.TYPE_BINARY: return avroInputField.getTempValueMeta().getBinary(fieldValue); case ValueMetaInterface.TYPE_BOOLEAN: return avroInputField.getTempValueMeta().getBoolean(fieldValue); case ValueMetaInterface.TYPE_DATE: if (avroInputField.getAvroType().getBaseType() == AvroSpec.DataType.INTEGER.getBaseType()) { LocalDate localDate = LocalDate.ofEpochDay(0).plusDays((Long) fieldValue); return Date.from(localDate.atStartOfDay(ZoneId.systemDefault()).toInstant()); } else if (avroInputField.getAvroType().getBaseType() == AvroSpec.DataType.STRING.getBaseType()) { Object pentahoData = null; String dateFormatStr = avroInputField.getStringFormat(); if ((dateFormatStr == null) || (dateFormatStr.trim().length() == 0)) { dateFormatStr = ValueMetaBase.DEFAULT_DATE_FORMAT_MASK; } SimpleDateFormat datePattern = new SimpleDateFormat(dateFormatStr); try { return datePattern.parse(fieldValue.toString()); } catch (Exception e) { return null; } } return avroInputField.getTempValueMeta().getDate(fieldValue); case ValueMetaInterface.TYPE_TIMESTAMP: return new Timestamp((Long) fieldValue); case ValueMetaInterface.TYPE_INTEGER: return avroInputField.getTempValueMeta().getInteger(fieldValue); case ValueMetaInterface.TYPE_NUMBER: return avroInputField.getTempValueMeta().getNumber(fieldValue); case ValueMetaInterface.TYPE_STRING: return avroInputField.getTempValueMeta().getString(fieldValue); case ValueMetaInterface.TYPE_INET: try { return InetAddress.getByName(fieldValue.toString()); } catch (UnknownHostException ex) { return null; } default: return null; } } /** * Helper function that checks the validity of a union. We can only handle unions that contain two elements: a type * and null. * * @param s the union schema to check * @return the type of the element that is not null. * @throws KettleException if a problem occurs. */ protected static Schema checkUnion(Schema s) throws KettleException { boolean ok = false; List<Schema> types = s.getTypes(); // the type other than null Schema otherSchema = null; if (types.size() != 2) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.UnionError1")); } for (Schema p : types) { if (p.getType() == Schema.Type.NULL) { ok = true; } else { otherSchema = p; } } if (!ok) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.UnionError2")); } return otherSchema; } /** * Cleanses a string path by ensuring that any variables names present in the path do not contain "."s (replaces any * dots with underscores). * * @param path the path to cleanse * @return the cleansed path */ public static String cleansePath(String path) { // look for variables and convert any "." to "_" int index = path.indexOf("${"); int endIndex = 0; String tempStr = path; while (index >= 0) { index += 2; endIndex += tempStr.indexOf("}"); if (endIndex > 0 && endIndex > index + 1) { String key = path.substring(index, endIndex); String cleanKey = key.replace('.', '_'); path = path.replace(key, cleanKey); } else { break; } if (endIndex + 1 < path.length()) { tempStr = path.substring(endIndex + 1, path.length()); } else { break; } index = tempStr.indexOf("${"); if (index > 0) { index += endIndex; } } return path; } /** * Inner class that handles a single array/map expansion process. Expands an array or map to multiple Kettle rows. * Delegates to AvroInptuField objects to handle the extraction of leaf primitives. * * @author Mark Hall (mhall{[at]}pentaho{[dot]}com) * @version $Revision$ */ public class AvroArrayExpansion { /** * The prefix of the full path that defines the expansion */ public String m_expansionPath; /** * Subfield objects that handle the processing of the path after the expansion prefix */ protected List<AvroInputField> m_subFields; private List<String> m_pathParts; private List<String> m_tempParts; protected RowMetaInterface m_outputRowMeta; public AvroArrayExpansion(List<AvroInputField> subFields) { m_subFields = subFields; } /** * Initialize this field by parsing the path etc. * * @throws KettleException if a problem occurs */ public void init() throws KettleException { if (Const.isEmpty(m_expansionPath)) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.NoPathSet")); } if (m_pathParts != null) { return; } String expansionPath = AvroNestedReader.cleansePath(m_expansionPath); String[] temp = expansionPath.split("\\."); m_pathParts = new ArrayList<String>(); for (String part : temp) { m_pathParts.add(part); } if (m_pathParts.get(0).equals("$")) { m_pathParts.remove(0); // root record indicator } else if (m_pathParts.get(0).startsWith("$[")) { // strip leading $ off of array String r = m_pathParts.get(0).substring(1, m_pathParts.get(0).length()); m_pathParts.set(0, r); } m_tempParts = new ArrayList<String>(); // initialize the sub fields if (m_subFields != null) { for (AvroInputField f : m_subFields) { int outputIndex = m_outputRowMeta.indexOfValue(f.getPentahoFieldName()); fieldInit(f, outputIndex); } } } /** * Reset this field. Should be called prior to processing a new field value from the avro file * * @param space environment variables (values that environment variables resolve to cannot contain "."s) */ public void reset(VariableSpace space) { m_tempParts.clear(); for (String part : m_pathParts) { if (space == null) { m_tempParts.add(part); } else { m_tempParts.add(space.environmentSubstitute(part)); } } // reset sub fields for (AvroInputField f : m_subFields) { resetField(f, space); } } /** * Processes a map at this point in the path. * * @param map the map to process * @param s the current schema at this point in the path * @param space environment variables * @param ignoreMissing true if null is to be returned for user fields that don't appear in the schema * @return an array of Kettle rows corresponding to the expanded map/array and containing all leaf values as defined * in the paths * @throws KettleException if a problem occurs */ public Object[][] convertToKettleValues(Map<Utf8, Object> map, Schema s, Schema defaultSchema, VariableSpace space, boolean ignoreMissing) throws KettleException { if (map == null) { return null; } if (m_tempParts.size() == 0) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.MalformedPathMap")); } String part = m_tempParts.remove(0); if (!(part.charAt(0) == '[')) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.MalformedPathMap2", part)); } String key = part.substring(1, part.indexOf(']')); if (part.indexOf(']') < part.length() - 1) { // more dimensions to the array/map part = part.substring(part.indexOf(']') + 1, part.length()); m_tempParts.add(0, part); } if (key.equals("*")) { // start the expansion - we delegate conversion to our subfields Schema valueType = s.getValueType(); Object[][] result = new Object[map.keySet().size()][m_outputRowMeta.size() + RowDataUtil.OVER_ALLOCATE_SIZE]; int i = 0; for (Utf8 mk : map.keySet()) { Object value = map.get(mk); for (int j = 0; j < m_subFields.size(); j++) { AvroInputField sf = m_subFields.get(j); resetField(sf, space); // what have we got if (valueType.getType() == Schema.Type.RECORD) { result[i][sf.getOutputIndex()] = convertToKettleValue(sf, (GenericData.Record) value, valueType, defaultSchema, ignoreMissing); } else if (valueType.getType() == Schema.Type.ARRAY) { result[i][sf.getOutputIndex()] = convertToKettleValue(sf, (GenericData.Array) value, valueType, defaultSchema, ignoreMissing); } else if (valueType.getType() == Schema.Type.MAP) { result[i][sf.getOutputIndex()] = convertToKettleValue(sf, (Map<Utf8, Object>) value, valueType, defaultSchema, ignoreMissing); } else { // assume a primitive result[i][sf.getOutputIndex()] = getPrimitive(sf, value, valueType); } } i++; // next row } return result; } else { Object value = map.get(new Utf8(key)); if (value == null) { // key doesn't exist in map Object[][] result = new Object[1][m_outputRowMeta.size() + RowDataUtil.OVER_ALLOCATE_SIZE]; for (int i = 0; i < m_subFields.size(); i++) { AvroInputField sf = m_subFields.get(i); result[0][sf.getOutputIndex()] = null; } return result; } Schema valueType = s.getValueType(); if (valueType.getType() == Schema.Type.UNION) { if (value instanceof GenericContainer) { // we can ask these things for their schema (covers // records, arrays, enums and fixed) valueType = ((GenericContainer) value).getSchema(); } else { // either have a map or primitive here if (value instanceof Map) { // now have to look for the schema of the map Schema mapSchema = null; for (Schema ts : valueType.getTypes()) { if (ts.getType() == Schema.Type.MAP) { mapSchema = ts; break; } } if (mapSchema == null) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.UnableToFindSchemaForUnionMap")); } valueType = mapSchema; } else { // We shouldn't have a primitive here if (!ignoreMissing) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.EncounteredAPrimitivePriorToMapExpansion")); } Object[][] result = new Object[1][m_outputRowMeta.size() + RowDataUtil.OVER_ALLOCATE_SIZE]; return result; } } } // what have we got? if (valueType.getType() == Schema.Type.RECORD) { return convertToKettleValues((GenericData.Record) value, valueType, defaultSchema, space, ignoreMissing); } else if (valueType.getType() == Schema.Type.ARRAY) { return convertToKettleValues((GenericData.Array) value, valueType, defaultSchema, space, ignoreMissing); } else if (valueType.getType() == Schema.Type.MAP) { return convertToKettleValues((Map<Utf8, Object>) value, valueType, defaultSchema, space, ignoreMissing); } else { // we shouldn't have a primitive at this point. If we are // extracting a particular key from the map then we're not to the // expansion phase, // so normally there must be a non-primitive sub-structure. Only if // the user is switching schema versions on a per-row basis or the // schema is a union at the top level could we end up here if (!ignoreMissing) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.UnexpectedMapValueTypeAtNonExpansionPoint")); } Object[][] result = new Object[1][m_outputRowMeta.size() + RowDataUtil.OVER_ALLOCATE_SIZE]; return result; } } } /** * Processes an array at this point in the path. * * @param array the array to process * @param s the current schema at this point in the path * @param space environment variables * @param ignoreMissing true if null is to be returned for user fields that don't appear in the schema * @return an array of Kettle rows corresponding to the expanded map/array and containing all leaf values as defined * in the paths * @throws KettleException if a problem occurs */ public Object[][] convertToKettleValues(GenericData.Array array, Schema s, Schema defaultSchema, VariableSpace space, boolean ignoreMissing) throws KettleException { if (array == null) { return null; } if (m_tempParts.size() == 0) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.MalformedPathArray")); } String part = m_tempParts.remove(0); if (!(part.charAt(0) == '[')) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.MalformedPathArray2", part)); } String index = part.substring(1, part.indexOf(']')); if (part.indexOf(']') < part.length() - 1) { // more dimensions to the array part = part.substring(part.indexOf(']') + 1, part.length()); m_tempParts.add(0, part); } if (index.equals("*")) { // start the expansion - we delegate conversion to our subfields Schema elementType = s.getElementType(); Object[][] result = new Object[array.size()][m_outputRowMeta.size() + RowDataUtil.OVER_ALLOCATE_SIZE]; for (int i = 0; i < array.size(); i++) { Object value = array.get(i); for (int j = 0; j < m_subFields.size(); j++) { AvroInputField sf = m_subFields.get(j); resetField(sf, space); // what have we got if (elementType.getType() == Schema.Type.RECORD) { result[i][sf.getOutputIndex()] = convertToKettleValue(sf, (GenericData.Record) value, elementType, defaultSchema, ignoreMissing); } else if (elementType.getType() == Schema.Type.ARRAY) { result[i][sf.getOutputIndex()] = convertToKettleValue(sf, (GenericData.Array) value, elementType, defaultSchema, ignoreMissing); } else if (elementType.getType() == Schema.Type.MAP) { result[i][sf.getOutputIndex()] = convertToKettleValue(sf, (Map<Utf8, Object>) value, elementType, defaultSchema, ignoreMissing); } else { // assume a primitive result[i][sf.getOutputIndex()] = getPrimitive(sf, value, elementType); } } } return result; } else { int arrayI = 0; try { arrayI = Integer.parseInt(index.trim()); } catch (NumberFormatException e) { throw new KettleException( BaseMessages.getString(PKG, "AvroInput.Error.UnableToParseArrayIndex", index)); } if (arrayI >= array.size() || arrayI < 0) { // index is out of bounds Object[][] result = new Object[1][m_outputRowMeta.size() + RowDataUtil.OVER_ALLOCATE_SIZE]; for (int i = 0; i < m_subFields.size(); i++) { AvroInputField sf = m_subFields.get(i); result[0][sf.getOutputIndex()] = null; } return result; } Object value = array.get(arrayI); Schema elementType = s.getElementType(); if (elementType.getType() == Schema.Type.UNION) { if (value instanceof GenericContainer) { // we can ask these things for their schema (covers // records, arrays, enums and fixed) elementType = ((GenericContainer) value).getSchema(); } else { // either have a map or primitive here if (value instanceof Map) { // now have to look for the schema of the map Schema mapSchema = null; for (Schema ts : elementType.getTypes()) { if (ts.getType() == Schema.Type.MAP) { mapSchema = ts; break; } } if (mapSchema == null) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.UnableToFindSchemaForUnionMap")); } elementType = mapSchema; } else { // We shouldn't have a primitive here if (!ignoreMissing) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.EncounteredAPrimitivePriorToMapExpansion")); } Object[][] result = new Object[1][m_outputRowMeta.size() + RowDataUtil.OVER_ALLOCATE_SIZE]; return result; } } } // what have we got? if (elementType.getType() == Schema.Type.RECORD) { return convertToKettleValues((GenericData.Record) value, elementType, defaultSchema, space, ignoreMissing); } else if (elementType.getType() == Schema.Type.ARRAY) { return convertToKettleValues((GenericData.Array) value, elementType, defaultSchema, space, ignoreMissing); } else if (elementType.getType() == Schema.Type.MAP) { return convertToKettleValues((Map<Utf8, Object>) value, elementType, defaultSchema, space, ignoreMissing); } else { // we shouldn't have a primitive at this point. If we are // extracting a particular index from the array then we're not to the // expansion phase, // so normally there must be a non-primitive sub-structure. Only if // the user is switching schema versions on a per-row basis or the // schema is a union at the top level could we end up here if (!ignoreMissing) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.UnexpectedArrayElementTypeAtNonExpansionPoint")); } else { Object[][] result = new Object[1][m_outputRowMeta.size() + RowDataUtil.OVER_ALLOCATE_SIZE]; return result; } } } } /** * Processes a record at this point in the path. * * @param record the record to process * @param s the current schema at this point in the path * @param space environment variables * @param ignoreMissing true if null is to be returned for user fields that don't appear in the schema * @return an array of Kettle rows corresponding to the expanded map/array and containing all leaf values as defined * in the paths * @throws KettleException if a problem occurs */ public Object[][] convertToKettleValues(GenericData.Record record, Schema s, Schema defaultSchema, VariableSpace space, boolean ignoreMissing) throws KettleException { if (record == null) { return null; } if (m_tempParts.size() == 0) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.MalformedPathRecord")); } String part = m_tempParts.remove(0); if (part.charAt(0) == '[') { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.InvalidPath") + m_tempParts); } if (part.indexOf('[') > 0) { String arrayPart = part.substring(part.indexOf('[')); part = part.substring(0, part.indexOf('[')); // put the array section back into location zero m_tempParts.add(0, arrayPart); } // part is a named field of the record Schema.Field fieldS = s.getField(part); if (fieldS == null) { if (!ignoreMissing) { throw new KettleException( BaseMessages.getString(PKG, "AvroInput.Error.NonExistentField", part)); } } Object field = record.get(part); if (field == null) { // field is null and we haven't hit the expansion yet. There will be // nothing // to return for all the sub-fields grouped in the expansion Object[][] result = new Object[1][m_outputRowMeta.size() + RowDataUtil.OVER_ALLOCATE_SIZE]; return result; } Schema.Type fieldT = fieldS.schema().getType(); Schema fieldSchema = fieldS.schema(); if (fieldT == Schema.Type.UNION) { if (field instanceof GenericContainer) { // we can ask these things for their schema (covers // records, arrays, enums and fixed) fieldSchema = ((GenericContainer) field).getSchema(); fieldT = fieldSchema.getType(); } else { // either have a map or primitive here if (field instanceof Map) { // now have to look for the schema of the map Schema mapSchema = null; for (Schema ts : fieldSchema.getTypes()) { if (ts.getType() == Schema.Type.MAP) { mapSchema = ts; break; } } if (mapSchema == null) { throw new KettleException( BaseMessages.getString(PKG, "AvroInput.Error.UnableToFindSchemaForUnionMap")); } fieldSchema = mapSchema; fieldT = Schema.Type.MAP; } else { // We shouldn't have a primitive here if (!ignoreMissing) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.EncounteredAPrimitivePriorToMapExpansion")); } Object[][] result = new Object[1][m_outputRowMeta.size() + RowDataUtil.OVER_ALLOCATE_SIZE]; return result; } } } // what have we got? if (fieldT == Schema.Type.RECORD) { return convertToKettleValues((GenericData.Record) field, fieldSchema, defaultSchema, space, ignoreMissing); } else if (fieldT == Schema.Type.ARRAY) { return convertToKettleValues((GenericData.Array) field, fieldSchema, defaultSchema, space, ignoreMissing); } else if (fieldT == Schema.Type.MAP) { return convertToKettleValues((Map<Utf8, Object>) field, fieldSchema, defaultSchema, space, ignoreMissing); } else { // primitives will always be handled by the subField delegates, so we // should'nt // get here throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.UnexpectedRecordFieldTypeAtNonExpansionPoint")); } } } // ----------------- End AvroArrayExpansion inner class -------------------------------- /** * Converts an incoming row to outgoing format. Extracts fields from either an Avro object in the incoming row or from * the next structure in the container or non-container Avro file. May return more than one row if a map/array is * being expanded. * * @param incoming incoming kettle row - may be null if decoding from a file rather than a field * @param space the variables to use * @return one or more rows in the outgoing format * @throws KettleException if a problem occurs */ public Object[][] avroObjectToKettle(Object[] incoming, VariableSpace space) throws KettleException { if (m_containerReader != null) { // container file try { if (m_containerReader.hasNext()) { if (m_topLevelRecord != null) { // special case for top-level record. In case we actually // have a top level union, reassign the record so that // we have the correctly populated object in the case // where our last record instance can't be reused (i.e. // the next record read is a different one from the union // than the last one). m_topLevelRecord = (GenericData.Record) m_containerReader.next(m_topLevelRecord); } else if (m_topLevelArray != null) { m_containerReader.next(m_topLevelArray); } else { m_containerReader.next(m_topLevelMap); } return setKettleFields(incoming, space); } else { return null; // no more input } } catch (IOException e) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.ObjectReadError")); } } else { // non-container file try { /* * if (m_decoder.isEnd()) { return null; } */ // reading from an incoming field if (m_decodingFromField) { if (incoming == null || incoming.length == 0) { // must be done - just return null return null; } ValueMetaInterface fieldMeta = m_incomingRowMeta.getValueMeta(m_fieldToDecodeIndex); // incoming avro field null? - all decoded fields are null if (fieldMeta.isNull(incoming[m_fieldToDecodeIndex])) { Object[][] result = new Object[1][]; // just resize the existing incoming array (if necessary) and return // the incoming values result[0] = RowDataUtil.resizeArray(incoming, m_outputRowMeta.size()); return result; } // if necessary, set the current datum reader and top level structure // for the incoming schema if (m_schemaInField) { ValueMetaInterface schemaMeta = m_incomingRowMeta.getValueMeta(m_schemaFieldIndex); String schemaToUse = schemaMeta.getString(incoming[m_schemaFieldIndex]); setSchemaToUse(schemaToUse, m_cacheSchemas, space); } if (m_jsonEncoded) { try { String fieldValue = fieldMeta.getString(incoming[m_fieldToDecodeIndex]); m_decoder = m_factory.jsonDecoder(m_schemaToUse, fieldValue); } catch (IOException e) { throw new KettleException( BaseMessages.getString(PKG, "AvroInput.Error.JsonDecoderError")); } } else { byte[] fieldValue = fieldMeta.getBinary(incoming[m_fieldToDecodeIndex]); m_decoder = m_factory.binaryDecoder(fieldValue, null); } } if (m_topLevelRecord != null) { // special case for top-level record. In case we actually // have a top level union, reassign the record so that // we have the correctly populated object in the case // where our last record instance can't be reused (i.e. // the next record read is a different one from the union // than the last one). m_topLevelRecord = (GenericData.Record) m_datumReader.read(m_topLevelRecord, m_decoder); } else if (m_topLevelArray != null) { m_datumReader.read(m_topLevelArray, m_decoder); } else { m_datumReader.read(m_topLevelMap, m_decoder); } return setKettleFields(incoming, space); } catch (IOException ex) { // some IO problem or no more input return null; } } } private Object[][] setKettleFields(Object[] outputRowData, VariableSpace space) throws KettleException { Object[][] result = null; // expand map/array in path structure to multiple rows (if necessary) if (m_expansionHandler != null) { m_expansionHandler.reset(space); if (m_schemaToUse.getType() == Schema.Type.RECORD || m_schemaToUse.getType() == Schema.Type.UNION) { // call getSchema() on the top level record here in case it has been // read as one of the elements from a top-level union result = m_expansionHandler.convertToKettleValues(m_topLevelRecord, m_topLevelRecord.getSchema(), m_defaultSchema, space, m_dontComplainAboutMissingFields); } else if (m_schemaToUse.getType() == Schema.Type.ARRAY) { result = m_expansionHandler.convertToKettleValues(m_topLevelArray, m_schemaToUse, m_defaultSchema, space, m_dontComplainAboutMissingFields); } else { result = m_expansionHandler.convertToKettleValues(m_topLevelMap, m_schemaToUse, m_defaultSchema, space, m_dontComplainAboutMissingFields); } } else { result = new Object[1][]; } // if there are no incoming rows (i.e. we're decoding from a file rather // than a field if (outputRowData == null) { outputRowData = RowDataUtil.allocateRowData(m_outputRowMeta.size()); } else { // make sure we allocate enough space for the new fields outputRowData = RowDataUtil.resizeArray(outputRowData, m_outputRowMeta.size()); } // get the normal (non expansion-related fields) Object value = null; int incomingFieldsOffset = m_outputRowMeta.size() - m_normalFields.size(); for (AvroInputField f : m_normalFields) { resetField(f, space); if (m_schemaToUse.getType() == Schema.Type.RECORD || m_schemaToUse.getType() == Schema.Type.UNION) { // call getSchema() on the top level record here in case it has been // read as one of the elements from a top-level union value = convertToKettleValue(f, m_topLevelRecord, m_topLevelRecord.getSchema(), m_defaultSchema, m_dontComplainAboutMissingFields); } else if (m_schemaToUse.getType() == Schema.Type.ARRAY) { value = convertToKettleValue(f, m_topLevelArray, m_schemaToUse, m_defaultSchema, m_dontComplainAboutMissingFields); } else { value = convertToKettleValue(f, m_topLevelMap, m_schemaToUse, m_defaultSchema, m_dontComplainAboutMissingFields); } outputRowData[f.getOutputIndex() + incomingFieldsOffset] = value; } // copy normal fields and existing incoming over to each expansion row (if // necessary) if (m_expansionHandler == null) { result[0] = outputRowData; } else if (m_normalFields.size() > 0 || m_newFieldOffset > 0) { for (int i = 0; i < result.length; i++) { Object[] row = result[i]; // existing incoming fields for (int j = 0; j < m_newFieldOffset; j++) { row[j] = outputRowData[j]; } int rowIndex = 0; for (int x = 0; x < outputRowData.length; x++) { if (outputRowData[x] != null && rowIndex < row.length) { row[rowIndex++] = outputRowData[x]; } } } } return result; } public void close() throws IOException { if (m_containerReader != null) { m_containerReader.close(); } if (m_inStream != null) { m_inStream.close(); } } protected void setSchemaToUse(String schemaKey, boolean useCache, VariableSpace space) throws KettleException { if (Const.isEmpty(schemaKey)) { // switch to default if (m_defaultDatumReader == null) { // no key, no default schema - can't continue with this row throw new KettleException( BaseMessages.getString(PKG, "AvroInput.Error.IncommingSchemaIsMissingAndNoDefault")); } if (m_log.isDetailed()) { m_log.logDetailed(BaseMessages.getString(PKG, "AvroInput.Message.IncommingSchemaIsMissing")); } m_datumReader = m_defaultDatumReader; m_schemaToUse = m_datumReader.getSchema(); setTopLevelStructure(m_defaultTopLevelObject); return; } else { schemaKey = schemaKey.trim(); schemaKey = space.environmentSubstitute(schemaKey); } Object[] cached = null; if (useCache) { cached = m_schemaCache.get(schemaKey); if (m_log.isDetailed() && cached != null) { m_log.logDetailed(BaseMessages.getString(PKG, "AvroInput.Message.UsingCachedSchema", schemaKey)); } } if (!useCache || cached == null) { Schema toUse = null; if (m_schemaFieldIsPath) { // load the schema from disk if (m_log.isDetailed()) { m_log.logDetailed(BaseMessages.getString(PKG, "AvroInput.Message.LoadingSchema", schemaKey)); } try { toUse = loadSchema(schemaKey); } catch (KettleException ex) { // fall back to default (if possible) if (m_defaultDatumReader != null) { if (m_log.isBasic()) { m_log.logBasic(BaseMessages.getString(PKG, "AvroInput.Message.FailedToLoadSchmeaUsingDefault", schemaKey)); } m_datumReader = m_defaultDatumReader; m_schemaToUse = m_datumReader.getSchema(); setTopLevelStructure(m_defaultTopLevelObject); return; } else { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.CantLoadIncommingSchemaAndNoDefault", schemaKey)); } } } else { // use the supplied schema if (m_log.isDetailed()) { m_log.logDetailed(BaseMessages.getString(PKG, "AvroInput.Message.ParsingSchema", schemaKey)); } Schema.Parser p = new Schema.Parser(); toUse = p.parse(schemaKey); } m_schemaToUse = toUse; m_datumReader = new GenericDatumReader(toUse); initTopLevelStructure(toUse, false); if (useCache) { Object[] schemaInfo = new Object[2]; schemaInfo[0] = m_datumReader; schemaInfo[1] = (m_topLevelArray != null) ? m_topLevelArray : ((m_topLevelRecord != null) ? m_topLevelRecord : m_topLevelMap); if (m_log.isDetailed()) { m_log.logDetailed(BaseMessages.getString(PKG, "AvroInput.Message.StoringSchemaInCache")); } m_schemaCache.put(schemaKey, schemaInfo); } } else if (useCache) { // got one from the cache m_datumReader = (GenericDatumReader) cached[0]; m_schemaToUse = m_datumReader.getSchema(); setTopLevelStructure(cached[1]); } } protected void setTopLevelStructure(Object topLevel) { if (topLevel instanceof GenericData.Record) { m_topLevelRecord = (GenericData.Record) topLevel; m_topLevelArray = null; m_topLevelMap = null; } else if (topLevel instanceof GenericData.Array) { m_topLevelArray = (GenericData.Array<?>) topLevel; m_topLevelRecord = null; m_topLevelMap = null; } else { m_topLevelMap = (HashMap<Utf8, Object>) topLevel; m_topLevelRecord = null; m_topLevelArray = null; } } /** * Load a schema from a file * * @param schemaFile the file to load from * @return the schema * @throws KettleException if a problem occurs */ protected static Schema loadSchema(String schemaFile) throws KettleException { Schema s = null; Schema.Parser p = new Schema.Parser(); FileObject fileO = KettleVFS.getFileObject(schemaFile); try { InputStream in = KettleVFS.getInputStream(fileO); s = p.parse(in); in.close(); } catch (FileSystemException e) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.SchemaError"), e); } catch (IOException e) { throw new KettleException(BaseMessages.getString(PKG, "AvroInput.Error.SchemaError"), e); } return s; } /** * Load a schema from a Avro container file * * @param containerFilename the name of the Avro container file * @return the schema * @throws KettleException if a problem occurs */ protected static Schema loadSchemaFromContainer(String containerFilename) throws KettleException { Schema s = null; FileObject fileO = KettleVFS.getFileObject(containerFilename); InputStream in = null; try { in = KettleVFS.getInputStream(fileO); GenericDatumReader dr = new GenericDatumReader(); DataFileStream reader = new DataFileStream(in, dr); s = reader.getSchema(); reader.close(); } catch (FileSystemException e) { throw new KettleException(BaseMessages.getString(PKG, "AvroInputDialog.Error.KettleFileException"), e); } catch (IOException e) { throw new KettleException(BaseMessages.getString(PKG, "AvroInputDialog.Error.KettleFileException"), e); } return s; } }