Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.gobblin.util; import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.avro.AvroRuntimeException; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.Schema.Type; import org.apache.avro.SchemaCompatibility; import org.apache.avro.file.DataFileReader; import org.apache.avro.file.SeekableInput; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericData.Record; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.BinaryDecoder; import org.apache.avro.io.DatumReader; import org.apache.avro.io.DatumWriter; import org.apache.avro.io.Decoder; import org.apache.avro.io.DecoderFactory; import org.apache.avro.io.Encoder; import org.apache.avro.io.EncoderFactory; import org.apache.avro.mapred.FsInput; import org.apache.avro.util.Utf8; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils; import org.codehaus.jackson.JsonNode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Function; import com.google.common.base.Joiner; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.base.Splitter; import com.google.common.base.Strings; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import com.google.common.io.Closer; import javax.annotation.Nonnull; import lombok.extern.slf4j.Slf4j; /** * A Utils class for dealing with Avro objects */ @Slf4j public class AvroUtils { private static final Logger LOG = LoggerFactory.getLogger(AvroUtils.class); public static final String FIELD_LOCATION_DELIMITER = "."; private static final String AVRO_SUFFIX = ".avro"; /** * Validates that the provided reader schema can be used to decode avro data written with the * provided writer schema. * @param readerSchema schema to check. * @param writerSchema schema to check. * @param ignoreNamespace whether name and namespace should be ignored in validation * @return true if validation passes */ public static boolean checkReaderWriterCompatibility(Schema readerSchema, Schema writerSchema, boolean ignoreNamespace) { if (ignoreNamespace) { List<Schema.Field> fields = deepCopySchemaFields(readerSchema); readerSchema = Schema.createRecord(writerSchema.getName(), writerSchema.getDoc(), writerSchema.getNamespace(), readerSchema.isError()); readerSchema.setFields(fields); } return SchemaCompatibility.checkReaderWriterCompatibility(readerSchema, writerSchema).getType() .equals(SchemaCompatibility.SchemaCompatibilityType.COMPATIBLE); } public static List<Field> deepCopySchemaFields(Schema readerSchema) { return readerSchema.getFields().stream().map(field -> { Field f = new Field(field.name(), field.schema(), field.doc(), field.defaultValue(), field.order()); field.getProps().forEach((key, value) -> f.addProp(key, value)); return f; }).collect(Collectors.toList()); } public static class AvroPathFilter implements PathFilter { @Override public boolean accept(Path path) { return path.getName().endsWith(AVRO_SUFFIX); } } /** * Given a GenericRecord, this method will return the schema of the field specified by the path parameter. The * fieldLocation parameter is an ordered string specifying the location of the nested field to retrieve. For example, * field1.nestedField1 takes the the schema of the field "field1", and retrieves the schema "nestedField1" from it. * @param schema is the record to retrieve the schema from * @param fieldLocation is the location of the field * @return the schema of the field */ public static Optional<Schema> getFieldSchema(Schema schema, String fieldLocation) { Preconditions.checkNotNull(schema); Preconditions.checkArgument(!Strings.isNullOrEmpty(fieldLocation)); Splitter splitter = Splitter.on(FIELD_LOCATION_DELIMITER).omitEmptyStrings().trimResults(); List<String> pathList = Lists.newArrayList(splitter.split(fieldLocation)); if (pathList.size() == 0) { return Optional.absent(); } return AvroUtils.getFieldSchemaHelper(schema, pathList, 0); } /** * Helper method that does the actual work for {@link #getFieldSchema(Schema, String)} * @param schema passed from {@link #getFieldSchema(Schema, String)} * @param pathList passed from {@link #getFieldSchema(Schema, String)} * @param field keeps track of the index used to access the list pathList * @return the schema of the field */ private static Optional<Schema> getFieldSchemaHelper(Schema schema, List<String> pathList, int field) { if (schema.getType() == Type.RECORD && schema.getField(pathList.get(field)) == null) { return Optional.absent(); } switch (schema.getType()) { case UNION: if (AvroSerdeUtils.isNullableType(schema)) { return AvroUtils.getFieldSchemaHelper(AvroSerdeUtils.getOtherTypeFromNullableType(schema), pathList, field); } throw new AvroRuntimeException("Union of complex types cannot be handled : " + schema); case MAP: if ((field + 1) == pathList.size()) { return Optional.fromNullable(schema.getValueType()); } return AvroUtils.getFieldSchemaHelper(schema.getValueType(), pathList, ++field); case RECORD: if ((field + 1) == pathList.size()) { return Optional.fromNullable(schema.getField(pathList.get(field)).schema()); } return AvroUtils.getFieldSchemaHelper(schema.getField(pathList.get(field)).schema(), pathList, ++field); default: throw new AvroRuntimeException("Invalid type in schema : " + schema); } } /** * Given a GenericRecord, this method will return the field specified by the path parameter. The * fieldLocation parameter is an ordered string specifying the location of the nested field to retrieve. For example, * field1.nestedField1 takes field "field1", and retrieves "nestedField1" from it. * @param schema is the record to retrieve the schema from * @param fieldLocation is the location of the field * @return the field */ public static Optional<Field> getField(Schema schema, String fieldLocation) { Preconditions.checkNotNull(schema); Preconditions.checkArgument(!Strings.isNullOrEmpty(fieldLocation)); Splitter splitter = Splitter.on(FIELD_LOCATION_DELIMITER).omitEmptyStrings().trimResults(); List<String> pathList = Lists.newArrayList(splitter.split(fieldLocation)); if (pathList.size() == 0) { return Optional.absent(); } return AvroUtils.getFieldHelper(schema, pathList, 0); } /** * Helper method that does the actual work for {@link #getField(Schema, String)} * @param schema passed from {@link #getFieldSchema(Schema, String)} * @param pathList passed from {@link #getFieldSchema(Schema, String)} * @param field keeps track of the index used to access the list pathList * @return the field */ private static Optional<Field> getFieldHelper(Schema schema, List<String> pathList, int field) { Field curField = schema.getField(pathList.get(field)); if (field + 1 == pathList.size()) { return Optional.fromNullable(curField); } Schema fieldSchema = curField.schema(); switch (fieldSchema.getType()) { case UNION: throw new AvroRuntimeException("Union of complex types cannot be handled : " + schema); case MAP: return AvroUtils.getFieldHelper(fieldSchema.getValueType(), pathList, ++field); case RECORD: return AvroUtils.getFieldHelper(fieldSchema, pathList, ++field); default: throw new AvroRuntimeException("Invalid type in schema : " + schema); } } /** * Given a GenericRecord, this method will return the field specified by the path parameter. The fieldLocation * parameter is an ordered string specifying the location of the nested field to retrieve. For example, * field1.nestedField1 takes the the value of the field "field1", and retrieves the field "nestedField1" from it. * @param record is the record to retrieve the field from * @param fieldLocation is the location of the field * @return the value of the field */ public static Optional<Object> getFieldValue(GenericRecord record, String fieldLocation) { Map<String, Object> ret = getMultiFieldValue(record, fieldLocation); return Optional.fromNullable(ret.get(fieldLocation)); } public static Map<String, Object> getMultiFieldValue(GenericRecord record, String fieldLocation) { Preconditions.checkNotNull(record); Preconditions.checkArgument(!Strings.isNullOrEmpty(fieldLocation)); Splitter splitter = Splitter.on(FIELD_LOCATION_DELIMITER).omitEmptyStrings().trimResults(); List<String> pathList = splitter.splitToList(fieldLocation); if (pathList.size() == 0) { return Collections.emptyMap(); } HashMap<String, Object> retVal = new HashMap<String, Object>(); AvroUtils.getFieldHelper(retVal, record, pathList, 0); return retVal; } /** * Helper method that does the actual work for {@link #getFieldValue(GenericRecord, String)} * @param data passed from {@link #getFieldValue(GenericRecord, String)} * @param pathList passed from {@link #getFieldValue(GenericRecord, String)} * @param field keeps track of the index used to access the list pathList * @return the value of the field */ private static void getFieldHelper(Map<String, Object> retVal, Object data, List<String> pathList, int field) { if (data == null) { return; } if ((field + 1) == pathList.size()) { Object val = null; Joiner joiner = Joiner.on("."); String key = joiner.join(pathList.iterator()); if (data instanceof Map) { val = getObjectFromMap((Map) data, pathList.get(field)); } else if (data instanceof List) { val = getObjectFromArray((List) data, Integer.parseInt(pathList.get(field))); } else { val = ((GenericRecord) data).get(pathList.get(field)); } if (val != null) { retVal.put(key, val); } return; } if (data instanceof Map) { AvroUtils.getFieldHelper(retVal, getObjectFromMap((Map) data, pathList.get(field)), pathList, ++field); return; } if (data instanceof List) { if (pathList.get(field).trim().equals("*")) { List arr = (List) data; Iterator it = arr.iterator(); int i = 0; while (it.hasNext()) { Object val = it.next(); List<String> newPathList = new ArrayList<>(pathList); newPathList.set(field, String.valueOf(i)); AvroUtils.getFieldHelper(retVal, val, newPathList, field + 1); i++; } } else { AvroUtils.getFieldHelper(retVal, getObjectFromArray((List) data, Integer.parseInt(pathList.get(field))), pathList, ++field); } return; } AvroUtils.getFieldHelper(retVal, ((GenericRecord) data).get(pathList.get(field)), pathList, ++field); return; } /** * Given a map: key -> value, return a map: key.toString() -> value.toString(). Avro serializer wraps a String * into {@link Utf8}. This method helps to restore the original string map object * * @param map a map object * @return a map of strings */ @SuppressWarnings("unchecked") public static Map<String, String> toStringMap(Object map) { if (map == null) { return null; } if (map instanceof Map) { Map<Object, Object> rawMap = (Map<Object, Object>) map; Map<String, String> stringMap = new HashMap<>(); for (Entry<Object, Object> entry : rawMap.entrySet()) { stringMap.put(entry.getKey().toString(), entry.getValue().toString()); } return stringMap; } else { throw new AvroRuntimeException("value must be a map"); } } /** * This method is to get object from map given a key as string. * Avro persists string as Utf8 * @param map passed from {@link #getFieldHelper(Map, Object, List, int)} * @param key passed from {@link #getFieldHelper(Map, Object, List, int)} * @return This could again be a GenericRecord */ private static Object getObjectFromMap(Map map, String key) { Utf8 utf8Key = new Utf8(key); Object value = map.get(utf8Key); if (value == null) { return map.get(key); } return value; } /** * Get an object from an array given an index. */ private static Object getObjectFromArray(List array, int index) { return array.get(index); } /** * Change the schema of an Avro record. * @param record The Avro record whose schema is to be changed. * @param newSchema The target schema. It must be compatible as reader schema with record.getSchema() as writer schema. * @return a new Avro record with the new schema. * @throws IOException if conversion failed. */ public static GenericRecord convertRecordSchema(GenericRecord record, Schema newSchema) throws IOException { if (record.getSchema().equals(newSchema)) { return record; } try { BinaryDecoder decoder = new DecoderFactory().binaryDecoder(recordToByteArray(record), null); DatumReader<GenericRecord> reader = new GenericDatumReader<>(record.getSchema(), newSchema); return reader.read(null, decoder); } catch (IOException e) { throw new IOException( String.format("Cannot convert avro record to new schema. Origianl schema = %s, new schema = %s", record.getSchema(), newSchema), e); } } /** * Convert a GenericRecord to a byte array. */ public static byte[] recordToByteArray(GenericRecord record) throws IOException { try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { Encoder encoder = EncoderFactory.get().directBinaryEncoder(out, null); DatumWriter<GenericRecord> writer = new GenericDatumWriter<>(record.getSchema()); writer.write(record, encoder); byte[] byteArray = out.toByteArray(); return byteArray; } } /** * Get Avro schema from an Avro data file. */ public static Schema getSchemaFromDataFile(Path dataFile, FileSystem fs) throws IOException { try (SeekableInput sin = new FsInput(dataFile, fs.getConf()); DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, new GenericDatumReader<GenericRecord>())) { return reader.getSchema(); } } /** * Parse Avro schema from a schema file. */ public static Schema parseSchemaFromFile(Path filePath, FileSystem fs) throws IOException { Preconditions.checkArgument(fs.exists(filePath), filePath + " does not exist"); try (FSDataInputStream in = fs.open(filePath)) { return new Schema.Parser().parse(in); } } public static void writeSchemaToFile(Schema schema, Path filePath, FileSystem fs, boolean overwrite) throws IOException { writeSchemaToFile(schema, filePath, null, fs, overwrite); } public static void writeSchemaToFile(Schema schema, Path filePath, Path tempFilePath, FileSystem fs, boolean overwrite) throws IOException { writeSchemaToFile(schema, filePath, tempFilePath, fs, overwrite, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.READ)); } public static void writeSchemaToFile(Schema schema, Path filePath, FileSystem fs, boolean overwrite, FsPermission perm) throws IOException { writeSchemaToFile(schema, filePath, null, fs, overwrite, perm); } /** * Write a schema to a file * @param schema the schema * @param filePath the target file * @param tempFilePath if not null then this path is used for a temporary file used to stage the write * @param fs a {@link FileSystem} * @param overwrite should any existing target file be overwritten? * @param perm permissions * @throws IOException */ public static void writeSchemaToFile(Schema schema, Path filePath, Path tempFilePath, FileSystem fs, boolean overwrite, FsPermission perm) throws IOException { boolean fileExists = fs.exists(filePath); if (!overwrite) { Preconditions.checkState(!fileExists, filePath + " already exists"); } else { // delete the target file now if not using a staging file if (fileExists && null == tempFilePath) { HadoopUtils.deletePath(fs, filePath, true); // file has been removed fileExists = false; } } // If the file exists then write to a temp file to make the replacement as close to atomic as possible Path writeFilePath = fileExists ? tempFilePath : filePath; try (DataOutputStream dos = fs.create(writeFilePath)) { dos.writeChars(schema.toString()); } fs.setPermission(writeFilePath, perm); // Replace existing file with the staged file if (fileExists) { if (!fs.delete(filePath, true)) { throw new IOException(String.format("Failed to delete %s while renaming %s to %s", filePath, tempFilePath, filePath)); } HadoopUtils.movePath(fs, tempFilePath, fs, filePath, true, fs.getConf()); } } /** * Get the latest avro schema for a directory * @param directory the input dir that contains avro files * @param fs the {@link FileSystem} for the given directory. * @param latest true to return latest schema, false to return oldest schema * @return the latest/oldest schema in the directory * @throws IOException */ public static Schema getDirectorySchema(Path directory, FileSystem fs, boolean latest) throws IOException { Schema schema = null; try (Closer closer = Closer.create()) { List<FileStatus> files = getDirectorySchemaHelper(directory, fs); if (files == null || files.size() == 0) { LOG.warn("There is no previous avro file in the directory: " + directory); } else { FileStatus file = latest ? files.get(0) : files.get(files.size() - 1); LOG.debug("Path to get the avro schema: " + file); FsInput fi = new FsInput(file.getPath(), fs.getConf()); GenericDatumReader<GenericRecord> genReader = new GenericDatumReader<>(); schema = closer.register(new DataFileReader<>(fi, genReader)).getSchema(); } } catch (IOException ioe) { throw new IOException("Cannot get the schema for directory " + directory, ioe); } return schema; } /** * Get the latest avro schema for a directory * @param directory the input dir that contains avro files * @param conf configuration * @param latest true to return latest schema, false to return oldest schema * @return the latest/oldest schema in the directory * @throws IOException */ public static Schema getDirectorySchema(Path directory, Configuration conf, boolean latest) throws IOException { return getDirectorySchema(directory, FileSystem.get(conf), latest); } private static List<FileStatus> getDirectorySchemaHelper(Path directory, FileSystem fs) throws IOException { List<FileStatus> files = Lists.newArrayList(); if (fs.exists(directory)) { getAllNestedAvroFiles(fs.getFileStatus(directory), files, fs); if (files.size() > 0) { Collections.sort(files, FileListUtils.LATEST_MOD_TIME_ORDER); } } return files; } private static void getAllNestedAvroFiles(FileStatus dir, List<FileStatus> files, FileSystem fs) throws IOException { if (dir.isDirectory()) { FileStatus[] filesInDir = fs.listStatus(dir.getPath()); if (filesInDir != null) { for (FileStatus f : filesInDir) { getAllNestedAvroFiles(f, files, fs); } } } else if (dir.getPath().getName().endsWith(AVRO_SUFFIX)) { files.add(dir); } } /** * Merge oldSchema and newSchame. Set a field default value to null, if this field exists in the old schema but not in the new schema. * @param oldSchema * @param newSchema * @return schema that contains all the fields in both old and new schema. */ public static Schema nullifyFieldsForSchemaMerge(Schema oldSchema, Schema newSchema) { if (oldSchema == null) { LOG.warn("No previous schema available, use the new schema instead."); return newSchema; } if (!(oldSchema.getType().equals(Type.RECORD) && newSchema.getType().equals(Type.RECORD))) { LOG.warn("Both previous schema and new schema need to be record type. Quit merging schema."); return newSchema; } List<Field> combinedFields = Lists.newArrayList(); for (Field newFld : newSchema.getFields()) { combinedFields.add(new Field(newFld.name(), newFld.schema(), newFld.doc(), newFld.defaultValue())); } for (Field oldFld : oldSchema.getFields()) { if (newSchema.getField(oldFld.name()) == null) { List<Schema> union = Lists.newArrayList(); Schema oldFldSchema = oldFld.schema(); if (oldFldSchema.getType().equals(Type.UNION)) { union.add(Schema.create(Type.NULL)); for (Schema itemInUion : oldFldSchema.getTypes()) { if (!itemInUion.getType().equals(Type.NULL)) { union.add(itemInUion); } } Schema newFldSchema = Schema.createUnion(union); combinedFields.add(new Field(oldFld.name(), newFldSchema, oldFld.doc(), oldFld.defaultValue())); } else { union.add(Schema.create(Type.NULL)); union.add(oldFldSchema); Schema newFldSchema = Schema.createUnion(union); combinedFields.add(new Field(oldFld.name(), newFldSchema, oldFld.doc(), oldFld.defaultValue())); } } } Schema mergedSchema = Schema.createRecord(newSchema.getName(), newSchema.getDoc(), newSchema.getNamespace(), newSchema.isError()); mergedSchema.setFields(combinedFields); return mergedSchema; } /** * Remove map, array, enum fields, as well as union fields that contain map, array or enum, * from an Avro schema. A schema with these fields cannot be used as Mapper key in a * MapReduce job. */ public static Optional<Schema> removeUncomparableFields(Schema schema) { return removeUncomparableFields(schema, Maps.newHashMap()); } private static Optional<Schema> removeUncomparableFields(Schema schema, Map<Schema, Optional<Schema>> processed) { switch (schema.getType()) { case RECORD: return removeUncomparableFieldsFromRecord(schema, processed); case UNION: return removeUncomparableFieldsFromUnion(schema, processed); case MAP: return Optional.absent(); case ARRAY: return Optional.absent(); case ENUM: return Optional.absent(); default: return Optional.of(schema); } } private static Optional<Schema> removeUncomparableFieldsFromRecord(Schema record, Map<Schema, Optional<Schema>> processed) { Preconditions.checkArgument(record.getType() == Schema.Type.RECORD); Optional<Schema> result = processed.get(record); if (null != result) { return result; } List<Field> fields = Lists.newArrayList(); for (Field field : record.getFields()) { Optional<Schema> newFieldSchema = removeUncomparableFields(field.schema(), processed); if (newFieldSchema.isPresent()) { fields.add(new Field(field.name(), newFieldSchema.get(), field.doc(), field.defaultValue())); } } Schema newSchema = Schema.createRecord(record.getName(), record.getDoc(), record.getNamespace(), false); newSchema.setFields(fields); result = Optional.of(newSchema); processed.put(record, result); return result; } private static Optional<Schema> removeUncomparableFieldsFromUnion(Schema union, Map<Schema, Optional<Schema>> processed) { Preconditions.checkArgument(union.getType() == Schema.Type.UNION); Optional<Schema> result = processed.get(union); if (null != result) { return result; } List<Schema> newUnion = Lists.newArrayList(); for (Schema unionType : union.getTypes()) { Optional<Schema> newType = removeUncomparableFields(unionType, processed); if (newType.isPresent()) { newUnion.add(newType.get()); } } // Discard the union field if one or more types are removed from the union. if (newUnion.size() != union.getTypes().size()) { result = Optional.absent(); } else { result = Optional.of(Schema.createUnion(newUnion)); } processed.put(union, result); return result; } /** * Copies the input {@link org.apache.avro.Schema} but changes the schema name. * @param schema {@link org.apache.avro.Schema} to copy. * @param newName name for the copied {@link org.apache.avro.Schema}. * @return A {@link org.apache.avro.Schema} that is a copy of schema, but has the name newName. */ public static Schema switchName(Schema schema, String newName) { if (schema.getName().equals(newName)) { return schema; } Schema newSchema = Schema.createRecord(newName, schema.getDoc(), schema.getNamespace(), schema.isError()); List<Field> fields = schema.getFields(); Iterable<Field> fieldsNew = Iterables.transform(fields, new Function<Field, Field>() { @Override public Schema.Field apply(Field input) { //this should never happen but the API has marked input as Nullable if (null == input) { return null; } Field field = new Field(input.name(), input.schema(), input.doc(), input.defaultValue(), input.order()); return field; } }); newSchema.setFields(Lists.newArrayList(fieldsNew)); return newSchema; } /** * Copies the input {@link org.apache.avro.Schema} but changes the schema namespace. * @param schema {@link org.apache.avro.Schema} to copy. * @param namespaceOverride namespace for the copied {@link org.apache.avro.Schema}. * @return A {@link org.apache.avro.Schema} that is a copy of schema, but has the new namespace. */ public static Schema switchNamespace(Schema schema, Map<String, String> namespaceOverride) { Schema newSchema; String newNamespace = StringUtils.EMPTY; // Process all Schema Types // (Primitives are simply cloned) switch (schema.getType()) { case ENUM: newNamespace = namespaceOverride.containsKey(schema.getNamespace()) ? namespaceOverride.get(schema.getNamespace()) : schema.getNamespace(); newSchema = Schema.createEnum(schema.getName(), schema.getDoc(), newNamespace, schema.getEnumSymbols()); break; case FIXED: newNamespace = namespaceOverride.containsKey(schema.getNamespace()) ? namespaceOverride.get(schema.getNamespace()) : schema.getNamespace(); newSchema = Schema.createFixed(schema.getName(), schema.getDoc(), newNamespace, schema.getFixedSize()); break; case MAP: newSchema = Schema.createMap(switchNamespace(schema.getValueType(), namespaceOverride)); break; case RECORD: newNamespace = namespaceOverride.containsKey(schema.getNamespace()) ? namespaceOverride.get(schema.getNamespace()) : schema.getNamespace(); List<Schema.Field> newFields = new ArrayList<>(); if (schema.getFields().size() > 0) { for (Schema.Field oldField : schema.getFields()) { Field newField = new Field(oldField.name(), switchNamespace(oldField.schema(), namespaceOverride), oldField.doc(), oldField.defaultValue(), oldField.order()); newFields.add(newField); } } newSchema = Schema.createRecord(schema.getName(), schema.getDoc(), newNamespace, schema.isError()); newSchema.setFields(newFields); break; case UNION: List<Schema> newUnionMembers = new ArrayList<>(); if (null != schema.getTypes() && schema.getTypes().size() > 0) { for (Schema oldUnionMember : schema.getTypes()) { newUnionMembers.add(switchNamespace(oldUnionMember, namespaceOverride)); } } newSchema = Schema.createUnion(newUnionMembers); break; case ARRAY: newSchema = Schema.createArray(switchNamespace(schema.getElementType(), namespaceOverride)); break; case BOOLEAN: case BYTES: case DOUBLE: case FLOAT: case INT: case LONG: case NULL: case STRING: newSchema = Schema.create(schema.getType()); break; default: String exceptionMessage = String.format("Schema namespace replacement failed for \"%s\" ", schema); LOG.error(exceptionMessage); throw new AvroRuntimeException(exceptionMessage); } // Copy schema metadata copyProperties(schema, newSchema); return newSchema; } /*** * Copy properties from old Avro Schema to new Avro Schema * @param oldSchema Old Avro Schema to copy properties from * @param newSchema New Avro Schema to copy properties to */ private static void copyProperties(Schema oldSchema, Schema newSchema) { Preconditions.checkNotNull(oldSchema); Preconditions.checkNotNull(newSchema); Map<String, JsonNode> props = oldSchema.getJsonProps(); copyProperties(props, newSchema); } /*** * Copy properties to an Avro Schema * @param props Properties to copy to Avro Schema * @param schema Avro Schema to copy properties to */ private static void copyProperties(Map<String, JsonNode> props, Schema schema) { Preconditions.checkNotNull(schema); // (if null, don't copy but do not throw exception) if (null != props) { for (Map.Entry<String, JsonNode> prop : props.entrySet()) { schema.addProp(prop.getKey(), prop.getValue()); } } } /** * Serialize a generic record as a relative {@link Path}. Useful for converting {@link GenericRecord} type keys * into file system locations. For example {field1=v1, field2=v2} returns field1=v1/field2=v2 if includeFieldNames * is true, or v1/v2 if it is false. Illegal HDFS tokens such as ':' and '\\' will be replaced with '_'. * Additionally, parameter replacePathSeparators controls whether to replace path separators ('/') with '_'. * * @param record {@link GenericRecord} to serialize. * @param includeFieldNames If true, each token in the path will be of the form key=value, otherwise, only the value * will be included. * @param replacePathSeparators If true, path separators ('/') in each token will be replaced with '_'. * @return A relative path where each level is a field in the input record. */ public static Path serializeAsPath(GenericRecord record, boolean includeFieldNames, boolean replacePathSeparators) { if (record == null) { return new Path(""); } List<String> tokens = Lists.newArrayList(); for (Schema.Field field : record.getSchema().getFields()) { String sanitizedName = HadoopUtils.sanitizePath(field.name(), "_"); String sanitizedValue = HadoopUtils.sanitizePath(record.get(field.name()).toString(), "_"); if (replacePathSeparators) { sanitizedName = sanitizedName.replaceAll(Path.SEPARATOR, "_"); sanitizedValue = sanitizedValue.replaceAll(Path.SEPARATOR, "_"); } if (includeFieldNames) { tokens.add(String.format("%s=%s", sanitizedName, sanitizedValue)); } else if (!Strings.isNullOrEmpty(sanitizedValue)) { tokens.add(sanitizedValue); } } return new Path(Joiner.on(Path.SEPARATOR).join(tokens)); } /** * Deserialize a {@link GenericRecord} from a byte array. This method is not intended for high performance. */ public static GenericRecord slowDeserializeGenericRecord(byte[] serializedRecord, Schema schema) throws IOException { Decoder decoder = DecoderFactory.get().binaryDecoder(serializedRecord, null); GenericDatumReader<GenericRecord> reader = new GenericDatumReader<>(schema); return reader.read(null, decoder); } /** * Decorate the {@link Schema} for a record with additional {@link Field}s. * @param inputSchema: must be a {@link Record} schema. * @return the decorated Schema. Fields are appended to the inputSchema. */ public static Schema decorateRecordSchema(Schema inputSchema, @Nonnull List<Field> fieldList) { Preconditions.checkState(inputSchema.getType().equals(Type.RECORD)); List<Field> outputFields = deepCopySchemaFields(inputSchema); List<Field> newOutputFields = Stream.concat(outputFields.stream(), fieldList.stream()) .collect(Collectors.toList()); Schema outputSchema = Schema.createRecord(inputSchema.getName(), inputSchema.getDoc(), inputSchema.getNamespace(), inputSchema.isError()); outputSchema.setFields(newOutputFields); copyProperties(inputSchema, outputSchema); return outputSchema; } /** * Decorate a {@link GenericRecord} with additional fields and make it conform to an extended Schema * It is the caller's responsibility to ensure that the outputSchema is the merge of the inputRecord's schema * and the additional fields. The method does not check this for performance reasons, because it is expected to be called in the * critical path of processing a record. * Use {@link AvroUtils#decorateRecordSchema(Schema, List)} to generate such a Schema before calling this method. * @param inputRecord: record with data to be copied into the output record * @param fieldMap: values can be primitive types or GenericRecords if nested * @param outputSchema: the schema that the decoratedRecord will conform to * @return an outputRecord that contains a union of the fields in the inputRecord and the field-values in the fieldMap */ public static GenericRecord decorateRecord(GenericRecord inputRecord, @Nonnull Map<String, Object> fieldMap, Schema outputSchema) { GenericRecord outputRecord = new GenericData.Record(outputSchema); inputRecord.getSchema().getFields().forEach(f -> outputRecord.put(f.name(), inputRecord.get(f.name()))); fieldMap.forEach((key, value) -> outputRecord.put(key, value)); return outputRecord; } /** * Given a generic record, Override the name and namespace of the schema and return a new generic record * @param input input record who's name and namespace need to be overridden * @param nameOverride new name for the record schema * @param namespaceOverride Optional map containing namespace overrides * @return an output record with overridden name and possibly namespace */ public static GenericRecord overrideNameAndNamespace(GenericRecord input, String nameOverride, Optional<Map<String, String>> namespaceOverride) { GenericRecord output = input; Schema newSchema = switchName(input.getSchema(), nameOverride); if (namespaceOverride.isPresent()) { newSchema = switchNamespace(newSchema, namespaceOverride.get()); } try { output = convertRecordSchema(output, newSchema); } catch (Exception e) { log.error("Unable to generate generic data record", e); } return output; } /** * Given a input schema, Override the name and namespace of the schema and return a new schema * @param input * @param nameOverride * @param namespaceOverride * @return a schema with overridden name and possibly namespace */ public static Schema overrideNameAndNamespace(Schema input, String nameOverride, Optional<Map<String, String>> namespaceOverride) { Schema newSchema = switchName(input, nameOverride); if (namespaceOverride.isPresent()) { newSchema = switchNamespace(newSchema, namespaceOverride.get()); } return newSchema; } }