co.cask.cdap.internal.io.Schema.java Source code

Java tutorial

Introduction

Here is the source code for co.cask.cdap.internal.io.Schema.java

Source

/*
 * Copyright  2014 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.internal.io;

import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.BiMap;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableBiMap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.io.CharStreams;
import com.google.gson.stream.JsonWriter;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * This class represents schema of data types.
 */
public final class Schema {

    /**
     * Types known to Schema.
     */
    public enum Type {
        NULL(true), BOOLEAN(true), INT(true), LONG(true), FLOAT(true), DOUBLE(true), BYTES(true), STRING(true),

        ENUM(false),

        ARRAY(false), MAP(false), RECORD(false), UNION(false);

        private final boolean simpleType;

        private Type(boolean primitive) {
            this.simpleType = primitive;
        }

        /**
         * @return true if this enum represents a simple schema type.
         */
        public boolean isSimpleType() {
            return simpleType;
        }
    }

    /**
     * Represents a field inside a {@link Type#RECORD} schema.
     */
    public static final class Field {
        private final String name;
        private final Schema schema;

        /**
         * Creates a {@link Field} instance with the given name and {@link Schema}.
         *
         * @param name Name of the field.
         * @param schema Schema of the field.
         * @return A new {@link Field} instance.
         */
        public static Field of(String name, Schema schema) {
            return new Field(name, schema);
        }

        private Field(String name, Schema schema) {
            this.name = name;
            this.schema = schema;
        }

        /**
         * @return Name of the field.
         */
        public String getName() {
            return name;
        }

        /**
         * @return Schema of the field.
         */
        public Schema getSchema() {
            return schema;
        }
    }

    /**
     * Creates a {@link Schema} for the given type. The type given must be a
     * {@link Schema.Type#isSimpleType() Simple Type}.
     *
     * @param type Type of the schema to create.
     * @return A {@link Schema} with the given type.
     */
    public static Schema of(Type type) {
        Preconditions.checkArgument(type.isSimpleType(), "Type %s is not a simple type.", type);
        return new Schema(type, null, null, null, null, null, null, null);
    }

    /**
     * Creates a {@link Schema} of {@link Type#ENUM ENUM} type, with the given enum values.
     * The set of values given should be unique and must contains at least one value.
     * The ordering of values in the enum type schema would be the same as the order being passed in.
     *
     * @param values Enum values.
     * @return A {@link Schema} of {@link Type#ENUM ENUM} type.
     */
    public static Schema enumWith(String... values) {
        return enumWith(ImmutableList.copyOf(values));
    }

    /**
     * Creates a {@link Schema} of {@link Type#ENUM ENUM} type, with the given enum values.
     * The set of values given should be unique and must contains at least one value.
     * The ordering of values in the enum type schema would be the same as the {@link Iterable#iterator()} order.
     *
     * @param values Enum values.
     * @return A {@link Schema} of {@link Type#ENUM ENUM} type.
     */
    public static Schema enumWith(Iterable<String> values) {
        Set<String> uniqueValues = ImmutableSet.copyOf(values);
        Preconditions.checkArgument(uniqueValues.size() > 0, "No enum value provided.");
        Preconditions.checkArgument(Iterables.size(values) == uniqueValues.size(),
                "Duplicate enum value is not allowed.");
        return new Schema(Type.ENUM, uniqueValues, null, null, null, null, null, null);
    }

    /**
     * Creates a {@link Schema} of {@link Type#ENUM ENUM} type, with values extracted from the given {@link Enum} class.
     * The ordering of values in the enum type schema would be the same as the {@link Enum#ordinal()} order.
     *
     * @param enumClass Enum values.
     * @return A {@link Schema} of {@link Type#ENUM ENUM} type.
     */
    public static Schema enumWith(Class<Enum<?>> enumClass) {
        Enum<?>[] enumConstants = enumClass.getEnumConstants();
        String[] names = new String[enumConstants.length];
        for (int i = 0; i < enumConstants.length; i++) {
            names[i] = enumConstants[i].name();
        }
        return enumWith(names);
    }

    /**
     * Creates an {@link Type#ARRAY ARRAY} {@link Schema} of the given component type.
     * @param componentSchema Schema of the array component.
     * @return A {@link Schema} of {@link Type#ARRAY ARRAY} type.
     */
    public static Schema arrayOf(Schema componentSchema) {
        return new Schema(Type.ARRAY, null, componentSchema, null, null, null, null, null);
    }

    /**
     * Creates a {@link Type#MAP MAP} {@link Schema} of the given key and value types.
     * @param keySchema Schema of the map key.
     * @param valueSchema Schema of the map value
     * @return A {@link Schema} of {@link Type#MAP MAP} type.
     */
    public static Schema mapOf(Schema keySchema, Schema valueSchema) {
        return new Schema(Type.MAP, null, null, keySchema, valueSchema, null, null, null);
    }

    /**
     * Creates a {@link Type#RECORD RECORD} {@link Schema} of the given name. The schema created
     * doesn't carry any record fields, which makes it only useful to be used as a component schema
     * for other schema type, where the actual schema is resolved from the top level container schema.
     *
     * @param name Name of the record.
     * @return A {@link Schema} of {@link Type#RECORD RECORD} type.
     */
    public static Schema recordOf(String name) {
        Preconditions.checkNotNull(name, "Record name cannot be null.");
        return new Schema(Type.RECORD, null, null, null, null, name, null, null);
    }

    /**
     * Creates a {@link Type#RECORD RECORD} {@link Schema} with the given name and {@link Field Fields}.
     * The ordering of the fields inside the record would be the same as the one being passed in.
     *
     * @param name Name of the record
     * @param fields All the fields that the record contains.
     * @return A {@link Schema} of {@link Type#RECORD RECORD} type.
     */
    public static Schema recordOf(String name, Field... fields) {
        return recordOf(name, ImmutableList.copyOf(fields));
    }

    /**
     * Creates a {@link Type#RECORD RECORD} {@link Schema} with the given name and {@link Field Fields}.
     * The ordering of the fields inside the record would be the same as the {@link Iterable#iterator()} order.
     *
     * @param name Name of the record
     * @param fields All the fields that the record contains.
     * @return A {@link Schema} of {@link Type#RECORD RECORD} type.
     */
    public static Schema recordOf(String name, Iterable<Field> fields) {
        Preconditions.checkNotNull(name, "Record name cannot be null.");
        ImmutableMap.Builder<String, Field> fieldMapBuilder = ImmutableMap.builder();
        for (Field field : fields) {
            fieldMapBuilder.put(field.getName(), field);
        }
        Map<String, Field> fieldMap = fieldMapBuilder.build();
        Preconditions.checkArgument(fieldMap.size() > 0, "No record field provided for %s", name);
        return new Schema(Type.RECORD, null, null, null, null, name, fieldMap, null);
    }

    /**
     * Creates a {@link Type#UNION UNION} {@link Schema} which represents a union of all the given schemas.
     * The ordering of the schemas inside the union would be the same as the one being passed in.
     *
     * @param schemas All the {@link Schema Schemas} constitutes the union.
     * @return A {@link Schema} of {@link Type#UNION UNION} type.
     */
    public static Schema unionOf(Schema... schemas) {
        return unionOf(ImmutableList.copyOf(schemas));
    }

    /**
     * Creates a {@link Type#UNION UNION} {@link Schema} which represents a union of all the given schemas.
     * The ordering of the schemas inside the union would be the same as the {@link Iterable#iterator()} order.
     *
     * @param schemas All the {@link Schema Schemas} constitutes the union.
     * @return A {@link Schema} of {@link Type#UNION UNION} type.
     */
    public static Schema unionOf(Iterable<Schema> schemas) {
        List<Schema> schemaList = ImmutableList.copyOf(schemas);
        Preconditions.checkArgument(schemaList.size() > 0, "No union schema provided.");
        return new Schema(Type.UNION, null, null, null, null, null, null, schemaList);
    }

    private final Type type;

    private final BiMap<String, Integer> enumValues;
    private final BiMap<Integer, String> enumIndexes;

    private final Schema componentSchema;

    private final Schema keySchema;
    private final Schema valueSchema;
    private final Map.Entry<Schema, Schema> mapSchema;

    private final String recordName;
    private final Map<String, Field> fieldMap;
    private final List<Field> fields;

    private final List<Schema> unionSchemas;

    private String schemaString;
    private SchemaHash schemaHash;

    private Schema(Type type, Set<String> enumValues, Schema componentSchema, Schema keySchema, Schema valueSchema,
            String recordName, Map<String, Field> fieldMap, List<Schema> unionSchemas) {
        this.type = type;
        this.enumValues = createIndex(enumValues);
        this.enumIndexes = this.enumValues == null ? null : this.enumValues.inverse();
        this.componentSchema = componentSchema;
        this.keySchema = keySchema;
        this.valueSchema = valueSchema;
        this.mapSchema = (keySchema == null || valueSchema == null) ? null
                : Maps.immutableEntry(keySchema, valueSchema);
        this.recordName = recordName;
        this.fieldMap = populateRecordFields(fieldMap);
        this.fields = this.fieldMap == null ? null : ImmutableList.copyOf(this.fieldMap.values());
        this.unionSchemas = unionSchemas;
    }

    /**
     * @return The {@link Type} that this schema represents.
     */
    public Type getType() {
        return type;
    }

    /**
     * @return An immutable {@link Set} of enum values or {@code null} if this is not a {@link Type#ENUM ENUM} schema.
     *         The {@link Set#iterator()} order would be the enum values orders.
     */
    public Set<String> getEnumValues() {
        return enumValues.keySet();
    }

    /**
     * @param value The enum value
     * @return The 0-base index of the given value in the enum values or {@code -1} if this is not a
     *         {@link Type#ENUM ENUM} schema.
     */
    public int getEnumIndex(String value) {
        if (enumValues == null) {
            return -1;
        }
        Integer idx = enumValues.get(value);
        return idx == null ? -1 : idx;
    }

    /**
     * @param idx The index in the enum values
     * @return The string represents the enum value, or {@code null} if this is not a {@link Type#ENUM ENUM} schema or
     *         the given index is invalid.
     */
    public String getEnumValue(int idx) {
        if (enumIndexes == null) {
            return null;
        }
        return enumIndexes.get(idx);
    }

    /**
     * @return The schema of the array component or {@code null} if this is not a {@link Type#ARRAY ARRAY} schema.
     */
    public Schema getComponentSchema() {
        return componentSchema;
    }

    /**
     * @return An immutable {@code Map.Entry} if this is a {@code Type#MAP MAP} schema or {@code null} otherwise.
     *         The {@code Map.Entry#getKey()} would returns the key schema, while {@code Map.Entry#getValue()}
     *         would returns the value schema.
     */
    public Map.Entry<Schema, Schema> getMapSchema() {
        return mapSchema;
    }

    /**
     * @return Name of the record if this is a {@link Type#RECORD RECORD} schema or {@code null} otherwise.
     */
    public String getRecordName() {
        return recordName;
    }

    /**
     * @return An immutable {@link List} of record {@link Field Fields} if this is a {@link Type#RECORD RECORD} schema
     *         or {@code null} otherwise.
     */
    public List<Field> getFields() {
        return fields;
    }

    /**
     * Returns the record {@link Field} of the given name.
     *
     * @param name Name of the field
     * @return A {@link Field} or {@code null} if there is no such field in this record
     *         or this is not a {@link Type#RECORD RECORD} schema.
     */
    public Field getField(String name) {
        if (fieldMap == null) {
            return null;
        }
        return fieldMap.get(name);
    }

    /**
     * @return An immutable {@link List} of schemas inside this union
     *         or {@code null} if this is not a {@link Type#UNION UNION} schema.
     */
    public List<Schema> getUnionSchemas() {
        return unionSchemas;
    }

    /**
     * @param idx Index to the union schemas
     * @return A {@link Schema} of the given union index or {@code null} if this is not a {@link Type#UNION UNION}
     *         schema or the given index is invalid.
     */
    public Schema getUnionSchema(int idx) {
        return (unionSchemas == null || idx < 0 || unionSchemas.size() <= idx) ? null : unionSchemas.get(idx);
    }

    @Override
    public String toString() {
        // The follow logic is thread safe, as all the fields buildString() needs are immutable.
        // It's possible that buildString() get triggered multiple times, but they should yield the same result.
        String str = schemaString;
        if (str == null) {
            schemaString = str = buildString();
        }
        return str;
    }

    @Override
    public boolean equals(Object other) {
        if (this == other) {
            return true;
        }
        if (other == null || getClass() != other.getClass()) {
            return false;
        }

        return getSchemaHash().equals(((Schema) other).getSchemaHash());
    }

    @Override
    public int hashCode() {
        return getSchemaHash().hashCode();
    }

    /**
     * @return A MD5 hash of this schema.
     */
    public SchemaHash getSchemaHash() {
        SchemaHash hash = schemaHash;
        if (hash == null) {
            schemaHash = hash = new SchemaHash(this);
        }
        return hash;
    }

    /**
     * Checks if the given target schema is compatible with this schema, meaning datum being written with this
     * schema could be projected correctly into the given target schema.
     *
     * TODO: Add link to document of the target type projection.
     *
     * @param target Schema to check for compatibility to this target
     * @return {@code true} if the schemas are compatible, {@code false} otherwise.
     */
    public boolean isCompatible(Schema target) {
        if (equals(target)) {
            return true;
        }
        Multimap<String, String> recordCompared = HashMultimap.create();
        return checkCompatible(target, recordCompared);
    }

    private boolean checkCompatible(Schema target, Multimap<String, String> recordCompared) {
        if (type.isSimpleType()) {
            if (type == target.getType()) {
                // Same simple type are always compatible
                return true;
            }
            switch (target.getType()) {
            case LONG:
                return type == Type.INT;
            case FLOAT:
                return type == Type.INT || type == Type.LONG;
            case DOUBLE:
                return type == Type.INT || type == Type.LONG || type == Type.FLOAT;
            case STRING:
                return type != Type.NULL && type != Type.BYTES;
            case UNION:
                for (Schema targetSchema : target.unionSchemas) {
                    if (checkCompatible(targetSchema, recordCompared)) {
                        return true;
                    }
                }
            }
            return false;
        }

        if (type == target.type) {
            switch (type) {
            case ENUM:
                return target.getEnumValues().containsAll(getEnumValues());
            case ARRAY:
                // The component schema must be compatible
                return componentSchema.checkCompatible(target.getComponentSchema(), recordCompared);
            case MAP:
                // Both key and value schemas must be compatible
                return keySchema.checkCompatible(target.keySchema, recordCompared)
                        && valueSchema.checkCompatible(target.valueSchema, recordCompared);
            case RECORD:
                // For every common field (by name), their schema must be compatible
                if (!recordCompared.containsEntry(recordName, target.recordName)) {
                    recordCompared.put(recordName, target.recordName);
                    for (Field field : fields) {
                        Field targetField = target.getField(field.getName());
                        if (targetField == null) {
                            continue;
                        }
                        if (!field.getSchema().checkCompatible(targetField.getSchema(), recordCompared)) {
                            return false;
                        }
                    }
                }
                return true;
            case UNION:
                // Compare each source union to target union
                for (Schema sourceSchema : unionSchemas) {
                    for (Schema targetSchema : target.unionSchemas) {
                        if (sourceSchema.checkCompatible(targetSchema, recordCompared)) {
                            return true;
                        }
                    }
                }
                return false;
            }
        }

        if (type == Type.UNION || target.type == Type.UNION) {
            List<Schema> unions = type == Type.UNION ? unionSchemas : target.unionSchemas;
            Schema checkSchema = type == Type.UNION ? target : this;
            for (Schema schema : unions) {
                if (schema.checkCompatible(checkSchema, recordCompared)) {
                    return true;
                }
            }
        }

        return false;
    }

    /**
     * Creates a map of indexes based on the iteration order of the given set.
     *
     * @param values Set of values to create index on
     * @return A map from the values to indexes in the set iteration order.
     */
    private <V> BiMap<V, Integer> createIndex(Set<V> values) {
        if (values == null) {
            return null;
        }

        ImmutableBiMap.Builder<V, Integer> builder = ImmutableBiMap.builder();
        int idx = 0;
        for (V value : values) {
            builder.put(value, idx++);
        }
        return builder.build();
    }

    /**
     * Resolves all field schemas.
     *
     * @param fields All the fields that need to be resolved.
     * @return A {@link Map} which has all the field schemas resolved.
     * @see #resolveSchema(Schema, java.util.Map)
     */
    private Map<String, Field> populateRecordFields(Map<String, Field> fields) {
        if (fields == null) {
            return null;
        }

        Map<String, Schema> knownRecordSchemas = Maps.newHashMap();
        knownRecordSchemas.put(recordName, this);
        ImmutableMap.Builder<String, Field> builder = ImmutableMap.builder();

        for (Map.Entry<String, Field> fieldEntry : fields.entrySet()) {
            String fieldName = fieldEntry.getKey();
            Field field = fieldEntry.getValue();
            Schema fieldSchema = resolveSchema(field.getSchema(), knownRecordSchemas);

            if (fieldSchema == field.getSchema()) {
                builder.put(fieldName, field);
            } else {
                builder.put(fieldName, Field.of(fieldName, fieldSchema));
            }
        }

        return builder.build();
    }

    /**
     * This method is to recursively resolves all name only record schema in the given schema.
     *
     * @param schema The schema needs to be resolved.
     * @param knownRecordSchemas The mapping of the already resolved record schemas.
     * @return A {@link Schema} that is structurally the same as the input schema, but with all
     *         name only record schemas resolved to full schemas (i.e. with fields sets).
     *         If nothing in the given schema needs to be resolved, the same schema instance would be returned,
     *         otherwise, a new instance would be returned.
     */
    private Schema resolveSchema(final Schema schema, final Map<String, Schema> knownRecordSchemas) {
        switch (schema.getType()) {
        case ARRAY:
            Schema componentSchema = resolveSchema(schema.getComponentSchema(), knownRecordSchemas);
            return (componentSchema == schema.getComponentSchema()) ? schema : Schema.arrayOf(componentSchema);
        case MAP:
            Map.Entry<Schema, Schema> entry = schema.getMapSchema();
            Schema keySchema = resolveSchema(entry.getKey(), knownRecordSchemas);
            Schema valueSchema = resolveSchema(entry.getValue(), knownRecordSchemas);
            return (keySchema == entry.getKey() && valueSchema == entry.getValue()) ? schema
                    : Schema.mapOf(keySchema, valueSchema);
        case UNION:
            ImmutableList.Builder<Schema> schemaBuilder = ImmutableList.builder();
            boolean changed = false;
            for (Schema input : schema.getUnionSchemas()) {
                Schema output = resolveSchema(input, knownRecordSchemas);
                if (output != input) {
                    changed = true;
                }
                schemaBuilder.add(output);
            }
            return changed ? Schema.unionOf(schemaBuilder.build()) : schema;
        case RECORD:
            if (schema.fields == null) {
                // It is a named record that refers to previously defined record
                Schema knownSchema = knownRecordSchemas.get(schema.recordName);
                Preconditions.checkArgument(knownSchema != null, "Undefined schema %s", schema.recordName);
                return knownSchema;
            } else {
                // It is a concrete schema
                knownRecordSchemas.put(schema.recordName, schema);
                return schema;
            }
        }
        return schema;
    }

    /**
     * Helper method to encode this schema into json string.
     *
     * @return A json string representing this schema.
     */
    private String buildString() {
        if (type.isSimpleType()) {
            return '"' + type.name().toLowerCase() + '"';
        }
        StringBuilder builder = new StringBuilder();
        JsonWriter writer = new JsonWriter(CharStreams.asWriter(builder));
        try {
            new co.cask.cdap.internal.io.SchemaTypeAdapter().write(writer, this);
            writer.close();
            return builder.toString();
        } catch (IOException e) {
            // It should never throw IOException on the StringBuilder Writer, if it does, something very wrong.
            throw Throwables.propagate(e);
        }
    }
}