org.kitesdk.data.spi.Constraints.java Source code

Java tutorial

Introduction

Here is the source code for org.kitesdk.data.spi.Constraints.java

Source

/*
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.kitesdk.data.spi;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
import javax.annotation.concurrent.Immutable;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Parser;
import org.apache.avro.reflect.ReflectData;
import org.apache.commons.codec.binary.Base64;
import org.kitesdk.data.DatasetException;
import org.kitesdk.data.DatasetIOException;
import org.kitesdk.data.PartitionStrategy;
import org.kitesdk.data.spi.partition.CalendarFieldPartitioner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static com.google.common.base.Predicates.alwaysTrue;

/**
 * A set of simultaneous constraints.
 *
 * This class accumulates combine manages a set of logical constraints.
 */
@Immutable
public class Constraints implements Serializable {

    private static final long serialVersionUID = -155119355851820161L;

    private static final Logger LOG = LoggerFactory.getLogger(Constraints.class);

    private transient Schema schema;
    private transient PartitionStrategy strategy;
    private transient Map<String, Predicate> constraints;

    public Constraints(Schema schema) {
        this(schema, null);
    }

    public Constraints(Schema schema, PartitionStrategy strategy) {
        this.schema = schema;
        this.strategy = strategy;
        this.constraints = ImmutableMap.of();
    }

    private Constraints(Schema schema, PartitionStrategy strategy, Map<String, Predicate> constraints) {
        this.schema = schema;
        this.strategy = strategy;
        this.constraints = constraints;
    }

    private Constraints(Schema schema, PartitionStrategy strategy, Map<String, Predicate> constraints, String name,
            Predicate predicate) {
        this.schema = schema;
        this.strategy = strategy;
        Map<String, Predicate> copy = Maps.newHashMap(constraints);
        copy.put(name, predicate);
        this.constraints = ImmutableMap.copyOf(copy);
    }

    @VisibleForTesting
    Constraints partitionedBy(PartitionStrategy strategy) {
        return new Constraints(schema, strategy, constraints);
    }

    /**
     * Get a {@link Predicate} for testing entity objects.
     *
     * @param <E> The type of entities to be matched
     * @return a Predicate to test if entity objects satisfy this constraint set
     */
    public <E> Predicate<E> toEntityPredicate() {
        return entityPredicate(constraints, schema, strategy);
    }

    /**
     * Get a {@link Predicate} for testing entity objects that match the given
     * {@link StorageKey}.
     *
     * @param <E> The type of entities to be matched
     * @param key a StorageKey for entities tested with the Predicate
     * @return a Predicate to test if entity objects satisfy this constraint set
     */
    public <E> Predicate<E> toEntityPredicate(StorageKey key) {
        if (key != null) {
            Map<String, Predicate> predicates = minimizeFor(key);
            if (predicates.isEmpty()) {
                return alwaysTrue();
            }
            return entityPredicate(predicates, schema, strategy);
        }
        return toEntityPredicate();
    }

    @VisibleForTesting
    @SuppressWarnings("unchecked")
    Map<String, Predicate> minimizeFor(StorageKey key) {
        Map<String, Predicate> unsatisfied = Maps.newHashMap(constraints);
        PartitionStrategy strategy = key.getPartitionStrategy();
        Set<String> timeFields = Sets.newHashSet();
        int i = 0;
        for (FieldPartitioner fp : strategy.getFieldPartitioners()) {
            String partition = fp.getName();
            Predicate partitionPredicate = unsatisfied.get(partition);
            if (partitionPredicate != null && partitionPredicate.apply(key.get(i))) {
                unsatisfied.remove(partition);
                LOG.debug("removing " + partition + " satisfied by " + key.get(i));
            }

            String source = fp.getSourceName();
            if (fp instanceof CalendarFieldPartitioner) {
                // keep track of time fields to consider
                timeFields.add(source);
            }
            // remove the field if it is satisfied by the StorageKey
            Predicate original = unsatisfied.get(source);
            if (original != null) {
                Predicate isSatisfiedBy = fp.projectStrict(original);
                LOG.debug("original: " + original + ", strict: " + isSatisfiedBy);
                if ((isSatisfiedBy != null) && isSatisfiedBy.apply(key.get(i))) {
                    LOG.debug("removing " + source + " satisfied by " + key.get(i));
                    unsatisfied.remove(source);
                }
            }
            i += 1;
        }
        // remove fields satisfied by the time predicates
        for (String timeField : timeFields) {
            Predicate<Long> original = unsatisfied.get(timeField);
            if (original != null) {
                Predicate<Marker> isSatisfiedBy = TimeDomain.get(strategy, timeField).projectStrict(original);
                LOG.debug("original: " + original + ", strict: " + isSatisfiedBy);
                if ((isSatisfiedBy != null) && isSatisfiedBy.apply(key)) {
                    LOG.debug("removing " + timeField + " satisfied by " + key);
                    unsatisfied.remove(timeField);
                }
            }
        }
        return ImmutableMap.copyOf(unsatisfied);
    }

    /**
     * Get a {@link Predicate} that tests {@link StorageKey} objects.
     *
     * If a {@code StorageKey} matches the predicate, it <em>may</em> represent a
     * partition that is responsible for entities that match this set of
     * constraints. If it does not match the predicate, it cannot be responsible
     * for entities that match this constraint set.
     *
     * @return a Predicate for testing StorageKey objects
     * @throws NullPointerException if no partition strategy is defined
     */
    public Predicate<StorageKey> toKeyPredicate() {
        Preconditions.checkNotNull(strategy, "Cannot produce a key predicate without a partition strategy");
        return new KeyPredicate(constraints, strategy);
    }

    /**
     * Get a set of {@link MarkerRange} objects that covers the set of possible
     * {@link StorageKey} partitions for this constraint set. If a
     * {@code StorageKey} is not in one of the ranges returned by this method,
     * then its partition cannot contain entities that satisfy this constraint
     * set.
     *
     * @return an Iterable of MarkerRange
     * @throws NullPointerException if no partition strategy is defined
     */
    public Iterable<MarkerRange> toKeyRanges() {
        Preconditions.checkNotNull(strategy, "Cannot produce key ranges without a partition strategy");
        return new KeyRangeIterable(strategy, constraints);
    }

    /**
     * If this returns true, the entities selected by this set of constraints
     * align to partition boundaries.
     *
     * For example, for a partition strategy [hash(num), identity(num)],
     * any constraint on the "num" field will be correctly enforced by the
     * partition predicate for this constraint set. However, a "color" field
     * wouldn't be satisfied by considering partition values alone and would
     * require further checks.
     *
     * An alternate explanation: This returns whether the key {@link Predicate}
     * from {@link #toKeyPredicate()} is equivalent to this set of constraints
     * under the given {@link PartitionStrategy}. The key predicate must accept a
     * key if that key's partition might include entities matched by this
     * constraint set. If this method returns true, then all entities in the
     * partitions it matches are guaranteed to match this constraint set. So, the
     * partitions are equivalent to the constraints.
     *
     * @return true if this constraint set is satisfied by partitioning
     * @throws NullPointerException if no partition strategy is defined
     */
    @SuppressWarnings("unchecked")
    public boolean alignedWithBoundaries() {
        Preconditions.checkNotNull(strategy, "Cannot produce key ranges without a partition strategy");
        Multimap<String, FieldPartitioner> partitioners = HashMultimap.create();
        for (FieldPartitioner fp : strategy.getFieldPartitioners()) {
            partitioners.put(fp.getSourceName(), fp);
        }

        // The key predicate is equivalent to a constraint set when the permissive
        // projection for each predicate can be used in its place. This happens if
        // fp.project(predicate) == fp.projectStrict(predicate):
        //
        // let D = some value domain
        // let pred : D -> {0, 1}
        // let D_{pred} = {x \in D | pred(x) == 1} (a subset of D selected by pred)
        //
        // let fp : D -> S (also a value domain)
        // let fp.strict(pred) = pred_{fp.strict} : S -> {0, 1}    (project strict)
        //      s.t. pred_{fp.strict}(fp(x)) == 1 => pred(x) == 1
        // let fp.project(pred) = pred_{fp.project} : S -> {0, 1}         (project)
        //      s.t. pred(x) == 1 => pred_{fp.project}(fp(x)) == 1
        //
        // lemma. {x \in D | pred_{fp.strict}(fp(x))} is a subset of D_{pred}
        //     pred_{fp.strict}(fp(x)) == 1 => pred(x) == 1 => x \in D_{pred}
        //
        // theorem. (pred_{fp.project}(s) => pred_{fp.strict}(s)) =>
        //                D_{pred} == {x \in D | pred_{fp.strict}(fp(x))}
        //
        //  => let x \in D_{pred}. then pred_{fp.project}(fp(x)) == 1 by def
        //                         then pred_{fp.strict(fp(x)) == 1 by premise
        //     therefore {x \in D | pred_{fp.strict}(fp(x))} \subsetOf D_{pred}
        //  <= by previous lemma
        //
        // Note: if projectStrict is too conservative or project is too permissive,
        // then this logic cannot determine that that the original predicate is
        // satisfied
        for (Map.Entry<String, Predicate> entry : constraints.entrySet()) {
            Collection<FieldPartitioner> fps = partitioners.get(entry.getKey());
            if (fps.isEmpty()) {
                LOG.debug("No field partitioners for key {}", entry.getKey());
                return false;
            }

            Predicate predicate = entry.getValue();
            if (!(predicate instanceof Predicates.Exists)) {
                boolean satisfied = false;
                for (FieldPartitioner fp : fps) {
                    if (fp instanceof CalendarFieldPartitioner) {
                        TimeDomain domain = TimeDomain.get(strategy, entry.getKey());
                        Predicate strict = domain.projectStrict(predicate);
                        Predicate permissive = domain.project(predicate);
                        LOG.debug("Time predicate strict: {}", strict);
                        LOG.debug("Time predicate permissive: {}", permissive);
                        satisfied = strict != null && strict.equals(permissive);
                        break;
                    } else {
                        Predicate strict = fp.projectStrict(predicate);
                        Predicate permissive = fp.project(predicate);
                        if (strict != null && strict.equals(permissive)) {
                            satisfied = true;
                            break;
                        }
                    }
                }
                // this predicate cannot be satisfied by the partition information
                if (!satisfied) {
                    LOG.debug("Predicate not satisfied: {}", predicate);
                    return false;
                }
            }
        }

        return true;
    }

    @SuppressWarnings("unchecked")
    public Constraints with(String name, Object... values) {
        SchemaUtil.checkTypeConsistency(schema, strategy, name, values);
        if (values.length > 0) {
            checkContained(name, values);
            // this is the most specific constraint and is idempotent under "and"
            return new Constraints(schema, strategy, constraints, name, Predicates.in(values));
        } else {
            if (!constraints.containsKey(name)) {
                // no other constraint => add the exists
                return new Constraints(schema, strategy, constraints, name, Predicates.exists());
            } else {
                // satisfied by an existing constraint
                return this;
            }
        }
    }

    public Constraints from(String name, Comparable value) {
        SchemaUtil.checkTypeConsistency(schema, strategy, name, value);
        checkContained(name, value);
        Range added = Ranges.atLeast(value);
        if (constraints.containsKey(name)) {
            return new Constraints(schema, strategy, constraints, name, combine(constraints.get(name), added));
        } else {
            return new Constraints(schema, strategy, constraints, name, added);
        }
    }

    public Constraints fromAfter(String name, Comparable value) {
        SchemaUtil.checkTypeConsistency(schema, strategy, name, value);
        checkContained(name, value);
        Range added = Ranges.greaterThan(value);
        if (constraints.containsKey(name)) {
            return new Constraints(schema, strategy, constraints, name, combine(constraints.get(name), added));
        } else {
            return new Constraints(schema, strategy, constraints, name, added);
        }
    }

    public Constraints to(String name, Comparable value) {
        SchemaUtil.checkTypeConsistency(schema, strategy, name, value);
        checkContained(name, value);
        Range added = Ranges.atMost(value);
        if (constraints.containsKey(name)) {
            return new Constraints(schema, strategy, constraints, name, combine(constraints.get(name), added));
        } else {
            return new Constraints(schema, strategy, constraints, name, added);
        }
    }

    public Constraints toBefore(String name, Comparable value) {
        SchemaUtil.checkTypeConsistency(schema, strategy, name, value);
        checkContained(name, value);
        Range added = Ranges.lessThan(value);
        if (constraints.containsKey(name)) {
            return new Constraints(schema, strategy, constraints, name, combine(constraints.get(name), added));
        } else {
            return new Constraints(schema, strategy, constraints, name, added);
        }
    }

    /**
     * Returns the predicate for a named field.
     *
     * For testing.
     *
     * @param name a String field name
     * @return a Predicate for the given field, or null if none is set
     */
    @VisibleForTesting
    Predicate get(String name) {
        return constraints.get(name);
    }

    @SuppressWarnings("unchecked")
    private void checkContained(String name, Object... values) {
        for (Object value : values) {
            if (constraints.containsKey(name)) {
                Predicate current = constraints.get(name);
                Preconditions.checkArgument(current.apply(value), "%s does not match %s", current, value);
            }
        }
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) {
            return true;
        }
        if (o == null || getClass() != o.getClass()) {
            return false;
        }

        Constraints that = (Constraints) o;
        return Objects.equal(this.constraints, that.constraints) && Objects.equal(this.schema, that.schema);
    }

    @Override
    public int hashCode() {
        return Objects.hashCode(schema, constraints);
    }

    @Override
    public String toString() {
        return Objects.toStringHelper(this).addValue(constraints).toString();
    }

    /**
     * Writes out the {@link Constraints} using Java serialization.
     */
    private void writeObject(java.io.ObjectOutputStream out) throws IOException {
        out.defaultWriteObject();
        out.writeUTF(schema.toString());
        out.writeUTF(strategy != null ? strategy.toString() : "");
        ConstraintsSerialization.writeConstraints(schema, strategy, constraints, out);
    }

    /**
     * Reads in the {@link Constraints} from the provided {@code in} stream.
     * @param in the stream from which to deserialize the object.
     * @throws IOException error deserializing the {@link Constraints}
     * @throws ClassNotFoundException Unable to properly access values inside the {@link Constraints}
    */
    private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException {
        in.defaultReadObject();
        schema = new Parser().parse(in.readUTF());
        String json = in.readUTF();
        if (!json.isEmpty()) {
            strategy = PartitionStrategyParser.parse(json);
        }
        constraints = ImmutableMap.copyOf(ConstraintsSerialization.readConstraints(schema, strategy, in));
    }

    public static String serialize(Constraints constraints) {
        try {
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            ObjectOutputStream out = new ObjectOutputStream(baos);
            out.writeObject(constraints);
            out.close();
            return Base64.encodeBase64String(baos.toByteArray());
        } catch (IOException e) {
            throw new DatasetIOException("Cannot serialize constraints " + constraints, e);
        }
    }

    public static Constraints deserialize(String s) {
        try {
            ByteArrayInputStream bais = new ByteArrayInputStream(Base64.decodeBase64(s));
            ObjectInputStream in = new ObjectInputStream(bais);
            return (Constraints) in.readObject();
        } catch (IOException e) {
            throw new DatasetIOException("Cannot deserialize constraints", e);
        } catch (ClassNotFoundException e) {
            throw new DatasetException("Cannot deserialize constraints", e);
        }
    }

    @SuppressWarnings("unchecked")
    static Predicate combine(Predicate left, Predicate right) {
        if (left == right) {
            return left;
        } else if (left == null) {
            return right; // must be non-null
        } else if (right == null || right instanceof Predicates.Exists) {
            return left; // must be non-null, which satisfies exists
        } else if (left instanceof Predicates.Exists) {
            return right; // must be non-null, which satisfies exists
        } else if (left instanceof Predicates.In) {
            return ((Predicates.In) left).filter(right);
        } else if (right instanceof Predicates.In) {
            return ((Predicates.In) right).filter(left);
        } else if (left instanceof Range && right instanceof Range) {
            return ((Range) left).intersection((Range) right);
        } else {
            return com.google.common.base.Predicates.and(left, right);
        }
    }

    private static <E> Predicate<E> entityPredicate(Map<String, Predicate> predicates, Schema schema,
            PartitionStrategy strategy) {
        if (Schema.Type.RECORD != schema.getType()) {
            return alwaysTrue();
        }
        return new EntityPredicate<E>(predicates, schema, strategy);
    }

    /**
     * Returns true if there are no constraints.
     *
     * @return {@code true} if there are no constraints, {@code false} otherwise
     */
    public boolean isUnbounded() {
        return constraints.isEmpty();
    }

    /**
     * A {@link Predicate} for testing entities against a set of predicates.
     *
     * @param <E> The type of entities this predicate tests
     */
    private static class EntityPredicate<E> implements Predicate<E> {
        private final List<Map.Entry<Schema.Field, Predicate>> predicatesByField;

        @SuppressWarnings("unchecked")
        public EntityPredicate(Map<String, Predicate> predicates, Schema schema, PartitionStrategy strategy) {

            List<Schema.Field> fields = schema.getFields();
            Map<Schema.Field, Predicate> predicateMap = Maps.newHashMap();

            // in the case of identical source and partition names, the predicate
            // will be applied for both source and partition values.

            for (Schema.Field field : fields) {
                Predicate sourcePredicate = predicates.get(field.name());
                if (sourcePredicate != null) {
                    predicateMap.put(field, sourcePredicate);
                }
            }

            if (strategy != null) {
                // there could be partition predicates to add
                for (FieldPartitioner fp : strategy.getFieldPartitioners()) {
                    Predicate partitionPredicate = predicates.get(fp.getName());
                    if (partitionPredicate != null) {
                        Predicate transformPredicate = new TransformPredicate(fp, partitionPredicate);
                        Schema.Field field = schema.getField(fp.getSourceName());
                        Predicate sourcePredicate = predicateMap.get(field);
                        if (sourcePredicate != null) {
                            // combine the source and the transform-wrapped predicates
                            predicateMap.put(field, combine(sourcePredicate, transformPredicate));
                        } else {
                            predicateMap.put(field, transformPredicate);
                        }
                    }
                }
            }

            this.predicatesByField = ImmutableList.copyOf(predicateMap.entrySet());
        }

        @Override
        @SuppressWarnings("unchecked")
        public boolean apply(@Nullable E entity) {
            if (entity == null) {
                return false;
            }

            // check each constraint and fail immediately
            for (Map.Entry<Schema.Field, Predicate> entry : predicatesByField) {
                Object eValue = get(entity, entry.getKey());
                if (!entry.getValue().apply(eValue)) {
                    return false;
                }
            }

            // all constraints were satisfied
            return true;
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null || getClass() != obj.getClass()) {
                return false;
            }
            EntityPredicate other = (EntityPredicate) obj;
            return Objects.equal(predicatesByField, other.predicatesByField);
        }

        @Override
        public int hashCode() {
            return Objects.hashCode(predicatesByField);
        }

        @Override
        public String toString() {
            return Objects.toStringHelper(this).add("predicates", predicatesByField).toString();
        }
    }

    private static Object get(Object entity, Schema.Field field) {
        // TODO: this should use the correct Avro data model, not just reflect
        // if this fails to find the field, it throws AvroRuntimeException
        return ReflectData.get().getField(entity, field.name(), field.pos());
    }

    /**
     * A {@link Predicate} for testing a {@link StorageKey} against a set of
     * predicates.
     */
    private static class KeyPredicate implements Predicate<StorageKey> {
        private final List<Predicate> partitionPredicates;
        private final List<Predicate<Marker>> timePredicates;

        @SuppressWarnings("unchecked")
        private KeyPredicate(Map<String, Predicate> predicates, PartitionStrategy strategy) {
            Preconditions.checkNotNull(strategy, "Cannot produce KeyPredicate without a PartitionStrategy");

            // in the case of identical source and partition names, there is only one
            // predicate and it is used like normal. the only time this conflicts is
            // when the source and predicate name for a single field are the same, in
            // which case the result will be the projected predicate combined with
            // itself. usually the function is identity when this happens and there is
            // no problem because of the combine identity check.

            List<FieldPartitioner> partitioners = strategy.getFieldPartitioners();
            Predicate[] preds = new Predicate[partitioners.size()];

            Map<String, Predicate> timeFields = Maps.newHashMap();
            for (int i = 0; i < preds.length; i += 1) {
                FieldPartitioner fp = partitioners.get(i);
                Predicate sourcePredicate = predicates.get(fp.getSourceName());
                if (sourcePredicate != null) {
                    Predicate projectedPredicate = fp.project(sourcePredicate);
                    if (projectedPredicate != null) {
                        preds[i] = projectedPredicate;
                    }
                    if (fp instanceof CalendarFieldPartitioner) {
                        timeFields.put(fp.getSourceName(), sourcePredicate);
                    }
                }

                Predicate partitionPredicate = predicates.get(fp.getName());
                if (preds[i] != null) {
                    if (partitionPredicate != null) {
                        preds[i] = combine(partitionPredicate, preds[i]);
                    }
                } else {
                    if (partitionPredicate != null) {
                        preds[i] = partitionPredicate;
                    } else {
                        preds[i] = alwaysTrue();
                    }
                }
            }
            this.partitionPredicates = ImmutableList.copyOf(preds);

            List<Predicate<Marker>> timePreds = Lists.newArrayList();
            for (Map.Entry<String, Predicate> entry : timeFields.entrySet()) {
                timePreds.add(TimeDomain.get(strategy, entry.getKey()).project(entry.getValue()));
            }
            this.timePredicates = ImmutableList.copyOf(timePreds);
        }

        @Override
        @SuppressWarnings("unchecked")
        public boolean apply(StorageKey key) {
            if (key == null) {
                return false;
            }

            // this is fail-fast: if the key fails a constraint, then drop it
            for (int i = 0; i < partitionPredicates.size(); i += 1) {
                Object pValue = key.get(i);
                if (!partitionPredicates.get(i).apply(pValue)) {
                    return false;
                }
            }

            for (Predicate<Marker> timePredicate : timePredicates) {
                if (!timePredicate.apply(key)) {
                    return false;
                }
            }

            // if we made it this far, everything passed
            return true;
        }

        @Override
        public String toString() {
            return Objects.toStringHelper(this).add("predicates", partitionPredicates)
                    .add("timePredicates", timePredicates).toString();
        }
    }

    /**
     * A Predicate that returns the result of transforming its input and applying
     * a predicate to the transformed value.
     * @param <S> The type of input to this predicate
     * @param <T> The type of input to the wrapped predicate.
     */
    private static class TransformPredicate<S, T> implements Predicate<S> {
        private final Function<S, T> function;
        private final Predicate<T> predicate;

        public TransformPredicate(Function<S, T> function, Predicate<T> predicate) {
            this.function = function;
            this.predicate = predicate;
        }

        @Override
        public boolean apply(@Nullable S input) {
            return predicate.apply(function.apply(input));
        }

        @Override
        public String toString() {
            return Objects.toStringHelper(this).add("function", function).add("predicate", predicate).toString();
        }
    }
}