com.cinchapi.concourse.plugin.data.TrackingMultimap.java Source code

Introduction

Here is the source code for com.cinchapi.concourse.plugin.data.TrackingMultimap.java
Source

/*
 * Copyright (c) 2013-2016 Cinchapi Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cinchapi.concourse.plugin.data;

import java.util.AbstractMap;
import java.util.AbstractSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

import javax.annotation.concurrent.NotThreadSafe;

import com.cinchapi.concourse.Link;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.zaxxer.sparsebits.SparseBitSet;

/**
 * <p>
 * An implementation of a {@code Map} that characterizes entries on the fly as
 * they are added or removed. This is used to characterize user data as it is
 * being entered, so that the visualization engine can query this map to
 * immediately view data characterization in constant time.
 * </p>
 * <p>
 * Apart from {@link #put(Object, Set)}, {@link #remove(Object)},
 * {@link #putAll(Map)}, and {@link #clear()}, all methods are delegated to an
 * internal map. The four aforementioned methods are overridden in terms of
 * functionality to characterize the entries in the map before performing the
 * original intended function.
 * </p>
 * <p>
 * {@link TrackingMultimap} is parametrized by type-parameters K and V, but the
 * underlying internal map is in the form {@code Map<K, Set<V>>}. This is to
 * comply with the format of data, which is either a Map from Strings (keys) to
 * Objects (values), or Objects (values) to Longs (records).
 * </p>
 * 
 * @author Jeff Nelson
 */
// TODO talk about what is tracked for keys and what is tracked for values
@NotThreadSafe
public abstract class TrackingMultimap<K, V> extends AbstractMap<K, Set<V>> {

    /**
     * Return the correct {@link DataType} for the {@code clazz}.
     * 
     * @param clazz the {@link Class} to translate
     * @return the correct {@link DataType}
     */
    private static DataType getDataTypeForClass(Class<?> clazz) {
        if (clazz == Link.class) {
            return DataType.LINK;
        } else if ((Number.class.isAssignableFrom(clazz) || OTHER_NUMBER_CLASSES.contains(clazz))) {
            return DataType.NUMBER;
        } else if (clazz == String.class) {
            return DataType.STRING;
        } else if (clazz == Boolean.class || clazz == boolean.class) {
            return DataType.BOOLEAN;
        } else {
            return DataType.UNKNOWN;
        }
    }

    /**
     * A collection of classes that don't extend {@link Number} should be
     * considered {@link DataType#NUMBER numerical}.
     */
    private static Set<Class<?>> OTHER_NUMBER_CLASSES = Sets.newIdentityHashSet();
    static {
        OTHER_NUMBER_CLASSES.add(int.class);
        OTHER_NUMBER_CLASSES.add(long.class);
        OTHER_NUMBER_CLASSES.add(float.class);
        OTHER_NUMBER_CLASSES.add(double.class);
        OTHER_NUMBER_CLASSES.add(short.class);
        OTHER_NUMBER_CLASSES.add(byte.class);
    }

    /**
     * An internal map where the data is actually stored.
     */
    private Map<K, Set<V>> data;

    /**
     * A mapping from each of the {@link DataType data types} to the number of
     * stored keys that are characterized as such.
     */
    private final Map<DataType, AtomicInteger> keyTypes;

    /**
     * The total number of values (including duplicates) added across all the
     * keys.
     */
    private final AtomicLong totalValueCount;

    /**
     * The total number of unique values (e.g. excluding duplicates) that are
     * stored across all the keys.
     */
    private final AtomicLong uniqueValueCount;

    /**
     * An approximate cache of values stored across all the keys.
     * <p>
     * Whenever a value is added to the map, the bit for its
     * {@link Object#hashCode() hash code} is flipped to indicate that the value
     * is stored. However, hash codes are not guaranteed to be unique among
     * objects, so its necessary to look through all the values and test the
     * equality for a potential match to determine if an object is actually
     * contained or not.
     * </p>
     */
    private final SparseBitSet valueCache;

    /**
     * Construct a new instance.
     * 
     * @param delegate an {@link Map#isEmpty() empty} map
     */
    protected TrackingMultimap(Map<K, Set<V>> delegate) {
        Preconditions.checkState(delegate.isEmpty());
        this.data = delegate;
        this.keyTypes = Maps.newIdentityHashMap();
        for (DataType type : DataType.values()) {
            this.keyTypes.put(type, new AtomicInteger(0));
        }
        this.totalValueCount = new AtomicLong(0);
        this.uniqueValueCount = new AtomicLong(0);
        this.valueCache = new SparseBitSet();
    }

    /**
     * Returns whether the {@link TrackingMultimap} contains values of the
     * specified {@link DataType}.
     * 
     * @param type the {@link DataType} being queried
     * @return {@code true} if the {@code Map} contains this {@link DataType},
     *         false otherwise
     */
    public boolean containsDataType(DataType type) {
        return percentKeyDataType(type) > 0;
    }

    /**
     * Remove the association between {@code key} and {@code value} from the
     * map.
     * 
     * @param key the key
     * @param value the value
     * @return {@code true} if the association previously existed and is removed
     */
    public boolean delete(K key, V value) {
        Set<V> values = data.get(key);
        if (values != null && values.remove(value)) {
            if (values.isEmpty()) {
                data.remove(values);
            }
            return true;
        } else {
            return false;
        }
    }

    @Override
    public Set<Entry<K, Set<V>>> entrySet() {
        return data.entrySet();
    }

    @Override
    public Set<V> get(Object key) {
        return data.get(key);
    }

    /**
     * Return {@code true} if this map associates {@code value} with at least
     * one key.
     * <p>
     * This method is different from {@link #containsValue(Object)} because it
     * checks for values <strong>within</strong> the Sets that are mapped from
     * keys. Use the aforementioned if you need to check for the existence of an
     * entire Set as opposed to an individual value.
     * </p>
     * 
     * @param value the value to checks
     * @return {@code true} if the value is contained, {@code false} otherwise
     */
    public boolean hasValue(V value) {
        int hashCode = Math.abs(value.hashCode());
        if (valueCache.get(hashCode)) {
            for (Set<V> values : data.values()) {
                if (values.contains(value)) {
                    return true;
                }
            }
            return false;
        } else {
            return false;
        }
    }

    /**
     * Merge all the {@code values} into the set of values that is mapped from
     * {@code key}.
     * 
     * @param key the key
     * @param values the values to merge
     * @return all the values mapped from {@code key} after the merge
     */
    public Set<V> merge(K key, Set<V> values) {
        for (V value : values) {
            insert(key, value);
        }
        return get(key);
    }

    /**
     * Return the percent (between 0 and 1) of keys that are an instance of the
     * specified {@link DataType type}.
     * 
     * @param type the {@link DataType} of interest
     * @return the percent of keys of the {@code type}
     */
    public double percentKeyDataType(DataType type) {
        return ((double) keyTypes.get(type).get()) / totalValueCount.get();
    }

    /**
     * Determines the proportion of occurrence of a particular key. This is
     * merely the frequency of that key divided by the total number of key
     * frequencies.
     * 
     * @param element the key for which the proportion is being sought
     * @return the proportion of the key
     */
    public double proportion(K element) {
        double frequency = data.get(element).size();
        return frequency / totalValueCount.get();
    }

    /**
     * <p>
     * <strong>NOTE:</strong> This implementation will replace all the existing
     * values mapped from {@code key} with those specified in the {@code value}.
     * If you want "merge-like" functionality call the {@link #merge(Set)}
     * method.
     * </p>
     * {@inheritDoc}
     */
    @Override
    public Set<V> put(K key, Set<V> value) {
        Set<V> stored = data.get(key);
        if (stored == null) {
            stored = new ValueSetWrapper(key);
            data.put(key, stored);
        }
        for (V element : stored) {
            delete(key, element);
        }
        for (V element : value) {
            insert(key, element);
        }
        return stored;
    }

    /**
     * Add a new association between {@code key} and {@code value} to the map if
     * it doesn't already exist.
     * 
     * @param key the key
     * @param value the value
     * @return {@code true} if the association didn't previously exist and is
     *         not added
     */
    public boolean insert(K key, V value) {
        Set<V> values = data.get(key);
        if (values == null) {
            values = new ValueSetWrapper(key);
            data.put(key, values);
        }
        if (values.add(value)) {
            return true;
        } else {
            return false;
        }
    }

    @SuppressWarnings("unchecked")
    @Override
    public Set<V> remove(Object key) {
        Set<V> stored = data.get(key);
        if (stored != null) {
            for (V element : stored) {
                delete((K) key, element); // type cast is valid because the
                                          // presence of elements over which to
                                          // iterate ensures that #put(K key, V
                                          // value) was called, which performs
                                          // type checking
            }
        }
        return stored;

    }

    /**
     * Calculates the uniqueness of the data by summing the squares of the
     * proportions of each key within the {@link #keySet() key set},
     * determining the square root of the sum, and subtracting it from 1. This
     * always results in a number between 0 and 1.
     * <p>
     * For datasets with a large number of distinct values appearing in
     * relatively similar frequency, this function returns a relatively high
     * number, since there are many unique values. Mathematically, each
     * contributes a small amount to the proportion, so the square root term is
     * small, returning a large end result.
     * </p>
     * <p>
     * Conversely, for datasets with a few dominating values, this function
     * returns a fairly low number. This is because the higher proportions from
     * the dominating values contribute more heavily towards the sum of squares.
     * The square root is therefore higher, and when subtracted from 1, returns
     * a lower number.
     * </p>
     * 
     * @return the uniqueness of the data, on a scale from 0 to 1.
     */
    public double uniqueness() {
        double sumOfSquares = 0;
        for (K key : this.keySet()) {
            sumOfSquares += Math.pow(proportion(key), 2);
        }
        return 1 - Math.sqrt(sumOfSquares);
    }

    /**
     * Determines how many unique values exist within the {@link Map} and
     * returns the appropriate {@link VariableType}.
     * 
     * The three possible return types are:
     * <ol>
     * <li><strong>DICHOTOMOUS</strong>: if there are 1 or 2 unique values</li>
     * <li><strong>NOMINAL</strong>: if the number of unique values is greater
     * than 2 and less than or equal to 12</li>
     * <li><strong>INTERVAL</strong>: if there are more than 12 unique values</li>
     * </ol>
     * 
     * @return
     */
    public VariableType variableType() {
        // NOTE: The boundary between nominal and interval is arbitrary, and may
        // require tweaking since it is a heuristic model.
        if (data.keySet().size() <= 2) {
            return VariableType.DICHOTOMOUS;
        } else if (data.keySet().size() <= 12) {
            return VariableType.NOMINAL;
        } else {
            return VariableType.INTERVAL;
        }
    }

    /**
     * Return a new {@link Set} (of the appropriate type) to use for storing the
     * values that are mapped from a key.
     * 
     * @return a new {@link Set}
     */
    protected abstract Set<V> createValueSet();

    /**
     * A broad classification of objects that describes the nature of the data.
     * 
     * @author Jeff Nelson
     */
    public static enum DataType {
        BOOLEAN, LINK, NUMBER, STRING, UNKNOWN;
    }

    /**
     * A classification of objects that describes how data is categorized
     */
    public static enum VariableType {
        DICHOTOMOUS, INTERVAL, NOMINAL;
    }

    /**
     * An internal wrapper around a Set returned from the
     * {@link #createValueSet()} method.
     * <p>
     * The wrapper is responsible for tracking stats for the individual set and
     * updating the appropriate variables of the outer class. This ensures that
     * the caller can interact with individual value sets without breaking
     * tracking semantics.
     * </p>
     * 
     * @author Jeff Nelson
     */
    private class ValueSetWrapper extends AbstractSet<V> {

        /**
         * The wrapped set that actually stores the data.
         */
        private final Set<V> values = createValueSet();

        /**
         * The key from which this {@link Set} is mapped in the outer
         * TrackingMultimap.
         */
        private K key;

        /**
         * Construct a new instance.
         * 
         * @param key
         */
        ValueSetWrapper(K key) {
            this.key = key;
        }

        @Override
        public boolean add(V element) {
            boolean contained = hasValue(element);
            if (values.add(element)) {
                totalValueCount.incrementAndGet();
                DataType keyType = getDataTypeForClass(key.getClass());
                keyTypes.get(keyType).incrementAndGet();
                if (!contained) {
                    // The value was not previously contained, so we must update
                    // the number of unique values stored across all the keys.
                    uniqueValueCount.incrementAndGet();
                    valueCache.set(Math.abs(element.hashCode()));
                }
                return true;
            } else {
                return false;
            }
        }

        @Override
        public Iterator<V> iterator() {

            return new Iterator<V>() {

                /**
                 * The delegate iterator that controls state.
                 */
                private final Iterator<V> delegate = values.iterator();

                /**
                 * The last value returned from the {@link #next()} method.
                 */
                private V next = null;

                @Override
                public boolean hasNext() {
                    return delegate.hasNext();
                }

                @Override
                public V next() {
                    next = delegate.next();
                    return next;
                }

                @Override
                public void remove() {
                    ValueSetWrapper.this.remove(next);
                    next = null;
                }

            };
        }

        @SuppressWarnings("unchecked")
        @Override
        public boolean remove(Object element) {
            if (values.remove(element)) {
                totalValueCount.decrementAndGet();
                DataType keyType = getDataTypeForClass(key.getClass());
                keyTypes.get(keyType).decrementAndGet();
                boolean contained = hasValue((V) element);
                if (!contained) {
                    // Since the value is no longer "contained" we are free to
                    // decrement the number of unique values stored across all
                    // the keys
                    uniqueValueCount.decrementAndGet();
                }
                return true;
            } else {
                return false;
            }
        }

        @Override
        public int size() {
            return values.size();
        }
    }

}