org.deidentifier.arx.aggregates.StatisticsBuilder.java Source code

Java tutorial

Introduction

Here is the source code for org.deidentifier.arx.aggregates.StatisticsBuilder.java

Source

/*
 * ARX: Powerful Data Anonymization
 * Copyright 2012 - 2016 Fabian Prasser, Florian Kohlmayer and contributors
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.deidentifier.arx.aggregates;

import java.text.ParseException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.apache.commons.math3.stat.descriptive.moment.GeometricMean;
import org.deidentifier.arx.ARXLogisticRegressionConfiguration;
import org.deidentifier.arx.DataHandleInternal;
import org.deidentifier.arx.DataHandleInternal.InterruptHandler;
import org.deidentifier.arx.DataScale;
import org.deidentifier.arx.DataType;
import org.deidentifier.arx.DataType.ARXString;
import org.deidentifier.arx.DataType.DataTypeWithRatioScale;
import org.deidentifier.arx.aggregates.StatisticsContingencyTable.Entry;
import org.deidentifier.arx.aggregates.StatisticsSummary.StatisticsSummaryOrdinal;
import org.deidentifier.arx.common.Groupify;
import org.deidentifier.arx.common.Groupify.Group;
import org.deidentifier.arx.common.TupleWrapper;
import org.deidentifier.arx.common.WrappedBoolean;
import org.deidentifier.arx.exceptions.ComputationInterruptedException;

import cern.colt.GenericSorting;
import cern.colt.Swapper;
import cern.colt.function.IntComparator;

/**
 * A class offering basic descriptive statistics about data handles.
 *
 * @author Fabian Prasser
 */
public class StatisticsBuilder {

    /** The handle. */
    private DataHandleInternal handle;

    /** The stop flag. */
    private volatile WrappedBoolean interrupt = new WrappedBoolean(false);

    /**
     * Creates a new instance.
     *
     * @param handle
     */
    public StatisticsBuilder(DataHandleInternal handle) {
        this.handle = handle;
    }

    /**
     * Creates a new set of statistics for the given classification task
     * @param clazz - The class attribute
     * @param config - The configuration
     * @throws ParseException
     */
    public StatisticsClassification getClassificationPerformance(String clazz,
            ARXLogisticRegressionConfiguration config) throws ParseException {
        return getClassificationPerformance(new String[] {}, clazz, config);
    }

    /**
     * Creates a new set of statistics for the given classification task
     * @param features - The feature attributes
     * @param clazz - The class attributes
     * @param config - The configuration
     * @throws ParseException
     */
    public StatisticsClassification getClassificationPerformance(String[] features, String clazz,
            ARXLogisticRegressionConfiguration config) throws ParseException {

        // Reset stop flag
        interrupt.value = false;

        // Return
        return new StatisticsClassification(handle.getAssociatedInput(), handle, features, clazz, config,
                interrupt);
    }

    /**
     * Returns a contingency table for the given columns.
     *
     * @param column1 The first column
     * @param orderFromDefinition1 Indicates whether the order that should be assumed for string data items
     *            can (and should) be derived from the hierarchy provided in the data
     *            definition (if any)
     * @param column2 The second column
     * @param orderFromDefinition2 Indicates whether the order that should be assumed for string data items
     *            can (and should) be derived from the hierarchy provided in the data
     *            definition (if any)
     * @return
     */
    public StatisticsContingencyTable getContingencyTable(int column1, boolean orderFromDefinition1, int column2,
            boolean orderFromDefinition2) {

        return getContingencyTable(column1, getHierarchy(column1, orderFromDefinition1), column2,
                getHierarchy(column2, orderFromDefinition2));
    }

    /**
     * Returns a contingency table for the given columns. This method assumes that the
     * order of string data items will be derived from the hierarchies provided
     * in the data definition (if any)
     * 
     * @param column1 The first column
     * @param column2 The second column
     * @return
     */
    public StatisticsContingencyTable getContingencyTable(int column1, int column2) {
        return getContingencyTable(column1, true, column2, true);
    }

    /**
     * Returns a contingency table for the given columns.
     *
     * @param column1 The first column
     * @param size1 The maximal size in this dimension
     * @param orderFromDefinition1 Indicates whether the order that should be assumed for string data items
     *            can (and should) be derived from the hierarchy provided in the data
     *            definition (if any)
     * @param column2 The second column
     * @param size2 The maximal size in this dimension
     * @param orderFromDefinition2 Indicates whether the order that should be assumed for string data items
     *            can (and should) be derived from the hierarchy provided in the data
     *            definition (if any)
     * @return
     */
    public StatisticsContingencyTable getContingencyTable(int column1, int size1, boolean orderFromDefinition1,
            int column2, int size2, boolean orderFromDefinition2) {

        return getContingencyTable(column1, size1, getHierarchy(column1, orderFromDefinition1), column2, size2,
                getHierarchy(column2, orderFromDefinition2));
    }

    /**
     * Returns a contingency table for the given columns. This method assumes that the
     * order of string data items can (and should) be derived from the hierarchies provided
     * in the data definition (if any)
     * 
     * @param column1 The first column
     * @param size1 The maximal size in this dimension
     * @param column2 The second column
     * @param size2 The maximal size in this dimension
     * @return
     */
    public StatisticsContingencyTable getContingencyTable(int column1, int size1, int column2, int size2) {
        return getContingencyTable(column1, size1, true, column2, size2, true);
    }

    /**
     * Returns a contingency table for the given columns. The order for string data items is derived
     * from the provided hierarchies
     * 
     * @param column1 The first column
     * @param size1 The maximal size in this dimension
     * @param hierarchy1 The hierarchy for the first column, may be null
     * @param column2 The second column
     * @param size2 The maximal size in this dimension
     * @param hierarchy2 The hierarchy for the second column, may be null
     * @return
     */
    public StatisticsContingencyTable getContingencyTable(int column1, int size1, String[][] hierarchy1,
            int column2, int size2, String[][] hierarchy2) {

        // Reset stop flag
        interrupt.value = false;

        // Check
        if (size1 <= 0 || size2 <= 0) {
            throw new IllegalArgumentException("Size must be > 0");
        }

        // Obtain default table
        StatisticsContingencyTable table = getContingencyTable(column1, hierarchy1, column2, hierarchy2);

        // Check if suitable
        if (table.values1.length <= size1 && table.values2.length <= size2) {
            return table;
        }

        // Init
        String[] values1;
        String[] values2;
        double factor1;
        double factor2;

        // Compute factors and values
        if (table.values1.length > size1) {
            factor1 = (double) size1 / (double) table.values1.length;
            values1 = getScaledValues(table.values1, size1);
        } else {
            factor1 = 1;
            values1 = table.values1;
        }
        if (table.values2.length > size2) {
            factor2 = (double) size2 / (double) table.values2.length;
            values2 = getScaledValues(table.values2, size2);
        } else {
            factor2 = 1;
            values2 = table.values2;
        }

        // Create entry set
        final Map<Entry, Double> entries = new HashMap<Entry, Double>();
        Iterator<Entry> iter = table.iterator;
        double max = 0d;
        while (iter.hasNext()) {
            checkInterrupt();
            Entry old = iter.next();
            int index1 = (int) Math.round((double) old.value1 * factor1);
            int index2 = (int) Math.round((double) old.value2 * factor2);
            index1 = index1 < size1 ? index1 : size1 - 1;
            index2 = index2 < size2 ? index2 : size2 - 1;
            Entry entry = new Entry(index1, index2);
            Double previous = entries.get(entry);
            double value = previous != null ? previous + old.frequency : old.frequency;
            max = Math.max(value, max);
            entries.put(entry, value);
        }

        // Create iterator
        final Iterator<Entry> internal = entries.keySet().iterator();
        final Iterator<Entry> iterator = new Iterator<Entry>() {

            private Map<Entry, Double> _entries = entries;
            private Iterator<Entry> _internal = internal;

            @Override
            public boolean hasNext() {

                if (_internal == null)
                    return false;
                boolean result = _internal.hasNext();

                // Try to release resources as early as possible
                if (!result) {
                    _internal = null;
                    _entries = null;
                }
                return result;
            }

            @Override
            public Entry next() {
                if (_internal == null)
                    return null;
                Entry e = _internal.next();
                e.frequency = _entries.get(e);
                return e;
            }

            @Override
            public void remove() {
                throw new UnsupportedOperationException();
            }
        };

        // Result result
        return new StatisticsContingencyTable(values1, values2, table.count, max, iterator);
    }

    /**
     * Returns a contingency table for the given columns. The order for string data items is derived
     * from the provided hierarchies
     * 
     * @param column1 The first column
     * @param hierarchy1 The hierarchy for the first column, may be null
     * @param column2 The second column
     * @param hierarchy2 The hierarchy for the second column, may be null
     * @return
     */
    public StatisticsContingencyTable getContingencyTable(int column1, String[][] hierarchy1, int column2,
            String[][] hierarchy2) {

        // Reset stop flag
        interrupt.value = false;

        // Init
        String[] values1 = getDistinctValuesOrdered(column1, hierarchy1);
        String[] values2 = getDistinctValuesOrdered(column2, hierarchy2);

        // Create maps of indexes
        Map<String, Integer> indexes1 = new HashMap<String, Integer>();
        for (int i = 0; i < values1.length; i++) {
            checkInterrupt();
            indexes1.put(values1[i], i);
        }
        Map<String, Integer> indexes2 = new HashMap<String, Integer>();
        for (int i = 0; i < values2.length; i++) {
            checkInterrupt();
            indexes2.put(values2[i], i);
        }

        // Create entry set
        int max = Integer.MIN_VALUE;
        final Map<Entry, Integer> entries = new HashMap<Entry, Integer>();
        for (int row = 0; row < handle.getNumRows(); row++) {
            checkInterrupt();
            int index1 = indexes1.get(handle.getValue(row, column1));
            int index2 = indexes2.get(handle.getValue(row, column2));
            Entry entry = new Entry(index1, index2);
            Integer previous = entries.get(entry);
            int value = previous != null ? previous + 1 : 1;
            max = Math.max(max, value);
            entries.put(entry, value);
        }

        // Create iterator
        final int count = handle.getNumRows();
        final Iterator<Entry> internal = entries.keySet().iterator();
        final Iterator<Entry> iterator = new Iterator<Entry>() {

            private Map<Entry, Integer> _entries = entries;
            private Iterator<Entry> _internal = internal;

            @Override
            public boolean hasNext() {

                if (_internal == null)
                    return false;
                boolean result = _internal.hasNext();

                // Try to release resources as early as possible
                if (!result) {
                    _internal = null;
                    _entries = null;
                }
                return result;
            }

            @Override
            public Entry next() {
                if (_internal == null)
                    return null;
                Entry e = _internal.next();
                e.frequency = (double) _entries.get(e) / (double) count;
                return e;
            }

            @Override
            public void remove() {
                throw new UnsupportedOperationException();
            }
        };

        // Result result
        return new StatisticsContingencyTable(values1, values2, count, (double) max / (double) count, iterator);
    }

    /**
     * Returns the distinct set of data items from the given column.
     *
     * @param column The column
     * @return
     */
    public String[] getDistinctValues(int column) {
        return this.handle.getDistinctValues(column, new InterruptHandler() {
            @Override
            public void checkInterrupt() {
                StatisticsBuilder.this.checkInterrupt();
            }
        });
    }

    /**
     * Returns an ordered list of the distinct set of data items from the given column. This method assumes
     * that the order of string data items can (and should) be derived from the hierarchy provided in the
     * data definition (if any)
     * 
     * @param column The column
     * @return
     */
    public String[] getDistinctValuesOrdered(int column) {
        return this.getDistinctValuesOrdered(column, true);
    }

    /**
     * Returns an ordered list of the distinct set of data items from the given column.
     *
     * @param column The column
     * @param orderFromDefinition Indicates whether the order that should be assumed for string data
     *            items can (and should) be derived from the hierarchy provided in the
     *            data definition (if any)
     * @return
     */
    public String[] getDistinctValuesOrdered(int column, boolean orderFromDefinition) {
        return getDistinctValuesOrdered(column, getHierarchy(column, orderFromDefinition));
    }

    /**
     * Returns an ordered list of the distinct set of data items from the given column. This method assumes
     * that the order of string data items can (and should) be derived from the provided hierarchy
     * 
     * @param column The column
     * @param hierarchy The hierarchy, may be null
     * @return
     */
    public String[] getDistinctValuesOrdered(int column, String[][] hierarchy) {

        // Reset stop flag
        interrupt.value = false;

        // Obtain list and data type
        final String[] list = getDistinctValues(column);
        final String attribute = handle.getAttributeName(column);
        final DataType<?> datatype = handle.getDataType(attribute);
        final int level = handle.getGeneralization(attribute);

        // Sort by data type
        if (hierarchy == null || level == 0) {
            sort(list, datatype);
            // Sort by hierarchy and data type
        } else {
            // Build order directly from the hierarchy
            final Map<String, Integer> order = new HashMap<String, Integer>();
            int max = 0; // The order to use for the suppression string

            // Create base order
            Set<String> baseSet = new HashSet<String>();
            DataType<?> baseType = handle.getBaseDataType(attribute);
            for (int i = 0; i < hierarchy.length; i++) {
                String element = hierarchy[i][0];
                checkInterrupt();
                // Make sure that only elements from the hierarchy
                // are added that are included in the data
                // TODO: Calling isValid is only a work-around
                if (baseType.isValid(element))
                    baseSet.add(element);
            }
            String[] baseArray = baseSet.toArray(new String[baseSet.size()]);
            sort(baseArray, handle.getBaseDataType(attribute));
            Map<String, Integer> baseOrder = new HashMap<String, Integer>();
            for (int i = 0; i < baseArray.length; i++) {
                checkInterrupt();
                baseOrder.put(baseArray[i], i);
            }

            // Handle optimized handles
            int lower = handle.isOptimized() ? 1 : level;
            int upper = handle.isOptimized() ? hierarchy[0].length : level + 1;

            // Build higher level order from base order
            for (int i = 0; i < hierarchy.length; i++) {
                checkInterrupt();

                for (int j = lower; j < upper; j++) {
                    if (!order.containsKey(hierarchy[i][j])) {
                        Integer position = baseOrder.get(hierarchy[i][0]);
                        if (position != null) {
                            order.put(hierarchy[i][j], position);
                            max = Math.max(position, max) + 1;
                        }
                    }
                }
            }

            // Add suppression string
            order.put(DataType.ANY_VALUE, max);

            // Sort
            sort(list, order);
        }

        // Done
        return list;
    }

    /**
     * Returns statistics about the equivalence classes.
     *
     * @return
     */
    public StatisticsEquivalenceClasses getEquivalenceClassStatistics() {

        // Reset stop flag
        interrupt.value = false;

        // Prepare
        Set<String> attributes = handle.getDefinition().getQuasiIdentifyingAttributes();
        final int[] indices = new int[attributes.size()];
        int index = 0;
        for (int column = 0; column < handle.getNumColumns(); column++) {
            if (attributes.contains(handle.getAttributeName(column))) {
                indices[index++] = column;
            }
        }

        // Calculate equivalence classes
        int capacity = handle.getNumRows() / 10;
        capacity = capacity > 10 ? capacity : 10;
        Groupify<TupleWrapper> map = new Groupify<TupleWrapper>(capacity);
        int numRows = handle.getNumRows();
        for (int row = 0; row < numRows; row++) {

            TupleWrapper tuple = new TupleWrapper(handle, indices, row, false);
            map.add(tuple);
            checkInterrupt();
        }

        // Now compute the following values
        double averageEquivalenceClassSize = 0d;
        double averageEquivalenceClassSizeIncludingOutliers = 0d;
        int maximalEquivalenceClassSize = Integer.MIN_VALUE;
        int maximalEquivalenceClassSizeIncludingOutliers = Integer.MIN_VALUE;
        int minimalEquivalenceClassSize = Integer.MAX_VALUE;
        int minimalEquivalenceClassSizeIncludingOutliers = Integer.MAX_VALUE;
        int numberOfEquivalenceClasses = 0;
        int numberOfEquivalenceClassesIncludingOutliers = map.size();
        int numberOfTuples = 0;
        int numberOfOutlyingTuples = 0;

        // Let's do it
        boolean containsOutliers = false;
        Group<TupleWrapper> element = map.first();
        while (element != null) {

            checkInterrupt();
            maximalEquivalenceClassSizeIncludingOutliers = Math.max(element.getCount(),
                    maximalEquivalenceClassSizeIncludingOutliers);
            minimalEquivalenceClassSizeIncludingOutliers = Math.min(element.getCount(),
                    minimalEquivalenceClassSizeIncludingOutliers);
            averageEquivalenceClassSizeIncludingOutliers += element.getCount();
            numberOfTuples += element.getCount();

            if (!element.getElement().isOutlier()) {

                maximalEquivalenceClassSize = Math.max(element.getCount(), maximalEquivalenceClassSize);
                minimalEquivalenceClassSize = Math.min(element.getCount(), minimalEquivalenceClassSize);
                averageEquivalenceClassSize += element.getCount();

            } else {

                containsOutliers = true;
                numberOfOutlyingTuples = element.getCount();
            }

            element = element.next();
        }

        numberOfEquivalenceClasses = numberOfEquivalenceClassesIncludingOutliers;
        if (containsOutliers) {
            numberOfEquivalenceClasses -= 1;
        }

        averageEquivalenceClassSize /= (double) numberOfEquivalenceClasses;
        averageEquivalenceClassSizeIncludingOutliers /= (double) numberOfEquivalenceClassesIncludingOutliers;

        // And return
        return new StatisticsEquivalenceClasses(averageEquivalenceClassSize,
                averageEquivalenceClassSizeIncludingOutliers, maximalEquivalenceClassSize,
                maximalEquivalenceClassSizeIncludingOutliers, minimalEquivalenceClassSize,
                minimalEquivalenceClassSizeIncludingOutliers, numberOfEquivalenceClasses,
                numberOfEquivalenceClassesIncludingOutliers, numberOfTuples, numberOfOutlyingTuples);
    }

    /**
     * Returns a frequency distribution for the values in the given column. This method assumes that the
     * order of string data items can (and should) be derived from the hierarchy provided in the data
     * definition (if any)
     * 
     * @param column The column
     * @return
     */
    public StatisticsFrequencyDistribution getFrequencyDistribution(int column) {
        return getFrequencyDistribution(column, true);
    }

    /**
     * Returns a frequency distribution for the values in the given column.
     *
     * @param column The column
     * @param orderFromDefinition Indicates whether the order that should be assumed for string data items
     *            can (and should) be derived from the hierarchy provided in the data
     *            definition (if any)
     * @return
     */
    public StatisticsFrequencyDistribution getFrequencyDistribution(int column, boolean orderFromDefinition) {
        return getFrequencyDistribution(column, getHierarchy(column, orderFromDefinition));
    }

    /**
     * Returns a frequency distribution for the values in the given column. The order for string data items
     * is derived from the provided hierarchy
     * 
     * @param column The column
     * @param hierarchy The hierarchy, may be null
     * @return
     */
    public StatisticsFrequencyDistribution getFrequencyDistribution(int column, String[][] hierarchy) {

        // Reset stop flag
        interrupt.value = false;

        // Init
        String[] values = getDistinctValuesOrdered(column, hierarchy);
        double[] frequencies = new double[values.length];

        // Create map of indexes
        Map<String, Integer> indexes = new HashMap<String, Integer>();
        for (int i = 0; i < values.length; i++) {
            checkInterrupt();
            indexes.put(values[i], i);
        }

        // Count frequencies
        for (int row = 0; row < handle.getNumRows(); row++) {
            checkInterrupt();
            String value = handle.getValue(row, column);
            frequencies[indexes.get(value)]++;
        }

        // Divide by count
        int count = handle.getNumRows();
        for (int i = 0; i < frequencies.length; i++) {
            checkInterrupt();
            frequencies[i] /= (double) count;
        }

        // Return
        return new StatisticsFrequencyDistribution(values, frequencies, count);
    }

    /**
     * 
     * Returns an interruptible instance of this object.
     *
     * @return
     */
    public StatisticsBuilderInterruptible getInterruptibleInstance() {
        return new StatisticsBuilderInterruptible(handle);
    }

    /**
     * Returns summary statistics for all attributes.
     * 
     * @param listwiseDeletion A flag enabling list-wise deletion
     * @return
     */
    @SuppressWarnings({ "unchecked", "rawtypes" })
    public <T> Map<String, StatisticsSummary<?>> getSummaryStatistics(boolean listwiseDeletion) {

        // Reset stop flag
        interrupt.value = false;

        Map<String, DescriptiveStatistics> statistics = new HashMap<String, DescriptiveStatistics>();
        Map<String, StatisticsSummaryOrdinal> ordinal = new HashMap<String, StatisticsSummaryOrdinal>();
        Map<String, DataScale> scales = new HashMap<String, DataScale>();
        Map<String, GeometricMean> geomean = new HashMap<String, GeometricMean>();

        // Detect scales
        for (int col = 0; col < handle.getNumColumns(); col++) {

            // Meta
            String attribute = handle.getAttributeName(col);
            DataType<?> type = handle.getDataType(attribute);

            // Scale
            DataScale scale = type.getDescription().getScale();

            // Try to replace nominal scale with ordinal scale based on base data type
            if (scale == DataScale.NOMINAL && handle.getGeneralization(attribute) != 0) {
                if (!(handle.getBaseDataType(attribute) instanceof ARXString) && getHierarchy(col, true) != null) {
                    scale = DataScale.ORDINAL;
                }
            }

            // Store
            scales.put(attribute, scale);
            statistics.put(attribute, new DescriptiveStatistics());
            geomean.put(attribute, new GeometricMean());
            ordinal.put(attribute, getSummaryStatisticsOrdinal(handle.getGeneralization(attribute),
                    handle.getDataType(attribute), handle.getBaseDataType(attribute), getHierarchy(col, true)));
        }

        // Compute summary statistics
        for (int row = 0; row < handle.getNumRows(); row++) {

            // Check, if we should include this row
            boolean include = true;
            if (listwiseDeletion) {
                for (int col = 0; col < handle.getNumColumns(); col++) {
                    if (handle.isOutlier(row) || DataType.isNull(handle.getValue(row, col))) {
                        include = false;
                        break;
                    }
                }
            }

            // Check
            checkInterrupt();

            // If yes, add
            if (include) {

                // For each column
                for (int col = 0; col < handle.getNumColumns(); col++) {

                    // Meta
                    String value = handle.getValue(row, col);
                    String attribute = handle.getAttributeName(col);
                    DataType<?> type = handle.getDataType(attribute);

                    // Analyze
                    if (!DataType.isAny(value) && !DataType.isNull(value)) {
                        ordinal.get(attribute).addValue(value);
                        if (type instanceof DataTypeWithRatioScale) {
                            double doubleValue = ((DataTypeWithRatioScale) type).toDouble(type.parse(value));
                            statistics.get(attribute).addValue(doubleValue);
                            geomean.get(attribute).increment(doubleValue + 1d);
                        }
                    }
                }
            }
        }

        // Convert
        Map<String, StatisticsSummary<?>> result = new HashMap<String, StatisticsSummary<?>>();
        for (int col = 0; col < handle.getNumColumns(); col++) {

            // Check
            checkInterrupt();

            // Depending on scale
            String attribute = handle.getAttributeName(col);
            DataScale scale = scales.get(attribute);
            DataType<T> type = (DataType<T>) handle.getDataType(attribute);
            ordinal.get(attribute).analyze();
            if (scale == DataScale.NOMINAL) {
                StatisticsSummaryOrdinal stats = ordinal.get(attribute);
                result.put(attribute, new StatisticsSummary<T>(DataScale.NOMINAL, stats.getNumberOfMeasures(),
                        stats.getMode(), type.parse(stats.getMode())));
            } else if (scale == DataScale.ORDINAL) {
                StatisticsSummaryOrdinal stats = ordinal.get(attribute);
                result.put(attribute,
                        new StatisticsSummary<T>(DataScale.ORDINAL, stats.getNumberOfMeasures(), stats.getMode(),
                                type.parse(stats.getMode()), stats.getMedian(), type.parse(stats.getMedian()),
                                stats.getMin(), type.parse(stats.getMin()), stats.getMax(),
                                type.parse(stats.getMax())));
            } else if (scale == DataScale.INTERVAL) {
                StatisticsSummaryOrdinal stats = ordinal.get(attribute);
                DescriptiveStatistics stats2 = statistics.get(attribute);
                boolean isPeriod = type.getDescription().getWrappedClass() == Date.class;

                // TODO: Something is wrong with commons math's kurtosis
                double kurtosis = stats2.getKurtosis();
                kurtosis = kurtosis < 0d ? Double.NaN : kurtosis;
                double range = stats2.getMax() - stats2.getMin();
                double stddev = Math.sqrt(stats2.getVariance());

                result.put(attribute, new StatisticsSummary<T>(DataScale.INTERVAL, stats.getNumberOfMeasures(),
                        stats.getMode(), type.parse(stats.getMode()), stats.getMedian(),
                        type.parse(stats.getMedian()), stats.getMin(), type.parse(stats.getMin()), stats.getMax(),
                        type.parse(stats.getMax()), toString(type, stats2.getMean(), false, false),
                        toValue(type, stats2.getMean()), stats2.getMean(),
                        toString(type, stats2.getVariance(), isPeriod, true), toValue(type, stats2.getVariance()),
                        stats2.getVariance(), toString(type, stats2.getPopulationVariance(), isPeriod, true),
                        toValue(type, stats2.getPopulationVariance()), stats2.getPopulationVariance(),
                        toString(type, stddev, isPeriod, false), toValue(type, stddev), stddev,
                        toString(type, range, isPeriod, false), toValue(type, range),
                        stats2.getMax() - stats2.getMin(), toString(type, kurtosis, isPeriod, false),
                        toValue(type, kurtosis), kurtosis));
            } else if (scale == DataScale.RATIO) {
                StatisticsSummaryOrdinal stats = ordinal.get(attribute);
                DescriptiveStatistics stats2 = statistics.get(attribute);
                GeometricMean geo = geomean.get(attribute);

                // TODO: Something is wrong with commons math's kurtosis
                double kurtosis = stats2.getKurtosis();
                kurtosis = kurtosis < 0d ? Double.NaN : kurtosis;
                double range = stats2.getMax() - stats2.getMin();
                double stddev = Math.sqrt(stats2.getVariance());

                result.put(attribute, new StatisticsSummary<T>(DataScale.RATIO, stats.getNumberOfMeasures(),
                        stats.getMode(), type.parse(stats.getMode()), stats.getMedian(),
                        type.parse(stats.getMedian()), stats.getMin(), type.parse(stats.getMin()), stats.getMax(),
                        type.parse(stats.getMax()), toString(type, stats2.getMean(), false, false),
                        toValue(type, stats2.getMean()), stats2.getMean(),
                        toString(type, stats2.getVariance(), false, false), toValue(type, stats2.getVariance()),
                        stats2.getVariance(), toString(type, stats2.getPopulationVariance(), false, false),
                        toValue(type, stats2.getPopulationVariance()), stats2.getPopulationVariance(),
                        toString(type, stddev, false, false), toValue(type, stddev), stddev,
                        toString(type, range, false, false), toValue(type, range), range,
                        toString(type, kurtosis, false, false), toValue(type, kurtosis), kurtosis,
                        toString(type, geo.getResult() - 1d, false, false), toValue(type, geo.getResult() - 1d),
                        stats2.getGeometricMean()));
            }
        }

        return result;
    }

    /**
     * Checks whether an interruption happened.
     */
    private void checkInterrupt() {
        if (interrupt.value) {
            throw new ComputationInterruptedException("Interrupted");
        }
    }

    /**
     * Returns the appropriate hierarchy, if any.
     *
     * @param column
     * @param orderFromDefinition
     * @return
     */
    private String[][] getHierarchy(int column, boolean orderFromDefinition) {

        // Init
        final String attribute = handle.getAttributeName(column);
        final String[][] hierarchy = handle.getDefinition().getHierarchy(attribute);
        final DataType<?> datatype = handle.getDataType(attribute);

        // Check if hierarchy available
        if (orderFromDefinition && datatype instanceof ARXString && hierarchy != null) {
            return hierarchy;
        } else {
            return null;
        }
    }

    /**
     * Scales the given string array.
     *
     * @param values
     * @param length The resulting length
     * @return
     */
    private String[] getScaledValues(String[] values, int length) {

        // Init
        AggregateFunction<String> function = AggregateFunction.forType(DataType.STRING).createSetFunction();
        double factor = (double) length / (double) values.length;
        String[] result = new String[length];

        // Aggregate
        int previous = 0;
        List<String> toAggregate = new ArrayList<String>();
        for (int i = 0; i < values.length; i++) {

            checkInterrupt();

            int index = (int) Math.round((double) i * factor);
            index = index < length ? index : length - 1;

            if (index != previous) {
                result[previous] = function.aggregate(toAggregate.toArray(new String[toAggregate.size()]));
                toAggregate.clear();
                previous = index;
            }
            toAggregate.add(values[i]);
        }

        result[length - 1] = function.aggregate(toAggregate.toArray(new String[toAggregate.size()]));
        return result;
    }

    /**
     * Returns a summary statistics object for the given attribute
     * @param generalization
     * @param dataType
     * @param baseDataType
     * @param hierarchy
     * @return
     */
    private <U, V> StatisticsSummaryOrdinal getSummaryStatisticsOrdinal(final int generalization,
            final DataType<U> dataType, final DataType<V> baseDataType, final String[][] hierarchy) {

        // TODO: It would be cleaner to return an ARXOrderedString for generalized variables
        // TODO: that have a suitable data type directly from the DataHandle
        if (generalization == 0 || !(dataType instanceof ARXString)) {
            return new StatisticsSummaryOrdinal(dataType);
        } else if (baseDataType instanceof ARXString) {
            return new StatisticsSummaryOrdinal(dataType);
        } else if (hierarchy == null) {
            return new StatisticsSummaryOrdinal(dataType);
        } else {
            final Map<String, String> map = new HashMap<String, String>();
            for (int i = 0; i < hierarchy.length; i++) {
                map.put(hierarchy[i][generalization], hierarchy[i][0]);
            }
            return new StatisticsSummaryOrdinal(new Comparator<String>() {
                public int compare(String o1, String o2) {
                    V _o1 = null;
                    try {
                        _o1 = baseDataType.parse(map.get(o1));
                    } catch (Exception e) {
                        // Nothing to do
                    }
                    V _o2 = null;
                    try {
                        _o2 = baseDataType.parse(map.get(o2));
                    } catch (Exception e) {
                        // Nothing to do
                    }
                    try {
                        return baseDataType.compare(_o1, _o2);
                    } catch (Exception e) {
                        return 0;
                    }
                }
            });
        }
    }

    /**
     * Orders the given array by data type.
     *
     * @param array
     * @param type
     */
    private void sort(final String[] array, final DataType<?> type) {
        GenericSorting.mergeSort(0, array.length, new IntComparator() {

            @Override
            public int compare(int arg0, int arg1) {
                checkInterrupt();
                try {
                    String s1 = array[arg0];
                    String s2 = array[arg1];
                    return (s1 == DataType.ANY_VALUE && s2 == DataType.ANY_VALUE) ? 0
                            : (s1 == DataType.ANY_VALUE ? +1
                                    : (s2 == DataType.ANY_VALUE ? -1 : type.compare(s1, s2)));
                } catch (IllegalArgumentException | ParseException e) {
                    throw new RuntimeException("Some values seem to not conform to the data type.", e);
                }
            }
        }, new Swapper() {
            @Override
            public void swap(int arg0, int arg1) {
                String temp = array[arg0];
                array[arg0] = array[arg1];
                array[arg1] = temp;
            }
        });
    }

    /**
     * Orders the given array by the given sort order.
     *
     * @param array
     * @param order
     */
    private void sort(final String[] array, final Map<String, Integer> order) {
        GenericSorting.mergeSort(0, array.length, new IntComparator() {
            @Override
            public int compare(int arg0, int arg1) {
                checkInterrupt();
                Integer order1 = order.get(array[arg0]);
                Integer order2 = order.get(array[arg1]);
                if (order1 == null || order2 == null) {
                    throw new RuntimeException("The hierarchy seems to not cover all data values");
                } else {
                    return order1.compareTo(order2);
                }
            }
        }, new Swapper() {
            @Override
            public void swap(int arg0, int arg1) {
                String temp = array[arg0];
                array[arg0] = array[arg1];
                array[arg1] = temp;
            }
        });
    }

    /**
     * Used for building summary statistics
     * @param type
     * @param value
     * @param isPeriod Defines whether the parameter is a time period
     * @param isSquare Defines whether the period is a squared period
     * @return
     */
    @SuppressWarnings({ "unchecked", "rawtypes" })
    private String toString(DataType<?> type, double value, boolean isPeriod, boolean isSquare) {

        // Handle corner cases
        if (Double.isNaN(value)) {
            return "Not available";
        } else if (Double.isInfinite(value)) {
            if (value < 0) {
                return "-Infinity";
            } else {
                return "+Infinity";
            }
        }

        // Handle periods
        if (isPeriod) {

            // Init
            long SECONDS = 1000;
            long MINUTES = 60 * SECONDS;
            long HOURS = 60 * MINUTES;
            long DAYS = 24 * HOURS;
            long WEEKS = 7 * DAYS;

            // Square
            if (isSquare) {
                SECONDS *= SECONDS;
                MINUTES *= MINUTES;
                HOURS *= HOURS;
                DAYS *= DAYS;
                WEEKS *= WEEKS;
            }

            // Compute
            final int weeks = (int) (value / WEEKS);
            value = value % WEEKS;
            final int days = (int) (value / DAYS);
            value = value % DAYS;
            final int hours = (int) (value / HOURS);
            value = value % HOURS;
            final int minutes = (int) (value / MINUTES);
            value = value % MINUTES;
            final int seconds = (int) (value / SECONDS);
            value = value % SECONDS;
            final int milliseconds = (int) (value);

            // Convert
            StringBuilder builder = new StringBuilder();
            if (weeks != 0)
                builder.append(weeks).append(isSquare ? "w^2, " : "w, ");
            if (days != 0)
                builder.append(days).append(isSquare ? "d^2, " : "d, ");
            if (hours != 0)
                builder.append(hours).append(isSquare ? "h^2, " : "h, ");
            if (minutes != 0)
                builder.append(minutes).append(isSquare ? "m^2, " : "m, ");
            if (seconds != 0)
                builder.append(seconds).append(isSquare ? "s^2, " : "s, ");
            builder.append(milliseconds).append(isSquare ? "ms^2" : "ms");

            // Return
            return builder.toString();

        }

        // Handle data types
        if (type instanceof DataTypeWithRatioScale) {
            DataTypeWithRatioScale rType = (DataTypeWithRatioScale) type;
            return rType.format(rType.fromDouble(value));
        } else {
            return String.valueOf(value);
        }
    }

    /**
     * Used for building summary statistics
     * @param type
     * @param value
     * @return
     */
    @SuppressWarnings("unchecked")
    private <T> T toValue(DataType<T> type, double value) {

        // Handle corner cases
        if (Double.isNaN(value) || Double.isInfinite(value)) {
            return null;
        }

        // Handle data types
        Class<?> clazz = type.getDescription().getWrappedClass();
        if (clazz == Long.class) {
            return (T) Long.valueOf((long) value);
        } else if (clazz == Double.class) {
            return (T) Double.valueOf(value);
        } else if (clazz == Date.class) {
            return (T) new Date((long) value);
        } else {
            return (T) String.valueOf(value);
        }
    }

    /**
     * Stops all computations. May lead to exceptions being thrown. Use with care.
     */
    void interrupt() {
        this.interrupt.value = true;
    }
}