net.sf.javaml.tools.DatasetTools.java Source code

Java tutorial

Introduction

Here is the source code for net.sf.javaml.tools.DatasetTools.java

Source

/**
 * This file is part of the Java Machine Learning Library
 * 
 * The Java Machine Learning Library is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * The Java Machine Learning Library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with the Java Machine Learning Library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 * 
 * Copyright (c) 2006-2012, Thomas Abeel
 * 
 * Project: http://java-ml.sourceforge.net/
 * 
 */
package net.sf.javaml.tools;

import java.util.Random;

import net.sf.javaml.core.Dataset;
import net.sf.javaml.core.DefaultDataset;
import net.sf.javaml.core.DenseInstance;
import net.sf.javaml.core.Instance;
import net.sf.javaml.core.SparseInstance;

import org.apache.commons.math3.stat.StatUtils;

/**
 * This class provides utility methods on data sets.
 * 
 * @see Dataset
 * @see DefaultDataset
 * 
 * @version 0.1.7
 * 
 * @author Thomas Abeel
 * 
 */
final public class DatasetTools {

    /**
     * All data will be merged together in the first supplied data set.
     * 
     * @param datasets
     *            a number of data sets
     * 
     */
    public static void merge(Dataset... datasets) {
        Dataset out = null;
        for (Dataset data : datasets) {
            if (out == null)
                out = data;
            else
                out.addAll(data);
        }
    }

    /**
     * Generate a bootstrap sample from the data set with a particular size,
     * using the given random generator.
     * 
     * This is done by sampling with replacement.
     * 
     * @param data
     *            data set to bootstrap from
     * @param size
     *            number of instances in the output data
     * @param rg
     *            random generator to use for the bootstrapping
     * @return bootstrap of the supplied data
     */
    @Deprecated
    public static Dataset bootstrap(Dataset data, int size, Random rg) {
        Dataset out = new DefaultDataset();
        while (out.size() < size) {
            out.add(data.instance(rg.nextInt(data.size())).copy());
        }
        return out;
    }

    /**
     * Create an instance that contains all the maximum values for the
     * attributes.
     * 
     * @param data
     *            data set to find minimum attribute values for
     * @return Instance representing the minimum values for each attribute
     */
    public static Instance maxAttributes(Dataset data) {
        Instance max = new SparseInstance();
        for (Instance i : data) {
            for (Integer index : i.keySet()) {
                double val = i.value(index);
                if (!max.containsKey(index))
                    max.put(index, val);
                else if (max.get(index) < val)
                    max.put(index, val);

            }

        }
        return max;
    }

    /**
     * Create an instance that contains all the minimum values for the
     * attributes.
     * 
     * @param data
     *            data set to calculate minimum attribute values for
     * @return Instance representing all minimum attribute values
     */
    public static Instance minAttributes(Dataset data) {
        Instance min = new SparseInstance();
        for (Instance i : data) {
            for (Integer index : i.keySet()) {
                double val = i.value(index);
                if (!min.containsKey(index))
                    min.put(index, val);
                else if (min.get(index) > val)
                    min.put(index, val);
            }
        }
        return min;
    }

    /**
     * Creates an instance that contains the standard deviation of the values
     * for each attribute.
     * 
     * @param data
     *            data set to calculate attribute value standard deviations for
     * @param avg
     *            the average instance for the data set
     * @return Instance representing the standard deviation of the values for
     *         each attribute
     */
    public static Instance standardDeviation(Dataset data, Instance avg) {
        Instance sum = new DenseInstance(new double[avg.noAttributes()]);
        for (Instance i : data) {
            Instance diff = i.minus(avg);
            sum = sum.add(diff.multiply(diff));
        }
        sum = sum.divide(data.size());
        return sum.sqrt();

    }

    /**
     * Creates an instance that contains the average values for the attributes.
     * 
     * @param data
     *            data set to calculate average attribute values for
     * @return Instance representing the average attribute values
     */
    public static Instance average(Dataset data) {
        double[] tmpOut = new double[data.noAttributes()];

        for (int i = 0; i < data.noAttributes(); i++) {
            double sum = 0;
            for (int j = 0; j < data.size(); j++) {
                sum += data.get(j).value(i);
            }
            tmpOut[i] = sum / data.size();

        }
        return new DenseInstance(tmpOut);
    }

    /**
     * Calculates the percentile hinge for a given percentile.
     * 
     * @param data
     *            data set to calculate percentile for
     * @param perc
     *            percentile to calculate, Q1=25, Q2=median=50,Q3=75
     * @return
     */
    public static Instance percentile(Dataset data, double perc) {
        double[] tmpOut = new double[data.noAttributes()];
        for (int i = 0; i < data.noAttributes(); i++) {
            double[] vals = new double[data.size()];
            for (int j = 0; j < data.size(); j++) {
                vals[j] = data.get(j).value(i);
            }
            tmpOut[i] = StatUtils.percentile(vals, perc);

        }
        return new DenseInstance(tmpOut);
    }

    /**
     * Creates an Instance from the class labels over all Instances in a data
     * set.
     * 
     * The indices of the class labels are used because the class labels can be
     * any Object.
     * 
     * @param data
     *            data set to create class label instance for
     * @return instance with class label indices as values.
     */
    public static Instance createInstanceFromClass(Dataset data) {
        Instance out = new DenseInstance(data.size());
        int index = 0;
        for (Instance inst : data)
            out.put(index++, (double) data.classIndex(inst.classValue()));
        return out;
    }

    /**
     * Creates an Instance from the values of one particular attribute over all
     * Instances in a data set.
     * 
     * @param data
     * @param i
     * @return
     */
    public static Instance createInstanceFromAttribute(Dataset data, int i) {
        Instance out = new DenseInstance(data.size());
        int index = 0;
        for (Instance inst : data)
            out.put(index++, inst.value(i));
        return out;
    }

    /**
     * Finds the double[][] array where the first index reflects the attributes
     * and the second index the minimum (index 0) and maximum (index 1) of the
     * attributes.
     * 
     * @param data
     *            data set to compute this array for
     * 
     * @return a two-dimensional array with the minimum and maximum values per
     *         attribute
     */
    private static double[][] getMinMax(Dataset data) {
        final int noAttributes = data.get(0).noAttributes();
        final int noInstances = data.size();
        final int MIN_INDEX = 0;
        final int MAX_INDEX = 1;

        // second index contains min and max -> therefore size = 2
        double[][] ret = new double[noAttributes][2];

        // reset min and max indices to Double.MAX_VALUE and Double.MIN_VALUE
        // resp;
        for (int a = 0; a < noAttributes; a++) {
            ret[a][MIN_INDEX] = Double.MAX_VALUE;
            ret[a][MAX_INDEX] = Double.MIN_VALUE;
        }

        // find min and max for each attribute
        double attrVal;
        Instance instance = null;
        for (int i = 0; i < noInstances; i++) { // for each instance
            instance = data.get(i);
            for (int a = 0; a < noAttributes; a++) { // for each attribute
                attrVal = instance.value(a);
                if (attrVal < ret[a][MIN_INDEX])
                    ret[a][MIN_INDEX] = attrVal;
                if (attrVal > ret[a][MAX_INDEX])
                    ret[a][MAX_INDEX] = attrVal;

            }
        }
        return ret;
    }

    /**
     * Creates a random instance using the
     * {@link DatasetTools#getMinMax(Dataset)} method. The random instance is
     * created by choosing a random value for each attribute (which is between
     * the min and max value of the attribute).
     * 
     * @param data
     *            the data set
     * @return a random instance
     */
    public static double[] getRandomInstance(Dataset data, Random r) {
        final int MIN_INDEX = 0;
        final int MAX_INDEX = 1;
        double[][] minMax = getMinMax(data);
        final int noAttributes = minMax.length;
        double[] ret = new double[noAttributes];
        for (int a = 0; a < noAttributes; a++) {
            ret[a] = minMax[a][MIN_INDEX] + (minMax[a][MAX_INDEX] - minMax[a][MIN_INDEX]) * r.nextDouble();
        }
        return ret;
    }

}