papaya.Rank.java Source code

Java tutorial

Introduction

Here is the source code for papaya.Rank.java

Source

/**
 * papaya: A collection of utilities for Statistics and Matrix-related manipulations
 * http://adilapapaya.com/papayastatistics.html, 1
 * Created by Adila Faruk, http://adilapapaya.com, May 1st 2012
 *
 *
 * Copyright (C) 2012 Adila Faruk http://adilapapaya.com 
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General
 * Public License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 * Boston, MA  02111-1307  USA
 */
package papaya;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.io.*;
import processing.core.*;

/**
 * <p>Ranking based on the natural ordering on floats for a sequence of data that may also 
 * contain NaNs.</p>
 * <p>When present, NaNs are treated according to the configured NaNStrategy constants and ties
 * are handled using the configured tiesStrategy constants as follows:
 * 
 * <p> Strategies for handling NaN values in rank transformations.
 * <ul>
 * <li>0 (REMOVED, default) - NaNs are removed before the rank transform is applied</li> 
 * <li>1 (MINIMAL) - NaNs are treated as minimal in the ordering, equivalent to
 * (that is, tied with) <code>Float.NEGATIVE_INFINITY</code>.</li>
 * <li>2 (MAXIMAL) - NaNs are treated as maximal in the ordering, equivalent to
 * <code>Float.POSITIVE_INFINITY</code></li>
 * <li>3 (FIXED) - NaNs are left "in place," that is the rank transformation is
 * applied to the other elements in the input array, but the NaN elements
 * are returned unchanged.</li>
 * </ul>
 * </p>
 * 
 * <p>Strategies for handling tied values in rank transformations:
 * <ul>
 * <li>0 (AVERAGE, default) - Tied values are assigned the average of the applicable ranks.
 * For example, (1,3,4,3) is ranked as (1,2.5,4,2.5)</li>
 * <li>1 (MINIMUM) - Tied values are assigned the minimum applicable rank, or the rank
 * of the first occurrence. For example, (1,3,4,3) is ranked as (1,2,4,2)</li>
 * <li>2 (MAXIMUM) - Tied values are assigned the maximum applicable rank, or the rank
 * of the last occurrence. For example, (1,3,4,3) is ranked as (1,3,4,3)</li>
 * <li>3 (SEQUENTIAL) - Ties are assigned ranks in order of occurrence in the original array,
 * for example (1,3,4,3) is ranked as (1,2,4,3)</li>
  </ul>
 * </p>
 *
 * The defaults are 0 (REMOVED) and 0 (AVERAGE) for the NaNStrategy and TiesStrategy respectively.
 *
 * <p>Examples:
 * <table border="1" cellpadding="3">
 * <tr><th colspan="3">
 * Input data: (20, 17, 30, 42.3, 17, 50, Float.NaN, Float.NEGATIVE_INFINITY, 17)
 * </th></tr>
 * <tr><th>NaNStrategy</th><th>TiesStrategy</th>
 * <th><code>rank(data)</code></th>
 * <tr>
 * <td>0 (default = NaNs removed)</td>
 * <td>0 (default = ties averaged)</td>
 * <td>(5, 3, 6, 7, 3, 8, 1, 3)</td></tr>
 * <tr>
 * <td>0 (default = NaNs removed)</td>
 * <td>1 (MINIMUM)</td>
 * <td>(5, 2, 6, 7, 2, 8, 1, 2)</td></tr>
 * <tr>
 * <td>1 (MINIMAL)</td>
 * <td>0 (default = ties averaged)</td>
 * <td>(6, 4, 7, 8, 4, 9, 1.5, 1.5, 4)</td></tr>
 * <tr>
 * <td>1 (MINIMAL)</td>
 * <td>2 (MAXIMUM)</td>
 * <td>(6, 5, 7, 8, 5, 9, 2, 2, 5)</td></tr>
 * <tr>
 * <td>2 (MAXIMAL)</td>
 * <td>2 (MAXIMUM)/td>
 * <td>(5, 4, 6, 7, 4, 8, 9, 1, 4)</td></tr>
 * </table></p>
 *
 * (Code adapted from the <a href="http://commons.apache.org/math/userguide/stat.html#a1.6_Rank_transformations"
 * target="_blank">org.apache.commons.math.stat.ranking</a> package, and modified extensively).
 */
public class Rank {

    // NaNs ----------------------------------------
    /** NaNs are removed before computing ranks. Corresponds to default */
    private static final int REMOVED = 0;

    /** NaNs are considered maximal in the ordering */
    private static final int MINIMAL = 1;

    /** NaNs are considered minimal in the ordering */
    private static final int MAXIMAL = 2;

    /** NaNs are left in place */
    private static final int FIXED = 3;

    // TIES ----------------------------------------
    /** Ties get the average of applicable ranks. Corresponds to default */
    private static final int AVERAGE = 0;

    /** Ties get the minimum applicable rank */
    private static final int MINIMUM = 1;

    /** Ties get the maximum applicable rank */
    private static final int MAXIMUM = 2;

    /** Ties assigned sequential ranks in order of occurrence */
    private static final int SEQUENTIAL = 3;

    /**
     * Makes this class non instantiable, but still let's others inherit from it.
     */
    protected Rank() {
    }

    /**
     * Rank an array containing NaN values using the natural ordering on Floats, with
     * NaN values handled according to <code>nanStrategy</code> and ties
     * resolved using <code>tiesStrategy</code>. 
     * 
     * <p>Input values that specify which strategy to use for handling tied values in the 
     * rank transformations:
     * <ul>
     * <li>0 (AVERAGE, default) - Tied values are assigned the average of the applicable ranks.
     * For example, (1,3,4,3) is ranked as (1,2.5,4,2.5)</li>
     * <li>1 (MINIMUM) - Tied values are assigned the minimum applicable rank, or the rank
     * of the first occurrence. For example, (1,3,4,3) is ranked as (1,2,4,2)</li>
     * <li>2 (MAXIMUM) - Tied values are assigned the maximum applicable rank, or the rank
     * of the last occurrence. For example, (1,3,4,3) is ranked as (1,3,4,3)</li>
     * <li>3 (SEQUENTIAL) - Ties are assigned ranks in order of occurrence in the original array,
     * for example (1,3,4,3) is ranked as (1,2,4,3)</li>
      * </ul>
     * </p>
      *
     * <p>Input values that specify which strategy to use for handling NaN values in the 
     * rank transformations:
     * <ul>
     * <li>0 (REMOVED, default) - NaNs are removed before the rank transform is applied</li> 
     * <li>1 (MINIMAL) - NaNs are treated as minimal in the ordering, equivalent to
     * (that is, tied with) <code>Float.NEGATIVE_INFINITY</code>.</li>
     * <li>2 (MAXIMAL) - NaNs are treated as maximal in the ordering, equivalent to
     * <code>Float.POSITIVE_INFINITY</code></li>
     * <li>3 (FIXED) - NaNs are left "in place," that is the rank transformation is
     * applied to the other elements in the input array, but the NaN elements
     * are returned unchanged.</li>
     * </ul>
     * </p>
     * If the data array has no NaN values, use {@link rank(float[], int)} instead. It is quicker.
      * @param data array to be ranked. This is cast to a float array prior to ranking.
      * @param nanStrategy 0,1,2 or 3 corresponding to the NaN strategy to employ.
      * @param tiesStrategy 0,1,2 or 3 corresponding to the ties strategy to employ.
      * @return array of ranks
      */
    public static float[] rank(int[] data, int tiesStrategy, int nanStrategy) {
        checkNaNStrategy(nanStrategy);
        checkTiesStrategy(tiesStrategy);
        // Array recording initial positions of data to be ranked
        IntFloatPair[] ranks = new IntFloatPair[data.length];
        for (int i = 0; i < data.length; i++) {
            ranks[i] = new IntFloatPair((float) data[i], i);
        }
        List<Integer> nanPositions = nanStrategy(ranks, nanStrategy);
        // Sort the IntFloatPairs
        Arrays.sort(ranks);
        float[] out = new float[ranks.length];
        rankIt(ranks, out, tiesStrategy);

        if (nanStrategy == FIXED) {
            restoreNaNs(out, nanPositions);
        }
        return out;
    }

    /**
     * Rank an array containing NaN values using the natural ordering on Floats, with
     * NaN values handled according to <code>nanStrategy</code> and ties
     * resolved using <code>tiesStrategy</code>. 
     * 
     * <p>Input values that specify which strategy to use for handling tied values in the 
     * rank transformations:
     * <ul>
     * <li>0 (AVERAGE, default) - Tied values are assigned the average of the applicable ranks.
     * For example, (1,3,4,3) is ranked as (1,2.5,4,2.5)</li>
     * <li>1 (MINIMUM) - Tied values are assigned the minimum applicable rank, or the rank
     * of the first occurrence. For example, (1,3,4,3) is ranked as (1,2,4,2)</li>
     * <li>2 (MAXIMUM) - Tied values are assigned the maximum applicable rank, or the rank
     * of the last occurrence. For example, (1,3,4,3) is ranked as (1,3,4,3)</li>
     * <li>3 (SEQUENTIAL) - Ties are assigned ranks in order of occurrence in the original array,
     * for example (1,3,4,3) is ranked as (1,2,4,3)</li>
      * </ul>
     * </p>
      *
     * <p>Input values that specify which strategy to use for handling NaN values in the 
     * rank transformations:
     * <ul>
     * <li>0 (REMOVED, default) - NaNs are removed before the rank transform is applied</li> 
     * <li>1 (MINIMAL) - NaNs are treated as minimal in the ordering, equivalent to
     * (that is, tied with) <code>Float.NEGATIVE_INFINITY</code>.</li>
     * <li>2 (MAXIMAL) - NaNs are treated as maximal in the ordering, equivalent to
     * <code>Float.POSITIVE_INFINITY</code></li>
     * <li>3 (FIXED) - NaNs are left "in place," that is the rank transformation is
     * applied to the other elements in the input array, but the NaN elements
     * are returned unchanged.</li>
     * </ul>
     * </p>
     * If the data array has no NaN values, use {@link rank(float[], int)} instead. It is quicker.
      * @param data array to be ranked
      * @param nanStrategy 0,1,2 or 3 corresponding to the NaN strategy to employ.
      * @param tiesStrategy 0,1,2 or 3 corresponding to the ties strategy to employ.
      * @return array of ranks
      */
    public static float[] rank(float[] data, int tiesStrategy, int nanStrategy) {
        checkNaNStrategy(nanStrategy);
        checkTiesStrategy(tiesStrategy);
        // Array recording initial positions of data to be ranked
        IntFloatPair[] ranks = new IntFloatPair[data.length];
        for (int i = 0; i < data.length; i++) {
            ranks[i] = new IntFloatPair(data[i], i);
        }
        List<Integer> nanPositions = nanStrategy(ranks, nanStrategy);
        // Sort the IntFloatPairs
        Arrays.sort(ranks);
        float[] out = new float[ranks.length];
        rankIt(ranks, out, tiesStrategy);

        if (nanStrategy == FIXED) {
            restoreNaNs(out, nanPositions);
        }
        return out;
    }

    /**
    * Rank an array (with no NaNs) using the natural ordering on Floats with ties
    * resolved using <code>tiesStrategy</code>. 
     * <p>Input values that specify which strategy to use for handling NaN values in the 
    * rank transformations:
    * <ul>
    * <li>0 (AVERAGE, default) - Tied values are assigned the average of the applicable ranks.
    * For example, (1,3,4,3) is ranked as (1,2.5,4,2.5)</li>
    * <li>1 (MINIMUM) - Tied values are assigned the minimum applicable rank, or the rank
    * of the first occurrence. For example, (1,3,4,3) is ranked as (1,2,4,2)</li>
    * <li>2 (MAXIMUM) - Tied values are assigned the maximum applicable rank, or the rank
    * of the last occurrence. For example, (1,3,4,3) is ranked as (1,3,4,3)</li>
    * <li>3 (SEQUENTIAL) - Ties are assigned ranks in order of occurrence in the original array,
    * for example (1,3,4,3) is ranked as (1,2,4,3)</li>
    * </ul>
    * </p>
    *
    * @param data array to be ranked. The array is cast to a float array prior to ranking
    * @param tiesStrategy the strategy to employ for ties.
    * @return array of ranks
    */
    public static float[] rank(int[] data, int tiesStrategy) {
        checkTiesStrategy(tiesStrategy);
        // Array recording initial positions of data to be ranked
        IntFloatPair[] ranks = new IntFloatPair[data.length];
        for (int i = 0; i < data.length; i++) {
            ranks[i] = new IntFloatPair((float) data[i], i);
        }
        // Sort the IntFloatPairs
        Arrays.sort(ranks);
        float[] out = new float[ranks.length];
        rankIt(ranks, out, tiesStrategy);
        return out;
    }

    /**
    * Rank an array (with no NaNs) using the natural ordering on Floats with ties
    * resolved using <code>tiesStrategy</code>. 
     * <p>Input values that specify which strategy to use for handling NaN values in the 
    * rank transformations:
    * <ul>
    * <li>0 (AVERAGE, default) - Tied values are assigned the average of the applicable ranks.
    * For example, (1,3,4,3) is ranked as (1,2.5,4,2.5)</li>
    * <li>1 (MINIMUM) - Tied values are assigned the minimum applicable rank, or the rank
    * of the first occurrence. For example, (1,3,4,3) is ranked as (1,2,4,2)</li>
    * <li>2 (MAXIMUM) - Tied values are assigned the maximum applicable rank, or the rank
    * of the last occurrence. For example, (1,3,4,3) is ranked as (1,3,4,3)</li>
    * <li>3 (SEQUENTIAL) - Ties are assigned ranks in order of occurrence in the original array,
    * for example (1,3,4,3) is ranked as (1,2,4,3)</li>
    * </ul>
    * </p>
    *
    * @param data array to be ranked.
    * @param tiesStrategy the strategy to employ for ties.
    * @return array of ranks
    */
    public static float[] rank(float[] data, int tiesStrategy) {
        checkTiesStrategy(tiesStrategy);
        // Array recording initial positions of data to be ranked
        IntFloatPair[] ranks = new IntFloatPair[data.length];
        for (int i = 0; i < data.length; i++) {
            ranks[i] = new IntFloatPair(data[i], i);
        }
        // Sort the IntFloatPairs
        Arrays.sort(ranks);
        float[] out = new float[ranks.length];
        rankIt(ranks, out, tiesStrategy);
        return out;
    }

    /** The actual ranking takes place here. 
    * @param ranks: the IntFloat pair of ranks
    * @param out: the output ranks */
    private static void rankIt(IntFloatPair[] ranks, float[] out, int tiesStrategy) {
        int pos = 1; // position in sorted array

        // Walk the sorted array, filling output array using sorted positions,
        // resolving ties as we go
        out[ranks[0].getPosition()] = pos;
        List<Integer> tiesTrace = new ArrayList<Integer>();
        tiesTrace.add(ranks[0].getPosition());
        for (int i = 1; i < ranks.length; i++) {
            if (Float.compare(ranks[i].getValue(), ranks[i - 1].getValue()) > 0) {
                // tie sequence has ended (or had length 1)
                pos = i + 1;
                if (tiesTrace.size() > 1) { // if seq is nontrivial, resolve
                    resolveTie(out, tiesTrace, tiesStrategy);
                }
                tiesTrace = new ArrayList<Integer>();
                tiesTrace.add(ranks[i].getPosition());
            } else {
                // tie sequence continues
                tiesTrace.add(ranks[i].getPosition());
            }
            out[ranks[i].getPosition()] = pos;
        }
        if (tiesTrace.size() > 1) { // handle tie sequence at end
            resolveTie(out, tiesTrace, tiesStrategy);
        }
    }

    /* Check if the nanStrategy input is one of those specified */
    private static void checkNaNStrategy(int num) {
        if (num < 0 || num > 3)
            throw new IllegalArgumentException("NaN strategy option has to be 0, 1, 2 or 3.");
    }

    /* Check if the tiesStrategy input is one of those specified */
    private static void checkTiesStrategy(int num) {
        if (num < 0 || num > 3)
            throw new IllegalArgumentException("Ties strategy option has to be 0, 1, 2 or 3.");
    }

    /** Decide how to proceed with the NaN values */
    private static List<Integer> nanStrategy(IntFloatPair[] ranks, int nanStrategy) {
        List<Integer> nanPositions = null;

        // Recode, remove or record positions of NaNs        
        switch (nanStrategy) {
        case MAXIMAL: // Replace NaNs with +INFs
            recodeNaNs(ranks, Float.POSITIVE_INFINITY);
            break;
        case MINIMAL: // Replace NaNs with -INFs
            recodeNaNs(ranks, Float.NEGATIVE_INFINITY);
            break;
        case REMOVED: // Drop NaNs from data
            ranks = removeNaNs(ranks);
            break;
        case FIXED: // Record positions of NaNs
            nanPositions = getNanPositions(ranks);
            break;
        }
        return nanPositions;
    }

    /**
     * Returns an array that is a copy of the input array with IntFloatPairs
     * having NaN values removed.
     * @param ranks input array
     * @return array with NaN-valued entries removed
     */
    private static IntFloatPair[] removeNaNs(IntFloatPair[] ranks) {
        if (!containsNaNs(ranks)) {
            return ranks;
        }
        IntFloatPair[] outRanks = new IntFloatPair[ranks.length];
        int j = 0;
        for (int i = 0; i < ranks.length; i++) {
            if (Float.isNaN(ranks[i].getValue())) {
                // drop, but adjust original ranks of later elements
                for (int k = i + 1; k < ranks.length; k++) {
                    ranks[k] = new IntFloatPair(ranks[k].getValue(), ranks[k].getPosition() - 1);
                }
            } else {
                outRanks[j] = new IntFloatPair(ranks[i].getValue(), ranks[i].getPosition());
                j++;
            }
        }
        IntFloatPair[] returnRanks = new IntFloatPair[j];
        System.arraycopy(outRanks, 0, returnRanks, 0, j);
        return returnRanks;
    }

    /**
     * Recodes NaN values to the given value.
     *
     * @param ranks array to recode
     * @param value the value to replace NaNs with
     */
    private static void recodeNaNs(IntFloatPair[] ranks, float value) {
        for (int i = 0; i < ranks.length; i++) {
            if (Float.isNaN(ranks[i].getValue())) {
                ranks[i] = new IntFloatPair(value, ranks[i].getPosition());
            }
        }
    }

    /**
     * Checks for presence of NaNs in <code>ranks.</code>
     *
     * @param ranks array to be searched for NaNs
     * @return true iff ranks contains one or more NaNs
     */
    private static boolean containsNaNs(IntFloatPair[] ranks) {
        for (int i = 0; i < ranks.length; i++) {
            if (Float.isNaN(ranks[i].getValue())) {
                return true;
            }
        }
        return false;
    }

    /**
     * Resolve a sequence of ties, using the configured TiesStrategy.
     * The input <code>ranks</code> array is expected to take the same value
     * for all indices in <code>tiesTrace</code>.  The common value is recoded
     * according to the tiesStrategy. For example, if ranks = <5,8,2,6,2,7,1,2>,
     * tiesTrace = <2,4,7> and tiesStrategy is MINIMUM, ranks will be unchanged.
     * The same array and trace with tiesStrategy AVERAGE will come out
     * <5,8,3,6,3,7,1,3>.
     *
     * @param ranks array of ranks
     * @param tiesTrace list of indices where <code>ranks</code> is constant
     * -- that is, for any i and j in TiesTrace, <code> ranks[i] == ranks[j]
     * </code>
     */
    private static void resolveTie(float[] ranks, List<Integer> tiesTrace, int tiesStrategy) {

        // constant value of ranks over tiesTrace
        final float c = ranks[tiesTrace.get(0)];

        // length of sequence of tied ranks
        final int length = tiesTrace.size();

        switch (tiesStrategy) {
        case AVERAGE: // Replace ranks with average
            double tmp = (2 * c + length - 1) / 2d;
            fill(ranks, tiesTrace, (float) tmp);
            break;
        case MAXIMUM: // Replace ranks with maximum values
            fill(ranks, tiesTrace, c + length - 1);
            break;
        case MINIMUM: // Replace ties with minimum
            fill(ranks, tiesTrace, c);
            break;
        case SEQUENTIAL: // Fill sequentially from c to c + length - 1
            // walk and fill
            Iterator<Integer> iterator = tiesTrace.iterator();
            float f = (float) Math.round(c);
            int i = 0;
            while (iterator.hasNext()) {
                ranks[iterator.next()] = f + i++;
            }
            break;
        }
    }

    /**
     * Sets<code>data[i] = value</code> for each i in <code>tiesTrace.</code>
     *
     * @param data array to modify
     * @param tiesTrace list of index values to set
     * @param value value to set
     */
    private static void fill(float[] data, List<Integer> tiesTrace, float value) {
        Iterator<Integer> iterator = tiesTrace.iterator();
        while (iterator.hasNext()) {
            data[iterator.next()] = value;
        }
    }

    /**
     * Set <code>ranks[i] = Float.NaN</code> for each i in <code>nanPositions.</code>
     *
     * @param ranks array to modify
     * @param nanPositions list of index values to set to <code>Float.NaN</code>
     */
    private static void restoreNaNs(float[] ranks, List<Integer> nanPositions) {
        if (nanPositions.size() == 0) {
            return;
        }
        Iterator<Integer> iterator = nanPositions.iterator();
        while (iterator.hasNext()) {
            ranks[iterator.next().intValue()] = Float.NaN;
        }

    }

    /**
     * Returns a list of indexes where <code>ranks</code> is <code>NaN.</code>
     *
     * @param ranks array to search for <code>NaNs</code>
     * @return list of indexes i such that <code>ranks[i] = NaN</code>
     */
    private static List<Integer> getNanPositions(IntFloatPair[] ranks) {
        ArrayList<Integer> out = new ArrayList<Integer>();
        for (int i = 0; i < ranks.length; i++) {
            if (Float.isNaN(ranks[i].getValue())) {
                out.add(Integer.valueOf(i));
            }
        }
        return out;
    }

    /**
     * Represents the position of a float value in an ordering.
     * Comparable interface is implemented so Arrays.sort can be used
     * to sort an array of IntFloatPairs by value.  Note that the
     * implicitly defined natural ordering is NOT consistent with equals.
     */
    private static class IntFloatPair implements Comparable<IntFloatPair> {

        /** Value of the pair */
        private final float value;

        /** Original position of the pair */
        private final int position;

        /**
         * Construct an IntFloatPair with the given value and position.
         * @param value the value of the pair
         * @param position the original position
         */
        public IntFloatPair(float value, int position) {
            this.value = value;
            this.position = position;
        }

        /**
         * Compare this IntFloatPair to another pair.
         * Only the <strong>values</strong> are compared.
         *
         * @param other the other pair to compare this to
         * @return result of <code>Float.compare(value, other.value)</code>
         */
        public int compareTo(IntFloatPair other) {
            return Float.compare(value, other.value);
        }

        /**
         * Returns the value of the pair.
         * @return value
         */
        public float getValue() {
            return value;
        }

        /**
         * Returns the original position of the pair.
         * @return position
         */
        public int getPosition() {
            return position;
        }
    }
}

//public class Rank {
//   private int[] rank;//= new int[numProducts];
//   private int[] idx;// = new int[numProducts]; 
//
//   public Rank(float[] sortedArray) {
//      int size = sortedArray.length;
//      rankTemp = new int[size];
//      int i=0;
//      while(i<sortedArray.length)) {
//         int j =i+1; int numRepeats = 1;
//         int r =(i+1); 
//         rank[i] = i+1;
//         while(sortedArray[j++]==sortedArray[i]){
//            r+=j+1;
//            numRepeats++; j++;
//         }
//         r /=numRepeats;
//         for(int k=i; k < j; k++){
//            rank[k] = r;
//         }
//         i=j;         
//      }
//   }
//
//   public int[] getRank() {
//      return rank;
//   }
//
//   public int[] getIndexSort(){
//      return idx;
//   }
//}