au.org.ala.delta.intkey.model.SortingUtils.java Source code

Introduction

Here is the source code for au.org.ala.delta.intkey.model.SortingUtils.java
Source

/*******************************************************************************
 * Copyright (C) 2011 Atlas of Living Australia
 * All Rights Reserved.
 * 
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/MPL/
 * 
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 ******************************************************************************/
package au.org.ala.delta.intkey.model;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang.math.FloatRange;

import au.org.ala.delta.best.DiagType;
import au.org.ala.delta.model.Attribute;
import au.org.ala.delta.model.Character;
import au.org.ala.delta.model.CharacterType;
import au.org.ala.delta.model.IntegerAttribute;
import au.org.ala.delta.model.IntegerCharacter;
import au.org.ala.delta.model.Item;
import au.org.ala.delta.model.MultiStateAttribute;
import au.org.ala.delta.model.MultiStateCharacter;
import au.org.ala.delta.model.RealAttribute;
import au.org.ala.delta.model.RealCharacter;
import au.org.ala.delta.model.TextCharacter;
import au.org.ala.delta.util.Pair;

/**
 * Utilities for sorting characters
 * 
 * @author ChrisF
 * 
 */
public class SortingUtils {

    private enum OrderingType {
        BEST, SEPARATE, DIAGNOSE
    }

    /**
     * A separating power below this is treated as zero
     */
    private static double minimumSeparatingPower = 0.0001;

    // TODO change arguments, pass in something other than the entire context.
    /**
     * Determines best order and separating power of all characters in the
     * supplied context's dataset
     * 
     * @param context
     *            the application's global state
     * @return a map of characters to their separating powers. The best order of
     *         the characters can be obtained by getting the keyset of the
     *         supplied map
     */
    public static LinkedHashMap<au.org.ala.delta.model.Character, Double> orderBest(IntkeyContext context) {
        return doOrdering(context, OrderingType.BEST, null);
    }

    public static LinkedHashMap<au.org.ala.delta.model.Character, Double> orderSeparate(IntkeyContext context,
            Item taxonToSeparate) {
        return doOrdering(context, OrderingType.SEPARATE, taxonToSeparate);
    }

    public static LinkedHashMap<au.org.ala.delta.model.Character, Double> orderDiagnose(IntkeyContext context,
            Item taxonToSeparate, DiagType diagType) {
        return doOrdering(context, OrderingType.SEPARATE, taxonToSeparate);
    }

    /**
     * Determines best order and separating power of all characters in the
     * supplied context's dataset
     * 
     * @param context
     *            the application's global state
     * @return a map of characters to their separating powers. The best order of
     *         the characters can be obtained by getting the keyset of the
     *         supplied map
     */
    public static LinkedHashMap<au.org.ala.delta.model.Character, Double> doOrdering(IntkeyContext context,
            OrderingType orderingType, Item taxonToSeparate) {
        LinkedHashMap<Character, Double> retMap = new LinkedHashMap<Character, Double>();

        IntkeyDataset dataset = context.getDataset();

        List<Character> allCharacters = dataset.getCharactersAsList();

        final double[] suVals = new double[allCharacters.size()];
        double[] sepVals = new double[allCharacters.size()];

        double[] charCosts = new double[allCharacters.size()];

        for (Character ch : allCharacters) {
            double charCost = Math.pow(context.getRBase(), 5.0 - Math.min(10.0, ch.getReliability()));
            charCosts[ch.getCharacterId() - 1] = charCost;
        }

        double varw = (1 - context.getVaryWeight()) / Math.max(context.getVaryWeight(), 0.01);

        // Build list of available characters
        List<Character> availableCharacters = new ArrayList<Character>(context.getAvailableCharacters());
        Collections.sort(availableCharacters);
        List<Character> ignoredCharacters = new ArrayList<Character>();
        for (Character ch : availableCharacters) {

            // TODO ignore EXACT characters that have been eliminated????

            // Ignore character if its reliability is zero
            if (ch.getReliability() == 0) {
                ignoredCharacters.add(ch);
            }

            // Ignore character if it is a text character
            if (ch instanceof TextCharacter) {
                ignoredCharacters.add(ch);
            }

            // Ignore real characters if there are no key states
            // for real characters
            if (ch instanceof RealCharacter && !context.getDataset().realCharacterKeyStateBoundariesPresent()) {
                ignoredCharacters.add(ch);
            }
        }
        availableCharacters.removeAll(ignoredCharacters);

        if (availableCharacters.isEmpty()) {
            // no available characters, so just return an empty map.
            return retMap;
        }

        // Build list of remaining taxa
        int numAvailableTaxa = 0;

        // Put includedTaxa and eliminated taxa into hash sets to speed up
        // membership lookup
        Set<Item> includedTaxa = new HashSet<Item>(context.getIncludedTaxa());
        Set<Item> eliminatedTaxa = new HashSet<Item>(context.getEliminatedTaxa());
        Map<Item, Boolean> taxaAvailability = new HashMap<Item, Boolean>();

        for (Item taxon : dataset.getItemsAsList()) {

            boolean ignore = false;

            if (eliminatedTaxa.contains(taxon)) {
                ignore = true;
            }

            // skip if taxon is not included
            if (!includedTaxa.contains(taxon)) {
                ignore = true;
            }

            // TODO skip if there are EXACT characters and this taxon has
            // been eliminated

            if (ignore) {
                taxaAvailability.put(taxon, false);
            } else {
                numAvailableTaxa++;
                taxaAvailability.put(taxon, true);
            }

        }

        if (numAvailableTaxa == 0) {
            // no taxa are available - return empty map
            return retMap;
        }

        // sort available characters by reliability (descending)
        Collections.sort(availableCharacters, new ReliabilityComparator());

        // minimum cost - this will always be the cost of the available
        // character with the greatest reliability
        double cmin = charCosts[availableCharacters.get(0).getCharacterId() - 1];

        List<Character> unsuitableCharacters = new ArrayList<Character>();

        for (Character ch : availableCharacters) {
            int sumNumTaxaInSubgroups = 0;
            double sumSubgroupsFrequencies = 0;
            int numSubgroupsSameSizeAsOriginalGroup = 0;
            double sup0 = 0; // theoretical partition component of sup.
            double dupf = 0; // arbitrary intra-taxon variability component of
                             // sup.
            double sep = 0; // separating power of the character
            double sup = 0; // total partition component of su. sup = sup0 +
                            // dupf
            double su = 0; // character suitability

            // NOTE: to simplify the algorithm, all characters are treated as
            // multistate characters. Integer and real
            // characters are converted into multistate representations.

            // Determine the total available states for each character
            int totalNumStates = 0;
            if (ch instanceof MultiStateCharacter) {
                totalNumStates = ((MultiStateCharacter) ch).getNumberOfStates();
            } else if (ch instanceof IntegerCharacter) {
                // for an integer character, 1 state for each value between
                // the minimum and
                // maximum (inclusive), 1 state for all values below the
                // minimum, and 1 state for
                // all values above the maximum
                IntegerCharacter intChar = (IntegerCharacter) ch;
                totalNumStates = intChar.getMaximumValue() - intChar.getMinimumValue() + 3;
            } else if (ch instanceof RealCharacter) {
                // the real character's key state boundaries are used to convert
                // a real value into a
                // multistate value (see below). The total number of possible
                // states is equal to the number of
                // key state boundaries.
                totalNumStates = ((RealCharacter) ch).getKeyStateBoundaries().size();
            } else {
                throw new RuntimeException("Invalid character type " + ch.toString());
            }

            // number of taxa in character subgroups
            int[] subgroupsNumTaxa = new int[totalNumStates];

            // frequency of character subgroups
            double[] subgroupFrequencies = new double[totalNumStates];

            List<Attribute> charAttributes = dataset.getAllAttributesForCharacter(ch.getCharacterId());

            // examine taxon to be diagnosed or separated first
            boolean[] taxonToSeparateStatePresence = new boolean[totalNumStates];
            int ndgSum = 1;
            if (orderingType == OrderingType.SEPARATE || orderingType == OrderingType.DIAGNOSE) {
                Attribute attr = charAttributes.get(taxonToSeparate.getItemNumber() - 1);

                if (attr.isUnknown() && attr.isInapplicable()) {
                    unsuitableCharacters.add(ch);
                    continue;
                }

                taxonToSeparateStatePresence = getStatePresenceForAttribute(attr, totalNumStates, orderingType,
                        context.getDiagType()).getFirst();
            }

            for (Attribute attr : charAttributes) {
                Item taxon = attr.getItem();

                // Skip any attributes that pertain to taxa that are not
                // available
                if (!taxaAvailability.get(taxon)) {
                    continue;
                }

                Pair<boolean[], Integer> statePresencePair = getStatePresenceForAttribute(attr, totalNumStates,
                        orderingType, context.getDiagType());

                boolean[] statePresence = statePresencePair.getFirst();
                int numStatesPresent = statePresencePair.getSecond();

                if (orderingType == OrderingType.BEST) {
                    // work out size of character subgroups.
                    for (int i = 0; i < totalNumStates; i++) {
                        if (statePresence[i] == true) {
                            subgroupsNumTaxa[i]++;

                            // frequency of items with current state of current
                            // character
                            double stateFrequency = 1.0 / (double) numStatesPresent;
                            stateFrequency += subgroupFrequencies[i];
                            subgroupFrequencies[i] = stateFrequency;
                        }
                    }
                } else {
                    for (int i = 0; i < totalNumStates; i++) {
                        if (statePresence[i] == true) {
                            subgroupsNumTaxa[i]++;
                        }
                    }

                    if (!taxon.equals(taxonToSeparate)) {
                        for (int i = 0; i < totalNumStates; i++) {
                            if (statePresence[i] && taxonToSeparateStatePresence[i]) {
                                ndgSum++;
                                break;
                            }
                        }
                    }
                }

            }

            if (orderingType == OrderingType.BEST) {
                // total number of non-empty character subgroups
                int totalNumSubgroups = 0;

                // work out sum of subgroup sizes and frequencies
                for (int i = 0; i < totalNumStates; i++) {
                    sumNumTaxaInSubgroups += subgroupsNumTaxa[i];
                    sumSubgroupsFrequencies += subgroupFrequencies[i];

                    if (subgroupsNumTaxa[i] > 0) {
                        totalNumSubgroups++;
                    }
                }

                for (int i = 0; i < totalNumStates; i++) {
                    int numTaxaInSubgroup = subgroupsNumTaxa[i];

                    if (numTaxaInSubgroup == sumNumTaxaInSubgroups) {
                        // character is unsuitable if it divides the characters
                        // into a
                        // single
                        // subgroup
                        unsuitableCharacters.add(ch);
                        continue;
                    } else {
                        if (numTaxaInSubgroup == numAvailableTaxa) {
                            numSubgroupsSameSizeAsOriginalGroup++;
                        }

                        if (subgroupsNumTaxa[i] > 0) {
                            sup0 += (subgroupFrequencies[i] * log2(subgroupsNumTaxa[i]));
                        }
                    }
                }

                boolean isControllingChar = !ch.getDependentCharacters().isEmpty();
                // TODO what is this test for???
                if (!isControllingChar && (totalNumSubgroups == numSubgroupsSameSizeAsOriginalGroup
                        || (sumNumTaxaInSubgroups > numAvailableTaxa
                                && numSubgroupsSameSizeAsOriginalGroup == totalNumStates))) {
                    unsuitableCharacters.add(ch);
                    continue;
                }

                sup0 = sup0 / sumSubgroupsFrequencies;

                if (numAvailableTaxa > 1 && sumNumTaxaInSubgroups > numAvailableTaxa) {
                    dupf = varw * (1 + 100 * numSubgroupsSameSizeAsOriginalGroup)
                            * (sumNumTaxaInSubgroups - numAvailableTaxa)
                            * ((numAvailableTaxa + 8) / (numAvailableTaxa * log2(numAvailableTaxa)));
                } else {
                    dupf = 0;
                }

                sep = -sup0 + log2(numAvailableTaxa);

                // handle rounding errors
                if (Math.abs(sep) <= minimumSeparatingPower) {
                    sep = 0.0;
                }

                // don't display controlling characters with 0 separation
                if (isControllingChar && sep == 0) {
                    unsuitableCharacters.add(ch);
                    continue;
                }

                sup = sup0 + dupf;
            } else { // SEPARATE or DIAGNOSE
                // total number of non-empty character subgroups
                int totalNumSubgroups = 0;

                for (int i = 0; i < totalNumStates; i++) {
                    sumNumTaxaInSubgroups += subgroupsNumTaxa[i];

                    if (subgroupsNumTaxa[i] > 0) {
                        totalNumSubgroups++;
                    }
                }

                for (int i = 0; i < totalNumStates; i++) {
                    int numTaxaInSubgroup = subgroupsNumTaxa[i];

                    // character is unsuitable if it divides the characters into
                    // a
                    // single
                    // subgroup
                    if (numTaxaInSubgroup == sumNumTaxaInSubgroups) {
                        unsuitableCharacters.add(ch);
                        continue;
                    } else {
                        if (numTaxaInSubgroup == numAvailableTaxa) {
                            numSubgroupsSameSizeAsOriginalGroup++;
                        }
                    }
                }

                // TODO what is this test for???
                if (orderingType == OrderingType.DIAGNOSE
                        && (totalNumSubgroups == numSubgroupsSameSizeAsOriginalGroup
                                || (sumNumTaxaInSubgroups > numAvailableTaxa
                                        && numSubgroupsSameSizeAsOriginalGroup == totalNumStates))) {
                    unsuitableCharacters.add(ch);
                    continue;
                }

                sup0 = log2(ndgSum);
                sep = -sup0 + log2(numAvailableTaxa);

                // handle rounding errors
                if (Math.abs(sep) <= minimumSeparatingPower) {
                    sep = 0.0;
                }

                // for DIAGNOSE, characters with zero separation are ignored
                if (sep <= 0.0 && orderingType == OrderingType.DIAGNOSE) {
                    unsuitableCharacters.add(ch);
                    continue;
                }

                sup = sup0;
            }

            su = charCosts[ch.getCharacterId() - 1] + cmin * sup;

            sepVals[ch.getCharacterId() - 1] = sep;
            suVals[ch.getCharacterId() - 1] = su;
        }

        availableCharacters.removeAll(unsuitableCharacters);

        List<Character> sortedChars = new ArrayList<Character>(availableCharacters);
        Collections.sort(sortedChars, new Comparator<Character>() {

            @Override
            public int compare(Character c1, Character c2) {
                double suValC1 = suVals[c1.getCharacterId() - 1];
                double suValC2 = suVals[c2.getCharacterId() - 1];

                if (suValC1 == suValC2) {
                    return Integer.valueOf(c1.getCharacterId()).compareTo(Integer.valueOf(c2.getCharacterId()));
                } else {
                    return Double.valueOf(suValC1).compareTo(Double.valueOf(suValC2));
                }
            }
        });

        for (Character ch : sortedChars) {
            retMap.put(ch, sepVals[ch.getCharacterId() - 1]);
        }

        return retMap;
    }

    private static double log2(double x) {
        return Math.log(x) / Math.log(2);
    }

    /**
     * Helper method for orderBest. Takes a real character's value and converts
     * it into a multistate value using the real character's key state
     * boundaries
     * 
     * @param realChar
     *            The real character
     * @param realValue
     *            The value for the real character
     * @param statePresence
     *            This array will be filled by the method. A true value in this
     *            array indicates that the corresponding state value is present
     * @return the number of key states present for the supplied real value
     */
    private static int generateKeyStatesForRealCharacter(RealCharacter realChar, FloatRange realValue,
            boolean[] statePresence) {
        int numStatesPresent = 0;

        List<Float> boundariesList = realChar.getKeyStateBoundaries();
        Float[] boundaries = new Float[boundariesList.size()];
        boundariesList.toArray(boundaries);

        float rangeMin = realValue.getMinimumFloat();
        float rangeMax = realValue.getMaximumFloat();

        int i = 0;
        for (; i < boundaries.length; i++) {
            if (rangeMin <= boundaries[i]) {
                statePresence[i] = true;
                numStatesPresent++;
                break;
            }
        }

        for (; i < boundaries.length - 1; i++) {
            if (rangeMax > boundaries[i]) {
                statePresence[i + 1] = true;
                numStatesPresent++;
            }
        }

        return numStatesPresent;
    }

    // Returns an array of booleans indicating the presence/absence of states for the supplied attribute. Also returns the number of present states.
    // For integer attributes, each value between the maximum and minimum is treated as a state.
    // Real attributes are converted to multistate representations using the key state boundaries
    private static Pair<boolean[], Integer> getStatePresenceForAttribute(Attribute attr, int totalNumStates,
            OrderingType orderingType, DiagType diagType) {
        Character ch = attr.getCharacter();

        // has a boolean value for each character state. A true value
        // designates the presence of the corresponding character state
        // for the attribute.
        boolean[] statePresence = new boolean[totalNumStates];

        int numStatesPresent = 0;

        // determine which character states are present for the
        // attribute.

        if (attr.isUnknown()) {
            // treat attribute as variable
            Arrays.fill(statePresence, true);
            numStatesPresent = totalNumStates;
        } else if (attr.isInapplicable() && (orderingType == OrderingType.SEPARATE
                || (orderingType == OrderingType.DIAGNOSE && diagType == DiagType.SPECIMENS))) {
            // treat attribute as variable
            Arrays.fill(statePresence, true);
            numStatesPresent = totalNumStates;
        } else {
            Arrays.fill(statePresence, false);

            if (ch.getCharacterType() == CharacterType.OrderedMultiState
                    || ch.getCharacterType() == CharacterType.UnorderedMultiState) {
                MultiStateAttribute multiStateAttr = (MultiStateAttribute) attr;
                Set<Integer> attrPresentStates = multiStateAttr.getPresentStates();

                for (int i = 0; i < totalNumStates; i++) {
                    if (attrPresentStates.contains(i + 1)) {
                        statePresence[i] = true;
                        numStatesPresent++;
                    }
                }

            } else if (ch.getCharacterType() == CharacterType.IntegerNumeric) {
                IntegerCharacter intChar = (IntegerCharacter) ch;
                IntegerAttribute intAttr = (IntegerAttribute) attr;

                // for an integer character, 1 state for each value
                // between
                // the minimum and
                // maximum (inclusive), 1 state for all values below the
                // minimum, and 1 state for
                // all values above the maximum

                Set<Integer> attrPresentStates = intAttr.getPresentValues();

                int offset = intChar.getMinimumValue() - 1;

                for (int i = 0; i < totalNumStates; i++) {
                    if (attrPresentStates.contains(i + offset)) {
                        statePresence[i] = true;
                        numStatesPresent++;
                    }
                }

            } else if (ch.getCharacterType() == CharacterType.RealNumeric) {
                RealCharacter realChar = (RealCharacter) ch;
                RealAttribute realAttr = (RealAttribute) attr;
                FloatRange presentRange = realAttr.getPresentRange();

                // convert real value into multistate value.
                numStatesPresent = generateKeyStatesForRealCharacter(realChar, presentRange, statePresence);
            } else {
                throw new RuntimeException("Invalid character type " + ch.toString());
            }
        }

        return new Pair<boolean[], Integer>(statePresence, numStatesPresent);
    }

    /**
     * Comparator used by orderBest() method to sort characters in descending
     * order using their reliabilities
     * 
     * @author ChrisF
     * 
     */
    private static class ReliabilityComparator implements Comparator<Character> {

        @Override
        public int compare(Character c1, Character c2) {

            int compareResult = Float.valueOf(c1.getReliability()).compareTo(Float.valueOf(c2.getReliability()));

            // multiply by -1 to get descending order
            return compareResult * -1;
        }
    }
}