Java tutorial
/******************************************************************************* * Copyright (C) 2011 Atlas of Living Australia * All Rights Reserved. * * The contents of this file are subject to the Mozilla Public * License Version 1.1 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or * implied. See the License for the specific language governing * rights and limitations under the License. ******************************************************************************/ package au.org.ala.delta.model; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.math.FloatRange; /** * Utility methods used to compare character values. Used by the DIFFERENCES, * SIMILARITIES and USE intkey directives, among others. * * @author ChrisF * */ public class DiffUtils { // TODO method documentation // This is the logic used by the differences directive. It is located here // to aid // unit testing. public static List<Character> determineDifferingCharactersForTaxa(DeltaDataSet dataset, List<Character> characters, List<Item> taxa, Specimen specimen, boolean matchUnknowns, boolean matchInapplicables, MatchType matchType, boolean omitTextCharacters) { List<Character> differencesList = new ArrayList<Character>(); for (au.org.ala.delta.model.Character ch : characters) { // If the specimen is included in the comparison, ignore any // characters for which there is no value set in the specimen if (specimen != null && !specimen.hasValueFor(ch)) { continue; } if (ch instanceof TextCharacter && omitTextCharacters) { continue; } boolean match = DiffUtils.compareForTaxa(dataset, ch, taxa, specimen, matchUnknowns, matchInapplicables, matchType); if (!match) { differencesList.add(ch); } } return differencesList; } public static List<Character> determineSimilaritiesForTaxa(DeltaDataSet dataset, List<Character> characters, List<Item> taxa, Specimen specimen, boolean matchUnknowns, boolean matchInapplicables, MatchType matchType) { List<Character> similaritiesList = new ArrayList<Character>(); for (au.org.ala.delta.model.Character ch : characters) { // If the specimen is included in the comparison, ignore any // characters for which there is no value set in the specimen if (specimen != null && !specimen.hasValueFor(ch)) { continue; } boolean match = DiffUtils.compareForTaxa(dataset, ch, taxa, specimen, matchUnknowns, matchInapplicables, matchType); if (match) { similaritiesList.add(ch); } } return similaritiesList; } /** * Compare the values coded for the supplied character for each of the * supplied taxa. If supplied, also include the specimen in the comparison * * @param dataset * the currently loaded dataset * @param ch * the character * @param taxa * the list of taxa * @param specimen * the specimen * @param matchUnknowns * true if unknown matches any value * @param matchInapplicables * true if inapplicable matches any value * @param matchType * the match type - exact, subset or overlap * @return true if the values coded for the specified character match for * all of the supplied taxa, as well as for the specimen if it was * supplied. */ public static boolean compareForTaxa(DeltaDataSet dataset, Character ch, List<Item> taxa, Specimen specimen, boolean matchUnknowns, boolean matchInapplicables, MatchType matchType) { int countUnknown = 0; int countNotApplicable = 0; int countTaxaWithCharacterCoded = 0; List<Attribute> allAttrsForChar = dataset.getAllAttributesForCharacter(ch.getCharacterId()); List<Attribute> attrs = new ArrayList<Attribute>(); // get number of states present in taxa to be compared for (Item t : taxa) { Attribute attr = (Attribute) allAttrsForChar.get(t.getItemNumber() - 1); if (attr.isUnknown()) { if (attr.isInapplicable()) { countNotApplicable++; } else { countUnknown++; } } else { countTaxaWithCharacterCoded++; attrs.add(attr); } } if (specimen != null) { if (!specimen.hasValueFor(ch)) { if (specimen.isCharacterInapplicable(ch)) { countNotApplicable++; } else { countUnknown++; } } else { countTaxaWithCharacterCoded++; // create an attribute containing the same value as the specimen // value. this is done to simplify the // comparison code as only the one datatype then needs to be // handled. attrs.add(specimen.getAttributeForCharacter(ch)); } } // If all attributes are either unknown or inapplicable, this is // considered a match. if (countUnknown + countNotApplicable == taxa.size()) { return true; } if (countUnknown > 0 && !matchUnknowns) { return countUnknown == taxa.size(); } else if (countNotApplicable > 0 && !matchInapplicables) { return countNotApplicable == taxa.size(); } else { if (countTaxaWithCharacterCoded > 1) { if (ch instanceof MultiStateCharacter || ch instanceof IntegerCharacter) { return doCompareMultistateOrIntegerForTaxa(attrs, matchUnknowns, matchInapplicables, matchType); } else if (ch instanceof RealCharacter) { List<RealAttribute> realAttrs = new ArrayList<RealAttribute>(); for (Attribute attr : attrs) { realAttrs.add((RealAttribute) attr); } return doCompareRealForTaxa(realAttrs, matchUnknowns, matchInapplicables, matchType); } else if (ch instanceof TextCharacter) { List<TextAttribute> textAttrs = new ArrayList<TextAttribute>(); for (Attribute attr : attrs) { textAttrs.add((TextAttribute) attr); } return doCompareTextForTaxa(textAttrs, matchUnknowns, matchInapplicables, matchType); } else { throw new RuntimeException("Unrecognised character type"); } } else { return true; } } } // comparator used to sort attributes by some value, e.g. number of states // or text length private static class AttributeValueSizeComparator implements Comparator<Attribute> { private Map<Attribute, Double> _valueSizes; public AttributeValueSizeComparator(Map<Attribute, Double> valueSizes) { _valueSizes = valueSizes; } @Override public int compare(Attribute a1, Attribute a2) { double a1ValueSize = _valueSizes.get(a1); double a2ValueSize = _valueSizes.get(a2); if (a1ValueSize < a2ValueSize) { return -1; } else if (a1ValueSize == a2ValueSize) { return 0; } else { return 1; } } } // Helper method to compare lists of multistate or integer attributes. // "attrs" list must contain all multistates or all integers, otherwise // a class cast exception will be thrown. private static boolean doCompareMultistateOrIntegerForTaxa(List<Attribute> attrs, boolean matchUnknowns, boolean matchInapplicables, MatchType matchType) { boolean typeMultistate = attrs.get(0) instanceof MultiStateAttribute; // sort attributes in ascending order based on the number of states // present Map<Attribute, Double> attrNumberOfStates = new HashMap<Attribute, Double>(); for (Attribute attr : attrs) { if (typeMultistate) { MultiStateAttribute msAttr = (MultiStateAttribute) attr; attrNumberOfStates.put(msAttr, (double) msAttr.getPresentStates().size()); } else { IntegerAttribute intAttr = (IntegerAttribute) attr; attrNumberOfStates.put(intAttr, (double) intAttr.getPresentValues().size()); } } Collections.sort(attrs, new AttributeValueSizeComparator(attrNumberOfStates)); if (matchType == MatchType.EXACT || matchType == MatchType.SUBSET) { boolean diff = false; for (int i = 0; i < attrs.size() - 1; i++) { if (typeMultistate) { MultiStateAttribute a1 = (MultiStateAttribute) attrs.get(i); MultiStateAttribute a2 = (MultiStateAttribute) attrs.get(i + 1); diff = !compareMultistate(a1, a2, matchUnknowns, matchInapplicables, matchType); } else { IntegerAttribute a1 = (IntegerAttribute) attrs.get(i); IntegerAttribute a2 = (IntegerAttribute) attrs.get(i + 1); diff = !compareInteger(a1, a2, matchUnknowns, matchInapplicables, matchType); } if (diff) { break; } } return !diff; } else { // overlap - taxa must have at least one state in common // convert state data into arrays for efficiency int numAttrs = attrs.size(); int[][] attrStates = new int[numAttrs][]; for (int i = 0; i < numAttrs; i++) { Attribute attr = attrs.get(i); List<Integer> states; if (typeMultistate) { states = new ArrayList<Integer>(((MultiStateAttribute) attr).getPresentStates()); } else { states = new ArrayList<Integer>(((IntegerAttribute) attr).getPresentValues()); } int[] statesArray = new int[states.size()]; for (int j = 0; j < states.size(); j++) { int stateVal = states.get(j); statesArray[j] = stateVal; } attrStates[i] = statesArray; } // compare the states for each pair of attributes. Note that each // pair only needs to be compared once. for (int i = 0; i < numAttrs; i++) { int[] a1States = attrStates[i]; for (int j = i + 1; j < numAttrs; j++) { boolean overlap = false; int[] a2States = attrStates[j]; for (int k = 0; k < a1States.length && !overlap; k++) { int a1StateVal = a1States[k]; for (int l = 0; l < a2States.length; l++) { int a2StateVal = a2States[l]; if (a1StateVal == a2StateVal) { overlap = true; break; } } } if (!overlap) { return false; } } } return true; } } private static boolean doCompareRealForTaxa(List<RealAttribute> attrs, boolean matchUnknowns, boolean matchInapplicables, MatchType matchType) { // sort attributes in ascending order according to range length Map<Attribute, Double> attrRangeLengths = new HashMap<Attribute, Double>(); for (RealAttribute attr : attrs) { FloatRange range = attr.getPresentRange(); attrRangeLengths.put(attr, (double) (range.getMaximumFloat() - range.getMinimumFloat())); } Collections.sort(attrs, new AttributeValueSizeComparator(attrRangeLengths)); if (matchType == MatchType.EXACT || matchType == MatchType.SUBSET) { boolean diff = false; for (int i = 0; i < attrs.size() - 1; i++) { RealAttribute a1 = attrs.get(i); RealAttribute a2 = attrs.get(i + 1); diff = !compareReal(a1, a2, matchUnknowns, matchInapplicables, matchType); if (diff) { break; } } return !diff; } else { // overlap - taxa must have at least one point in common // progressively calculate area of overlap int numAttrs = attrs.size(); for (int i = 0; i < numAttrs; i++) { RealAttribute a1 = attrs.get(i); FloatRange a1Range = a1.getPresentRange(); for (int j = i + 1; j < numAttrs; j++) { boolean overlap = false; RealAttribute a2 = attrs.get(j); FloatRange a2Range = a2.getPresentRange(); overlap = a2Range.overlapsRange(a1Range); if (!overlap) { return false; } } } return true; } } private static boolean doCompareTextForTaxa(List<TextAttribute> attrs, boolean matchUnknowns, boolean matchInapplicables, MatchType matchType) { // sort attributes in ascending order according to text length Map<Attribute, Double> attrTextLengths = new HashMap<Attribute, Double>(); for (TextAttribute attr : attrs) { String attrText = attr.getText(); attrTextLengths.put(attr, (double) attrText.length()); } Collections.sort(attrs, new AttributeValueSizeComparator(attrTextLengths)); boolean diff = false; for (int i = 0; i < attrs.size() - 1; i++) { TextAttribute a1 = attrs.get(i); TextAttribute a2 = attrs.get(i + 1); diff = !compareText(a1, a2, matchUnknowns, matchInapplicables, matchType); if (diff) { break; } } return !diff; } /** * Compare two multistate values. Both values must correspond to the same * character * * @param attr1 * first multistate value * @param attr2 * second multistate value * @param attr * the attribute * @param matchUnknowns * true if unknown matches any value * @param matchInapplicables * true if inapplicable matches any value * @param matchType * the match type - exact, subset or overlap * @return true if the specimen value matches the attribute */ public static boolean compareMultistate(MultiStateAttribute attr1, MultiStateAttribute attr2, boolean matchUnknowns, boolean matchInapplicables, MatchType matchType) { if (!attr1.getCharacter().equals(attr2.getCharacter())) { throw new IllegalArgumentException( String.format("Specimen value character %s does not match attribute character %s", attr1.getCharacter(), attr2.getCharacter())); } Set<Integer> attr1Values = attr1.getPresentStates(); Set<Integer> attr2Values = attr2.getPresentStates(); boolean attr1Unknown = attr1.isUnknown(); boolean attr2Unknown = attr2.isUnknown(); boolean attr1Inapplicable = attr1.isInapplicable(); boolean attr2Inapplicable = attr2.isInapplicable(); return doCompareMultiStateOrInteger(attr1Values, attr2Values, attr1Unknown, attr2Unknown, attr1Inapplicable, attr2Inapplicable, matchUnknowns, matchInapplicables, matchType); } /** * Compare two integer values. Both values must correspond to the same * character * * @param attr1 * first integer value * @param attr2 * second integer value * @param attr * the attribute * @param matchUnknowns * true if unknown matches any value * @param matchInapplicables * true if inapplicable matches any value * @param matchType * the match type - exact, subset or overlap * @return true if the specimen value matches the attribute */ public static boolean compareInteger(IntegerAttribute attr1, IntegerAttribute attr2, boolean matchUnknowns, boolean matchInapplicables, MatchType matchType) { if (!attr1.getCharacter().equals(attr2.getCharacter())) { throw new IllegalArgumentException( String.format("Specimen value character %s does not match attribute character %s", attr1.getCharacter(), attr2.getCharacter())); } Set<Integer> attr1Values = attr1.getPresentValues(); Set<Integer> attr2Values = attr2.getPresentValues(); boolean attr1Unknown = attr1.isUnknown(); boolean attr2Unknown = attr2.isUnknown(); boolean attr1Inapplicable = attr1.isInapplicable(); boolean attr2Inapplicable = attr2.isInapplicable(); return doCompareMultiStateOrInteger(attr1Values, attr2Values, attr1Unknown, attr2Unknown, attr1Inapplicable, attr2Inapplicable, matchUnknowns, matchInapplicables, matchType); } private static boolean doCompareMultiStateOrInteger(Set<Integer> attr1Values, Set<Integer> attr2Values, boolean attr1Unknown, boolean attr2Unknown, boolean attr1Inapplicable, boolean attr2Inapplicable, boolean matchUnknowns, boolean matchInapplicables, MatchType matchType) { // If both attributes are unknown or inapplicable this is considered a // match. Otherwise, the return value depends on setting for // matchInapplicable or matchUnknown. if ((attr1Unknown || attr1Inapplicable) && (attr2Unknown || attr2Inapplicable)) { return true; } if ((attr1Unknown && attr1Inapplicable) || (attr2Unknown && attr2Inapplicable)) { return matchInapplicables; } if ((attr1Unknown && !attr1Inapplicable) || (attr2Unknown && !attr2Inapplicable)) { return matchUnknowns; } boolean match = false; switch (matchType) { case EXACT: match = attr1Values.equals(attr2Values); break; case SUBSET: // is the first a subset of the second match = attr2Values.containsAll(attr1Values); break; case OVERLAP: for (int stateVal : attr1Values) { if (attr2Values.contains(stateVal)) { match = true; break; } } break; default: throw new RuntimeException(String.format("Unrecognized match type %s", matchType.toString())); } return match; } /** * Compare two real values. Both values must correspond to the same * character * * @param attr1 * first real value * @param attr2 * second real value * @param attr * the attribute * @param matchUnknowns * true if unknown matches any value * @param matchInapplicables * true if inapplicable matches any value * @param matchType * the match type - exact, subset or overlap * @return true if the specimen value matches the attribute */ public static boolean compareReal(RealAttribute attr1, RealAttribute attr2, boolean matchUnknowns, boolean matchInapplicables, MatchType matchType) { if (!attr1.getCharacter().equals(attr2.getCharacter())) { throw new IllegalArgumentException( String.format("Specimen value character %s does not match attribute character %s", attr1.getCharacter(), attr2.getCharacter())); } FloatRange attr1Range = attr1.getPresentRange(); FloatRange attr2Range = attr2.getPresentRange(); boolean attr1Unknown = attr1.isUnknown(); boolean attr2Unknown = attr2.isUnknown(); boolean attr1Inapplicable = attr1.isInapplicable(); boolean attr2Inapplicable = attr2.isInapplicable(); return doCompareRange(attr1Range, attr2Range, attr1Unknown, attr2Unknown, attr1Inapplicable, attr2Inapplicable, matchUnknowns, matchInapplicables, matchType); } private static boolean doCompareRange(FloatRange attr1Range, FloatRange attr2Range, boolean attr1Unknown, boolean attr2Unknown, boolean attr1Inapplicable, boolean attr2Inapplicable, boolean matchUnknowns, boolean matchInapplicables, MatchType matchType) { // If both attributes are unknown or inapplicable this is considered a // match. Otherwise, the return value depends on setting for // matchInapplicable or matchUnknown. if ((attr1Unknown || attr1Inapplicable) && (attr2Unknown || attr2Inapplicable)) { return true; } if ((attr1Unknown && attr1Inapplicable) || (attr2Unknown && attr2Inapplicable)) { return matchInapplicables; } if ((attr1Unknown && !attr1Inapplicable) || (attr2Unknown && !attr2Inapplicable)) { return matchUnknowns; } boolean match = false; switch (matchType) { case EXACT: match = attr1Range.equals(attr2Range); break; case SUBSET: // is the first a subset of the second match = attr2Range.containsRange(attr1Range); break; case OVERLAP: match = attr1Range.overlapsRange(attr2Range); break; default: throw new RuntimeException(String.format("Unrecognized match type %s", matchType.toString())); } return match; } /** * Compare two text values. Both values must correspond to the same * character The following rules apply: * * 1. MATCH INAPPLICABLE and MATCH UNKNOWN are ignored. Inapplicables and * unknowns are treated as a mismatch. 2. The text to be found may consist * of a number of sub-strings separated by '/'. In the cases of MATCH EXACT * and MATCH SUBSET, each sub-string must exist separately in the searched * text. For MATCH OVERLAP, the presence of any sub-string will result in a * match. * * @param attr1 * first text value * @param attr2 * second text value * @param attr * the attribute * @param matchUnknowns * true if unknown matches any value * @param matchInapplicables * true if inapplicable matches any value * @param matchType * the match type - exact, subset or overlap * @return true if the specimen value matches the attribute */ public static boolean compareText(TextAttribute attr1, TextAttribute attr2, boolean matchUnknowns, boolean matchInapplicables, MatchType matchType) { if (!attr1.getCharacter().equals(attr2.getCharacter())) { throw new IllegalArgumentException( String.format("Specimen value character %s does not match attribute character %s", attr1.getCharacter(), attr2.getCharacter())); } String attr1Text = attr1.getText(); String attr2Text = attr2.getText(); boolean attr1Unknown = attr1.isUnknown(); boolean attr2Unknown = attr2.isUnknown(); boolean attr1Inapplicable = attr1.isInapplicable(); boolean attr2Inapplicable = attr2.isInapplicable(); // The text for the first attribute passed in may be multiple values set // for a text character in a specimen, delimited by // "/". List<String> attr1Values = Arrays.asList(attr1Text.split("/")); return doCompareText(attr1Values, attr2Text, attr1Unknown, attr2Unknown, attr1Inapplicable, attr2Inapplicable, matchUnknowns, matchInapplicables, matchType); } /** * compares two text characters applying the following rules - 1. MATCH * INAPPLICABLE and MATCH UNKNOWN are ignored. Inapplicables and unknowns * are treated as a mismatch. 2. The text to be found may consist of a * number of sub-strings separated by '/'. In the cases of MATCH EXACT and * MATCH SUBSET, each sub-string must exist separately in the searched text. * For MATCH OVERLAP, the presence of any sub-string will result in a match. * * @param val * @param attr * @return */ private static boolean doCompareText(List<String> attr1Values, String attr2Value, boolean attr1Unknown, boolean attr2Unknown, boolean attr1Inapplicable, boolean attr2Inapplicable, boolean matchUnknowns, boolean matchInapplicables, MatchType matchType) { // If both attributes are unknown or inapplicable this is considered a // match. if ((attr1Unknown || attr1Inapplicable) && (attr2Unknown || attr2Inapplicable)) { return true; } // One attribute unknown and inapplicable always equates to no match for // text // attributes if ((attr1Unknown && attr1Inapplicable) || (attr2Unknown && attr2Inapplicable)) { return false; } if ((attr1Unknown && !attr1Inapplicable) || (attr2Unknown && !attr2Inapplicable)) { return false; } boolean match = false; switch (matchType) { case EXACT: match = StringUtils.join(attr1Values, "/").toLowerCase().equals(attr2Value.toLowerCase()); case SUBSET: match = true; for (String txtVal : attr1Values) { if (!attr2Value.toLowerCase().contains(txtVal.toLowerCase())) { match = false; break; } } break; case OVERLAP: for (String txtVal : attr1Values) { if (attr2Value.toLowerCase().contains(txtVal.toLowerCase())) { match = true; break; } } break; default: throw new RuntimeException(String.format("Unrecognized match type %s", matchType.toString())); } return match; } }