Java tutorial
/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package meka.core; import weka.core.Attribute; import weka.core.Instance; import weka.core.Instances; import weka.core.Utils; import java.util.*; /** * PSUtils.java - Handy Utils for working with Pruned Sets. * Essentially, we have a <code>P</code> parameter for pruning and an <code>N</code> parameter for reintroduction. * @author Jesse Read * @version June 2014 */ public abstract class PSUtils { /** * Sum Counts - sum all the values in 'map'. */ public static int sumCounts(HashMap<LabelSet, Integer> map) { int c = 0; for (Integer c_ : map.values()) { c = c + c_; } return c; } /** * Count Subsets - returns the number of times labelset 'ysub' exists as a subset in 'Y'. */ public static int countSubsets(LabelSet ysub, Set<LabelSet> Y) { int c = 0; for (LabelSet s : Y) { if (ysub.subsetof(s) > 0) c++; } return c; } /** * Get Subsets - get all subsets of 'y' in the set 'set'. */ public static Set<LabelSet> getSubsets(LabelSet y, Set<LabelSet> set) { Set<LabelSet> subsets = new HashSet<LabelSet>(); for (LabelSet s : set) { // is it a subset? int m = LabelSet.subset(s.indices, y.indices); if (m > 0) { // it is! subsets.add(s); } } return subsets; } /** * Get Sorted Subsets - get all subsets of 'y' in the set 'set'; sorted according to 'cmp'. */ public static SortedSet<LabelSet> getSortedSubsets(LabelSet y, Set<LabelSet> set, Comparator cmp) { SortedSet<LabelSet> subsets = new TreeSet<LabelSet>(cmp); for (LabelSet s : set) { // is it a subset? int m = LabelSet.subset(s.indices, y.indices); if (m > 0) { // it is! subsets.add(s); } } return subsets; } /** * Get Sorted Subsets - get all subsets of 'y' in the set 'set'; sorted according to length, and counts in 'map'. */ public static SortedSet<LabelSet> getSortedSubsets(LabelSet y, HashMap<LabelSet, Integer> map) { return getSortedSubsets(y, map.keySet(), new LabelSetComparator(map)); } /** * Cover - cover 'y' completely (or as best as possible) with sets from 'map'. * @param y a LabelSet, e.g., [0,2,7] * @param map a map of LabelSets to counts e.g., {[0,2,7]:39,...} * @return the sets to cover y (or just y, if it already covers itself). */ public static LabelSet[] cover(LabelSet y, HashMap<LabelSet, Integer> map) { Integer count = map.get(y); if (count != null && count >= 1) { return new LabelSet[] { y }; } else { // Find some matches (i.e., subsets) Comparator cmp = new LabelSetComparator(map); SortedSet<LabelSet> allS = getSortedSubsets(y, map.keySet(), cmp); Set<LabelSet> covS = cover(y, allS, cmp); return covS.toArray(new LabelSet[0]); } } public static Set<LabelSet> cover(LabelSet y, SortedSet<LabelSet> S, Comparator cmp) { LabelSet y_copy = y.deep_copy(); Set<LabelSet> K = new HashSet<LabelSet>(); // While we have more, and not covered, ... while (S.size() > 0 && y_copy.indices.length > 0) { //System.out.println("y = "+y_copy); //System.out.println("S = "+S); LabelSet s_ = S.last(); //System.out.println("s_ = "+s_); K.add(s_); // add s_ to new 'keep' list y_copy.minus(s_); S = getSortedSubsets(y_copy, S, cmp); //System.out.println(""+y_copy); } return K; } /** * GetAllSubsets - Get all frequent subsets of 'y' according to 'map'. * @param y a labelset, e.g., [0,2,7] * @param map a map of labelsets to counts e.g., {[0,2]:39, [2,7]:5, [2,9]:24...} * @return the LabelSets to use to decompose y into, e.g., [[0,2],[2,7]] */ public static LabelSet[] getAllSubsets(LabelSet y, HashMap<LabelSet, Integer> map) { Integer count = map.get(y); if (count != null && count >= 1) { // don't prune return new LabelSet[] { y }; } SortedSet<LabelSet> subsets = getSortedSubsets(y, map.keySet(), new LabelSetComparator(map)); LabelSet s[] = subsets.toArray(new LabelSet[subsets.size()]); return s; } /** * GetTopNSubsets - Don't cover all (like cover(y,map), rather only the top 'n') * @param y a labelset, e.g., [0,2,7] * @param map a map of labelsets to counts e.g., {[0,2]:39, [2,7]:5, [2,9]:24...} * @param n the number of sets to take * @return the LabelSets to use to decompose y into, e.g., [[0,2],[2,7]] */ public static LabelSet[] getTopNSubsets(LabelSet y, HashMap<LabelSet, Integer> map, int n) { LabelSet s[] = getAllSubsets(y, map); return Arrays.copyOfRange(s, Math.max(0, s.length - n), s.length); } public static SortedSet<LabelSet> getTopNSubsetsAsSet(LabelSet y, HashMap<LabelSet, Integer> map, int n) { SortedSet<LabelSet> allSets = getSortedSubsets(y, map); SortedSet<LabelSet> topSets = new TreeSet<LabelSet>(); int n_ = 0; for (LabelSet Y : allSets) { topSets.add(Y); if (++n_ > n) break; } return topSets; } public static LabelSet getTopSubset(LabelSet y, HashMap<LabelSet, Integer> map) { return getTopNSubsets(y, map, 1)[0]; } /** * CountCombinationsSparseSubset - like CountCombinationsSparse, but only interested in 'indices[]' wrt 'D'. * @param D dataset * @param indices indices we are interested in * @return a HashMap where a LabelSet representation of each label combination is associated with an Integer count, e.g., [3,7,14],3 */ public static HashMap<LabelSet, Integer> countCombinationsSparseSubset(Instances D, int indices[]) { HashMap<LabelSet, Integer> map = new HashMap<LabelSet, Integer>(); for (int i = 0; i < D.numInstances(); i++) { LabelSet m = new LabelSet(MLUtils.toSubIndicesSet(D.instance(i), indices)); map.put(m, map.containsKey(m) ? map.get(m) + 1 : 1); } return map; } /** * CountCombinationsSparse - return a mapping of each distinct label combination and its count. * @param D dataset * @param L number of labels * @return a HashMap where a LabelSet representation of each label combination is associated with an Integer count, e.g., [3,7,14],3 */ public static final HashMap<LabelSet, Integer> countCombinationsSparse(Instances D, int L) { HashMap<LabelSet, Integer> map = new HashMap<LabelSet, Integer>(); for (int i = 0; i < D.numInstances(); i++) { LabelSet y = new LabelSet(MLUtils.toSparseIntArray(D.instance(i), L)); Integer c = map.get(y); map.put(y, c == null ? 1 : c + 1); } return map; } /** used by convertDistribution(p,L) */ @Deprecated private static final double[] toDoubleArray(String labelSet, int L) { int set[] = (labelSet.length() <= 2) ? new int[] {} : MLUtils.toIntArray(labelSet); //StringBuffer y = new StringBuffer(L); double y[] = new double[L]; //for(int j = 0; j < L; j++) { // y.append("0"); //} for (int j : set) { //y.setCharAt(j,'1'); y[j] = 1.; } return y; //return y.toString(); } /** * Convert Distribution - Given the posterior across combinations, return the distribution across labels. * <br> * TODO Use recombination!!! * @see PSUtils#recombination(double[],int,LabelSet[]) * @param p the posterior of the super classes (combinations), e.g., P([1,3],[2]) = [1,0] * @param L the number of labels * @return the distribution across labels, e.g., P(1,2,3) = [1,0,1] */ @Deprecated public static double[] convertDistribution(double p[], int L, Instances iTemplate) { double y[] = new double[L]; int i = Utils.maxIndex(p); double d[] = toDoubleArray(iTemplate.classAttribute().value(i), L); for (int j = 0; j < d.length; j++) { if (d[j] > 0.0) y[j] = 1.0; } return y; } /** * Convert Distribution - Given the posterior across combinations, return the distribution across labels. * @param p the posterior of the super classes (combinations), e.g., P([1,3],[2]) = [0.3,0.7] * @param L the number of labels, e.g., L = 3 * @param meta_labels typical mapping, e.g., [13] to [1,3] * @return the distribution across labels, e.g., P(1,2,3) = [0.3,0.7,0.3] */ public static double[] convertDistribution(double p[], int L, LabelSet meta_labels[]) { double y[] = new double[L]; for (int i = 0; i < p.length; i++) { LabelSet Y_i = meta_labels[i]; // e.g., [1,4] for (int j : Y_i.indices) { // j = 1, 4 y[j] += p[i]; // y[1] += p[i] = 0.5 } } return y; } public static final LabelSet[] makeLabelSetMap(Instances T) { int L_ = 4; return new LabelSet[L_]; } // @todo name convertDistribution ? /** * Convert Distribution - Given the posterior across combinations, return the distribution across labels. * @param p the posterior of the super classes (combinations), e.g., P([1,3],[2]) = [1,0] * @param L the number of labels * @return the distribution across labels, e.g., P(1,2,3) = [1,0,1] */ public static final double[] recombination(double p[], int L, LabelSet map[]) { double y[] = new double[L]; int i = Utils.maxIndex(p); LabelSet y_meta = map[i]; for (int j : y_meta.indices) { y[j] = 1.0; } return y; } // @todo name convertDistribution ? /** * Convert Distribution - Given the posterior across combinations, return the distribution across labels. * @param p the posterior of the super classes (combinations), e.g., P([1,3],[2]) = [0.3,0.7] * @param L the number of labels * @return the distribution across labels, e.g., P(1,2,3) = [0.3,0.7,0.3] */ public static final double[] recombination_t(double p[], int L, Instances iTemplate) { double y[] = new double[L]; for (int k = 0; k < p.length; k++) { String d_string = iTemplate.classAttribute().value(k); // e.g. d_string = "[1,3,5]" int d[] = MLUtils.toIntArray(d_string); // e.g. d = [1,3,5] p[k] = 0.5 for (int j : d) { y[j] += p[k]; // e.g., y[0] += d[0] * p[k] = 1 * 0.5 = 0.5 } } return y; } public static final double[] recombination_t(double p[], int L, LabelSet map[]) { double y[] = new double[L]; for (int k = 0; k < p.length; k++) { LabelSet y_meta = map[k]; for (int j : y_meta.indices) { y[j] += p[k]; } } return y; } /** * Convert a multi-label instance into a multi-class instance, according to a template. */ public static Instance convertInstance(Instance x, int L, Instances template) { Instance x_ = (Instance) x.copy(); x_.setDataset(null); for (int i = 0; i < L; i++) x_.deleteAttributeAt(0); x_.insertAttributeAt(0); x_.setDataset(template); return x_; } public static Instances LCTransformation(Instances D) { return LCTransformation(D, D.classIndex()); } public static Instances LCTransformation(Instances D, int L) { return PSTransformation(D, L, "Class", 0, 0); } public static Instances PSTransformation(Instances D, int P, int N) { return PSTransformation(D, D.classIndex(), "Class", P, N); } public static Instances PSTransformation(Instances D, int L, int P, int N) { return PSTransformation(D, L, "Class", P, N); } /** * Transform instances into a multi-class representation. * @param D original dataset * @param L number of labels in the original dataset * @param cname class name for the new dataset (may want to encode the list of indices here for RAkEL-like methods) * @param p pruning value * @param n restoration value * @return transformed dataset */ public static Instances PSTransformation(Instances D, int L, String cname, int p, int n) { D = new Instances(D); // Gather combinations HashMap<LabelSet, Integer> distinctCombinations = PSUtils.countCombinationsSparse(D, L); // Prune combinations if (p > 0) MLUtils.pruneCountHashMap(distinctCombinations, p); // Check there are > 2 if (distinctCombinations.size() <= 1 && p > 0) { // ... or try again if not ... System.err.println("[Warning] You did too much pruning, setting P = P-1"); return PSTransformation(D, L, cname, p - 1, n); } // Create class attribute ArrayList<String> ClassValues = new ArrayList<String>(); for (LabelSet y : distinctCombinations.keySet()) ClassValues.add(y.toString()); Attribute C = new Attribute(cname, ClassValues); // Insert new special attribute (which has all possible combinations of labels) D.insertAttributeAt(C, L); D.setClassIndex(L); //Add class values int N = D.numInstances(); for (int i = 0; i < N; i++) { Instance x = D.instance(i); LabelSet y = new LabelSet(MLUtils.toSparseIntArray(x, L)); String y_string = y.toString(); // add it if (ClassValues.contains(y_string)) //if its class value exists x.setClassValue(y_string); // decomp else if (n > 0) { //String d_subsets[] = getTopNSubsets(comb,distinctCombinations,n); LabelSet d_subsets[] = PSUtils.getTopNSubsets(y, distinctCombinations, n); //LabelSet d_subsets[] = PSUtils.cover(y,distinctCombinations); if (d_subsets.length > 0) { // fast x.setClassValue(d_subsets[0].toString()); // additional if (d_subsets.length > 1) { for (int s_i = 1; s_i < d_subsets.length; s_i++) { Instance x_ = (Instance) (x).copy(); x_.setClassValue(d_subsets[s_i].toString()); D.add(x_); } } } else { x.setClassMissing(); } } } // remove with missing class D.deleteWithMissingClass(); try { D = F.removeLabels(D, L); } catch (Exception e) { // should never happen } D.setClassIndex(0); return D; } /* * This method was used before tighter MOA integration (in Feb 2016). * This method could probably be elimitated if doing so does not cause any problems. public static Instance[] PSTransformation(Instance x, int L, HashMap<LabelSet,Integer> map, int n) { int y_[] = MLUtils.toSparseIntArray(x,L); if (y_.length <= 0) // there can be no transformation if there are no labels! return new Instance[0]; LabelSet y = new LabelSet(y_); if (map.get(y) != null) { Instance x_subsets[] = new Instance[1]; x_subsets[0] = convertInstance(x,L,x.dataset()); x_subsets[0].setClassValue(y.toString()); return x_subsets; } else { LabelSet d_subsets[] = PSUtils.getTopNSubsets(y,map,n); Instance x_subsets[] = new Instance[d_subsets.length]; Instance x_template = convertInstance(x,L,x.dataset()); for(int i = 1; i < d_subsets.length; i++) { x_subsets[i] = (Instance)(x_template).copy(); x_subsets[i].setClassValue(d_subsets[i].toString()); } return x_subsets; } } */ /** * Transform one instance into multi-class representations (an array of possibly multiple single-label instances). * @param x instance * @param L number of labels in the instance * @param map a map of labelsets to their frequencies * @param n restoration value * @return transformed instances */ public static Instance[] PSTransformation(Instance x, int L, HashMap<LabelSet, Integer> map, int n, Instances iTemplate) { int y_[] = MLUtils.toSparseIntArray(x, L); if (y_.length <= 0) // There can be no transformation if there are no labels! return new Instance[0]; LabelSet y = new LabelSet(y_); if (map.get(y) != null) { // The labelset already exists in the map (was observed in the training set) Instance x_subsets[] = new Instance[1]; x_subsets[0] = convertInstance(x, L, iTemplate); x_subsets[0].setClassValue(y.toString()); // problem here! return x_subsets; } else { // The labelset has not been seen before, use thap to construct some instances that fit LabelSet d_subsets[] = PSUtils.getTopNSubsets(y, map, n); Instance x_subsets[] = new Instance[d_subsets.length]; Instance x_template = convertInstance(x, L, iTemplate); for (int i = 1; i < d_subsets.length; i++) { x_subsets[i] = (Instance) (x_template).copy(); x_subsets[i].setClassValue(d_subsets[i].toString()); } return x_subsets; } } /** * Transform instances into a multi-class representation. * @param D original dataset * @param L number of labels in that dataset * @param cname class name for the new dataset (may want to encode the list of indices here for RAkEL-like methods) * @param p pruning value * @param n restoration value * @return transformed dataset */ public static Instances SLTransformation(Instances D, int L, String cname, int p, int n) { D = new Instances(D); // Gather combinations HashMap<LabelSet, Integer> distinctCombinations = PSUtils.countCombinationsSparse(D, L); // Prune combinations if (p > 0) MLUtils.pruneCountHashMap(distinctCombinations, p); // Check there are > 2 if (distinctCombinations.size() <= 1 && p > 0) { // ... or try again if not ... System.err.println("[Warning] You did too much pruning, setting P = P-1"); return PSTransformation(D, L, cname, p - 1, n); } // Create class attribute ArrayList<String> ClassValues = new ArrayList<String>(); for (LabelSet y : distinctCombinations.keySet()) ClassValues.add(y.toString()); Attribute C = new Attribute(cname, ClassValues); // Insert new special attribute (which has all possible combinations of labels) D.insertAttributeAt(C, L); D.setClassIndex(L); //Add class values int N = D.numInstances(); for (int i = 0; i < N; i++) { Instance x = D.instance(i); LabelSet y = new LabelSet(MLUtils.toSparseIntArray(x, L)); String y_string = y.toString(); // add it if (ClassValues.contains(y_string)) //if its class value exists x.setClassValue(y_string); // decomp else if (n > 0) { //String d_subsets[] = getTopNSubsets(comb,distinctCombinations,n); LabelSet d_subsets[] = PSUtils.getTopNSubsets(y, distinctCombinations, n); //LabelSet d_subsets[] = PSUtils.cover(y,distinctCombinations); if (d_subsets.length > 0) { // fast x.setClassValue(d_subsets[0].toString()); // additional if (d_subsets.length > 1) { for (int s_i = 1; s_i < d_subsets.length; s_i++) { Instance x_ = (Instance) (x).copy(); x_.setClassValue(d_subsets[s_i].toString()); D.add(x_); } } } else { x.setClassMissing(); } } } // remove with missing class D.deleteWithMissingClass(); try { D = F.removeLabels(D, L); } catch (Exception e) { // should never happen } D.setClassIndex(0); return D; } /** * Given N labelsets 'sparseY', use a count 'map' to */ public static final LabelSet[] convert(LabelSet[] sparseY, HashMap<LabelSet, Integer> map) { return null; } /** * SaveMap - Save the HashMap 'map' to the file 'filename'. */ public static final void saveMap(String filename, HashMap<LabelSet, Integer> map) throws Exception { MLUtils.saveObject(map, filename); } /** * LoadMap - Load the HashMap stored in 'filename'. */ public static HashMap<LabelSet, Integer> loadMap(String filename) throws Exception { return (HashMap<LabelSet, Integer>) MLUtils.loadObject(filename); } }