Java tutorial
/* Copyright (C) 2012 The Stanford MobiSocial Laboratory Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package edu.stanford.muse.groups; import edu.stanford.muse.util.Pair; import edu.stanford.muse.util.Util; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import java.util.*; /** static methods related to similar groups */ public class SimilarGroupMethods { private static Log log = LogFactory.getLog(SimilarGroupMethods.class); /* returns a map consisting of group -> its freq in the input */ private static <T extends Comparable<? super T>> void computeGroupFrequencies(List<Group<T>> input, Collection<SimilarGroup<T>> candidates) { // brute force, could be made more efficient by keeping a person->group index for (Group<T> b : input) { for (SimilarGroup<T> g : candidates) if (b.contains(g)) g.freq++; } } /** * find intersections till fixed point and compute frequency for each group. * returns a run log and list of similar groups */ private static <T extends Comparable<? super T>> Set<SimilarGroup<T>> intersectGroups( Collection<SimilarGroup<T>> startingGroups, int minSize, GroupAlgorithmStats stats) { boolean fixedPoint = false; Set<SimilarGroup<T>> candidates = new LinkedHashSet<SimilarGroup<T>>(); // add all the actually occurring (exact) recipient sets candidates.addAll(startingGroups); // allGroups will be the master list of all groups that we have (uptil // the previous iteration) Set<SimilarGroup<T>> newGroupsPrevIteration = candidates; // newGroupsPrevIteration will start off as all groups in the first iteration, // but reduce to only the newly derived groups at the end of each iteration. // we do this so as have to check for intersections only between the newly // derived groups and all other groups. trying to intersect each group with // every other group known in each iteration might be too expensive. int iteration = 1; while (!fixedPoint) { // newGroups will be the groups we derive in this iteration Set<SimilarGroup<T>> newGroups = new LinkedHashSet<SimilarGroup<T>>(); // brute force: all to all intersection between existing groups and // groups newly created in prev. iteration can be made more efficient // by maintaining person -> group map and only intersecting those // groups that have at least one person in common. for (SimilarGroup<T> g1 : candidates) for (SimilarGroup<T> g2 : newGroupsPrevIteration) { SimilarGroup<T> newGroup = new SimilarGroup<T>(g1.intersect(g2)); if (newGroup.size() == 0) continue; if (newGroup.size() < minSize) continue; // add to newGroups if we dont already have it if (!candidates.contains(newGroup) && !newGroups.contains(newGroup)) newGroups.add(newGroup); } log.info("Intersection iteration " + iteration + ": " + newGroups.size() + " new sets"); stats.intersectionGroupsAdded.add(new GroupStats(newGroups)); for (SimilarGroup<T> g : newGroups) log.debug("new group: " + g); candidates.addAll(newGroups); iteration++; // reached fixed point when no new groups fixedPoint = (newGroups.size() == 0); newGroupsPrevIteration = newGroups; } return candidates; } /** just return freqs of each item in the given corpus */ @SuppressWarnings("unused") private static <T extends Comparable<? super T>> Map<T, Integer> computeIndivFreqs(List<Group<T>> input) { Map<T, Integer> result = new LinkedHashMap<T, Integer>(); for (Group<T> g : input) { // sometimes same person is present twice on the // same message, in that case, do not double count Set<T> set = new LinkedHashSet<T>(); for (T t : g.elements) { if (set.contains(t)) continue; Integer I = result.get(t); if (I == null) result.put(t, 1); else result.put(t, I + 1); } } return result; } private static <T extends Comparable<? super T>> Set<SimilarGroup<T>> selectGroupsWithMinFreq( Collection<SimilarGroup<T>> groups, int minFreq) { Set<SimilarGroup<T>> result = new LinkedHashSet<SimilarGroup<T>>(); for (SimilarGroup<T> g : groups) { // should we select T ? // yes, if it is frequent and not subsumed // by a superset previously emit if (g.freq >= minFreq) result.add(g); // could also redistribute to first subset } return result; } /** select groups above mincount and not subsumed according to maxerror */ private static <T extends Comparable<? super T>> List<SimilarGroup<T>> selectGroupsNotSubsumed( List<SimilarGroup<T>> groups, float maxError) { // sort the groups by size Collections.sort(groups, new Comparator<SimilarGroup<T>>() { public int compare(SimilarGroup<T> g1, SimilarGroup<T> g2) { return g2.size() - g1.size(); } }); List<SimilarGroup<T>> selectedGroups = new ArrayList<SimilarGroup<T>>(); for (SimilarGroup<T> g : groups) { // is it subsumed by previously selected groups boolean subsumed = false; for (SimilarGroup<T> selected : selectedGroups) { // if a superset exists with smaller than maxError rate, // subsumed is true if (selected.contains(g)) { double error = selected.errorWRT(g); if (error <= maxError) { subsumed = true; log.debug(g + "\nsubsumed with error " + error + " by\n " + selected); break; } } } if (!subsumed) selectedGroups.add(g); } return selectedGroups; } /** select groups above mincount and not subsumed according to maxerror */ private static <T extends Comparable<? super T>> Set<SimilarGroup<T>> selectGroupsWithMinSize( Collection<SimilarGroup<T>> groups, int minSize) { LinkedHashSet<SimilarGroup<T>> selectedGroups = new LinkedHashSet<SimilarGroup<T>>(); for (SimilarGroup<T> g : groups) if (g.size() >= minSize) selectedGroups.add(g); return selectedGroups; } /* private static<T extends Comparable<? super T>> float computeErrorWRT(List<Group<T>> originalGroups, SimilarGroup<T> superGroup, SimilarGroup<T> group_i) { return 0.0f; } */ // compute sims matrix. its symmetric. // note: diagonal entries should always be kept at 0. private static <T extends Comparable<? super T>> Pair<float[][], int[][]> computeSims( List<SimilarGroup<T>> groups) { float sims[][] = new float[groups.size()][groups.size()]; int interSize[][] = new int[groups.size()][groups.size()]; for (int i = 0; i < groups.size(); i++) { SimilarGroup<T> group_i = groups.get(i); for (int j = i + 1; j < groups.size(); j++) { SimilarGroup<T> group_j = groups.get(j); float sim = group_i.jaccardSim(group_j); sims[i][j] = sims[j][i] = sim; int intersectionSize = group_i.intersectionSize(group_j); interSize[i][j] = interSize[j][i] = intersectionSize; } } return new Pair<float[][], int[][]>(sims, interSize); } private static <T extends Comparable<? super T>> Set<SimilarSuperGroup<T>> manufactureSuperGroups( List<Group<T>> originalMessages, Set<SimilarGroup<T>> startingGroups, float maxError, float groupMembersSimThreshold) { // result will contain only the new supergroups that we generate Set<SimilarSuperGroup<T>> result = new LinkedHashSet<SimilarSuperGroup<T>>(); // convert to list, its easier to use indices // select only groups of size > 2 as candidates for merging Set<SimilarGroup<T>> s = selectGroupsWithMinSize(startingGroups, 2); List<SimilarGroup<T>> similarGroupsList = new ArrayList<SimilarGroup<T>>(); similarGroupsList.addAll(s); if (similarGroupsList.size() == 0) return result; // empty result if no groups of size 2 or more // compute sims matrix. its symmetric. // note: diagonal entries should always be kept at 0. Pair<float[][], int[][]> matrix = computeSims(similarGroupsList); float sims[][] = matrix.getFirst(); int interSize[][] = matrix.getSecond(); // merge best non-used pair of groups in whole matrix, till done // heuristic: once a group is used for a merge, it will not be used again. // used[] keeps track of whether a group has been used boolean[] used = new boolean[similarGroupsList.size()]; while (true) { float bestSim = -0.1f; int bestSim_i = -1, bestSim_j = -1; // find best sim in the whole matrix, ignoring used rows and cols for (int i = 0; i < similarGroupsList.size(); i++) { if (used[i]) continue; for (int j = i + 1; j < similarGroupsList.size(); j++) { if (used[j]) continue; if (sims[i][j] > bestSim) { bestSim = sims[i][j]; bestSim_i = i; bestSim_j = j; } } } if (bestSim_i == -1 || bestSim_j == -1) break; if (bestSim < groupMembersSimThreshold && interSize[bestSim_i][bestSim_j] < 3) break; // best* is the best in the whole matrix. mark these two groups used used[bestSim_i] = used[bestSim_j] = true; // create the new supergroup SimilarGroup<T> group_i = similarGroupsList.get(bestSim_i); SimilarGroup<T> group_j = similarGroupsList.get(bestSim_j); log.info("Merging most similar groups in this iteration: " + bestSim + " between G" + bestSim_i + " " + group_i + " + G" + bestSim_j + " " + group_j); SimilarSuperGroup<T> superGroup = new SimilarSuperGroup<T>(group_i, group_j); float u1 = group_i.utility * (superGroup.size() / group_i.size()); float u2 = group_j.utility * (superGroup.size() / group_j.size()); superGroup.utility = Math.max(u1, u2); // ignore if we already generated this supergroup, // or if our starting groups contain it if (result.contains(superGroup) || startingGroups.contains(superGroup)) { continue; } // in theory could also check if the error is reasonable // if (computeErrorWRT(originalGroups, superGroup, group_i) < maxError // && computeErrorWRT(originalGroups, superGroup, group_j) < maxError) // result.add(superGroup); // it's official. we can manufacture a new group. result.add(superGroup); // supergroup could be a superset of other groups too. // heuristic: mark them used up. // another heuristic: let them be for (int i = 0; i < similarGroupsList.size(); i++) { if (superGroup.contains(similarGroupsList.get(i))) used[i] = true; } // for convenience, we'll replace group i with supergroup and // recompute sims for just that row and col sims diagonal will // still be 0 because used[i] is currently true similarGroupsList.set(bestSim_i, superGroup); for (int i = 0; i < similarGroupsList.size(); i++) { if (used[i]) continue; float sim = superGroup.jaccardSim(similarGroupsList.get(i)); sims[bestSim_i][i] = sims[i][bestSim_i] = sim; } used[bestSim_i] = false; // revive entry i, its now the supergroup } log.info("Manufactured " + result.size() + " supergroups"); return result; } private static <T extends Comparable<? super T>> void doDFS(List<SimilarGroup<T>> groups, boolean used[], int lastAddedIdx, List<SimilarGroup<T>> result) { SimilarGroup<T> lastAddedGroup = groups.get(lastAddedIdx); // find all unused groups with non-zero sim with lastadded group List<Pair<Integer, Float>> similarGroupsInfo = new ArrayList<Pair<Integer, Float>>(); for (int j = 0; j < groups.size(); j++) { if (used[j]) continue; float sim = lastAddedGroup.jaccardSim(groups.get(j)); if (sim > 0.0001) similarGroupsInfo.add(new Pair<Integer, Float>(j, sim)); } // now sort sim group idx's according to decreasing similarity Util.sortPairsBySecondElement(similarGroupsInfo); for (Pair<Integer, Float> p : similarGroupsInfo) { int groupIdx = p.getFirst(); // groupIdx could have become used in the meantime in calls to doDFS() if (used[groupIdx]) continue; result.add(groups.get(groupIdx)); used[groupIdx] = true; doDFS(groups, used, groupIdx, result); } } // private static <T extends Comparable<? super T>> void // dumpGroupsForDebug(String title, Set<SimilarGroup<T>> set) // { // if (!log.isDebugEnabled()) // return; // List<SimilarGroup<T>> list = new ArrayList<SimilarGroup<T>>(); // list.addAll(set); // dumpGroupsForDebug(title, list); // } /** This is the alternative group algorithm, described in the IUI-2011 paper */ public static <T extends Comparable<? super T>> GroupHierarchy<T> findContactGroupsIUI(List<Group<T>> input, int MINCOUNT, int MIN_GROUP_SIZE, float MAX_SUBSUMPTION_ERROR, float MIN_MERGE_GROUP_SIM, String utilityType, float UTILITY_MULTIPLIER, GroupAlgorithmStats<T> stats) { log.info( "----------------------------------------------- GROUPER -----------------------------------------------\n"); long startTimeMillis = System.currentTimeMillis(); // copy over the alg. parameters so everything is in one place stats.MIN_GROUP_SIZE = MIN_GROUP_SIZE; stats.MIN_FREQ = MINCOUNT; stats.MAX_SUBSUMPTION_ERROR = MAX_SUBSUMPTION_ERROR; stats.MIN_MERGE_GROUP_SIM = MIN_MERGE_GROUP_SIM; int MAX_EDGES = 1000; Set<T> hypers = Grouper.findHyperConnectedElementsRaw(input, MAX_EDGES); for (Group<T> g : input) { for (Iterator<T> it = g.elements.iterator(); it.hasNext();) { T t = it.next(); if (hypers.contains(t)) it.remove(); if (g.elements.size() == 0) continue; } } List<SimilarGroup<T>> exactGroups = Grouper.convertToSimilarGroups(input); //int nUniqueGroups = exactGroups.size(); stats.startingGroups = new GroupStats<T>(exactGroups); // dumpGroupsForDebug("Starting Groups", exactGroups); Set<SimilarGroup<T>> candidates = selectGroupsWithMinSize(exactGroups, MIN_GROUP_SIZE); stats.groupsWithMinSize = new GroupStats<T>(candidates); // dumpGroupsForDebug("Groups with min size " + MIN_GROUP_SIZE, candidates); log.warn("Intersections are disabled because taking too long for Ken Lay!!"); //candidates // returns (log, result) //= SimilarGroupMethods.intersectGroups(candidates, MIN_GROUP_SIZE, stats); stats.groupsAfterIntersections = new GroupStats<T>(candidates); // dumpGroupsForDebug("Groups after intersections ", candidates); // verify for (SimilarGroup<T> sg : candidates) { Util.softAssert(candidates.contains(sg)); Util.softAssert(sg.size() >= MIN_GROUP_SIZE); } // now filter based on min freq computeGroupFrequencies(input, candidates); candidates = SimilarGroupMethods.selectGroupsWithMinFreq(candidates, MINCOUNT); stats.groupsWithMinFreqAndMinSize = new GroupStats<T>(candidates); // dumpGroupsForDebug("Groups with min. freq. " + MINCOUNT, candidates); // compute utilities // Map<T, Integer> indivFreqs = // SimilarGroupMethods.computeIndivFreqs(input); for (SimilarGroup<T> sg : candidates) { if ("linear".equals(utilityType)) sg.computeLinearUtility(); else if ("square".equals(utilityType)) sg.computeSquareUtility(); else sg.computeExpUtility(UTILITY_MULTIPLIER); // sg.computeZScore(indivFreqs, input.size()); } // convert candidates from set to list now, because we need sorting etc List<SimilarGroup<T>> candidateList = new ArrayList<SimilarGroup<T>>(); candidateList.addAll(candidates); // remove subsumed groups List<SimilarGroup<T>> selectedGroups = SimilarGroupMethods.selectGroupsNotSubsumed(candidateList, MAX_SUBSUMPTION_ERROR); stats.groupsAfterSubsumption = new GroupStats<T>(selectedGroups); // dumpGroupsForDebug("Groups after subsumption with error " // + MAX_SUBSUMPTION_ERROR, selectedGroups); // now compute hierarchy, just to identify the root groups GroupHierarchy<T> hierarchy = new GroupHierarchy<T>(selectedGroups); // Map<SimilarGroup<T>, List<SimilarGroup<T>>> parentToChildGroupMap // = hierarchy.parentToChildrenMap; Set<SimilarGroup<T>> rootGroups = hierarchy.rootGroups; log.info("hierarchy: #root groups = " + rootGroups.size()); // add supergroups. supergroups never subsume other subgroups Set<SimilarSuperGroup<T>> manufacturedGroups = SimilarGroupMethods.manufactureSuperGroups(input, rootGroups, MAX_SUBSUMPTION_ERROR, MIN_MERGE_GROUP_SIM); stats.manufacturedGroups = new GroupStats(manufacturedGroups); selectedGroups.addAll(manufacturedGroups); stats.finalGroups = new GroupStats<T>(selectedGroups); // dumpGroupsForDebug("Final groups " + MAX_SUBSUMPTION_ERROR, selectedGroups); // recompute hierarchy. its easier than // trying to update the existing hierarchy hierarchy = new GroupHierarchy<T>(selectedGroups); rootGroups = hierarchy.rootGroups; stats.finalRootGroups = new GroupStats(rootGroups); long endTimeMillis = System.currentTimeMillis(); stats.executionTimeMillis = endTimeMillis - startTimeMillis; return hierarchy; } /** returns top groups based on freq. */ public static <T extends Comparable<? super T>> List<SimilarGroup<T>> topGroups(GroupHierarchy<T> hierarchy, int nGroups) { List<SimilarGroup<T>> result = new ArrayList<SimilarGroup<T>>(); // annoying set -> list conversion. no good reason List<SimilarGroup<T>> similarGroupsList = new ArrayList<SimilarGroup<T>>(); similarGroupsList.addAll(hierarchy.getAllGroups()); if (log.isDebugEnabled()) log.debug("# similar groups = " + similarGroupsList.size()); // sort the groups by frequency of occurrence Collections.sort(similarGroupsList, new Comparator<SimilarGroup<T>>() { public int compare(SimilarGroup<T> g1, SimilarGroup<T> g2) { if (g2.utility == g1.utility) return 0; else if (g2.utility > g1.utility) return 1; else return -1; } }); // sorts of frequency count of basket int i = 0; for (; i < nGroups && i < similarGroupsList.size(); i++) result.add(similarGroupsList.get(i)); if (i < similarGroupsList.size()) log.info("group utility cutoff is : " + similarGroupsList.get(i).utility); return result; } /** * this is a pure display thing to cluster similar root groups together. * we'll try and cluster root groups together following DFS paths in order * of similarity */ public static <T extends Comparable<? super T>> List<SimilarGroup<T>> orderGroupsBySimilarity( List<SimilarGroup<T>> groups) { // we'll generate a better ordering in result List<SimilarGroup<T>> result = new ArrayList<SimilarGroup<T>>(); // used will track whether the corresponding index in the input groups // has been already used up, i.e. added to result boolean used[] = new boolean[groups.size()]; while (true) { int nextIdx = 0; for (nextIdx = 0; nextIdx < groups.size(); nextIdx++) if (!used[nextIdx]) break; // if all groups are used, next == groups.size(), so we are done if (nextIdx == groups.size()) break; // mark group[next] as done. result.add(groups.get(nextIdx)); used[nextIdx] = true; // invoke DFS doDFS(groups, used, nextIdx, result); } return result; } //compute the exact frequency of the group instead of including its superset's msgs private static <T extends Comparable<? super T>> void computeGroupFrequenciesv2(List<Group<T>> input, Collection<SimilarGroup<T>> candidates) { for (Group<T> b : input) { for (SimilarGroup<T> g : candidates) if (b.equals(g)) g.freq++; } } }