Java tutorial
/* * Copyright 2012 Nabeel Mukhtar * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package net.sf.jtmt.clustering; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * The Class NearestNeighborClusterer. */ public class NearestNeighborClusterer { /** The log. */ private final Log log = LogFactory.getLog(getClass()); /** The num neighbors. */ private int numNeighbors; /** The similarity threshold. */ private double similarityThreshold; /** * Sets the num neighbors. * * @param numNeighbors the new num neighbors */ public void setNumNeighbors(int numNeighbors) { this.numNeighbors = numNeighbors; } /** * Sets the similarity threshold. * * @param similarityThreshold the new similarity threshold */ public void setSimilarityThreshold(double similarityThreshold) { this.similarityThreshold = similarityThreshold; } /** * Cluster. * * @param collection the collection * @return the list */ public List<Cluster> cluster(DocumentCollection collection) { // get neighbors for every document Map<String, Double> similarityMap = collection.getSimilarityMap(); // for (String key : similarityMap.keySet()) { // log.debug("sim(" + key + ") => " + similarityMap.get(key)); // } Map<String, List<String>> neighborMap = new HashMap<String, List<String>>(); for (String documentName : collection.getDocumentNames()) { neighborMap.put(documentName, collection.getNeighbors(documentName, similarityMap, numNeighbors)); } // compute sum of similarities of every document with its numNeighbors Map<String, Double> fitnesses = getFitnesses(collection, similarityMap, neighborMap); List<String> sortedDocNames = new ArrayList<String>(); // sort by sum of similarities descending sortedDocNames.addAll(collection.getDocumentNames()); Collections.sort(sortedDocNames, Collections.reverseOrder(new ByValueComparator<String, Double>(fitnesses))); // for (String sortedDocName : sortedDocNames) { // log.debug(sortedDocName + " => " + fitnesses.get(sortedDocName)); // } List<Cluster> clusters = new ArrayList<Cluster>(); int clusterId = 0; // Loop through the list of documents in descending order of the sum of the // similarities. Map<String, String> documentClusterMap = new HashMap<String, String>(); for (String docName : sortedDocNames) { // skip if document already assigned to cluster if (documentClusterMap.containsKey(docName)) { continue; } // create cluster with current document Cluster cluster = new Cluster("C" + clusterId); cluster.addDocument(docName, collection.getDocument(docName)); documentClusterMap.put(docName, cluster.getId()); // find all neighboring documents to the left and right of the current // document that are not assigned to a cluster, and have a similarity // greater than our threshold. Add these documents to the new cluster List<String> neighbors = neighborMap.get(docName); for (String neighbor : neighbors) { if (documentClusterMap.containsKey(neighbor)) { continue; } double similarity = similarityMap.get(StringUtils.join(new String[] { docName, neighbor }, ":")); if (similarity < similarityThreshold) { continue; } cluster.addDocument(neighbor, collection.getDocument(neighbor)); documentClusterMap.put(neighbor, cluster.getId()); } clusters.add(cluster); clusterId++; } return clusters; } /** * Gets the fitnesses. * * @param collection the collection * @param similarityMap the similarity map * @param neighbors the neighbors * @return the fitnesses */ private Map<String, Double> getFitnesses(DocumentCollection collection, Map<String, Double> similarityMap, Map<String, List<String>> neighbors) { Map<String, Double> fitnesses = new HashMap<String, Double>(); for (String docName : collection.getDocumentNames()) { double fitness = 0.0D; for (String neighborDoc : neighbors.get(docName)) { String key = StringUtils.join(new String[] { docName, neighborDoc }, ":"); fitness += similarityMap.get(key); } fitnesses.put(docName, fitness); } return fitnesses; } }