net.sf.jtmt.clustering.NearestNeighborClusterer.java Source code

Introduction

Here is the source code for net.sf.jtmt.clustering.NearestNeighborClusterer.java
Source

/*
 * Copyright 2012 Nabeel Mukhtar 
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); 
 * you may not use this file except in compliance with the License. 
 * You may obtain a copy of the License at 
 * 
 *  http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 * See the License for the specific language governing permissions and
 * limitations under the License. 
 * 
 */
package net.sf.jtmt.clustering;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * The Class NearestNeighborClusterer.
 */
public class NearestNeighborClusterer {

    /** The log. */
    private final Log log = LogFactory.getLog(getClass());

    /** The num neighbors. */
    private int numNeighbors;

    /** The similarity threshold. */
    private double similarityThreshold;

    /**
     * Sets the num neighbors.
     *
     * @param numNeighbors the new num neighbors
     */
    public void setNumNeighbors(int numNeighbors) {
        this.numNeighbors = numNeighbors;
    }

    /**
     * Sets the similarity threshold.
     *
     * @param similarityThreshold the new similarity threshold
     */
    public void setSimilarityThreshold(double similarityThreshold) {
        this.similarityThreshold = similarityThreshold;
    }

    /**
     * Cluster.
     *
     * @param collection the collection
     * @return the list
     */
    public List<Cluster> cluster(DocumentCollection collection) {
        // get neighbors for every document
        Map<String, Double> similarityMap = collection.getSimilarityMap();
        //    for (String key : similarityMap.keySet()) {
        //      log.debug("sim(" + key + ") => " + similarityMap.get(key));
        //    }
        Map<String, List<String>> neighborMap = new HashMap<String, List<String>>();
        for (String documentName : collection.getDocumentNames()) {
            neighborMap.put(documentName, collection.getNeighbors(documentName, similarityMap, numNeighbors));
        }
        // compute sum of similarities of every document with its numNeighbors
        Map<String, Double> fitnesses = getFitnesses(collection, similarityMap, neighborMap);
        List<String> sortedDocNames = new ArrayList<String>();
        // sort by sum of similarities descending
        sortedDocNames.addAll(collection.getDocumentNames());
        Collections.sort(sortedDocNames,
                Collections.reverseOrder(new ByValueComparator<String, Double>(fitnesses)));
        //    for (String sortedDocName : sortedDocNames) {
        //      log.debug(sortedDocName + " => " + fitnesses.get(sortedDocName));
        //    }
        List<Cluster> clusters = new ArrayList<Cluster>();
        int clusterId = 0;
        // Loop through the list of documents in descending order of the sum of the
        // similarities.
        Map<String, String> documentClusterMap = new HashMap<String, String>();
        for (String docName : sortedDocNames) {
            // skip if document already assigned to cluster
            if (documentClusterMap.containsKey(docName)) {
                continue;
            }
            // create cluster with current document
            Cluster cluster = new Cluster("C" + clusterId);
            cluster.addDocument(docName, collection.getDocument(docName));
            documentClusterMap.put(docName, cluster.getId());
            // find all neighboring documents to the left and right of the current
            // document that are not assigned to a cluster, and have a similarity
            // greater than our threshold. Add these documents to the new cluster
            List<String> neighbors = neighborMap.get(docName);
            for (String neighbor : neighbors) {
                if (documentClusterMap.containsKey(neighbor)) {
                    continue;
                }
                double similarity = similarityMap.get(StringUtils.join(new String[] { docName, neighbor }, ":"));
                if (similarity < similarityThreshold) {
                    continue;
                }
                cluster.addDocument(neighbor, collection.getDocument(neighbor));
                documentClusterMap.put(neighbor, cluster.getId());
            }
            clusters.add(cluster);
            clusterId++;
        }
        return clusters;
    }

    /**
     * Gets the fitnesses.
     *
     * @param collection the collection
     * @param similarityMap the similarity map
     * @param neighbors the neighbors
     * @return the fitnesses
     */
    private Map<String, Double> getFitnesses(DocumentCollection collection, Map<String, Double> similarityMap,
            Map<String, List<String>> neighbors) {
        Map<String, Double> fitnesses = new HashMap<String, Double>();
        for (String docName : collection.getDocumentNames()) {
            double fitness = 0.0D;
            for (String neighborDoc : neighbors.get(docName)) {
                String key = StringUtils.join(new String[] { docName, neighborDoc }, ":");
                fitness += similarityMap.get(key);
            }
            fitnesses.put(docName, fitness);
        }
        return fitnesses;
    }
}