net.sf.jtmt.clustering.DocumentCollection.java Source code

Java tutorial

Introduction

Here is the source code for net.sf.jtmt.clustering.DocumentCollection.java

Source

/*
 * Copyright 2012 Nabeel Mukhtar 
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); 
 * you may not use this file except in compliance with the License. 
 * You may obtain a copy of the License at 
 * 
 *  http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 * See the License for the specific language governing permissions and
 * limitations under the License. 
 * 
 */
package net.sf.jtmt.clustering;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import net.sf.jtmt.similarity.matrix.CosineSimilarity;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.math.linear.RealMatrix;

/**
 * The Class DocumentCollection.
 */
public class DocumentCollection {

    /** The td matrix. */
    private RealMatrix tdMatrix;

    /** The document map. */
    private Map<String, RealMatrix> documentMap;

    /** The document names. */
    private List<String> documentNames;

    /**
     * Instantiates a new document collection.
     *
     * @param tdMatrix the td matrix
     * @param docNames the doc names
     */
    public DocumentCollection(RealMatrix tdMatrix, String[] docNames) {
        int position = 0;
        this.tdMatrix = tdMatrix;
        this.documentMap = new HashMap<String, RealMatrix>();
        this.documentNames = new ArrayList<String>();
        for (String documentName : docNames) {
            documentMap.put(documentName,
                    tdMatrix.getSubMatrix(0, tdMatrix.getRowDimension() - 1, position, position));
            documentNames.add(documentName);
            position++;
        }
    }

    /**
     * Size.
     *
     * @return the int
     */
    public int size() {
        return documentMap.keySet().size();
    }

    /**
     * Gets the document names.
     *
     * @return the document names
     */
    public List<String> getDocumentNames() {
        return documentNames;
    }

    /**
     * Gets the document name at.
     *
     * @param position the position
     * @return the document name at
     */
    public String getDocumentNameAt(int position) {
        return documentNames.get(position);
    }

    /**
     * Gets the document at.
     *
     * @param position the position
     * @return the document at
     */
    public RealMatrix getDocumentAt(int position) {
        return documentMap.get(documentNames.get(position));
    }

    /**
     * Gets the document.
     *
     * @param documentName the document name
     * @return the document
     */
    public RealMatrix getDocument(String documentName) {
        return documentMap.get(documentName);
    }

    /**
     * Shuffle.
     */
    public void shuffle() {
        Collections.shuffle(documentNames);
    }

    /**
     * Gets the similarity map.
     *
     * @return the similarity map
     */
    public Map<String, Double> getSimilarityMap() {
        Map<String, Double> similarityMap = new HashMap<String, Double>();
        CosineSimilarity similarity = new CosineSimilarity();
        RealMatrix similarityMatrix = similarity.transform(tdMatrix);
        for (int i = 0; i < similarityMatrix.getRowDimension(); i++) {
            for (int j = 0; j < similarityMatrix.getColumnDimension(); j++) {
                String sourceDoc = getDocumentNameAt(i);
                String targetDoc = getDocumentNameAt(j);
                similarityMap.put(StringUtils.join(new String[] { sourceDoc, targetDoc }, ":"),
                        similarityMatrix.getEntry(i, j));
            }
        }
        return similarityMap;
    }

    /**
     * Gets the neighbors.
     *
     * @param docName the doc name
     * @param similarityMap the similarity map
     * @param numNeighbors the num neighbors
     * @return the neighbors
     */
    public List<String> getNeighbors(String docName, Map<String, Double> similarityMap, int numNeighbors) {
        if (numNeighbors > size()) {
            throw new IllegalArgumentException("numNeighbors too large, max: " + size());
        }
        final Map<String, Double> differenceMap = new HashMap<String, Double>();
        List<String> neighbors = new ArrayList<String>();
        neighbors.addAll(documentNames);
        for (String documentName : documentNames) {
            String key = StringUtils.join(new String[] { docName, documentName }, ":");
            double difference = Math.abs(similarityMap.get(key) - 1.0D);
            differenceMap.put(documentName, difference);
        }
        Collections.sort(neighbors, new ByValueComparator<String, Double>(differenceMap));
        return neighbors.subList(0, numNeighbors + 1);
    }
}