Java tutorial
/* * Copyright 2012 Nabeel Mukhtar * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package net.sf.jtmt.clustering; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import net.sf.jtmt.similarity.matrix.CosineSimilarity; import org.apache.commons.lang.StringUtils; import org.apache.commons.math.linear.RealMatrix; /** * The Class DocumentCollection. */ public class DocumentCollection { /** The td matrix. */ private RealMatrix tdMatrix; /** The document map. */ private Map<String, RealMatrix> documentMap; /** The document names. */ private List<String> documentNames; /** * Instantiates a new document collection. * * @param tdMatrix the td matrix * @param docNames the doc names */ public DocumentCollection(RealMatrix tdMatrix, String[] docNames) { int position = 0; this.tdMatrix = tdMatrix; this.documentMap = new HashMap<String, RealMatrix>(); this.documentNames = new ArrayList<String>(); for (String documentName : docNames) { documentMap.put(documentName, tdMatrix.getSubMatrix(0, tdMatrix.getRowDimension() - 1, position, position)); documentNames.add(documentName); position++; } } /** * Size. * * @return the int */ public int size() { return documentMap.keySet().size(); } /** * Gets the document names. * * @return the document names */ public List<String> getDocumentNames() { return documentNames; } /** * Gets the document name at. * * @param position the position * @return the document name at */ public String getDocumentNameAt(int position) { return documentNames.get(position); } /** * Gets the document at. * * @param position the position * @return the document at */ public RealMatrix getDocumentAt(int position) { return documentMap.get(documentNames.get(position)); } /** * Gets the document. * * @param documentName the document name * @return the document */ public RealMatrix getDocument(String documentName) { return documentMap.get(documentName); } /** * Shuffle. */ public void shuffle() { Collections.shuffle(documentNames); } /** * Gets the similarity map. * * @return the similarity map */ public Map<String, Double> getSimilarityMap() { Map<String, Double> similarityMap = new HashMap<String, Double>(); CosineSimilarity similarity = new CosineSimilarity(); RealMatrix similarityMatrix = similarity.transform(tdMatrix); for (int i = 0; i < similarityMatrix.getRowDimension(); i++) { for (int j = 0; j < similarityMatrix.getColumnDimension(); j++) { String sourceDoc = getDocumentNameAt(i); String targetDoc = getDocumentNameAt(j); similarityMap.put(StringUtils.join(new String[] { sourceDoc, targetDoc }, ":"), similarityMatrix.getEntry(i, j)); } } return similarityMap; } /** * Gets the neighbors. * * @param docName the doc name * @param similarityMap the similarity map * @param numNeighbors the num neighbors * @return the neighbors */ public List<String> getNeighbors(String docName, Map<String, Double> similarityMap, int numNeighbors) { if (numNeighbors > size()) { throw new IllegalArgumentException("numNeighbors too large, max: " + size()); } final Map<String, Double> differenceMap = new HashMap<String, Double>(); List<String> neighbors = new ArrayList<String>(); neighbors.addAll(documentNames); for (String documentName : documentNames) { String key = StringUtils.join(new String[] { docName, documentName }, ":"); double difference = Math.abs(similarityMap.get(key) - 1.0D); differenceMap.put(documentName, difference); } Collections.sort(neighbors, new ByValueComparator<String, Double>(differenceMap)); return neighbors.subList(0, numNeighbors + 1); } }