Java tutorial
/* * Copyright (C) 2007-2009 Institute for Computational Biomedicine, * Weill Medical College of Cornell University * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package edu.cornell.med.icb.clustering; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.NullReader; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import org.junit.Test; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; import java.util.List; /** * Basic tests of the {@link MCLClusterer}. */ public class TestMCLClusterer { /** * Used to log debug and informational messages. */ private static final Logger LOGGER = Logger.getLogger(TestMCLClusterer.class); /** * Validate that reading from an MCL output file produces the proper number of clusters. * @throws IOException if the {@link edu.cornell.med.icb.clustering.MCLClusterer} * cannot access the reader passed to it. */ @Test public void testMCLOutputFile() throws IOException { // assume that mcl was executed in the following way: // mcl inputfile.txt --abc -o outputfile.txt // where the input file was: // 0 2 0.05 // 1 3 0.30 // // this should produce two clusters [1, 3] and [2, 4] // so we're testing that the mcl output file translates correctly final int[][] expected = { { 0, 2 }, { 1, 3 } }; final Reader reader = new StringReader(expected[0][0] + "\t" + expected[0][1] + IOUtils.LINE_SEPARATOR + expected[1][0] + "\t" + expected[1][1] + IOUtils.LINE_SEPARATOR); final Clusterer clusterer = new MCLClusterer(reader); final List<int[]> clusters = clusterer.getClusters(); assertEquals("there should be 2 clusters", 2, clusters.size()); assertArrayEquals(expected[0], clusters.get(0)); assertArrayEquals(expected[1], clusters.get(1)); } /** * Validate that reading from an empty MCL output file does not cause errors. * @throws IOException if the {@link edu.cornell.med.icb.clustering.MCLClusterer} * cannot access the reader passed to it. */ @Test public void testEmptyMCLOutputFile() throws IOException { Reader reader = null; try { reader = new NullReader(0); final Clusterer clusterer = new MCLClusterer(reader); final List<int[]> clusters = clusterer.getClusters(); assertEquals("there should be 0 clusters", 0, clusters.size()); } finally { IOUtils.closeQuietly(reader); } } /** * Default delta to use when comparing floating point values. */ private static final double DELTA = 0.00001; /** * This test validates that each instance will be placed into it's own * cluster when there is no overlap between them. */ @Test public void oneInstancePerCluster() { // put one instance in each cluster, total two instances final Clusterer clusterer = new MCLClusterer(2); final SimilarityDistanceCalculator distanceCalculator = new MaxLinkageDistanceCalculator() { public double distance(final int instanceIndex, final int otherInstanceIndex) { if (instanceIndex != otherInstanceIndex) { return 100; } else { return 0; } } }; assertEquals(100d, distanceCalculator.distance(0, 1), DELTA); assertEquals(100d, distanceCalculator.distance(1, 0), DELTA); assertEquals(0d, distanceCalculator.distance(0, 0), DELTA); assertEquals(0d, distanceCalculator.distance(1, 1), DELTA); final List<int[]> clusters = clusterer.cluster(distanceCalculator, 2); assertNotNull(clusters); assertEquals(2, clusters.size()); assertEquals(1, clusters.get(0).length); assertEquals(1, clusters.get(1).length); assertEquals(0, clusters.get(0)[0]); assertEquals(1, clusters.get(1)[0]); } @Test public void fourInstanceClusteringInOneCluster() { // put one instance in each cluster, total two instances final Clusterer clusterer = new MCLClusterer(4); final SimilarityDistanceCalculator distanceCalculator = new MaxLinkageDistanceCalculator() { public double distance(final int i, final int j) { // instances 0 and 1 belong to same cluster if (i == 0 && j == 1 || i == 1 && j == 0) { return 0; } else { return 10; } } }; // instances 0,1,2,3 go to cluster 1 (distance(0,1)=0; distance(2,0)=10<=threshold) assertEquals(0d, distanceCalculator.distance(0, 1), DELTA); assertEquals(0d, distanceCalculator.distance(1, 0), DELTA); assertEquals(10d, distanceCalculator.distance(0, 0), DELTA); assertEquals(10d, distanceCalculator.distance(1, 1), DELTA); assertEquals(10d, distanceCalculator.distance(0, 2), DELTA); assertEquals(10d, distanceCalculator.distance(2, 0), DELTA); assertEquals(10d, distanceCalculator.distance(2, 3), DELTA); final List<int[]> clusters = clusterer.cluster(distanceCalculator, 11); assertNotNull(clusters); assertEquals("Expected one cluster", 1, clusters.size()); final int[] cluster = clusters.get(0); assertEquals("First cluster must have size 4", 4, cluster.length); assertTrue("Instance 0 in cluster 0", ArrayUtils.contains(cluster, 0)); assertTrue("Instance 1 in cluster 0", ArrayUtils.contains(cluster, 1)); assertTrue("Instance 2 in cluster 0", ArrayUtils.contains(cluster, 2)); assertTrue("Instance 3 in cluster 0", ArrayUtils.contains(cluster, 3)); } @Test public void fourInstanceClusteringInThreeClusters() { // put one instance in each cluster, total two instances final Clusterer clusterer = new MCLClusterer(4); final SimilarityDistanceCalculator distanceCalculator = new MaxLinkageDistanceCalculator() { public double distance(final int i, final int j) { // instances 0 and 1 belong to same cluster if (i == 0 && j == 1 || i == 1 && j == 0) { return 0; } else { return 11; } } }; assertEquals(0d, distanceCalculator.distance(0, 1), DELTA); assertEquals(0d, distanceCalculator.distance(1, 0), DELTA); assertEquals(11d, distanceCalculator.distance(0, 2), DELTA); assertEquals(11d, distanceCalculator.distance(2, 0), DELTA); assertEquals(11d, distanceCalculator.distance(2, 3), DELTA); final List<int[]> clusters = clusterer.cluster(distanceCalculator, 10); assertNotNull(clusters); assertEquals("Incorrect number of clusters", 3, clusters.size()); assertEquals("First cluster must have size 2", 2, clusters.get(0).length); assertEquals("Second cluster must have size 1", 1, clusters.get(1).length); assertEquals("Third cluster must have size 1", 1, clusters.get(2).length); assertEquals("Instance 0 in cluster 0", 0, clusters.get(0)[0]); assertEquals("Instance 1 in cluster 0", 1, clusters.get(0)[1]); assertEquals("Instance 2 in cluster 1", 2, clusters.get(1)[0]); assertEquals("Instance 3 in cluster 2", 3, clusters.get(2)[0]); } @Test public void fourInstanceClusteringInFourClusters() { // put one instance in each cluster, total two instances final Clusterer clusterer = new MCLClusterer(4); final SimilarityDistanceCalculator distanceCalculator = new MaxLinkageDistanceCalculator() { public double distance(final int i, final int j) { // instances 0 and 1 belong to same cluster if (i == 0 && j == 1 || i == 1 && j == 0) { return 0; } else { return 10; } } }; final List<int[]> clusters = clusterer.cluster(distanceCalculator, 2); assertNotNull(clusters); assertEquals("Incorrect number of clusters", 3, clusters.size()); assertEquals("First cluster must have size 2", 2, clusters.get(0).length); assertEquals("Second cluster must have size 1", 1, clusters.get(1).length); assertEquals("Third cluster must have size 1", 1, clusters.get(2).length); assertEquals("Instance 0 in cluster 0", 0, clusters.get(0)[0]); assertEquals("Instance 1 in cluster 0", 1, clusters.get(0)[1]); assertEquals("Instance 2 in cluster 2", 2, clusters.get(1)[0]); assertEquals("Instance 3 in cluster 3", 3, clusters.get(2)[0]); } @Test public void zeroDistanceCalculator() { final Clusterer clusterer = new MCLClusterer(4); final SimilarityDistanceCalculator distanceCalculator = new MaxLinkageDistanceCalculator() { public double distance(final int i, final int j) { return 0; // instances 0-3 belong to the same cluster } }; final List<int[]> clusters = clusterer.cluster(distanceCalculator, 2); assertNotNull(clusters); assertEquals("Expected one cluster", 1, clusters.size()); final int[] cluster = clusters.get(0); assertEquals("First cluster must have size 4", 4, cluster.length); assertTrue("Instance 0 in cluster 0", ArrayUtils.contains(cluster, 0)); assertTrue("Instance 1 in cluster 0", ArrayUtils.contains(cluster, 1)); assertTrue("Instance 2 in cluster 0", ArrayUtils.contains(cluster, 2)); assertTrue("Instance 3 in cluster 0", ArrayUtils.contains(cluster, 3)); } /** * This test validates that the clusterer will not throw any errors when * passed zero instances. */ @Test public void zeroInstances() { final Clusterer clusterer = new MCLClusterer(0); final SimilarityDistanceCalculator distanceCalculator = new MaxLinkageDistanceCalculator() { public double distance(final int instanceIndex, final int otherInstanceIndex) { return Math.abs(instanceIndex - otherInstanceIndex); } }; final List<int[]> result = clusterer.cluster(distanceCalculator, 0); assertNotNull(result); assertEquals(0, result.size()); } /** * This test validates that the clusterer will not not allow a negative * instance count. */ @Test(expected = IllegalArgumentException.class) public void illegalInstanceCount() { new MCLClusterer(-1); } /** * This test validates that a dataset is clustered correctly using various * different values of thresholds. */ @Test public void multipleThresholds() { // raw data to test final int[] data = { 1, 2, 3, 3, 2, 1, 42, 43, 4, 6 }; // list of expected results per threshold tested @SuppressWarnings("unchecked") final List<int[]>[] expectedResults = new List[6]; // threshold = 0 ( each instance in it's own cluster ) expectedResults[0] = new ArrayList<int[]>(); // threshold = 0 expectedResults[0] = new ArrayList<int[]>(); expectedResults[0].add(new int[] { 1, 1 }); expectedResults[0].add(new int[] { 2, 2 }); expectedResults[0].add(new int[] { 3, 3 }); expectedResults[0].add(new int[] { 42 }); expectedResults[0].add(new int[] { 43 }); expectedResults[0].add(new int[] { 4 }); expectedResults[0].add(new int[] { 6 }); // threshold = 1 expectedResults[1] = new ArrayList<int[]>(); expectedResults[1].add(new int[] { 1, 2, 2, 1, 3, 3, 4 }); expectedResults[1].add(new int[] { 42, 43 }); expectedResults[1].add(new int[] { 6 }); // threshold = 2 expectedResults[2] = new ArrayList<int[]>(); expectedResults[2].add(new int[] { 1, 2, 3, 3, 2, 1, 4, 6 }); expectedResults[2].add(new int[] { 42, 43 }); final Clusterer clusterer = new MCLClusterer(data.length); // Distance function that returns the difference between instances final SimilarityDistanceCalculator distanceCalculator = new MaxLinkageDistanceCalculator() { public double distance(final int i, final int j) { return Math.abs(data[i] - data[j]); } }; for (int i = 0; i <= 2; i++) { final List<int[]> clusters = clusterer.cluster(distanceCalculator, i); assertNotNull("Cluster at threshold " + i, clusters); LOGGER.debug("Iterative clusters - threshold = " + i); final List<int[]> expectedCluster = expectedResults[i]; assertEquals("Number of clusters don't match at threshold = " + i, expectedCluster.size(), clusters.size()); int j = 0; for (final int[] cluster : clusters) { // convert instance indexes from the cluster to data final int[] result = new int[cluster.length]; for (int k = 0; k < result.length; k++) { result[k] = data[cluster[k]]; } LOGGER.debug(j + ":" + ArrayUtils.toString(result)); final int[] expectedResult = expectedCluster.get(j); assertArrayEquals("Cluster " + j + " with threshold " + i + " does not match expected", expectedResult, result); j++; } } } /** * A test that uses a clusters words of equal length together. */ @Test public void clusterWordsInAString() { final String text = "Four score and seven years ago our fathers brought forth on this" + " continent a new nation conceived in liberty and dedicated to the proposition" + " that all men are created equal"; final List<String[]> expectedResults = new ArrayList<String[]>(); expectedResults.add(new String[] { "and", "ago", "our", "new", "and", "the", "all", "men", "are" }); expectedResults.add(new String[] { "score", "seven", "years", "forth", "equal" }); expectedResults.add(new String[] { "fathers", "brought", "liberty", "created" }); expectedResults.add(new String[] { "Four", "this", "that" }); expectedResults.add(new String[] { "on", "in", "to" }); expectedResults.add(new String[] { "continent", "conceived", "dedicated" }); expectedResults.add(new String[] { "a" }); expectedResults.add(new String[] { "nation" }); expectedResults.add(new String[] { "proposition" }); // break the text up into an array of individual words final String[] words = text.split(" "); // create a distance calculator that returns the difference in size between the two words final SimilarityDistanceCalculator distanceCalculator = new MaxLinkageDistanceCalculator() { public double distance(final int i, final int j) { return Math.abs(words[i].length() - words[j].length()); } }; // and cluster the words into groups according to their size final Clusterer clusterer = new MCLClusterer(words.length); final List<int[]> clusters = clusterer.cluster(distanceCalculator, 0); assertEquals("Number of clusters don't match", expectedResults.size(), clusters.size()); int j = 0; for (final int[] cluster : clusters) { // convert instance indexes from the cluster to source data final String[] result = new String[cluster.length]; for (int k = 0; k < result.length; k++) { result[k] = words[cluster[k]]; } LOGGER.debug(ArrayUtils.toString(cluster)); LOGGER.debug(ArrayUtils.toString(result)); assertArrayEquals("Cluster " + j + " does not match expected", expectedResults.get(j), result); j++; } } private interface Person { } private interface Place { } private interface Thing { } /** * Tests clustering with lists of object types. */ @Test public void clusterObjectCollections() { final List<Object> peoplePlacesAndThings = new ArrayList<Object>(); final Person tom = new Person() { }; final Person dick = new Person() { }; final Person harry = new Person() { }; peoplePlacesAndThings.add(tom); peoplePlacesAndThings.add(dick); peoplePlacesAndThings.add(harry); final Place home = new Place() { }; final Place work = new Place() { }; final Place school = new Place() { }; peoplePlacesAndThings.add(home); peoplePlacesAndThings.add(work); peoplePlacesAndThings.add(school); final Thing pencil = new Thing() { }; final Thing pen = new Thing() { }; final Thing paper = new Thing() { }; final Thing stapler = new Thing() { }; peoplePlacesAndThings.add(pencil); peoplePlacesAndThings.add(pen); peoplePlacesAndThings.add(paper); peoplePlacesAndThings.add(stapler); // put things in a random order just to make things interesting Collections.shuffle(peoplePlacesAndThings); final Clusterer clusterer = new MCLClusterer(peoplePlacesAndThings.size()); final List<int[]> clusters = clusterer.cluster(new MaxLinkageDistanceCalculator() { public double distance(final int i, final int j) { final Object object1 = peoplePlacesAndThings.get(i); final Object object2 = peoplePlacesAndThings.get(j); if (object1 instanceof Person && object2 instanceof Person) { return 0; } else if (object1 instanceof Place && object2 instanceof Place) { return 0; } else if (object1 instanceof Thing && object2 instanceof Thing) { return 0; } else { return 42; } } }, 1.0f); assertNotNull("Cluster should not be null", clusters); assertEquals("There should be 3 clusters", 3, clusters.size()); boolean peopleClustered = false; boolean placesClustered = false; boolean thingsClustered = false; for (final int[] cluster : clusters) { // check the type of the first, so we know what we're dealing with final Object object = peoplePlacesAndThings.get(cluster[0]); if (object instanceof Person) { assertEquals("There should be 3 people", 3, cluster.length); assertFalse("There appears to be more than one cluster of people", peopleClustered); peopleClustered = true; for (int i = 1; i < cluster.length; i++) { final Object person = peoplePlacesAndThings.get(cluster[i]); assertTrue("Cluster contains more than people", person instanceof Person); } } else if (object instanceof Place) { assertEquals("There should be 3 places", 3, cluster.length); assertFalse("There appears to be more than one cluster of places", placesClustered); placesClustered = true; for (int i = 1; i < cluster.length; i++) { final Object place = peoplePlacesAndThings.get(cluster[i]); assertTrue("Cluster contains more than places", place instanceof Place); } } else if (object instanceof Thing) { assertEquals("There should be 4 things", 4, cluster.length); assertFalse("There appears to be more than one cluster of things", thingsClustered); thingsClustered = true; for (int i = 1; i < cluster.length; i++) { final Object thing = peoplePlacesAndThings.get(cluster[i]); assertTrue("Cluster contains more than things", thing instanceof Thing); } } else { fail("Cluster contains an unknown object type: " + object.getClass().getName()); } } assertTrue("People should have been clustered", peopleClustered); assertTrue("Places should have been clustered", placesClustered); assertTrue("Things should have been clustered", thingsClustered); } }