org.mitre.ccv.canopy.CcvCanopyCluster.java Source code

Java tutorial

Introduction

Here is the source code for org.mitre.ccv.canopy.CcvCanopyCluster.java

Source

/**
 * Created on 15 December 2009.
 *
 * Copyright 2010- The MITRE Corporation. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you  may not 
 * use this file except in compliance with the License. You may obtain a copy of 
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 * License for the specific language governing permissions andlimitations under
 * the License.
 *
 * $Id$
 */
package org.mitre.ccv.canopy;

import java.io.BufferedReader;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.Log;

import org.mitre.ccv.CompleteMatrix;
import org.mitre.clustering.canopy.Canopy;
import org.mitre.clustering.canopy.CanopyCluster;
import org.mitre.clustering.canopy.CanopyDistanceMetric;
import org.mitre.spectrum.TH1D;
import org.mitre.spectrum.TSpectrum;

/**
 * Class that implements a Canopy Clustering approach to the generated CCV vectors.
 *
 * @see org.mitre.clustering.canopy.CanopyCluster
 * @author Marc Colosimo
 */
public class CcvCanopyCluster {

    private static final Log LOG = LogFactory.getLog("CcvCanopyCluster");
    private CompleteMatrix completeMatrix;
    private List<Canopy<Integer>> vectorCanopies = null;
    private CanopyDistanceMetric<Integer> cheapMetric = null;
    private float t1;
    private float t2;

    /* Generate list of sample vectors which are unfortunately the columns */
    //private List<RealVector> vectors; // tmp?!?
    public CcvCanopyCluster(CompleteMatrix completeMatrix) {

        this.completeMatrix = completeMatrix;
        //this.cheapMetric = new TanimotoCcvDistanceMetric(completeMatrix);
        this.cheapMetric = new HammingCcvDistanceMetric(completeMatrix);
        this.t1 = 0.0f;
        this.t2 = 0.0f;

        //this.vectors = this.completeMatrix.getVectors();
    }

    /**
     * Sets the thresholds 1 and 2 using MaxLike profile.
     *
     * Issues/Pittfalls:
     * <ol>
     * <ul>t2 might be to small and nothing is removed from the list
     * <ul>t1 might be to large and everything is added to a canopy
     * </ol>
     * @todo: figure out how to select threshold1 (not to big not to small)
     */
    public double[] autoThreshold() throws Exception {
        LOG.info("autoThreshold: Generating distance distribution");
        //SortedMap<Double, Integer> sortMap = new TreeMap<Double, Integer>(new ReverseDoubleComparator());
        SortedMap<Double, Integer> sortMap = new TreeMap<Double, Integer>();
        // generate all the pairwise distances
        final int size = completeMatrix.getMatrix().getColumnDimension();
        for (int i = 0; i < size; ++i) {
            for (int j = i + 1; j < size; ++j) {
                // only calculate one triangle not full!
                Double d = this.cheapMetric.distance(i, j);
                //set.add(this.cheapMetric.distance(i, j));
                if (sortMap.containsKey(d)) {
                    sortMap.put(d, sortMap.get(d) + 1);
                } else {
                    sortMap.put(d, 1);
                }
            }
        }

        /**
        * $gnuplot
        * > set nokey
        * > set xlabel "Pairwise distance"
        * > set ylabel "Number of samples"
        * > plot "output.txt" using 1:2
        */
        /* */
        for (Iterator<Entry<Double, Integer>> i = sortMap.entrySet().iterator(); i.hasNext();) {
            Entry<Double, Integer> entry = i.next();

            //System.out.printf("%f\t%d\n", entry.getKey(), entry.getValue());
        }
        /* */

        /**
         * How many bins per samples do we want?
         * Using the two end cases at lower and upper bounds.
         */
        TH1D hist = new TH1D(completeMatrix.getMatrix().getColumnDimension() * 2, sortMap.firstKey(),
                sortMap.lastKey());
        LOG.info(String.format("autoThreshold: Packing into histogram with %d bins (%f, %f)", hist.getBins().length,
                hist.getLower(), hist.getUpper()));
        hist.pack(sortMap);
        int[] bins = hist.getBins();
        if (LOG.isDebugEnabled()) {
            if (hist.getNumberOverflows() != 0) {
                LOG.debug(
                        String.format("autoThreshold: Have %d overflows in histogram!", hist.getNumberOverflows()));
            }
            if (hist.getNumberUnderflows() != 0) {
                LOG.debug(String.format("autoThreshold: Have %d underflows in histogram!",
                        hist.getNumberUnderflows()));
            }
        }

        // print out histogram bins
        for (int i = 0; i < bins.length; i++) {
            //System.out.printf("%f\t%d\n", hist.getBinCenter(i), hist.getBinContent(i));
        }
        TSpectrum spectrum = new TSpectrum(); // use default values (sigma = 1, threshold = 0.5
        int numFound = spectrum.search(hist);
        LOG.info(String.format("autoThreshold: Found %d peaks", numFound));
        if (numFound == 0) {
            LOG.fatal("autoThreshold: No peaks found in data!");
            throw new Exception();
        }
        double xpeaks[] = spectrum.getPostionX();
        double[] rtn = new double[2]; // t1, t2
        if (numFound == 1) {
            int bin = hist.findBin(xpeaks[0]);
            // is this in the top or bottom half?
            // @todo: must be better way than this hack
            if (bin > 0) {
                bin--;
            }
            rtn[0] = hist.getBinCenter(bin); // threshold1 is only peak
            rtn[1] = (hist.getLower() + rtn[0]) / 2;
            return rtn;
        }

        // more than one peak
        /**
         * Several possible options:
         * - select t1 first than find a good t2
         * - select t2 first than find a good t1
         * 
         * make sure that there is enough samples below t2 and above t1
             
        if (xpeaks[0] > xpeaks[1]) {
        // what about sigma value: how many are between these two
        rtn[0] = xpeaks[0]; // t1
        rtn[1] = xpeaks[1];  //t2
        } else {
        rtn[0] = xpeaks[1];
        rtn[1] = xpeaks[0];
        }
        */

        // find the peak with the smallest this will be the basis for t2
        double minPeakX = hist.getUpper();
        int minPeakI = -1;
        for (int i = 0; i < numFound; i++) {
            final double x = xpeaks[i];
            if (x < minPeakX) {
                minPeakX = x;
                minPeakI = i;
            }
        }
        //System.err.printf("minPeakX=%f (%d)\n", minPeakX, minPeakI);

        // find next peak above the smallest
        // should try using something about the average and standard deviation
        // of the distribution of entries in picking this
        double min2PeakX = hist.getUpper();
        int min2PeakI = -1;
        for (int i = 0; i < numFound; i++) {
            final double x = xpeaks[i];
            if (i != minPeakI && x < min2PeakX) { // should check that it isn't equal or within sigma
                min2PeakX = x;
                min2PeakI = i;
            }
        }
        //System.err.printf("min2PeakX=%f (%d)\n", min2PeakX, min2PeakI);
        /**
        if (minPeakI + 1 < min2PeakI - 1) {
        rtn[0] = hist.getBinCenter(min2PeakI - 1);         // t1
        rtn[1] = hist.getBinCenter(minPeakI + 1);          // t2
        } else {
        // really close not good - these should be the centers
        LOG.info("autoThreshold: t1 and t2 are possbily from adjacent bins!");
        rtn[0] = min2PeakX;
        rtn[1] = minPeakX;
        }
        int t2bin = hist.findBin(minPeakX);
        if (t2bin - 1 > 0 ) {
        rtn[1] = hist.getBinCenter(t2bin - 1); // don't want the first bin?
        } else {
        rtn[1] = minPeakX;
        }
        int t1bin = hist.findBin(min2PeakX);
        if (t1bin + 1 < bins.length - 1) {  // don't want the last bin?
        rtn[0] = hist.getBinCenter(t1bin + 1);
        } else {
        rtn[0] = min2PeakX;
        }*/

        rtn[0] = min2PeakX;
        rtn[1] = minPeakX;

        /*
        double t1 = hist.getUpper();
        double t2 = hist.getLower(); */
        // print out what we found
        for (int p = 0; p < numFound; p++) {
            double xp = xpeaks[p];
            int bin = hist.findBin(xp);
            int yp = hist.getBinContent(bin); // double yp
            System.err.printf("%d\t%f\t%d\n", bin, xp, yp);
            // if(yp- Math.sqrt(yp) < fline.eval(xp)) continue
        }

        return rtn;
    }

    /**
     * Comparator class that sorts doubles from largest to smallest
     */
    public class ReverseDoubleComparator implements Comparator<Double> {

        public int compare(Double d, Double d1) {
            return -1 * Double.compare(d, d1);
        }
    }

    /**
     * Returns the threshold1
     */
    public float getThreshold1() {
        return this.t1;
    }

    /**
     * Returns the threshold2
     */
    public float getThreshold2() {
        return this.t2;
    }

    public void cluster(float threshold1, float threshold2) {
        // need python like range
        ArrayList<Integer> indices = new ArrayList<Integer>();
        for (int n = 0; n < this.completeMatrix.getNames().size(); n++) {
            indices.add(n);
        }
        CanopyCluster<Integer> canopyCluster = new CanopyCluster(threshold1, threshold2, cheapMetric);
        this.vectorCanopies = canopyCluster.cluster(indices, false);
        LOG.info(String.format("Generated %d canopies for %d samples using t1=%f and t2=%f",
                this.vectorCanopies.size(), indices.size(), threshold1, threshold2));
    }

    @Override
    public String toString() {
        // @todo: convert to JSON
        //List<RealVector> vectors = this.completeMatrix.getVectors();
        ArrayList<String> names = this.completeMatrix.getNames();
        StringBuffer sb = new StringBuffer();
        for (Iterator<Canopy<Integer>> ci = this.vectorCanopies.iterator(); ci.hasNext();) {
            Canopy<Integer> v = ci.next();
            sb.append(names.get(v.getFounder()));
            sb.append(": {");
            for (Iterator<Integer> mi = v.iterator(); mi.hasNext();) {
                sb.append(names.get(mi.next()));
                if (mi.hasNext()) {
                    sb.append(", ");
                }
            }
            sb.append(" }");
            if (ci.hasNext()) {
                sb.append(", \n");
            }
        }
        return sb.toString();
    }

    /**
     * Rough main to test code. Loads in json file and outputs canopies
     * @param arg
     * @throws Exception
     */
    @SuppressWarnings("static-access")
    public static void main(String[] argv) throws Exception {
        if (argv.length >= 1) {
            LOG.info("Reading in CompleteCompositionVectors from " + argv[0]);
            // we only save the data not everything that is in the vectorSet
            BufferedReader br = new BufferedReader(new FileReader(argv[0]));
            CompleteMatrix completeMatrix = CompleteMatrix.readJsonCompleteMatrix(br);
            br.close();
            LOG.info(String.format("Loaded in %d samples and %d nmers (features)", completeMatrix.getNames().size(),
                    completeMatrix.getNmers().size()));
            CcvCanopyCluster canopyCluster = new CcvCanopyCluster(completeMatrix);
            float t1 = (float) -1.0;
            float t2 = (float) -1.0;
            if (argv.length >= 2) {
                t1 = Float.parseFloat(argv[1]);
            }
            if (argv.length >= 3) {
                t2 = Float.parseFloat(argv[2]);
            }
            if (t1 < 0.0 || t2 < 0.0) {
                double[] ts = canopyCluster.autoThreshold();
                t1 = (float) ts[0];
                t2 = (float) ts[1];
            }

            canopyCluster.cluster(t1, t2);
            //System.out.println(canopyCluster.toString());
        }
    }
}