us.levk.math.linear.EucledianDistanceClusterer.java Source code

Java tutorial

Introduction

Here is the source code for us.levk.math.linear.EucledianDistanceClusterer.java

Source

/**
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package us.levk.math.linear;

import static java.lang.Double.isInfinite;
import static java.lang.Double.isNaN;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.Setter;
import lombok.ToString;
import lombok.experimental.Accessors;
import lombok.extern.log4j.Log4j;

import org.apache.commons.math3.linear.RealMatrix;

/**
 * @author levk
 * 
 */
@Log4j
public class EucledianDistanceClusterer {

    private Iterator<Integer> enumerator = new Iterator<Integer>() {

        private int counter = -1;

        @Override
        public boolean hasNext() {
            return true;
        }

        @Override
        public Integer next() {
            counter--;
            if (counter > 0)
                counter = -1;
            return counter;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    };

    @RequiredArgsConstructor
    @Accessors(fluent = true)
    @ToString
    public class Cluster {
        private @Getter final int id;
        private @Getter @Setter double d = 0.0;
        private @Getter final List<Integer> contains = new ArrayList<>();
        private @Getter final Cluster[] children;

        public Cluster(int id) {
            this.id = id;
            contains().add(id);
            children = null;
        }

        public Cluster(Cluster left, Cluster right) {
            id = enumerator.next();
            children = new Cluster[] { left, right };
        }
    }

    public Cluster eucledian(final RealMatrix original) throws IOException {
        try (HugeRealMatrix distances = new HugeRealMatrix(original.getRowDimension(),
                original.getRowDimension())) {
            final Map<Integer, Cluster> genehash = new HashMap<Integer, Cluster>() {
                private static final long serialVersionUID = 1L;

                {
                    for (int index = original.getRowDimension(); --index >= 0; put(index, new Cluster(index)))
                        ;
                }
            };
            TreeMap<Double, int[]> sorted = new TreeMap<>();

            log.debug("Populating distance matrix");
            for (int i = 0; i < original.getRowDimension(); i++) {
                for (int j = i + 1; j < original.getRowDimension(); j++) {
                    // Euclidean distance calculation.
                    double total = 0;
                    for (int k = 0; k < original.getColumnDimension(); k++) {
                        double left = original.getEntry(i, k);
                        double right = original.getEntry(j, k);
                        if (!isNaN(left) && !isNaN(right) && !isInfinite(left) && !isInfinite(right))
                            total += Math.pow(left - right, 2);
                    }
                    double distance = Math.pow(total, 0.5);

                    distances.setEntry(i, j, distance);
                    distances.setEntry(j, i, distance);
                    int[] genePair = { i, j };
                    // Enter the distance calculated and the genes measured into a
                    // treemap. Will be automatically sorted.
                    sorted.put(distance, genePair);
                }
            }
            log.debug("Initialized distances matrix " + distances);

            while (true) {
                // Get the first key of the TreeMap. Will be the shortest distance de
                // facto.
                final double minkey = (Double) sorted.firstKey();
                int[] minValues = (int[]) sorted.firstEntry().getValue();

                final int value1 = minValues[0], value2 = minValues[1];
                // find

                Cluster cluster = new Cluster(genehash.get(value1), genehash.get(value2)) {
                    {
                        log.debug("Generating cluster from " + value1 + " and " + value2 + " in " + genehash);
                        contains().addAll(genehash.get(value1).contains());
                        contains().addAll(genehash.get(value2).contains());
                        d(minkey);
                        log.debug("Generated cluster " + this);
                    }
                };

                genehash.put(cluster.id(), cluster);
                genehash.remove(value1);
                genehash.remove(value2);

                if (genehash.size() <= 1)
                    break;

                // Iterate over all the current clusters to remeasure distance with the
                // previously clustered group.
                for (Cluster c : genehash.values()) {
                    // Skip measuring the new cluster with itself.
                    if (c == cluster)
                        continue;

                    double distance = 0;
                    int n = 0;
                    // Get genes from each cluster. Distance is measured from each element
                    // to every element.
                    for (int current : c.contains())
                        for (int created : cluster.contains()) {
                            distance += distances.getEntry(current, created);
                            n++;
                        }

                    distance = distance / n;

                    int[] valuePair = { c.id(), cluster.id() };
                    sorted.put(distance, valuePair);
                }

                // Get the shortest distance.
                // Check to make sure shortest distance does not include a gene pair
                // that
                // has already had its elements clustered.
                boolean minimized = false;
                while (!minimized) {
                    double mk = sorted.firstKey();
                    minValues = sorted.firstEntry().getValue();
                    // If the gene pair is not present in the current gene set, remove
                    // this
                    // distance.
                    if (!genehash.containsKey(minValues[0]) || !genehash.containsKey(minValues[1]))
                        sorted.remove(mk);
                    else
                        minimized = true;
                }
            }

            return genehash.entrySet().iterator().next().getValue();
        }
    }
}