org.clueminer.clustering.algorithm.DBSCANParamEstim.java Source code

Java tutorial

Introduction

Here is the source code for org.clueminer.clustering.algorithm.DBSCANParamEstim.java

Source

/*
 * Copyright (C) 2011-2015 clueminer.org
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.clueminer.clustering.algorithm;

import java.util.Arrays;
import java.util.Collections;
import org.apache.commons.math3.util.FastMath;
import org.clueminer.dataset.api.Dataset;
import org.clueminer.dataset.api.Instance;
import org.clueminer.knn.LinearSearch;
import org.clueminer.neighbor.KNNSearch;
import org.clueminer.neighbor.Neighbor;
import org.clueminer.utils.Props;

/**
 * Parameter estimator for DBSCAN algorithm. Based on recommendation in the
 * original paper.
 *
 * Estimation parameters, especially Eps parameter, is hard without prior
 * knowledge of the dataset. We're using sorted k-NN(4) distances to all data
 * points in order to determine proper settings.
 *
 * @author deric
 * @param <E>
 */
public class DBSCANParamEstim<E extends Instance> {

    /**
     * k-th neighbor used as reference distance for each data point. According
     * to DBSCAN authors after 4th neighbor estimated Eps won't change much
     */
    private int k = 4;
    private double slope;
    private int localNeighborhood = 10;
    private Double[] kdist;
    private int knee;
    private double eps;

    public void estimate(Dataset<E> dataset, Props params) {
        //compute k-dist for dataset
        kdist(dataset);

        knee = findKnee(kdist);
        //recommended eps value
        eps = kdist[knee];
        params.putDouble(DBSCAN.EPS, eps);
    }

    /**
     *
     * @param dataset
     * @return
     */
    private Double[] kdist(Dataset<E> dataset) {
        //k-dist graph data
        KNNSearch<Instance> knn = new LinearSearch(dataset);
        Neighbor[] neighbors;
        kdist = new Double[dataset.size()];
        for (int i = 0; i < dataset.size(); i++) {
            neighbors = knn.knn(dataset.get(i), k);
            kdist[i] = neighbors[k - 1].distance;
        }
        Arrays.sort(kdist, 0, kdist.length - 1, Collections.reverseOrder());

        return kdist;
    }

    private int findKnee(Double[] kdist) {
        int maxX = maxX(kdist);
        slope = slope(kdist, maxX);
        double dist;
        double max = Double.MIN_VALUE;
        int maxIdx = 0;
        for (int i = 1; i < maxX; i++) {
            dist = localMin(kdist, i, i + localNeighborhood, slope);
            if (dist > max) {
                max = dist;
                maxIdx = i;
            }
            //System.out.println(i + " => " + kx + ", max = " + max);
        }
        System.out.println("max = " + max + ", at " + maxIdx);
        return maxIdx;
    }

    /**
     * Find shortest local distance
     *
     * @param kdist
     * @param from
     * @param to
     * @return
     */
    private double localMin(Double[] kdist, int from, int to, double slope) {
        double min = Double.MAX_VALUE;
        double dist = 0;
        //int minX;
        for (int i = from; i < to; i++) {
            dist = distance(from, kdist[from], i, ref(kdist, i, slope));
            if (dist < min) {
                min = dist;
            }
        }
        return dist;
    }

    private double distance(double x1, double y1, double x2, double y2) {
        double dist = FastMath.pow(x1 - x2, 2) + FastMath.pow(y1 - y2, 2);
        return Math.sqrt(dist);
    }

    /**
     * Computes slope of a linear function
     *
     * @param kdist sorted array of data values (from max to min)
     * @param maxX
     * @return slope of a cure
     */
    public double slope(Double[] kdist, int maxX) {
        int x1 = 0;
        int x2 = maxX;
        double y1 = kdist[x1]; //max y
        double y2 = kdist[x2]; //min y - we're looking for elbow/knee in first half
        return (y2 - y1) / (x2 - x1);
    }

    /**
     * We're looking for the knee point in the first half of the curve
     *
     * @param kdist
     * @return
     */
    private int maxX(Double[] kdist) {
        return (kdist.length - 1) / 2;
    }

    /**
     * Reference straight line
     *
     * @param kdist sorted array of data values (from max to min)
     * @param i
     * @param slope
     * @return
     */
    public double ref(Double[] kdist, int i, double slope) {
        return kdist[0] + (slope * i);
    }

    public int getK() {
        return k;
    }

    public double getSlope() {
        return slope;
    }

    public Double[] getKdist() {
        return kdist;
    }

    /**
     * X coordinate of the knee (elbow) point on the cure
     *
     * @return
     */
    public int getKnee() {
        return knee;
    }

    public double getEps() {
        return eps;
    }

    public double getMinEps() {
        return kdist[maxX(kdist)];
    }

    /**
     * Make sure max eps is in reasonable interval around actual eps
     *
     * @return estimated reasonable max value
     */
    public double getMaxEps() {
        return eps + (kdist[0] - eps) / 2.0;
    }
}