com.jgaap.distances.PearsonCorrelationDistance.java Source code

Java tutorial

Introduction

Here is the source code for com.jgaap.distances.PearsonCorrelationDistance.java

Source

/*
 * JGAAP -- a graphical program for stylometric authorship attribution
 * Copyright (C) 2009,2011 by Patrick Juola
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
/**
 **/
package com.jgaap.distances;

import java.util.Set;

import com.google.common.collect.Sets;
import com.jgaap.generics.DistanceFunction;
import com.jgaap.util.Event;
import com.jgaap.util.Histogram;

/**
 * PearsonCorrelationDistance : Parametric equivalent of
 * KendallCorrelationDistance, based on Pearson's product-moment correlation 
 * General theory:
 * Pearson's correlation measures how similar frequency rankings are
 * between two rank orderings; +1 is perfect agreement, -1 is perfect
 * disagreement. We subtract from 1 to get a distance measure.  The main
 * difference is that Pearson measures only linear association, so the
 * Kendall correlation between x^2 and x^3 is perfect, but the PPMC is not.
 * 
 * @author Juola
 * @version 5.0.1
 */
public class PearsonCorrelationDistance extends DistanceFunction {
    @Override
    public String displayName() {
        return "Pearson Correlation Distance";
    }

    @Override
    public String tooltipText() {
        return "Pearson Correlation Distance Nearest Neighbor Classifier";
    }

    @Override
    public boolean showInGUI() {
        return true;
    }

    /**
     * Returns PPMCC distance between event sets es1 and es2
     * 
     * @param es1
     *            The first EventSet
     * @param es2
     *            The second EventSet
     * @return the PPMCC distance (1 - Pearson's r) between them.
     *         Returns 0 if sd of both es1 and es2 are zero (point mass),
     *            otherwise returns 1 if only one is 0.  
     */
    @Override
    public double distance(Histogram unknownHistogram, Histogram knownHistogram) {
        Set<Event> s = Sets.union(unknownHistogram.uniqueEvents(), knownHistogram.uniqueEvents());

        int n; // number of elements

        double sigX; // sum of relative frequencies in h1;
        double sigY; // sum of relative frequencies in h2;
        double sigXY; // sum of products of relative frequencies
        double sigX2; // sum of squared relative frequencies in h1;
        double sigY2; // sum of squared relative frequencies in h2;

        double denom1, denom2; // factors of denominator

        double correlation = 0.0;

        //System.out.println(h1.toString());
        //System.out.println(h2.toString());
        //System.out.println(s.toString());

        n = s.size();
        sigX = 0.0;
        sigY = 0.0;
        sigXY = 0.0;
        sigX2 = 0.0;
        sigY2 = 0.0;
        for (Event e : s) {
            double x = unknownHistogram.relativeFrequency(e);
            double y = knownHistogram.relativeFrequency(e);
            sigX += x;
            sigY += y;
            sigX2 += x * x;
            sigY2 += y * y;
            sigXY += x * y;
        }
        ;

        //System.out.println("n = " + n);
        //System.out.println("sigX = " + sigX);
        //System.out.println("sigY = " + sigY);
        //System.out.println("sigXY = " + sigXY);
        //System.out.println("sigX2 = " + sigX2);
        //System.out.println("sigY2 = " + sigY2);

        // formula from http://davidmlane.com/hyperstat/A56626.html
        // as well as lots of other places

        denom1 = sigX2 - (sigX * sigX) / n;
        denom2 = sigY2 - (sigY * sigY) / n;

        //System.out.println("denom1 = " + denom1);
        //System.out.println("denom2 = " + denom2);

        // check for edge cases
        if (Math.abs(denom1) < 0.000001 && Math.abs(denom2) < 0.000001)
            return 0;

        if (Math.abs(denom1) < 0.000001 || Math.abs(denom2) < 0.000001)
            return 1;

        correlation = (sigXY - (sigX * sigY) / n) / Math.sqrt(denom1 * denom2);

        //System.out.println("correlation = "+correlation);
        //System.out.println("---");

        return 1.0 - correlation;

    }
}