entity.DifficultyResamplingManager.java Source code

Java tutorial

Introduction

Here is the source code for entity.DifficultyResamplingManager.java

Source

/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

package entity;

import java.math.BigDecimal;
import weka.core.Instances;

/**
 * This class manages the resampling method for PR variation in a dataset
 * 
 * @author Edoardo Varasi
 * @email edoardo.varasi@gmail.com
 *
 */
public class DifficultyResamplingManager {

    private boolean verbose = false;

    /**
     * Return max dimensions of subdataset for a PR (total, p, n)
     * @param originalDataset
     * @param positiveExamplePercentProportion
     * @return
     */
    public SubdatasetDimensions calculateSubdatasetDimensionsForProportion(Instances originalDataset,
            BigDecimal positiveExamplePercentProportion) {

        // size of subdataset, initialized to original size
        int total = originalDataset.numInstances();
        // number of positive instances
        int p = 0;
        // number of negative instances
        int n = 0;
        // current PR
        int pp = 0;

        // count positives
        for (int i = 0; i < total; i++) {
            if (originalDataset.instance(i).stringValue(originalDataset.classIndex()).equals(Settings.buggyLabel)) {
                p++;
            }
        }

        n = total - p;

        // finds actual PR
        pp = calculatePositivePercentCeil(p + n, p);

        if (verbose)
            System.out.println(
                    "[DifficultyResamplingManager , calculateSubdatasetDimensionsForProportion] attuale: p=" + p
                            + " n=" + n + " pp = " + pp);

        // if current PR equals desired one, return current dimensions
        if (pp == positiveExamplePercentProportion.intValue())
            return new SubdatasetDimensions(p, n);

        // if current PR is greater than the desired one
        // decrements p until ceiling of current PR is greater than the desired one
        if (pp > positiveExamplePercentProportion.intValue()) {
            while (pp > positiveExamplePercentProportion.intValue()) {
                p--;
                pp = calculatePositivePercentCeil(p + n, p);
                if (verbose)
                    System.out
                            .println("[DifficultyResamplingManager , calculateSubdatasetDimensionsForProportion] p="
                                    + p + " n=" + n + " pp = " + pp);
            }
            // goes back if the previous PR was "nearer" to the desired than the current one
            if (isPPPNearerThanPPToDesiredPercent(calculatePositivePercentCeil(p + 1 + n, p + 1), pp,
                    positiveExamplePercentProportion.intValue())) {
                p++;
                pp = calculatePositivePercentCeil(p + n, p);
            }
        }

        // if current PR is less than the desired one
        // decrements n until ceiling of current PR is less than the desired one
        if (pp < positiveExamplePercentProportion.intValue()) {
            while (pp < positiveExamplePercentProportion.intValue()) {
                n--;
                pp = calculatePositivePercentCeil(p + n, p);
                if (verbose)
                    System.out
                            .println("[DifficultyResamplingManager , calculateSubdatasetDimensionsForProportion] p="
                                    + p + " n=" + n + " pp = " + pp);
            }
            // goes back if the previous PR was "nearer" to the desired than the current one
            if (isPPPNearerThanPPToDesiredPercent(calculatePositivePercentCeil(p + n + 1, p), pp,
                    positiveExamplePercentProportion.intValue())) {
                n++;
                pp = calculatePositivePercentCeil(p + n, p);
            }
        }

        if (verbose)
            System.out
                    .println("[DifficultyResamplingManager , calculateSubdatasetDimensionsForProportion] finale p="
                            + p + " n=" + n + " pp = " + pp);
        return new SubdatasetDimensions(p, n);
    }

    /**
     * 
     * Calculates the sumbdataset size for all PR levels basing the decision on
     * min PR, max PR and natural PR of the dataset
     * 
     * @param originalDataset
     * @param maxPositiveExamplePercentProportion
     * @param minPositiveExamplePercentProportion
     * @return subdatset size
     */
    public int calculateSubdatasetDimensionAmongProportions(Instances originalDataset,
            BigDecimal maxPositiveExamplePercentProportion, BigDecimal minPositiveExamplePercentProportion) {

        SubdatasetDimensions maxPpDimensions = this.calculateSubdatasetDimensionsForProportion(originalDataset,
                maxPositiveExamplePercentProportion);
        SubdatasetDimensions minPpDimensions = this.calculateSubdatasetDimensionsForProportion(originalDataset,
                minPositiveExamplePercentProportion);

        if (verbose)
            System.out.println(
                    "[DifficultyResamplingManager, calculateSubdatasetDimensionAmongProportions] dimensioni maxPp: "
                            + maxPpDimensions.toString());
        if (verbose)
            System.out.println(
                    "[DifficultyResamplingManager, calculateSubdatasetDimensionAmongProportions] dimensioni minPp: "
                            + minPpDimensions.toString());

        // calculates size for max PR and for min PR, then chooses the smallest
        if (minPpDimensions.getTotalNumberOfElements() <= maxPpDimensions.getTotalNumberOfElements()) {
            if (verbose)
                System.out.println(
                        "[DifficultyResamplingManager, calculateSubdatasetDimensionAmongProportions] chose size for minPp");
            return minPpDimensions.getTotalNumberOfElements();
        } else {
            if (verbose)
                System.out.println(
                        "[DifficultyResamplingManager, calculateSubdatasetDimensionAmongProportions] chose size for maxPp");
            return maxPpDimensions.getTotalNumberOfElements();
        }
    }

    /**
     * 
     * Generates a subdataset randomly resampling the original.
     * The subdataset will have desired size and PR.
     * Each time the original dataset is randomized, so the reampling can be repeated multiple times 
     * 
     * @param originalDataset
     * @param pP
     * @param dimension
     * @return Instances resampledSubdataset
     */
    public Instances generateResampledSubdataset(Instances originalDataset, BigDecimal pP, int dimension) {

        int p = (int) Math.ceil((dimension * pP.doubleValue()) / 100.0);
        int n = dimension - p;
        SubdatasetDimensions subdatasetDimensions = new SubdatasetDimensions(p, n);

        if (verbose)
            System.out.println("[DifficultyResamplingManager, generateResampledSubdataset] resampling with: p=" + p
                    + ", n=" + n + ", tot=" + dimension + ", pp=" + pP + "%");

        return this.generateResampledSubdataset(originalDataset, subdatasetDimensions);

    }

    /**
     * called by generateResampledSubdataset
     * 
     * @param originalDataset
     * @param subdatasetDimensions
     * @return
     */
    private Instances generateResampledSubdataset(Instances originalDataset,
            SubdatasetDimensions subdatasetDimensions) {

        // creates an empty dataset
        Instances resampledSubdataset = new Instances(originalDataset);
        resampledSubdataset.delete();

        // randomize dataset instances order
        originalDataset.randomize(RandomizationManager.randomGenerator);

        // calc number of positives to insert
        int positivesToInsert = subdatasetDimensions.getP();
        if (verbose)
            System.out.println("[DifficultyResamplingManager, generateResampledSubdataset] positivesToInsert = "
                    + positivesToInsert);

        // calc number of negatives to insert
        int negativesToInsert = subdatasetDimensions.getN();

        // iterates over the original dataset instances
        for (int i = 0; i < originalDataset.numInstances(); i++) {
            // if instance is positive and more are needed in the new dataset, inserts into new dataset
            if ((positivesToInsert > 0) && (originalDataset.instance(i).stringValue(originalDataset.classIndex())
                    .equals(Settings.buggyLabel))) {
                resampledSubdataset.add(originalDataset.instance(i));
                positivesToInsert--;
            }

            // if instance is negative and more are needed in the new dataset, inserts into new dataset
            if ((negativesToInsert > 0) && (originalDataset.instance(i).stringValue(originalDataset.classIndex())
                    .equals(Settings.nonbuggyLabel))) {
                resampledSubdataset.add(originalDataset.instance(i));
                negativesToInsert--;
            }

        }

        if (verbose)
            System.out.println("[DifficultyResamplingManager, generateResampledSubdataset] resampling terminato: "
                    + this.printDatasetInfo(resampledSubdataset));
        return resampledSubdataset;
    }

    /**
     * prints number of posive and negative instances and respective percentaghes
     * @param dataset
     * @return
     */
    public String printDatasetInfo(Instances dataset) {

        int positives = 0;
        int negatives = 0;

        for (int i = 0; i < dataset.numInstances(); i++) {

            if (dataset.instance(i).stringValue(dataset.classIndex()).equals(Settings.buggyLabel)) {
                positives++;
            }

            if (dataset.instance(i).stringValue(dataset.classIndex()).equals(Settings.nonbuggyLabel)) {
                negatives++;
            }
        }

        double percent = ((double) positives / (double) dataset.numInstances()) * 100;
        return new String("totale istanze: " + dataset.numInstances() + ", p+n=" + (positives + negatives) + ", p: "
                + positives + ", n: " + negatives + ", %p : " + percent);
    }

    /**
     * Given a number of positives and size, returns the ceiling of PR
     * @param total
     * @param p
     * @return
     */
    private int calculatePositivePercentCeil(int total, int p) {
        double percent = (((double) p) / ((double) total)) * 100;
        return (int) Math.ceil(percent);
    }

    /**
     * 
     * returns true if ppp is "nearer" to dpp than pp
     * @param ppp
     * @param pp
     * @param dpp
     * @return
     */
    private boolean isPPPNearerThanPPToDesiredPercent(int ppp, int pp, int dpp) {
        int dist1 = Math.abs(pp - dpp);
        int dist2 = Math.abs(ppp - dpp);

        if (dist1 <= dist2)
            return false;
        else
            return true;
    }
}