cezeri.feature.selection.FeatureSelectionRanker.java Source code

Java tutorial

Introduction

Here is the source code for cezeri.feature.selection.FeatureSelectionRanker.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package cezeri.feature.selection;

import cezeri.evaluater.FactoryEvaluation;
import cezeri.matrix.CMatrix;
import cezeri.utils.CustomComparatorForFeatureRank;
import cezeri.utils.FactoryCombination;
import cezeri.utils.FactoryInstance;
import cezeri.utils.FactoryStatistic;
import cezeri.utils.FactoryUtils;
import cezeri.types.TCorelation;
import cezeri.types.TFeatureRank;
import cezeri.types.TMachineLearning;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Random;
import java.util.logging.Level;
import java.util.logging.Logger;
import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;
import weka.core.Instances;

/**
 *
 * @author BAP1
 */
public class FeatureSelectionRanker {

    /**
     * b. works only on type="binary classification" no need to normalize
     * dataset before applying fisher
     *
     * @param data dataset
     * @param type TMachineLearning.CLASSIFICATION
     * @return
     */

    public static String yaz(String msg) {
        System.out.println(msg);
        return msg;
    }

    /**
     * You should use this method only for the Classification problems Fisher is
     * not suitable for Regression. For Regression problems you can use CRCF
     * method.
     *
     * @param data
     * @param type
     * @return
     */
    public static TFeatureRank[] fisherDistance(Instances data, int type) {
        if (type == TMachineLearning.REGRESSION) {
            return null;
        }
        TFeatureRank[] ret = new TFeatureRank[data.numAttributes() - 1];
        String[] attributeNames = FactoryInstance.getAttributeList(data);
        //        FactoryInstance.getMatrix(data).plot();
        Instances[] ins = FactoryInstance.getSpecificInstancesBasedOnClassValue(data,
                FactoryInstance.getDefaultClasses(data));
        if (ins.length < 2) {
            return null;
        }
        double[][] cl_1 = CMatrix.getInstance(FactoryInstance.getData(ins[0])).transpose().get2DArrayDouble();
        double[][] cl_2 = CMatrix.getInstance(FactoryInstance.getData(ins[1])).transpose().get2DArrayDouble();
        //        FactoryMatrix.transpose(FactoryInstance.getData(ins[1]));
        double[] fisher = new double[cl_1.length];
        for (int i = 0; i < cl_1.length - 1; i++) {
            double mean_1 = FactoryUtils.getMean(cl_1[i]);
            double std_1 = FactoryStatistic.getStandardDeviation(cl_1[i]);
            double mean_2 = FactoryUtils.getMean(cl_2[i]);
            double std_2 = FactoryStatistic.getStandardDeviation(cl_2[i]);
            if (Math.pow(std_1, 2) + Math.pow(std_2, 2) == 0.0) {
                fisher[i] = 0.0;
            } else {
                //                double f = Math.abs(mean_1 - mean_2) / (Math.pow(std_1, 2) + Math.pow(std_2, 2));
                double f = Math.pow((mean_1 - mean_2), 2) / (Math.pow(std_1, 2) + Math.pow(std_2, 2));
                fisher[i] = FactoryUtils.formatDouble(f);
            }
            TFeatureRank obj = new TFeatureRank();
            obj.featureName = attributeNames[i];
            obj.index = "" + i;
            obj.value = fisher[i];
            ret[i] = obj;
            //println(i + ".fisher distance:" + fisher[i]);
        }
        ArrayList<TFeatureRank> lst = toArrayList(ret);
        Collections.sort(lst, new CustomComparatorForFeatureRank());
        ret = toArray(lst);
        //        int[] fisherIndex = FactoryUtils.sortArrayAndReturnIndex(fisher, "desc");
        return ret;
    }

    /**
     * You should use this method only for the Classification problems Fisher is
     * not suitable for Regression. For Regression problems you can use CRCF
     * method.
     *
     * @param data
     * @param type : like TCorelation.ARE
     * @return
     */
    public static TFeatureRank[] rankFeatureRegression(Instances data, int type) {
        TFeatureRank[] cr = correlation(data, type);
        for (int i = 0; i < cr.length; i++) {
            for (int j = i; j < cr.length; j++) {
                if (cr[i].value < cr[j].value) {
                    TFeatureRank tmp = new TFeatureRank();
                    tmp.featureName = cr[i].featureName;
                    tmp.index = cr[i].index;
                    tmp.value = cr[i].value;
                    cr[i].featureName = cr[j].featureName;
                    cr[i].index = cr[j].index;
                    cr[i].value = cr[j].value;
                    cr[j].featureName = tmp.featureName;
                    cr[j].index = tmp.index;
                    cr[j].value = tmp.value;
                }
            }
        }
        return cr;
    }

    /**
     * if full exhaustive search is not feasible due to computational cost,
     * shrink search space by reducing the number of features that you want to
     * explore
     *
     * @param nSubset desired subset number i.e.: if you have 15 features you
     * may want to 9 feature subset result
     * @param data train or test data
     * @param model classifier you used
     * @param nFolds during learning what will be the cross validation folds
     * @param show_text print the output
     * @param show_plot plot the output
     * @return
     */
    public static TFeatureRank[] wrapperExhaustiveSearchLimited(int nSubset, Instances data, Classifier model,
            int nFolds, boolean show_text, boolean show_plot) {
        if (nSubset > data.numAttributes() - 1) {
            System.out.println("subset should be less than attribute number");
            return null;
        }

        String[] attributeNames = FactoryInstance.getAttributeListExceptClassAttribute(data);
        String[] lstComb = FactoryCombination.getCombination(attributeNames, nSubset);
        FactoryCombination.toString(lstComb);
        TFeatureRank[] ret = computeCombinationPairs(lstComb, data, model, nFolds, show_text, show_plot);
        return ret;
    }

    /**
     * if number of features is less than 15 only you can make exhaustive global
     * search on the feature space
     *
     * @param data :dataset
     * @param model :classifier
     * @param nFolds :number of cross validation folds
     * @param show_text :print the output
     * @param show_plot :plot the output
     * @return
     */
    public static TFeatureRank[] wrapperExhaustiveSearch(Instances data, Classifier model, int nFolds,
            boolean show_text, boolean show_plot) {
        if (data.numAttributes() > 15) {
            System.out.println(
                    "for exhaustive search num of attributes greater than 13 is not feasible comp cost is too high to compute");
            return null;
        }
        String[] attributeNames = FactoryInstance.getAttributeListExceptClassAttribute(data);
        String[] lstComb = FactoryCombination.getAllCombinations(attributeNames);
        TFeatureRank[] ret = computeCombinationPairs(lstComb, data, model, nFolds, show_text, show_plot);
        return ret;
    }

    /**
     * Previous version of Simulated Annealing
     *
     *
     * @param subset :works only subsets
     * @param data :instances
     * @param model :classifier
     * @param show_text :print the output
     * @param show_plot :plot the output
     * @return
     */
    private static TFeatureRank[] wrapperSimulatedAnnealingV1(int subset, Instances data, Classifier model,
            int folds, boolean show_text, boolean show_plot) {
        String[] attList = FactoryInstance.getAttributeListExceptClassAttribute(data);
        Random rnd = new Random();
        double globalError = 1.0;
        double ret = 0;
        double probability;
        double alpha = 0.9999;
        double temperature = 400000.0;
        double epsilon = 0.001;
        double delta;
        ArrayList<String> combinationList = new ArrayList<String>();
        ArrayList<TFeatureRank> rankList = new ArrayList<TFeatureRank>();
        String[] lstComb = FactoryCombination.getCombination(attList, subset);
        String currCombination = FactoryUtils.getRandomSubset(lstComb);
        String bestCombination = currCombination;
        int q = 0;
        boolean isNeighbor = false;
        while (temperature > epsilon) {
            q++;
            if (isNeighbor) {
                currCombination = FactoryUtils.getNeighborSubset(currCombination, lstComb);
            } else {
                currCombination = FactoryUtils.getRandomSubset(lstComb);
            }
            if (combinationList.contains(currCombination)) {
                //                    System.out.println("temperature:" + temperature + " during sim annealing redundant subset was found and ignored " + currCombination);
                temperature *= alpha;
                continue;
            } else {
                combinationList.add(currCombination);
            }
            ret = computeCombinationFeature(currCombination, data, folds, model, show_text, show_plot);
            delta = ret - globalError;
            if (delta < 0) {
                globalError = ret;
                bestCombination = currCombination;
            } else {
                probability = rnd.nextDouble();
                if (probability < Math.exp(-delta / temperature)) {
                    globalError = ret;
                    bestCombination = currCombination;
                }
            }
            temperature *= alpha;
            System.out.println(
                    q + ".new subset:" + currCombination + " temperature:" + temperature + " accuracy:" + ret);
            TFeatureRank obj = new TFeatureRank();
            obj.featureName = currCombination;
            obj.index = q + "";
            obj.value = ret;
            rankList.add(obj);

        }
        Collections.sort(rankList, new CustomComparatorForFeatureRank());
        TFeatureRank[] fr = toArray(rankList);
        return fr;
    }

    public static TFeatureRank[] wrapperSimulatedAnnealing(int subset, Instances data, Classifier model, int folds,
            boolean show_text, boolean show_plot) {
        String[] attList = FactoryInstance.getAttributeListExceptClassAttribute(data);
        ArrayList<String> combinationList = new ArrayList<>();
        ArrayList<TFeatureRank> rankList = new ArrayList<>();
        String[] lstComb = FactoryCombination.getCombination(attList, subset);
        System.out.println("Size of subsets:" + lstComb.length);
        int size = lstComb.length;
        String currentCombination = FactoryUtils.getRandomSubset(lstComb);
        String newCombination = "";
        String bestCombination = currentCombination;

        // Set initial temp
        double temp = 1000000;

        // Cooling rate
        double coolingRate = 0.009;

        // Set as current and best
        double bestEnergy = computeCombinationFeature(currentCombination, data, folds, model, show_text, show_plot);
        double currentEnergy = bestEnergy;
        // Initialize intial solution
        System.out.println("Initial solution accuracy: " + bestEnergy);

        // Loop until system has cooled
        int k = 0;
        int m = 0;
        while (temp > 1) {
            m++;
            if (m > size) {
                temp = 0;
            }
            // get new random neighbour subset
            newCombination = FactoryUtils.getRandomSubset(lstComb);
            if (combinationList.contains(newCombination)) {
                //                temp *= 1 - coolingRate;
                continue;
            } else {
                combinationList.add(newCombination);
            }

            // Get energy of solutions
            double neighbourEnergy = computeCombinationFeature(newCombination, data, folds, model, show_text,
                    show_plot);

            // Decide if we should accept the neighbour
            if (acceptanceProbability(currentEnergy, neighbourEnergy, temp) > Math.random()) {
                currentCombination = newCombination;
                currentEnergy = neighbourEnergy;
            }

            // Keep track of the best solution found
            if (currentEnergy > bestEnergy) {
                bestEnergy = currentEnergy;
                bestCombination = currentCombination;
                TFeatureRank obj = new TFeatureRank();
                obj.featureName = bestCombination;
                obj.index = "" + (k++);
                obj.value = bestEnergy;
                rankList.add(obj);
                obj.toString();
            }
            System.out.println("dor:" + m + " T=" + temp + " result:" + bestEnergy);
            // Cool system
            temp *= 1 - coolingRate;
        }
        System.out.println("dor:" + m + " T=" + temp);
        System.out.println("Final solution accuracy: " + bestEnergy);
        System.out.println("Subset: " + bestCombination);
        Collections.sort(rankList, new CustomComparatorForFeatureRank());
        TFeatureRank[] fr = toArray(rankList);
        return fr;
    }

    public static ArrayList<TFeatureRank> toArrayList(TFeatureRank[] d) {
        ArrayList<TFeatureRank> ret = new ArrayList<TFeatureRank>();
        for (int i = 0; i < d.length; i++) {
            ret.add(d[i]);
        }
        return ret;
    }

    public static TFeatureRank[] toArray(ArrayList<TFeatureRank> d) {
        TFeatureRank[] ret = new TFeatureRank[d.size()];
        for (int i = 0; i < d.size(); i++) {
            ret[i] = d.get(i);
        }
        return ret;
    }

    public static String toString(String str, TFeatureRank[] d) {
        String ret = "";
        System.out.println("");
        //        System.out.println(str);
        if (d == null) {
            System.out.println("NULL VALUE OR REGRESSION PROBLEM DOESNT HOLD FISHER DISCRIMINATION POWER METRIC! ");
            return "";
        }
        for (int i = 0; i < d.length; i++) {
            ret += d[i].toString() + "\n";
        }
        //        System.out.println("------------------------------------------------------");
        //        System.out.println("List Size:" + d.length);
        return ret;
    }

    private static TFeatureRank[] correlation(Instances data, int type) {
        TFeatureRank[] ret = new TFeatureRank[data.numAttributes() - 1];
        String[] attributeNames = FactoryInstance.getAttributeList(data);
        double[] out = data.attributeToDoubleArray(data.classIndex());
        for (int i = 0; i < data.numAttributes() - 1; i++) {
            TFeatureRank obj = new TFeatureRank();
            obj.featureName = attributeNames[i];
            obj.index = i + "";
            if (type == TCorelation.ARE) {
                obj.value = Math.abs(FactoryStatistic.ARE(data.attributeToDoubleArray(i), out));
            }
            if (type == TCorelation.CRCF) {
                obj.value = Math.abs(FactoryStatistic.CRCF(data.attributeToDoubleArray(i), out));
            }
            if (type == TCorelation.IOA) {
                obj.value = Math.abs(FactoryStatistic.IOA(data.attributeToDoubleArray(i), out));
            }
            if (type == TCorelation.KENDALL) {
                obj.value = Math.abs(FactoryStatistic.KENDALL(data.attributeToDoubleArray(i), out));
            }
            if (type == TCorelation.MAE) {
                obj.value = Math.abs(FactoryStatistic.MAE(data.attributeToDoubleArray(i), out));
            }
            if (type == TCorelation.MPE) {
                obj.value = Math.abs(FactoryStatistic.MPE(data.attributeToDoubleArray(i), out));
            }
            if (type == TCorelation.MSE) {
                obj.value = Math.abs(FactoryStatistic.MSE(data.attributeToDoubleArray(i), out));
            }
            if (type == TCorelation.NSEC) {
                obj.value = Math.abs(FactoryStatistic.NSEC(data.attributeToDoubleArray(i), out));
            }
            if (type == TCorelation.PEARSON) {
                obj.value = Math.abs(FactoryStatistic.PEARSON(data.attributeToDoubleArray(i), out));
            }
            if (type == TCorelation.R) {
                obj.value = Math.abs(FactoryStatistic.R(data.attributeToDoubleArray(i), out));
            }
            if (type == TCorelation.R2) {
                obj.value = Math.abs(FactoryStatistic.R2(data.attributeToDoubleArray(i), out));
            }
            if (type == TCorelation.RAE) {
                obj.value = Math.abs(FactoryStatistic.RAE(data.attributeToDoubleArray(i), out));
            }
            if (type == TCorelation.RELATIVE_NSEC) {
                obj.value = Math.abs(FactoryStatistic.RELATIVE_NSEC(data.attributeToDoubleArray(i), out));
            }
            if (type == TCorelation.RMSE) {
                obj.value = Math.abs(FactoryStatistic.RMSE(data.attributeToDoubleArray(i), out));
            }
            if (type == TCorelation.RRSE) {
                obj.value = Math.abs(FactoryStatistic.RRSE(data.attributeToDoubleArray(i), out));
            }
            if (type == TCorelation.SPEARMAN) {
                obj.value = Math.abs(FactoryStatistic.SPEARMAN(data.attributeToDoubleArray(i), out));
            }
            //            if (type==FactoryCorrelation.KENDALL) {
            //                obj.value=Math.abs(FactoryCorrelation.rankKendallTauBeta(data.attributeToDoubleArray(i), out));
            //            }
            //            if (type==FactoryCorrelation.PEARSON) {
            //                obj.value=Math.abs(FactoryCorrelation.pearson(data.attributeToDoubleArray(i), out));
            //            }
            //            if (type==FactoryCorrelation.SPEARMAN) {
            //                obj.value=Math.abs(FactoryCorrelation.spearman(data.attributeToDoubleArray(i), out));
            //            }            
            ret[i] = obj;
        }
        ArrayList<TFeatureRank> lst = toArrayList(ret);
        Collections.sort(lst, new CustomComparatorForFeatureRank());
        ret = toArray(lst);
        return ret;
    }

    private static TFeatureRank[] computeCombinationPairs(String[] lstComb, Instances data, Classifier model,
            int nFolds, boolean show_text, boolean show_plot) {
        TFeatureRank[] ret = new TFeatureRank[lstComb.length];
        int m = lstComb.length;
        double q = m * 1.0 / 100;
        int n = 0;
        for (int i = 0; i < m; i++) {
            if (n != (int) Math.round(i / q)) {
                n = (int) Math.round(i / q);
                System.out.println("progress:" + n + "%");
            }
            TFeatureRank obj = new TFeatureRank();
            obj.featureName = lstComb[i];
            obj.index = i + "";
            Instances subsetData = FactoryInstance.getSubsetData(data, lstComb[i].split(","));
            Evaluation eval = FactoryEvaluation.performCrossValidate(model, subsetData, nFolds, show_text,
                    show_plot);
            try {
                if (data.classAttribute().isNominal()) {
                    obj.value = eval.pctCorrect();
                } else {
                    obj.value = eval.correlationCoefficient();
                }
            } catch (Exception ex) {
                Logger.getLogger(FeatureSelectionRanker.class.getName()).log(Level.SEVERE, null, ex);
            }
            ret[i] = obj;
        }
        ArrayList<TFeatureRank> lst = toArrayList(ret);
        Collections.sort(lst, new CustomComparatorForFeatureRank());
        ret = toArray(lst);
        return ret;
    }

    private static double computeCombinationFeature(String lstComb, Instances data, int folds, Classifier model,
            boolean show_text, boolean show_plot) {
        TFeatureRank obj = new TFeatureRank();
        obj.featureName = lstComb;
        obj.index = "";
        Instances subsetData = FactoryInstance.getSubsetData(data, lstComb.split(","));
        Evaluation eval = FactoryEvaluation.performCrossValidate(model, subsetData, folds, show_text, show_plot);
        try {
            if (data.classAttribute().isNominal()) {
                obj.value = eval.pctCorrect();
            } else {
                obj.value = eval.correlationCoefficient();
            }
        } catch (Exception ex) {
            Logger.getLogger(FeatureSelectionRanker.class.getName()).log(Level.SEVERE, null, ex);
        }
        return obj.value;
    }

    // Calculate the acceptance probability
    private static double acceptanceProbability(double energy, double newEnergy, double temperature) {
        // If the new solution is better, accept it
        if (newEnergy < energy) {
            return 1.0;
        }
        // If the new solution is worse, calculate an acceptance probability
        return Math.exp((energy - newEnergy) / temperature);
    }

}