Example usage for weka.core Instances enumerateAttributes

Introduction

In this page you can find the example usage for weka.core Instances enumerateAttributes.

Prototype

publicEnumeration<Attribute> enumerateAttributes()

Source Link

Document

Returns an enumeration of all the attributes.

Usage

From source file:jmetal.test.survivalanalysis.GenerateSurvivalGraph.java

License:Open Source License

/** 
 * Evaluates a solution /*from w w w.  j a v  a  2  s .  co m*/
 * @param solution The solution to evaluate
 */
public void evaluate(Solution solution) {
    Binary variable;
    int counterSelectedFeatures;

    DataSource source;

    double testStatistic = Double.MAX_VALUE;
    double pValue = Double.MAX_VALUE;
    double ArithmeticHarmonicCutScore = Double.MAX_VALUE;
    //double statScore;
    REXP x;

    variable = ((Binary) solution.getDecisionVariables()[0]);

    counterSelectedFeatures = 0;

    try {
        // read the data file 
        source = new DataSource(this.dataFileName);
        Instances data = source.getDataSet();
        //System.out.print("Data read successfully. ");
        //System.out.print("Number of attributes: " + data.numAttributes());
        //System.out.println(". Number of instances: " + data.numInstances());

        // save the attribute 'T' and 'Censor'
        attTime = data.attribute(data.numAttributes() - 2);
        attCensor = data.attribute(data.numAttributes() - 1);

        // First filter the attributes based on chromosome
        Instances tmpData = this.filterByChromosome(data, solution);

        // Now filter the attribute 'T' and 'Censor'
        Remove filter = new Remove();
        // remove the two last attributes : 'T' and 'Censor'
        filter.setAttributeIndices("" + (tmpData.numAttributes() - 1) + "," + tmpData.numAttributes());
        //System.out.println("After chromosome filtering no of attributes: " + tmpData.numAttributes());
        filter.setInputFormat(tmpData);
        Instances dataClusterer = Filter.useFilter(tmpData, filter);

        // filtering complete

        // List the selected features/attributes
        Enumeration<Attribute> attributeList = dataClusterer.enumerateAttributes();
        System.out.println("Selected attributes/features: ");
        while (attributeList.hasMoreElements()) {
            Attribute att = attributeList.nextElement();
            System.out.print(att.name() + ",");
        }

        System.out.println();

        /*
        // debug: write the filtered dataset
                
         ArffSaver saver = new ArffSaver();
         saver.setInstances(dataClusterer);
         saver.setFile(new File("filteered-data.arff"));
         saver.writeBatch();
        // end debug
                
        */

        // train hierarchical clusterer

        HierarchicalClusterer clusterer = new HierarchicalClusterer();
        clusterer.setOptions(new String[] { "-L", this.HC_LinkType });
        //Link type (Single, Complete, Average, Mean, Centroid, Ward, Adjusted complete, Neighbor Joining)
        //[SINGLE|COMPLETE|AVERAGE|MEAN|CENTROID|WARD|ADJCOMPLETE|NEIGHBOR_JOINING]

        //clusterer.setDebug(true);
        clusterer.setNumClusters(2);
        clusterer.setDistanceFunction(new EuclideanDistance());
        clusterer.setDistanceIsBranchLength(false); // ?? Should it be changed to false? (Noman)

        clusterer.buildClusterer(dataClusterer);

        double[][] distanceMatrix = clusterer.getDistanceMatrix();

        // Cluster evaluation:
        ClusterEvaluation eval = new ClusterEvaluation();
        eval.setClusterer(clusterer);

        if (this.testDataFileName != null) {

            DataSource testSource = new DataSource(this.testDataFileName);

            Instances tmpTestData = testSource.getDataSet();
            tmpTestData.setClassIndex(tmpTestData.numAttributes() - 1);
            //testSource.

            // First filter the attributes based on chromosome
            Instances testData = this.filterByChromosome(tmpTestData, solution);
            //String[] options = new String[2];
            //options[0] = "-t";
            //options[1] = "/some/where/somefile.arff";
            //eval.
            //System.out.println(eval.evaluateClusterer(testData, options));
            eval.evaluateClusterer(testData);
            System.out.println("\nCluster evluation for this solution(" + this.testDataFileName + "): "
                    + eval.clusterResultsToString());
        }

        // First analyze using my library function

        // save the cluster assignments

        int[] clusterAssignment = new int[dataClusterer.numInstances()];
        int classOneCnt = 0;
        int classTwoCnt = 0;
        for (int i = 0; i < dataClusterer.numInstances(); ++i) {
            clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i));
            if (clusterAssignment[i] == 0) {
                ++classOneCnt;
            } else if (clusterAssignment[i] == 1) {
                ++classTwoCnt;
            }
            //System.out.println("Instance " + i + ": " + clusterAssignment[i]);
        }

        System.out.println("Class 1 cnt: " + classOneCnt + " Class 2 cnt: " + classTwoCnt);

        // create arrays with time (event occurrence time) and censor data for use with jstat LogRankTest
        double[] time1 = new double[classOneCnt];
        double[] censor1 = new double[classOneCnt];
        double[] time2 = new double[classTwoCnt];
        double[] censor2 = new double[classTwoCnt];

        //data = source.getDataSet();
        for (int i = 0, cnt1 = 0, cnt2 = 0; i < dataClusterer.numInstances(); ++i) {
            //clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i));
            if (clusterAssignment[i] == 0) {
                time1[cnt1] = data.get(i).value(attTime);
                censor1[cnt1++] = data.get(i).value(attCensor);
                //System.out.println("i: " + i + " T: " + time1[cnt1-1]);
            } else if (clusterAssignment[i] == 1) {
                time2[cnt2] = data.get(i).value(attTime);
                //System.out.println("i: " + i + " T: " + time2[cnt2-1]);
                censor2[cnt2++] = data.get(i).value(attCensor);
                ;
            }
            //System.out.println("Instance " + i + ": " + clusterAssignment[i]);
        }

        //Instances[] classInstances = separateClassInstances(clusterAssignment, this.dataFileName,solution);
        //System.out.println("Class instances seperated");

        // calculate log rank test and p values

        LogRankTest testclass1 = new LogRankTest(time1, time2, censor1, censor2);
        double[] scores = testclass1.logRank();
        testStatistic = scores[0];
        pValue = scores[2];

        ArithmeticHarmonicCutScore = this.getArithmeticHarmonicCutScore(distanceMatrix, clusterAssignment);
        //debug:
        System.out.println("Calculation by myLibrary:\n testStatistic: " + scores[0] + " pValue: " + scores[2]
                + " Arithmetic Harmonic Cut Score: " + ArithmeticHarmonicCutScore);
        //end debug
        //WilcoxonTest testclass1 = new WilcoxonTest(time1, censor1, time2, censor2);
        //testStatistic = testclass1.testStatistic;
        //pValue = testclass1.pValue;true

        // Now analyze calling R for Log Rank test, Parallelization not possible

        String strT = "time <- c(";
        String strC = "censor <- c(";
        String strG = "group <- c(";

        for (int i = 0; i < dataClusterer.numInstances() - 1; ++i) {
            strT = strT + (int) data.get(i).value(attTime) + ",";
            strG = strG + clusterer.clusterInstance(dataClusterer.get(i)) + ",";
            strC = strC + (int) data.get(i).value(attCensor) + ",";
        }

        int tmpi = dataClusterer.numInstances() - 1;
        strT = strT + (int) data.get(tmpi).value(attTime) + ")";
        strG = strG + clusterer.clusterInstance(dataClusterer.get(tmpi)) + ")";
        strC = strC + (int) data.get(tmpi).value(attCensor) + ")";

        this.re.eval(strT);
        this.re.eval(strC);
        this.re.eval(strG);

        //debug
        //System.out.println(strT);
        //System.out.println(strC);
        //System.out.println(strG);
        //end debug

        /** If you are calling surv_test from coin library */
        /*v
        re.eval("library(coin)");
        re.eval("grp <- factor (group)");
        re.eval("result <- surv_test(Surv(time,censor)~grp,distribution=\"exact\")");
                
        x=re.eval("statistic(result)");
        testStatistic = x.asDouble();
        //x=re.eval("pvalue(result)");
        //pValue = x.asDouble();
        //System.out.println("StatScore: " + statScore + "pValue: " + pValue);
         */

        /** If you are calling survdiff from survival library (much faster) */
        re.eval("library(survival)");
        re.eval("res2 <- survdiff(Surv(time,censor)~group,rho=0)");
        x = re.eval("res2$chisq");
        testStatistic = x.asDouble();
        //System.out.println(x);
        x = re.eval("pchisq(res2$chisq, df=1, lower.tail = FALSE)");
        //x = re.eval("1.0 - pchisq(res2$chisq, df=1)");
        pValue = x.asDouble();
        //debug:
        //System.out.println("Calculation by R: StatScore: " + testStatistic + "pValue: " + pValue);
        //end debug

        System.out.println("Calculation by R:");
        System.out.println("StatScore: " + testStatistic + "  pValue: " + pValue);

        re.eval("timestrata1.surv <- survfit( Surv(time, censor)~ strata(group), conf.type=\"log-log\")");
        re.eval("timestrata1.surv1 <- survfit( Surv(time, censor)~ 1, conf.type=\"none\")");
        String evalStr = "jpeg('SurvivalPlot-" + this.SolutionID + ".jpg')";
        re.eval(evalStr);
        re.eval("plot(timestrata1.surv, col=c(2,3), xlab=\"Time\", ylab=\"Survival Probability\")");
        re.eval("par(new=T)");
        re.eval("plot(timestrata1.surv1,col=1)");
        re.eval("legend(0.2, c(\"Group1\",\"Group2\",\"Whole\"))");
        re.eval("dev.off()");

        System.out.println("\nCluster Assignments:");
        for (int i = 0; i < dataClusterer.numInstances(); ++i) {
            System.out.println("Instance " + i + ": " + clusterAssignment[i]);
        }

    } catch (Exception e) {
        // TODO Auto-generated catch block
        System.err.println("Can't open the data file.");
        e.printStackTrace();
        System.exit(1);
    }

}

From source file:jmetal.test.survivalanalysis.GenerateSurvivalGraphOld.java

License:Open Source License

/** 
 * Evaluates a solution - actually generate the survival graph 
 * @param solution The solution to evaluate
 *///from  w  w w .j a  v  a 2 s.c  o m
public void evaluate(Solution solution) {
    Binary variable;
    int counterSelectedFeatures;

    DataSource source;

    double testStatistic = Double.MAX_VALUE;
    double pValue = Double.MAX_VALUE;
    //double statScore;
    REXP x;

    variable = ((Binary) solution.getDecisionVariables()[0]);

    counterSelectedFeatures = 0;

    System.out.println("\nSolution ID " + this.SolutionID);

    try {
        // read the data file 
        source = new DataSource(this.dataFileName);
        Instances data = source.getDataSet();
        //System.out.print("Data read successfully. ");
        //System.out.print("Number of attributes: " + data.numAttributes());
        //System.out.println(". Number of instances: " + data.numInstances());

        // save the attribute 'T' and 'Censor'
        attTime = data.attribute(data.numAttributes() - 2);
        attCensor = data.attribute(data.numAttributes() - 1);

        // First filter the attributes based on chromosome
        Instances tmpData = this.filterByChromosome(data, solution);

        // Now filter the attribute 'T' and 'Censor'
        Remove filter = new Remove();
        // remove the two last attributes : 'T' and 'Censor'
        filter.setAttributeIndices("" + (tmpData.numAttributes() - 1) + "," + tmpData.numAttributes());
        //System.out.println("After chromosome filtering no of attributes: " + tmpData.numAttributes());
        filter.setInputFormat(tmpData);
        Instances dataClusterer = Filter.useFilter(tmpData, filter);

        Enumeration<Attribute> attributeList = dataClusterer.enumerateAttributes();
        System.out.println("Selected attributes: ");
        while (attributeList.hasMoreElements()) {
            Attribute att = attributeList.nextElement();
            System.out.print(att.name() + ",");
        }

        System.out.println();
        // filtering complete

        // Debug: write the filtered dataset
        /*
        ArffSaver saver = new ArffSaver();
        saver.setInstances(dataClusterer);
        saver.setFile(new File("filteered-data.arff"));
        saver.writeBatch();
         */

        // train hierarchical clusterer

        HierarchicalClusterer clusterer = new HierarchicalClusterer();
        clusterer.setOptions(new String[] { "-L", "COMPLETE" }); // complete linkage clustering
        //clusterer.setDebug(true);
        clusterer.setNumClusters(2);
        clusterer.setDistanceFunction(new EuclideanDistance());
        //clusterer.setDistanceFunction(new ChebyshevDistance());
        clusterer.setDistanceIsBranchLength(false);

        clusterer.buildClusterer(dataClusterer);

        // Cluster evaluation:
        ClusterEvaluation eval = new ClusterEvaluation();
        eval.setClusterer(clusterer);

        if (this.testDataFileName != null) {

            DataSource testSource = new DataSource(this.testDataFileName);

            Instances tmpTestData = testSource.getDataSet();
            tmpTestData.setClassIndex(tmpTestData.numAttributes() - 1);
            //testSource.

            // First filter the attributes based on chromosome
            Instances testData = this.filterByChromosome(tmpTestData, solution);
            //String[] options = new String[2];
            //options[0] = "-t";
            //options[1] = "/some/where/somefile.arff";
            //eval.
            //System.out.println(eval.evaluateClusterer(testData, options));
            eval.evaluateClusterer(testData);
            System.out.println("\nCluster evluation for this solution: " + eval.clusterResultsToString());
        }

        // Print the cluster assignments:

        // save the cluster assignments
        //if (printClusterAssignment==true){
        int[] clusterAssignment = new int[dataClusterer.numInstances()];
        int classOneCnt = 0;
        int classTwoCnt = 0;
        for (int i = 0; i < dataClusterer.numInstances(); ++i) {
            clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i));
            if (clusterAssignment[i] == 0) {
                ++classOneCnt;
            } else if (clusterAssignment[i] == 1) {
                ++classTwoCnt;
            }
            //System.out.println("Instance " + i + ": " + clusterAssignment[i]);
        }

        System.out.println("Class 1 cnt: " + classOneCnt + " Class 2 cnt: " + classTwoCnt);
        //}

        /*
                
                         
                 // create arrays with time (event occurrence time) and censor data for use with jstat LogRankTest
                 double[] time1 = new double[classOneCnt];   
                 double[] censor1 = new double[classOneCnt];
                 double[] time2 = new double[classTwoCnt];
                 double[] censor2 = new double[classTwoCnt];
                
                
                 //data = source.getDataSet();
                 for (int i=0, cnt1=0, cnt2=0; i<dataClusterer.numInstances(); ++i){
                    clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i));
                    if (clusterAssignment[i]==0){
                       time1[cnt1] = data.get(i).value(attTime);
                       censor1[cnt1++] = 1;
                       //System.out.println("i: " + i + " T: " + time1[cnt1-1]);
                    }
                    else if (clusterAssignment[i]==1){
                       time2[cnt2] = data.get(i).value(attTime);
                       //System.out.println("i: " + i + " T: " + time2[cnt2-1]);
                       censor2[cnt2++] = 1;
                    }
                    //System.out.println("Instance " + i + ": " + clusterAssignment[i]);
                 }
                
                
                
                 //Instances[] classInstances = separateClassInstances(clusterAssignment, this.dataFileName,solution);
                 //System.out.println("Class instances seperated");
                
                 // calculate log rank test and p values
                         
                 //LogRankTest testclass1 = new LogRankTest(time1, censor1, time2, censor2);
                 //testStatistic = testclass1.testStatistic;
                 //pValue = testclass1.pValue;
                
                
                 WilcoxonTest testclass1 = new WilcoxonTest(time1, censor1, time2, censor2);
                 testStatistic = testclass1.testStatistic;
                 pValue = testclass1.pValue;true
        */

        String strT = "time1 <- c(";
        String strC = "censor1 <- c(";
        String strG = "group1 <- c(";

        for (int i = 0; i < dataClusterer.numInstances() - 1; ++i) {
            strT = strT + (int) data.get(i).value(attTime) + ",";
            strG = strG + clusterer.clusterInstance(dataClusterer.get(i)) + ",";
            strC = strC + (int) data.get(i).value(attCensor) + ",";

        }

        int tmpi = dataClusterer.numInstances() - 1;
        strT = strT + (int) data.get(tmpi).value(attTime) + ")";
        strG = strG + clusterer.clusterInstance(dataClusterer.get(tmpi)) + ")";
        strC = strC + (int) data.get(tmpi).value(attCensor) + ")";

        this.re.eval(strT);
        this.re.eval(strC);
        this.re.eval(strG);

        // for MyLogRankTest

        double[] time1 = new double[classOneCnt];
        double[] time2 = new double[classTwoCnt];
        double[] censor1 = new double[classOneCnt];
        double[] censor2 = new double[classTwoCnt];

        int i1 = 0, i2 = 0;

        for (int i = 0; i < dataClusterer.numInstances(); ++i) {

            strT = strT + (int) data.get(i).value(attTime) + ",";
            strG = strG + clusterer.clusterInstance(dataClusterer.get(i)) + ",";
            strC = strC + (int) data.get(i).value(attCensor) + ",";

            if (clusterer.clusterInstance(dataClusterer.get(i)) == 0) {
                time1[i1] = data.get(i).value(attTime);
                censor1[i1] = data.get(i).value(attCensor);
                ++i1;
            } else {
                time2[i2] = data.get(i).value(attTime);
                censor2[i2] = data.get(i).value(attCensor);
                ++i2;
            }

        }

        /** If you are calling surv_test from coin library */
        /*v
        re.eval("library(coin)");
        re.eval("grp <- factor (group)");
        re.eval("result <- surv_test(Surv(time,censor)~grp,distribution=\"exact\")");
                
        x=re.eval("statistic(result)");
        testStatistic = x.asDouble();
        //x=re.eval("pvalue(result)");
        //pValue = x.asDouble();
        //System.out.println("StatScore: " + statScore + "pValue: " + pValue);
        */

        /** If you are calling survdiff from survival library (much faster) */
        re.eval("library(survival)");
        re.eval("res21 <- survdiff(Surv(time1,censor1)~group1,rho=0)");
        x = re.eval("res21$chisq");
        testStatistic = x.asDouble();
        //System.out.println(x);
        x = re.eval("pchisq(res21$chisq, df=1, lower.tail = FALSE)");
        //x = re.eval("1.0 - pchisq(res2$chisq, df=1)");
        pValue = x.asDouble();
        System.out.println("Results from R:");
        System.out.println("StatScore: " + testStatistic + "  pValue: " + pValue);

        re.eval("timestrata1.surv <- survfit( Surv(time1, censor1)~ strata(group1), conf.type=\"log-log\")");
        re.eval("timestrata1.surv1 <- survfit( Surv(time1, censor1)~ 1, conf.type=\"none\")");
        String evalStr = "jpeg('SurvivalPlot-" + this.SolutionID + ".jpg')";
        re.eval(evalStr);
        re.eval("plot(timestrata1.surv, col=c(2,3), xlab=\"Time\", ylab=\"Survival Probability\")");
        re.eval("par(new=T)");
        re.eval("plot(timestrata1.surv1,col=1)");
        re.eval("legend(0.2, c(\"Group1\",\"Group2\",\"Whole\"))");
        re.eval("dev.off()");

        System.out.println("Results from my code: ");
        LogRankTest lrt = new LogRankTest(time1, time2, censor1, censor2);
        double[] results = lrt.logRank();
        System.out.println("Statistics: " + results[0] + " variance: " + results[1] + " pValue: " + results[2]);

    } catch (Exception e) {
        // TODO Auto-generated catch block
        System.err.println("Can't open the data file.");
        e.printStackTrace();
        System.exit(1);
    }

    /**********
     *  Current Implementation considers two objectives
     *  1. pvalue to be minimized / statistical score to be maximized
     *  2. Number of Features to be maximized/minimized
     */

}

From source file:lu.lippmann.cdb.common.gui.dataset.InstancesLoaderDialogFactory.java

License:Open Source License

private static Instances showDialog(final Component parent, final boolean setClass) throws Exception {
    final Preferences prefs = Preferences.userRoot().node("CadralDecisionBuild");
    final String path = prefs.get(REG_KEY, WekaDataAccessUtil.DEFAULT_SAMPLE_DIR);

    final JFileChooser fc = new JFileChooser();
    fc.setCurrentDirectory(new File(path));
    final int returnVal = fc.showOpenDialog(parent);
    if (returnVal == JFileChooser.APPROVE_OPTION) {
        final File file = fc.getSelectedFile();
        if (file != null) {
            prefs.put(REG_KEY, file.getPath());
            final Instances ds = WekaDataAccessUtil.loadInstancesFromARFFOrCSVFile(file);
            final Attribute defaultClassAttr = ds.classIndex() >= 0 ? ds.classAttribute() : ds.attribute(0);
            ds.setClassIndex(-1);//w w w .j  a  va 2  s.c o m
            ds.setRelationName(file.getPath());
            final List<String> attributesNames = new ArrayList<String>();
            final Enumeration<?> e = ds.enumerateAttributes();
            while (e.hasMoreElements()) {
                final Attribute attr = (Attribute) e.nextElement();
                attributesNames.add(attr.name());
            }

            if (setClass) {
                final String s = (String) JOptionPane.showInputDialog(parent,
                        "Select the class attribute for '" + file.getName() + "' (default:'"
                                + defaultClassAttr.name() + "'): ",
                        "Class selection", JOptionPane.QUESTION_MESSAGE, null, // icon
                        attributesNames.toArray(), attributesNames.get(attributesNames.size() - 1));
                if (s != null) {
                    ds.setClass(ds.attribute(s));
                } else {
                    //Otherwise no class defined and CACHE attributeClass => No class index defined after cancel + retry
                    ds.setClass(defaultClassAttr);
                    return null;
                }
            } else {
                ds.setClass(defaultClassAttr);
            }
            return ds;
        } else
            throw new Exception();
    } else
        return null;
}

From source file:mathematik.Discretisierer.java

public Discretisierer(Instances insta) {

    this.inst = new Instances(insta);

    attwerten = new int[insta.numInstances()];
    Enumeration<Attribute> enu = insta.enumerateAttributes();
    int attindex = 0;
    while (enu.hasMoreElements()) {
        Attribute att = enu.nextElement();

        if (att.type() == Attribute.NUMERIC) {

            {/*from  w ww  .java  2  s.com*/
                for (int i = 0; i < attwerten.length; i++) {
                    attwerten[i] = ((int) inst.instance(i).value(attindex));
                }

            }

            changeIndiscretvalues(attindex);

        }
        attindex++;
    }
}

From source file:myclassifier.MyC45.java

/**
 * Method building ID3 tree.// ww w.j  a  v a2  s  .co m
 *
 * @param data the training data
 * @exception Exception if decision tree can't be built successfully
 */
private void makeTree(Instances data) throws Exception {

    // Check if no instances have reached this node.
    if (data.numInstances() == 0) {
        m_Attribute = null;
        m_ClassValue = -1; //Instance.missingValue();
        m_Distribution = new double[data.numClasses()];
        return;
    }

    // Compute attribute with maximum information gain.
    double[] gainRatios = new double[data.numAttributes()];
    Enumeration attEnum = data.enumerateAttributes();
    while (attEnum.hasMoreElements()) {
        Attribute att = (Attribute) attEnum.nextElement();
        gainRatios[att.index()] = computeGainRatio(data, att);
    }
    m_Attribute = data.attribute(Utils.maxIndex(gainRatios));

    // Make leaf if information gain is zero. 
    // Otherwise create successors.

    if (Utils.eq(gainRatios[m_Attribute.index()], 0)) {
        m_Attribute = null;
        m_Distribution = new double[data.numClasses()];
        Enumeration instEnum = data.enumerateInstances();
        while (instEnum.hasMoreElements()) {
            Instance inst = (Instance) instEnum.nextElement();
            m_Distribution[(int) inst.classValue()]++;
        }
        Utils.normalize(m_Distribution);
        m_ClassValue = Utils.maxIndex(m_Distribution);
        m_ClassAttribute = data.classAttribute();
    } else {
        Instances[] splitData = splitData(data, m_Attribute);
        m_Successors = new MyC45[m_Attribute.numValues()];
        for (int j = 0; j < m_Attribute.numValues(); j++) {
            m_Successors[j] = new MyC45();
            m_Successors[j].makeTree(splitData[j]);
        }
    }
}

From source file:myID3.MyId3.java

/**
 * Construct the tree using the given instance
 * Find the highest attribute value which best at dividing the data
 * @param data Instance//  w w w  . j  a  va  2  s .c  om
 */
public void buildTree(Instances data) {
    if (data.numInstances() > 0) {
        // Lets find the highest Information Gain!
        // First compute each information gain attribute
        double IG[] = new double[data.numAttributes()];
        Enumeration enumAttribute = data.enumerateAttributes();
        while (enumAttribute.hasMoreElements()) {
            Attribute attribute = (Attribute) enumAttribute.nextElement();
            IG[attribute.index()] = informationGain(data, attribute);
            // System.out.println(attribute.toString() + ": " + IG[attribute.index()]);
        }
        // Assign it as the tree attribute!
        currentAttribute = data.attribute(maxIndex(IG));
        //System.out.println(Arrays.toString(IG) + IG[currentAttribute.index()]);

        // IG = 0 then current node = leaf!
        if (Utils.eq(IG[currentAttribute.index()], 0)) {
            // Set the class value as the highest frequency of the class
            currentAttribute = null;
            classDistribution = new double[data.numClasses()];
            Enumeration enumInstance = data.enumerateInstances();
            while (enumInstance.hasMoreElements()) {
                Instance temp = (Instance) enumInstance.nextElement();
                classDistribution[(int) temp.classValue()]++;
            }
            Utils.normalize(classDistribution);
            classValue = Utils.maxIndex(classDistribution);
            classAttribute = data.classAttribute();
        } else {
            // Create another node from the current tree
            Instances[] splitData = splitDataByAttribute(data, currentAttribute);
            nodes = new MyId3[currentAttribute.numValues()];

            for (int i = 0; i < currentAttribute.numValues(); i++) {
                nodes[i] = new MyId3();
                nodes[i].buildTree(splitData[i]);
            }
        }
    } else {
        classAttribute = null;
        classValue = Utils.missingValue();
        classDistribution = new double[data.numClasses()];
    }
}

From source file:myid3andc45classifier.Model.MyC45.java

@Override
public void buildClassifier(Instances data) throws Exception {
    getCapabilities().testWithFail(data);

    data = new Instances(data);
    data.deleteWithMissingClass();/*from   w  w  w  . j av  a  2 s  .  co  m*/

    Enumeration enumAtt = data.enumerateAttributes();
    while (enumAtt.hasMoreElements()) {
        Attribute attr = (Attribute) enumAtt.nextElement();
        if (attr.isNumeric()) {
            ArrayList<Double> mid = new ArrayList<Double>();
            Instances savedData = null;
            double temp, max = Double.NEGATIVE_INFINITY;
            // TODO: split nominal
            data.sort(attr);
            for (int i = 0; i < data.numInstances() - 1; i++) {
                if (data.instance(i).classValue() != data.instance(i + 1).classValue()) {
                    if (data.attribute(attr.name() + " "
                            + (data.instance(i + 1).value(attr) + data.instance(i).value(attr)) / 2) == null) {
                        data = convertInstances(data, attr,
                                (data.instance(i + 1).value(attr) + data.instance(i).value(attr)) / 2);
                        //temp = computeInfoGainRatio(newData, newData.attribute(newData.numAttributes()-1));
                        //System.out.println("attribute "+newData.attribute(newData.numAttributes()-1).name());
                        //if (temp > max) {
                        //    max = temp;
                        //    savedData = newData;
                        //}
                    }
                }
            }

            //Penanganan Missing Value
            AttributeStats attributeStats = data.attributeStats(attr.index());
            double mean = attributeStats.numericStats.mean;
            if (Double.isNaN(mean))
                mean = 0;
            // Replace missing value with mean
            Enumeration instEnumerate = data.enumerateInstances();
            while (instEnumerate.hasMoreElements()) {
                Instance instance = (Instance) instEnumerate.nextElement();
                if (instance.isMissing(attr.index())) {
                    instance.setValue(attr.index(), mean);
                }
            }

            //data = new Instances(savedData);
        } else {
            //Penanganan Missing Value
            AttributeStats attributeStats = data.attributeStats(attr.index());
            int maxIndex = 0;
            for (int i = 1; i < attr.numValues(); i++) {
                if (attributeStats.nominalCounts[maxIndex] < attributeStats.nominalCounts[i]) {
                    maxIndex = i;
                }
            }
            // Replace missing value with max index
            Enumeration instEnumerate = data.enumerateInstances();
            while (instEnumerate.hasMoreElements()) {
                Instance instance = (Instance) instEnumerate.nextElement();
                if (instance.isMissing(attr.index())) {
                    instance.setValue(attr.index(), maxIndex);
                }
            }
        }
    }
    makeMyC45Tree(data);

}

From source file:myid3andc45classifier.Model.MyC45.java

public void makeMyC45Tree(Instances data) throws Exception {
    if (data.numInstances() == 0) {
        attribute = null;//  ww  w .jav  a 2s. c  o m
        label = Instance.missingValue();
        return;
    }
    //System.out.println("NEW");
    double[] infoGainRatios = new double[data.numAttributes()];
    Enumeration attEnum = data.enumerateAttributes();
    while (attEnum.hasMoreElements()) {
        Attribute att = (Attribute) attEnum.nextElement();
        if (!att.isNumeric())
            infoGainRatios[att.index()] = computeInfoGainRatio(data, att);
        else
            infoGainRatios[att.index()] = Double.NEGATIVE_INFINITY;
        //System.out.println(att.name() + " " + infoGainRatios[att.index()]);
    }

    // TODO: build the tree
    attribute = data.attribute(maxIndex(infoGainRatios));
    //System.out.println(infoGainRatios[maxIndex(infoGainRatios)]);
    // Make leaf if information gain is zero. 
    // Otherwise create successors.
    if (infoGainRatios[maxIndex(infoGainRatios)] <= epsilon
            || Double.isNaN(infoGainRatios[maxIndex(infoGainRatios)])) {
        attribute = null;
        double[] numClasses = new double[data.numClasses()];

        Enumeration instEnum = data.enumerateInstances();
        while (instEnum.hasMoreElements()) {
            Instance inst = (Instance) instEnum.nextElement();
            numClasses[(int) inst.classValue()]++;
        }

        label = maxIndex(numClasses);
        classAttribute = data.classAttribute();
    } else {
        classAttribute = data.classAttribute();
        Instances[] splitData = splitInstancesByAttribute(data, attribute);
        Instances[] distrData = splitInstancesByAttribute(data, data.classAttribute());
        distribution = new double[distrData.length];
        for (int j = 0; j < distribution.length; j++) {
            distribution[j] = distrData[j].numInstances();
        }
        successors = new MyC45[attribute.numValues()];
        for (int j = 0; j < attribute.numValues(); j++) {
            successors[j] = new MyC45();
            successors[j].buildClassifier(splitData[j]);
        }
    }
    // TODO: prune
    //pruneTree(data);
}

From source file:myid3andc45classifier.Model.MyID3.java

@Override
public void buildClassifier(Instances data) throws Exception {
    if (!data.classAttribute().isNominal()) {
        throw new Exception("MyID3: nominal class, please.");
    }//  w ww .j  a  v a2 s  . c o m
    Enumeration enumAtt = data.enumerateAttributes();
    while (enumAtt.hasMoreElements()) {
        Attribute attr = (Attribute) enumAtt.nextElement();
        if (!attr.isNominal()) {
            throw new Exception("MyID3: only nominal attributes, please.");
        }
        Enumeration enumInstance = data.enumerateInstances();
        while (enumInstance.hasMoreElements()) {
            if (((Instance) enumInstance.nextElement()).isMissing(attr)) {
                throw new Exception("MyID3: no missing values, please.");
            }
        }
    }
    data = new Instances(data);
    data.deleteWithMissingClass();
    makeMyID3Tree(data);
}

From source file:myid3andc45classifier.Model.MyID3.java

public void makeMyID3Tree(Instances data) throws Exception {

    // Mengecek apakah tidak terdapat instance yang dalam node ini
    if (data.numInstances() == 0) {
        attribute = null;//from  ww  w.  ja v a  2 s  . c  o  m
        classValue = Instance.missingValue();
        return;
    }

    // Compute attribute with maximum information gain.
    double[] infoGains = new double[data.numAttributes()];
    Enumeration attEnum = data.enumerateAttributes();
    while (attEnum.hasMoreElements()) {
        Attribute att = (Attribute) attEnum.nextElement();
        infoGains[att.index()] = computeInfoGain(data, att);
    }

    attribute = data.attribute(maxIndex(infoGains));

    // Make leaf if information gain is zero. 
    // Otherwise create successors.
    if (isDoubleEqual(infoGains[attribute.index()], 0)) {
        attribute = null;
        double[] numClasses = new double[data.numClasses()];

        Enumeration instEnum = data.enumerateInstances();
        while (instEnum.hasMoreElements()) {
            Instance inst = (Instance) instEnum.nextElement();
            numClasses[(int) inst.classValue()]++;
        }

        label = maxIndex(numClasses);
        classAttribute = data.classAttribute();
    } else {
        Instances[] splitData = splitInstancesByAttribute(data, attribute);
        successors = new MyID3[attribute.numValues()];
        for (int j = 0; j < attribute.numValues(); j++) {
            successors[j] = new MyID3();
            successors[j].buildClassifier(splitData[j]);
        }
    }
}