List of usage examples for weka.core Instances enumerateAttributes
publicEnumeration<Attribute> enumerateAttributes()
From source file:jmetal.test.survivalanalysis.GenerateSurvivalGraph.java
License:Open Source License
/** * Evaluates a solution /*from w w w. j a v a 2 s . co m*/ * @param solution The solution to evaluate */ public void evaluate(Solution solution) { Binary variable; int counterSelectedFeatures; DataSource source; double testStatistic = Double.MAX_VALUE; double pValue = Double.MAX_VALUE; double ArithmeticHarmonicCutScore = Double.MAX_VALUE; //double statScore; REXP x; variable = ((Binary) solution.getDecisionVariables()[0]); counterSelectedFeatures = 0; try { // read the data file source = new DataSource(this.dataFileName); Instances data = source.getDataSet(); //System.out.print("Data read successfully. "); //System.out.print("Number of attributes: " + data.numAttributes()); //System.out.println(". Number of instances: " + data.numInstances()); // save the attribute 'T' and 'Censor' attTime = data.attribute(data.numAttributes() - 2); attCensor = data.attribute(data.numAttributes() - 1); // First filter the attributes based on chromosome Instances tmpData = this.filterByChromosome(data, solution); // Now filter the attribute 'T' and 'Censor' Remove filter = new Remove(); // remove the two last attributes : 'T' and 'Censor' filter.setAttributeIndices("" + (tmpData.numAttributes() - 1) + "," + tmpData.numAttributes()); //System.out.println("After chromosome filtering no of attributes: " + tmpData.numAttributes()); filter.setInputFormat(tmpData); Instances dataClusterer = Filter.useFilter(tmpData, filter); // filtering complete // List the selected features/attributes Enumeration<Attribute> attributeList = dataClusterer.enumerateAttributes(); System.out.println("Selected attributes/features: "); while (attributeList.hasMoreElements()) { Attribute att = attributeList.nextElement(); System.out.print(att.name() + ","); } System.out.println(); /* // debug: write the filtered dataset ArffSaver saver = new ArffSaver(); saver.setInstances(dataClusterer); saver.setFile(new File("filteered-data.arff")); saver.writeBatch(); // end debug */ // train hierarchical clusterer HierarchicalClusterer clusterer = new HierarchicalClusterer(); clusterer.setOptions(new String[] { "-L", this.HC_LinkType }); //Link type (Single, Complete, Average, Mean, Centroid, Ward, Adjusted complete, Neighbor Joining) //[SINGLE|COMPLETE|AVERAGE|MEAN|CENTROID|WARD|ADJCOMPLETE|NEIGHBOR_JOINING] //clusterer.setDebug(true); clusterer.setNumClusters(2); clusterer.setDistanceFunction(new EuclideanDistance()); clusterer.setDistanceIsBranchLength(false); // ?? Should it be changed to false? (Noman) clusterer.buildClusterer(dataClusterer); double[][] distanceMatrix = clusterer.getDistanceMatrix(); // Cluster evaluation: ClusterEvaluation eval = new ClusterEvaluation(); eval.setClusterer(clusterer); if (this.testDataFileName != null) { DataSource testSource = new DataSource(this.testDataFileName); Instances tmpTestData = testSource.getDataSet(); tmpTestData.setClassIndex(tmpTestData.numAttributes() - 1); //testSource. // First filter the attributes based on chromosome Instances testData = this.filterByChromosome(tmpTestData, solution); //String[] options = new String[2]; //options[0] = "-t"; //options[1] = "/some/where/somefile.arff"; //eval. //System.out.println(eval.evaluateClusterer(testData, options)); eval.evaluateClusterer(testData); System.out.println("\nCluster evluation for this solution(" + this.testDataFileName + "): " + eval.clusterResultsToString()); } // First analyze using my library function // save the cluster assignments int[] clusterAssignment = new int[dataClusterer.numInstances()]; int classOneCnt = 0; int classTwoCnt = 0; for (int i = 0; i < dataClusterer.numInstances(); ++i) { clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i)); if (clusterAssignment[i] == 0) { ++classOneCnt; } else if (clusterAssignment[i] == 1) { ++classTwoCnt; } //System.out.println("Instance " + i + ": " + clusterAssignment[i]); } System.out.println("Class 1 cnt: " + classOneCnt + " Class 2 cnt: " + classTwoCnt); // create arrays with time (event occurrence time) and censor data for use with jstat LogRankTest double[] time1 = new double[classOneCnt]; double[] censor1 = new double[classOneCnt]; double[] time2 = new double[classTwoCnt]; double[] censor2 = new double[classTwoCnt]; //data = source.getDataSet(); for (int i = 0, cnt1 = 0, cnt2 = 0; i < dataClusterer.numInstances(); ++i) { //clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i)); if (clusterAssignment[i] == 0) { time1[cnt1] = data.get(i).value(attTime); censor1[cnt1++] = data.get(i).value(attCensor); //System.out.println("i: " + i + " T: " + time1[cnt1-1]); } else if (clusterAssignment[i] == 1) { time2[cnt2] = data.get(i).value(attTime); //System.out.println("i: " + i + " T: " + time2[cnt2-1]); censor2[cnt2++] = data.get(i).value(attCensor); ; } //System.out.println("Instance " + i + ": " + clusterAssignment[i]); } //Instances[] classInstances = separateClassInstances(clusterAssignment, this.dataFileName,solution); //System.out.println("Class instances seperated"); // calculate log rank test and p values LogRankTest testclass1 = new LogRankTest(time1, time2, censor1, censor2); double[] scores = testclass1.logRank(); testStatistic = scores[0]; pValue = scores[2]; ArithmeticHarmonicCutScore = this.getArithmeticHarmonicCutScore(distanceMatrix, clusterAssignment); //debug: System.out.println("Calculation by myLibrary:\n testStatistic: " + scores[0] + " pValue: " + scores[2] + " Arithmetic Harmonic Cut Score: " + ArithmeticHarmonicCutScore); //end debug //WilcoxonTest testclass1 = new WilcoxonTest(time1, censor1, time2, censor2); //testStatistic = testclass1.testStatistic; //pValue = testclass1.pValue;true // Now analyze calling R for Log Rank test, Parallelization not possible String strT = "time <- c("; String strC = "censor <- c("; String strG = "group <- c("; for (int i = 0; i < dataClusterer.numInstances() - 1; ++i) { strT = strT + (int) data.get(i).value(attTime) + ","; strG = strG + clusterer.clusterInstance(dataClusterer.get(i)) + ","; strC = strC + (int) data.get(i).value(attCensor) + ","; } int tmpi = dataClusterer.numInstances() - 1; strT = strT + (int) data.get(tmpi).value(attTime) + ")"; strG = strG + clusterer.clusterInstance(dataClusterer.get(tmpi)) + ")"; strC = strC + (int) data.get(tmpi).value(attCensor) + ")"; this.re.eval(strT); this.re.eval(strC); this.re.eval(strG); //debug //System.out.println(strT); //System.out.println(strC); //System.out.println(strG); //end debug /** If you are calling surv_test from coin library */ /*v re.eval("library(coin)"); re.eval("grp <- factor (group)"); re.eval("result <- surv_test(Surv(time,censor)~grp,distribution=\"exact\")"); x=re.eval("statistic(result)"); testStatistic = x.asDouble(); //x=re.eval("pvalue(result)"); //pValue = x.asDouble(); //System.out.println("StatScore: " + statScore + "pValue: " + pValue); */ /** If you are calling survdiff from survival library (much faster) */ re.eval("library(survival)"); re.eval("res2 <- survdiff(Surv(time,censor)~group,rho=0)"); x = re.eval("res2$chisq"); testStatistic = x.asDouble(); //System.out.println(x); x = re.eval("pchisq(res2$chisq, df=1, lower.tail = FALSE)"); //x = re.eval("1.0 - pchisq(res2$chisq, df=1)"); pValue = x.asDouble(); //debug: //System.out.println("Calculation by R: StatScore: " + testStatistic + "pValue: " + pValue); //end debug System.out.println("Calculation by R:"); System.out.println("StatScore: " + testStatistic + " pValue: " + pValue); re.eval("timestrata1.surv <- survfit( Surv(time, censor)~ strata(group), conf.type=\"log-log\")"); re.eval("timestrata1.surv1 <- survfit( Surv(time, censor)~ 1, conf.type=\"none\")"); String evalStr = "jpeg('SurvivalPlot-" + this.SolutionID + ".jpg')"; re.eval(evalStr); re.eval("plot(timestrata1.surv, col=c(2,3), xlab=\"Time\", ylab=\"Survival Probability\")"); re.eval("par(new=T)"); re.eval("plot(timestrata1.surv1,col=1)"); re.eval("legend(0.2, c(\"Group1\",\"Group2\",\"Whole\"))"); re.eval("dev.off()"); System.out.println("\nCluster Assignments:"); for (int i = 0; i < dataClusterer.numInstances(); ++i) { System.out.println("Instance " + i + ": " + clusterAssignment[i]); } } catch (Exception e) { // TODO Auto-generated catch block System.err.println("Can't open the data file."); e.printStackTrace(); System.exit(1); } }
From source file:jmetal.test.survivalanalysis.GenerateSurvivalGraphOld.java
License:Open Source License
/** * Evaluates a solution - actually generate the survival graph * @param solution The solution to evaluate *///from w w w .j a v a 2 s.c o m public void evaluate(Solution solution) { Binary variable; int counterSelectedFeatures; DataSource source; double testStatistic = Double.MAX_VALUE; double pValue = Double.MAX_VALUE; //double statScore; REXP x; variable = ((Binary) solution.getDecisionVariables()[0]); counterSelectedFeatures = 0; System.out.println("\nSolution ID " + this.SolutionID); try { // read the data file source = new DataSource(this.dataFileName); Instances data = source.getDataSet(); //System.out.print("Data read successfully. "); //System.out.print("Number of attributes: " + data.numAttributes()); //System.out.println(". Number of instances: " + data.numInstances()); // save the attribute 'T' and 'Censor' attTime = data.attribute(data.numAttributes() - 2); attCensor = data.attribute(data.numAttributes() - 1); // First filter the attributes based on chromosome Instances tmpData = this.filterByChromosome(data, solution); // Now filter the attribute 'T' and 'Censor' Remove filter = new Remove(); // remove the two last attributes : 'T' and 'Censor' filter.setAttributeIndices("" + (tmpData.numAttributes() - 1) + "," + tmpData.numAttributes()); //System.out.println("After chromosome filtering no of attributes: " + tmpData.numAttributes()); filter.setInputFormat(tmpData); Instances dataClusterer = Filter.useFilter(tmpData, filter); Enumeration<Attribute> attributeList = dataClusterer.enumerateAttributes(); System.out.println("Selected attributes: "); while (attributeList.hasMoreElements()) { Attribute att = attributeList.nextElement(); System.out.print(att.name() + ","); } System.out.println(); // filtering complete // Debug: write the filtered dataset /* ArffSaver saver = new ArffSaver(); saver.setInstances(dataClusterer); saver.setFile(new File("filteered-data.arff")); saver.writeBatch(); */ // train hierarchical clusterer HierarchicalClusterer clusterer = new HierarchicalClusterer(); clusterer.setOptions(new String[] { "-L", "COMPLETE" }); // complete linkage clustering //clusterer.setDebug(true); clusterer.setNumClusters(2); clusterer.setDistanceFunction(new EuclideanDistance()); //clusterer.setDistanceFunction(new ChebyshevDistance()); clusterer.setDistanceIsBranchLength(false); clusterer.buildClusterer(dataClusterer); // Cluster evaluation: ClusterEvaluation eval = new ClusterEvaluation(); eval.setClusterer(clusterer); if (this.testDataFileName != null) { DataSource testSource = new DataSource(this.testDataFileName); Instances tmpTestData = testSource.getDataSet(); tmpTestData.setClassIndex(tmpTestData.numAttributes() - 1); //testSource. // First filter the attributes based on chromosome Instances testData = this.filterByChromosome(tmpTestData, solution); //String[] options = new String[2]; //options[0] = "-t"; //options[1] = "/some/where/somefile.arff"; //eval. //System.out.println(eval.evaluateClusterer(testData, options)); eval.evaluateClusterer(testData); System.out.println("\nCluster evluation for this solution: " + eval.clusterResultsToString()); } // Print the cluster assignments: // save the cluster assignments //if (printClusterAssignment==true){ int[] clusterAssignment = new int[dataClusterer.numInstances()]; int classOneCnt = 0; int classTwoCnt = 0; for (int i = 0; i < dataClusterer.numInstances(); ++i) { clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i)); if (clusterAssignment[i] == 0) { ++classOneCnt; } else if (clusterAssignment[i] == 1) { ++classTwoCnt; } //System.out.println("Instance " + i + ": " + clusterAssignment[i]); } System.out.println("Class 1 cnt: " + classOneCnt + " Class 2 cnt: " + classTwoCnt); //} /* // create arrays with time (event occurrence time) and censor data for use with jstat LogRankTest double[] time1 = new double[classOneCnt]; double[] censor1 = new double[classOneCnt]; double[] time2 = new double[classTwoCnt]; double[] censor2 = new double[classTwoCnt]; //data = source.getDataSet(); for (int i=0, cnt1=0, cnt2=0; i<dataClusterer.numInstances(); ++i){ clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i)); if (clusterAssignment[i]==0){ time1[cnt1] = data.get(i).value(attTime); censor1[cnt1++] = 1; //System.out.println("i: " + i + " T: " + time1[cnt1-1]); } else if (clusterAssignment[i]==1){ time2[cnt2] = data.get(i).value(attTime); //System.out.println("i: " + i + " T: " + time2[cnt2-1]); censor2[cnt2++] = 1; } //System.out.println("Instance " + i + ": " + clusterAssignment[i]); } //Instances[] classInstances = separateClassInstances(clusterAssignment, this.dataFileName,solution); //System.out.println("Class instances seperated"); // calculate log rank test and p values //LogRankTest testclass1 = new LogRankTest(time1, censor1, time2, censor2); //testStatistic = testclass1.testStatistic; //pValue = testclass1.pValue; WilcoxonTest testclass1 = new WilcoxonTest(time1, censor1, time2, censor2); testStatistic = testclass1.testStatistic; pValue = testclass1.pValue;true */ String strT = "time1 <- c("; String strC = "censor1 <- c("; String strG = "group1 <- c("; for (int i = 0; i < dataClusterer.numInstances() - 1; ++i) { strT = strT + (int) data.get(i).value(attTime) + ","; strG = strG + clusterer.clusterInstance(dataClusterer.get(i)) + ","; strC = strC + (int) data.get(i).value(attCensor) + ","; } int tmpi = dataClusterer.numInstances() - 1; strT = strT + (int) data.get(tmpi).value(attTime) + ")"; strG = strG + clusterer.clusterInstance(dataClusterer.get(tmpi)) + ")"; strC = strC + (int) data.get(tmpi).value(attCensor) + ")"; this.re.eval(strT); this.re.eval(strC); this.re.eval(strG); // for MyLogRankTest double[] time1 = new double[classOneCnt]; double[] time2 = new double[classTwoCnt]; double[] censor1 = new double[classOneCnt]; double[] censor2 = new double[classTwoCnt]; int i1 = 0, i2 = 0; for (int i = 0; i < dataClusterer.numInstances(); ++i) { strT = strT + (int) data.get(i).value(attTime) + ","; strG = strG + clusterer.clusterInstance(dataClusterer.get(i)) + ","; strC = strC + (int) data.get(i).value(attCensor) + ","; if (clusterer.clusterInstance(dataClusterer.get(i)) == 0) { time1[i1] = data.get(i).value(attTime); censor1[i1] = data.get(i).value(attCensor); ++i1; } else { time2[i2] = data.get(i).value(attTime); censor2[i2] = data.get(i).value(attCensor); ++i2; } } /** If you are calling surv_test from coin library */ /*v re.eval("library(coin)"); re.eval("grp <- factor (group)"); re.eval("result <- surv_test(Surv(time,censor)~grp,distribution=\"exact\")"); x=re.eval("statistic(result)"); testStatistic = x.asDouble(); //x=re.eval("pvalue(result)"); //pValue = x.asDouble(); //System.out.println("StatScore: " + statScore + "pValue: " + pValue); */ /** If you are calling survdiff from survival library (much faster) */ re.eval("library(survival)"); re.eval("res21 <- survdiff(Surv(time1,censor1)~group1,rho=0)"); x = re.eval("res21$chisq"); testStatistic = x.asDouble(); //System.out.println(x); x = re.eval("pchisq(res21$chisq, df=1, lower.tail = FALSE)"); //x = re.eval("1.0 - pchisq(res2$chisq, df=1)"); pValue = x.asDouble(); System.out.println("Results from R:"); System.out.println("StatScore: " + testStatistic + " pValue: " + pValue); re.eval("timestrata1.surv <- survfit( Surv(time1, censor1)~ strata(group1), conf.type=\"log-log\")"); re.eval("timestrata1.surv1 <- survfit( Surv(time1, censor1)~ 1, conf.type=\"none\")"); String evalStr = "jpeg('SurvivalPlot-" + this.SolutionID + ".jpg')"; re.eval(evalStr); re.eval("plot(timestrata1.surv, col=c(2,3), xlab=\"Time\", ylab=\"Survival Probability\")"); re.eval("par(new=T)"); re.eval("plot(timestrata1.surv1,col=1)"); re.eval("legend(0.2, c(\"Group1\",\"Group2\",\"Whole\"))"); re.eval("dev.off()"); System.out.println("Results from my code: "); LogRankTest lrt = new LogRankTest(time1, time2, censor1, censor2); double[] results = lrt.logRank(); System.out.println("Statistics: " + results[0] + " variance: " + results[1] + " pValue: " + results[2]); } catch (Exception e) { // TODO Auto-generated catch block System.err.println("Can't open the data file."); e.printStackTrace(); System.exit(1); } /********** * Current Implementation considers two objectives * 1. pvalue to be minimized / statistical score to be maximized * 2. Number of Features to be maximized/minimized */ }
From source file:lu.lippmann.cdb.common.gui.dataset.InstancesLoaderDialogFactory.java
License:Open Source License
private static Instances showDialog(final Component parent, final boolean setClass) throws Exception { final Preferences prefs = Preferences.userRoot().node("CadralDecisionBuild"); final String path = prefs.get(REG_KEY, WekaDataAccessUtil.DEFAULT_SAMPLE_DIR); final JFileChooser fc = new JFileChooser(); fc.setCurrentDirectory(new File(path)); final int returnVal = fc.showOpenDialog(parent); if (returnVal == JFileChooser.APPROVE_OPTION) { final File file = fc.getSelectedFile(); if (file != null) { prefs.put(REG_KEY, file.getPath()); final Instances ds = WekaDataAccessUtil.loadInstancesFromARFFOrCSVFile(file); final Attribute defaultClassAttr = ds.classIndex() >= 0 ? ds.classAttribute() : ds.attribute(0); ds.setClassIndex(-1);//w w w .j a va 2 s.c o m ds.setRelationName(file.getPath()); final List<String> attributesNames = new ArrayList<String>(); final Enumeration<?> e = ds.enumerateAttributes(); while (e.hasMoreElements()) { final Attribute attr = (Attribute) e.nextElement(); attributesNames.add(attr.name()); } if (setClass) { final String s = (String) JOptionPane.showInputDialog(parent, "Select the class attribute for '" + file.getName() + "' (default:'" + defaultClassAttr.name() + "'): ", "Class selection", JOptionPane.QUESTION_MESSAGE, null, // icon attributesNames.toArray(), attributesNames.get(attributesNames.size() - 1)); if (s != null) { ds.setClass(ds.attribute(s)); } else { //Otherwise no class defined and CACHE attributeClass => No class index defined after cancel + retry ds.setClass(defaultClassAttr); return null; } } else { ds.setClass(defaultClassAttr); } return ds; } else throw new Exception(); } else return null; }
From source file:mathematik.Discretisierer.java
public Discretisierer(Instances insta) { this.inst = new Instances(insta); attwerten = new int[insta.numInstances()]; Enumeration<Attribute> enu = insta.enumerateAttributes(); int attindex = 0; while (enu.hasMoreElements()) { Attribute att = enu.nextElement(); if (att.type() == Attribute.NUMERIC) { {/*from w ww .java 2 s.com*/ for (int i = 0; i < attwerten.length; i++) { attwerten[i] = ((int) inst.instance(i).value(attindex)); } } changeIndiscretvalues(attindex); } attindex++; } }
From source file:myclassifier.MyC45.java
/** * Method building ID3 tree.// ww w.j a v a2 s .co m * * @param data the training data * @exception Exception if decision tree can't be built successfully */ private void makeTree(Instances data) throws Exception { // Check if no instances have reached this node. if (data.numInstances() == 0) { m_Attribute = null; m_ClassValue = -1; //Instance.missingValue(); m_Distribution = new double[data.numClasses()]; return; } // Compute attribute with maximum information gain. double[] gainRatios = new double[data.numAttributes()]; Enumeration attEnum = data.enumerateAttributes(); while (attEnum.hasMoreElements()) { Attribute att = (Attribute) attEnum.nextElement(); gainRatios[att.index()] = computeGainRatio(data, att); } m_Attribute = data.attribute(Utils.maxIndex(gainRatios)); // Make leaf if information gain is zero. // Otherwise create successors. if (Utils.eq(gainRatios[m_Attribute.index()], 0)) { m_Attribute = null; m_Distribution = new double[data.numClasses()]; Enumeration instEnum = data.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); m_Distribution[(int) inst.classValue()]++; } Utils.normalize(m_Distribution); m_ClassValue = Utils.maxIndex(m_Distribution); m_ClassAttribute = data.classAttribute(); } else { Instances[] splitData = splitData(data, m_Attribute); m_Successors = new MyC45[m_Attribute.numValues()]; for (int j = 0; j < m_Attribute.numValues(); j++) { m_Successors[j] = new MyC45(); m_Successors[j].makeTree(splitData[j]); } } }
From source file:myID3.MyId3.java
/** * Construct the tree using the given instance * Find the highest attribute value which best at dividing the data * @param data Instance// w w w . j a va 2 s .c om */ public void buildTree(Instances data) { if (data.numInstances() > 0) { // Lets find the highest Information Gain! // First compute each information gain attribute double IG[] = new double[data.numAttributes()]; Enumeration enumAttribute = data.enumerateAttributes(); while (enumAttribute.hasMoreElements()) { Attribute attribute = (Attribute) enumAttribute.nextElement(); IG[attribute.index()] = informationGain(data, attribute); // System.out.println(attribute.toString() + ": " + IG[attribute.index()]); } // Assign it as the tree attribute! currentAttribute = data.attribute(maxIndex(IG)); //System.out.println(Arrays.toString(IG) + IG[currentAttribute.index()]); // IG = 0 then current node = leaf! if (Utils.eq(IG[currentAttribute.index()], 0)) { // Set the class value as the highest frequency of the class currentAttribute = null; classDistribution = new double[data.numClasses()]; Enumeration enumInstance = data.enumerateInstances(); while (enumInstance.hasMoreElements()) { Instance temp = (Instance) enumInstance.nextElement(); classDistribution[(int) temp.classValue()]++; } Utils.normalize(classDistribution); classValue = Utils.maxIndex(classDistribution); classAttribute = data.classAttribute(); } else { // Create another node from the current tree Instances[] splitData = splitDataByAttribute(data, currentAttribute); nodes = new MyId3[currentAttribute.numValues()]; for (int i = 0; i < currentAttribute.numValues(); i++) { nodes[i] = new MyId3(); nodes[i].buildTree(splitData[i]); } } } else { classAttribute = null; classValue = Utils.missingValue(); classDistribution = new double[data.numClasses()]; } }
From source file:myid3andc45classifier.Model.MyC45.java
@Override public void buildClassifier(Instances data) throws Exception { getCapabilities().testWithFail(data); data = new Instances(data); data.deleteWithMissingClass();/*from w w w . j av a 2 s . co m*/ Enumeration enumAtt = data.enumerateAttributes(); while (enumAtt.hasMoreElements()) { Attribute attr = (Attribute) enumAtt.nextElement(); if (attr.isNumeric()) { ArrayList<Double> mid = new ArrayList<Double>(); Instances savedData = null; double temp, max = Double.NEGATIVE_INFINITY; // TODO: split nominal data.sort(attr); for (int i = 0; i < data.numInstances() - 1; i++) { if (data.instance(i).classValue() != data.instance(i + 1).classValue()) { if (data.attribute(attr.name() + " " + (data.instance(i + 1).value(attr) + data.instance(i).value(attr)) / 2) == null) { data = convertInstances(data, attr, (data.instance(i + 1).value(attr) + data.instance(i).value(attr)) / 2); //temp = computeInfoGainRatio(newData, newData.attribute(newData.numAttributes()-1)); //System.out.println("attribute "+newData.attribute(newData.numAttributes()-1).name()); //if (temp > max) { // max = temp; // savedData = newData; //} } } } //Penanganan Missing Value AttributeStats attributeStats = data.attributeStats(attr.index()); double mean = attributeStats.numericStats.mean; if (Double.isNaN(mean)) mean = 0; // Replace missing value with mean Enumeration instEnumerate = data.enumerateInstances(); while (instEnumerate.hasMoreElements()) { Instance instance = (Instance) instEnumerate.nextElement(); if (instance.isMissing(attr.index())) { instance.setValue(attr.index(), mean); } } //data = new Instances(savedData); } else { //Penanganan Missing Value AttributeStats attributeStats = data.attributeStats(attr.index()); int maxIndex = 0; for (int i = 1; i < attr.numValues(); i++) { if (attributeStats.nominalCounts[maxIndex] < attributeStats.nominalCounts[i]) { maxIndex = i; } } // Replace missing value with max index Enumeration instEnumerate = data.enumerateInstances(); while (instEnumerate.hasMoreElements()) { Instance instance = (Instance) instEnumerate.nextElement(); if (instance.isMissing(attr.index())) { instance.setValue(attr.index(), maxIndex); } } } } makeMyC45Tree(data); }
From source file:myid3andc45classifier.Model.MyC45.java
public void makeMyC45Tree(Instances data) throws Exception { if (data.numInstances() == 0) { attribute = null;// ww w .jav a 2s. c o m label = Instance.missingValue(); return; } //System.out.println("NEW"); double[] infoGainRatios = new double[data.numAttributes()]; Enumeration attEnum = data.enumerateAttributes(); while (attEnum.hasMoreElements()) { Attribute att = (Attribute) attEnum.nextElement(); if (!att.isNumeric()) infoGainRatios[att.index()] = computeInfoGainRatio(data, att); else infoGainRatios[att.index()] = Double.NEGATIVE_INFINITY; //System.out.println(att.name() + " " + infoGainRatios[att.index()]); } // TODO: build the tree attribute = data.attribute(maxIndex(infoGainRatios)); //System.out.println(infoGainRatios[maxIndex(infoGainRatios)]); // Make leaf if information gain is zero. // Otherwise create successors. if (infoGainRatios[maxIndex(infoGainRatios)] <= epsilon || Double.isNaN(infoGainRatios[maxIndex(infoGainRatios)])) { attribute = null; double[] numClasses = new double[data.numClasses()]; Enumeration instEnum = data.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); numClasses[(int) inst.classValue()]++; } label = maxIndex(numClasses); classAttribute = data.classAttribute(); } else { classAttribute = data.classAttribute(); Instances[] splitData = splitInstancesByAttribute(data, attribute); Instances[] distrData = splitInstancesByAttribute(data, data.classAttribute()); distribution = new double[distrData.length]; for (int j = 0; j < distribution.length; j++) { distribution[j] = distrData[j].numInstances(); } successors = new MyC45[attribute.numValues()]; for (int j = 0; j < attribute.numValues(); j++) { successors[j] = new MyC45(); successors[j].buildClassifier(splitData[j]); } } // TODO: prune //pruneTree(data); }
From source file:myid3andc45classifier.Model.MyID3.java
@Override public void buildClassifier(Instances data) throws Exception { if (!data.classAttribute().isNominal()) { throw new Exception("MyID3: nominal class, please."); }// w ww .j a v a2 s . c o m Enumeration enumAtt = data.enumerateAttributes(); while (enumAtt.hasMoreElements()) { Attribute attr = (Attribute) enumAtt.nextElement(); if (!attr.isNominal()) { throw new Exception("MyID3: only nominal attributes, please."); } Enumeration enumInstance = data.enumerateInstances(); while (enumInstance.hasMoreElements()) { if (((Instance) enumInstance.nextElement()).isMissing(attr)) { throw new Exception("MyID3: no missing values, please."); } } } data = new Instances(data); data.deleteWithMissingClass(); makeMyID3Tree(data); }
From source file:myid3andc45classifier.Model.MyID3.java
public void makeMyID3Tree(Instances data) throws Exception { // Mengecek apakah tidak terdapat instance yang dalam node ini if (data.numInstances() == 0) { attribute = null;//from ww w. ja v a 2 s . c o m classValue = Instance.missingValue(); return; } // Compute attribute with maximum information gain. double[] infoGains = new double[data.numAttributes()]; Enumeration attEnum = data.enumerateAttributes(); while (attEnum.hasMoreElements()) { Attribute att = (Attribute) attEnum.nextElement(); infoGains[att.index()] = computeInfoGain(data, att); } attribute = data.attribute(maxIndex(infoGains)); // Make leaf if information gain is zero. // Otherwise create successors. if (isDoubleEqual(infoGains[attribute.index()], 0)) { attribute = null; double[] numClasses = new double[data.numClasses()]; Enumeration instEnum = data.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); numClasses[(int) inst.classValue()]++; } label = maxIndex(numClasses); classAttribute = data.classAttribute(); } else { Instances[] splitData = splitInstancesByAttribute(data, attribute); successors = new MyID3[attribute.numValues()]; for (int j = 0; j < attribute.numValues(); j++) { successors[j] = new MyID3(); successors[j].buildClassifier(splitData[j]); } } }