List of usage examples for weka.core Instances classIndex
publicint classIndex()
From source file:cezeri.evaluater.FactoryEvaluation.java
public static Evaluation performCrossValidate(Classifier model, Instances datax, int folds, boolean show_text, boolean show_plot, TFigureAttribute attr) { Random rand = new Random(1); Instances randData = new Instances(datax); randData.randomize(rand);/*www. j a v a 2s . c o m*/ if (randData.classAttribute().isNominal()) { randData.stratify(folds); } Evaluation eval = null; try { // perform cross-validation eval = new Evaluation(randData); // double[] simulated = new double[0]; // double[] observed = new double[0]; // double[] sim = new double[0]; // double[] obs = new double[0]; for (int n = 0; n < folds; n++) { Instances train = randData.trainCV(folds, n, rand); Instances validation = randData.testCV(folds, n); // build and evaluate classifier Classifier clsCopy = Classifier.makeCopy(model); clsCopy.buildClassifier(train); // sim = eval.evaluateModel(clsCopy, validation); // obs = validation.attributeToDoubleArray(validation.classIndex()); // if (show_plot) { // double[][] d = new double[2][sim.length]; // d[0] = obs; // d[1] = sim; // CMatrix f1 = CMatrix.getInstance(d); // f1.transpose().plot(attr); // } // if (show_text) { // // output evaluation // System.out.println(); // System.out.println("=== Setup for each Cross Validation fold==="); // System.out.println("Classifier: " + model.getClass().getName() + " " + Utils.joinOptions(model.getOptions())); // System.out.println("Dataset: " + randData.relationName()); // System.out.println("Folds: " + folds); // System.out.println("Seed: " + 1); // System.out.println(); // System.out.println(eval.toSummaryString("=== " + folds + "-fold Cross-validation ===", false)); // } simulated = FactoryUtils.concatenate(simulated, eval.evaluateModel(clsCopy, validation)); observed = FactoryUtils.concatenate(observed, validation.attributeToDoubleArray(validation.classIndex())); // simulated = FactoryUtils.mean(simulated,eval.evaluateModel(clsCopy, validation)); // observed = FactoryUtils.mean(observed,validation.attributeToDoubleArray(validation.classIndex())); } if (show_plot) { double[][] d = new double[2][simulated.length]; d[0] = observed; d[1] = simulated; CMatrix f1 = CMatrix.getInstance(d); attr.figureCaption = "overall performance"; f1.transpose().plot(attr); } if (show_text) { // output evaluation System.out.println(); System.out.println("=== Setup for Overall Cross Validation==="); System.out.println( "Classifier: " + model.getClass().getName() + " " + Utils.joinOptions(model.getOptions())); System.out.println("Dataset: " + randData.relationName()); System.out.println("Folds: " + folds); System.out.println("Seed: " + 1); System.out.println(); System.out.println(eval.toSummaryString("=== " + folds + "-fold Cross-validation ===", false)); } } catch (Exception ex) { Logger.getLogger(FactoryEvaluation.class.getName()).log(Level.SEVERE, null, ex); } return eval; }
From source file:cezeri.evaluater.FactoryEvaluation.java
public static Evaluation performCrossValidateTestAlso(Classifier model, Instances datax, Instances test, boolean show_text, boolean show_plot) { TFigureAttribute attr = new TFigureAttribute(); Random rand = new Random(1); Instances randData = new Instances(datax); randData.randomize(rand);//from w w w . j av a 2s. c om Evaluation eval = null; int folds = randData.numInstances(); try { eval = new Evaluation(randData); for (int n = 0; n < folds; n++) { // randData.randomize(rand); // Instances train = randData; Instances train = randData.trainCV(folds, n); // Instances train = randData.trainCV(folds, n, rand); Classifier clsCopy = Classifier.makeCopy(model); clsCopy.buildClassifier(train); Instances validation = randData.testCV(folds, n); // Instances validation = test.testCV(test.numInstances(), n%test.numInstances()); // CMatrix.fromInstances(train).showDataGrid(); // CMatrix.fromInstances(validation).showDataGrid(); simulated = FactoryUtils.concatenate(simulated, eval.evaluateModel(clsCopy, validation)); observed = FactoryUtils.concatenate(observed, validation.attributeToDoubleArray(validation.classIndex())); } if (show_plot) { double[][] d = new double[2][simulated.length]; d[0] = observed; d[1] = simulated; CMatrix f1 = CMatrix.getInstance(d); attr.figureCaption = "overall performance"; f1.transpose().plot(attr); } if (show_text) { // output evaluation System.out.println(); System.out.println("=== Setup for Overall Cross Validation==="); System.out.println( "Classifier: " + model.getClass().getName() + " " + Utils.joinOptions(model.getOptions())); System.out.println("Dataset: " + randData.relationName()); System.out.println("Folds: " + folds); System.out.println("Seed: " + 1); System.out.println(); System.out.println(eval.toSummaryString("=== " + folds + "-fold Cross-validation ===", false)); } } catch (Exception ex) { Logger.getLogger(FactoryEvaluation.class.getName()).log(Level.SEVERE, null, ex); } return eval; }
From source file:cezeri.evaluater.FactoryEvaluation.java
private static Evaluation doTest(boolean isTrained, Classifier model, Instances train, Instances test, boolean show_text, boolean show_plot, TFigureAttribute attr) { Instances data = new Instances(train); Random rand = new Random(1); data.randomize(rand);/*from ww w . jav a2s . c om*/ Evaluation eval = null; try { // double[] simulated = null; eval = new Evaluation(train); if (isTrained) { simulated = eval.evaluateModel(model, test); } else { Classifier clsCopy = Classifier.makeCopy(model); clsCopy.buildClassifier(train); simulated = eval.evaluateModel(clsCopy, test); } if (show_plot) { observed = test.attributeToDoubleArray(test.classIndex()); double[][] d = new double[2][simulated.length]; d[0] = observed; d[1] = simulated; CMatrix f1 = CMatrix.getInstance(d); String[] items = { "Observed", "Simulated" }; attr.items = items; attr.figureCaption = model.getClass().getCanonicalName(); f1.transpose().plot(attr); // if (attr.axis[0].isEmpty() && attr.axis[1].isEmpty()) { // f1.transpose().plot(attr); // } else { // f1.transpose().plot(model.getClass().getCanonicalName(), attr.items, attr.axis); // } } if (show_text) { System.out.println(); System.out.println("=== Setup for Test ==="); System.out.println( "Classifier: " + model.getClass().getName() + " " + Utils.joinOptions(model.getOptions())); System.out.println("Dataset: " + test.relationName()); System.out.println(); System.out.println(eval.toSummaryString("=== Test Results ===", false)); } } catch (Exception ex) { Logger.getLogger(FactoryEvaluation.class.getName()).log(Level.SEVERE, null, ex); } return eval; }
From source file:cezeri.feature.selection.FeatureSelectionRanker.java
private static TFeatureRank[] correlation(Instances data, int type) { TFeatureRank[] ret = new TFeatureRank[data.numAttributes() - 1]; String[] attributeNames = FactoryInstance.getAttributeList(data); double[] out = data.attributeToDoubleArray(data.classIndex()); for (int i = 0; i < data.numAttributes() - 1; i++) { TFeatureRank obj = new TFeatureRank(); obj.featureName = attributeNames[i]; obj.index = i + ""; if (type == TCorelation.ARE) { obj.value = Math.abs(FactoryStatistic.ARE(data.attributeToDoubleArray(i), out)); }//from w ww . ja va 2 s. c om if (type == TCorelation.CRCF) { obj.value = Math.abs(FactoryStatistic.CRCF(data.attributeToDoubleArray(i), out)); } if (type == TCorelation.IOA) { obj.value = Math.abs(FactoryStatistic.IOA(data.attributeToDoubleArray(i), out)); } if (type == TCorelation.KENDALL) { obj.value = Math.abs(FactoryStatistic.KENDALL(data.attributeToDoubleArray(i), out)); } if (type == TCorelation.MAE) { obj.value = Math.abs(FactoryStatistic.MAE(data.attributeToDoubleArray(i), out)); } if (type == TCorelation.MPE) { obj.value = Math.abs(FactoryStatistic.MPE(data.attributeToDoubleArray(i), out)); } if (type == TCorelation.MSE) { obj.value = Math.abs(FactoryStatistic.MSE(data.attributeToDoubleArray(i), out)); } if (type == TCorelation.NSEC) { obj.value = Math.abs(FactoryStatistic.NSEC(data.attributeToDoubleArray(i), out)); } if (type == TCorelation.PEARSON) { obj.value = Math.abs(FactoryStatistic.PEARSON(data.attributeToDoubleArray(i), out)); } if (type == TCorelation.R) { obj.value = Math.abs(FactoryStatistic.R(data.attributeToDoubleArray(i), out)); } if (type == TCorelation.R2) { obj.value = Math.abs(FactoryStatistic.R2(data.attributeToDoubleArray(i), out)); } if (type == TCorelation.RAE) { obj.value = Math.abs(FactoryStatistic.RAE(data.attributeToDoubleArray(i), out)); } if (type == TCorelation.RELATIVE_NSEC) { obj.value = Math.abs(FactoryStatistic.RELATIVE_NSEC(data.attributeToDoubleArray(i), out)); } if (type == TCorelation.RMSE) { obj.value = Math.abs(FactoryStatistic.RMSE(data.attributeToDoubleArray(i), out)); } if (type == TCorelation.RRSE) { obj.value = Math.abs(FactoryStatistic.RRSE(data.attributeToDoubleArray(i), out)); } if (type == TCorelation.SPEARMAN) { obj.value = Math.abs(FactoryStatistic.SPEARMAN(data.attributeToDoubleArray(i), out)); } // if (type==FactoryCorrelation.KENDALL) { // obj.value=Math.abs(FactoryCorrelation.rankKendallTauBeta(data.attributeToDoubleArray(i), out)); // } // if (type==FactoryCorrelation.PEARSON) { // obj.value=Math.abs(FactoryCorrelation.pearson(data.attributeToDoubleArray(i), out)); // } // if (type==FactoryCorrelation.SPEARMAN) { // obj.value=Math.abs(FactoryCorrelation.spearman(data.attributeToDoubleArray(i), out)); // } ret[i] = obj; } ArrayList<TFeatureRank> lst = toArrayList(ret); Collections.sort(lst, new CustomComparatorForFeatureRank()); ret = toArray(lst); return ret; }
From source file:cezeri.utils.FactoryInstance.java
public static String[] getOriginalClasses(Instances data) { Attribute att = data.attribute(data.classIndex()); String[] ret = new String[data.numClasses()]; Enumeration enu = att.enumerateValues(); int q = 0;/*from ww w . jav a2 s . co m*/ while (enu.hasMoreElements()) { ret[q++] = (String) enu.nextElement(); } return ret; }
From source file:cezeri.utils.FactoryInstance.java
public static Instances getSubsetData(Instances data, String[] attList) { Instances temp = new Instances(data); for (int i = 0; i < data.numAttributes(); i++) { if (!temp.attribute(0).equals(temp.classAttribute())) { temp.deleteAttributeAt(0);// www . j av a2 s . com } } double[][] m = new double[attList.length + 1][data.numInstances()]; for (int i = 0; i < attList.length; i++) { int n = attList.length - 1 - i; String str = attList[n]; Attribute t = data.attribute(str); double[] d = data.attributeToDoubleArray(t.index()); m[n] = d; temp.insertAttributeAt(t, 0); } m[attList.length] = data.attributeToDoubleArray(data.classIndex()); m = CMatrix.getInstance(m).transpose().get2DArrayDouble(); FastVector att = new FastVector(); for (int i = 0; i < temp.numAttributes(); i++) { att.addElement(temp.attribute(i)); } Instances ret = new Instances(temp.relationName(), att, m.length); for (int i = 0; i < m.length; i++) { Instance ins = new Instance(m[0].length); for (int j = 0; j < m[0].length; j++) { ins.setValue(j, m[i][j]); } ret.add(ins); } ret.setClassIndex(temp.classIndex()); return ret; }
From source file:CGLSMethod.LinearRegression.java
License:Open Source License
/** * Builds a regression model for the given data. * * @param data the training data to be used for generating the * linear regression function/*w ww . j av a2 s . c o m*/ * @throws Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data) throws Exception { // Preprocess instances if (!m_checksTurnedOff) { m_TransformFilter = new NominalToBinary(); m_TransformFilter.setInputFormat(data); data = Filter.useFilter(data, m_TransformFilter); m_MissingFilter = new ReplaceMissingValues(); m_MissingFilter.setInputFormat(data); data = Filter.useFilter(data, m_MissingFilter); data.deleteWithMissingClass(); } else { m_TransformFilter = null; m_MissingFilter = null; } m_ClassIndex = data.classIndex(); m_TransformedData = data; // Turn all attributes on for a start m_SelectedAttributes = new boolean[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { if (i != m_ClassIndex) { m_SelectedAttributes[i] = true; } } m_Coefficients = null; // Compute means and standard deviations m_Means = new double[data.numAttributes()]; m_StdDevs = new double[data.numAttributes()]; for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { m_Means[j] = data.meanOrMode(j); m_StdDevs[j] = Math.sqrt(data.variance(j)); if (m_StdDevs[j] == 0) { m_SelectedAttributes[j] = false; } } } m_ClassStdDev = Math.sqrt(data.variance(m_TransformedData.classIndex())); m_ClassMean = data.meanOrMode(m_TransformedData.classIndex()); // Perform the regression findBestModel(); // Save memory m_TransformedData = new Instances(data, 0); }
From source file:ChiSquare.ChiSquaredAttributeEval.java
License:Open Source License
/** * Initializes a chi-squared attribute evaluator. * Discretizes all attributes that are numeric. * * @param data set of instances serving as training data * @throws Exception if the evaluator has not been * generated successfully//from www . jav a 2s .c o m */ public void buildEvaluator(Instances data) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); int classIndex = data.classIndex(); int numInstances = data.numInstances(); if (!m_Binarize) { Discretize disTransform = new Discretize(); disTransform.setUseBetterEncoding(true); disTransform.setInputFormat(data); data = Filter.useFilter(data, disTransform); } else { NumericToBinary binTransform = new NumericToBinary(); binTransform.setInputFormat(data); data = Filter.useFilter(data, binTransform); } int numClasses = data.attribute(classIndex).numValues(); // Reserve space and initialize counters double[][][] counts = new double[data.numAttributes()][][]; for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); counts[k] = new double[numValues + 1][numClasses + 1]; } } // Initialize counters double[] temp = new double[numClasses + 1]; for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); if (inst.classIsMissing()) { temp[numClasses] += inst.weight(); } else { temp[(int) inst.classValue()] += inst.weight(); } } for (int k = 0; k < counts.length; k++) { if (k != classIndex) { for (int i = 0; i < temp.length; i++) { counts[k][0][i] = temp[i]; } } } // Get counts for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); for (int i = 0; i < inst.numValues(); i++) { if (inst.index(i) != classIndex) { if (inst.isMissingSparse(i) || inst.classIsMissing()) { if (!inst.isMissingSparse(i)) { counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } else if (!inst.classIsMissing()) { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst .classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } else { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst .weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } } else { counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } } } } // distribute missing counts if required if (m_missing_merge) { for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); // Compute marginals double[] rowSums = new double[numValues]; double[] columnSums = new double[numClasses]; double sum = 0; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { rowSums[i] += counts[k][i][j]; columnSums[j] += counts[k][i][j]; } sum += rowSums[i]; } if (Utils.gr(sum, 0)) { double[][] additions = new double[numValues][numClasses]; // Compute what needs to be added to each row for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j]; } } // Compute what needs to be added to each column for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses]; } } // Compute what needs to be added to each cell for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses]; } } // Make new contingency table double[][] newTable = new double[numValues][numClasses]; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { newTable[i][j] = counts[k][i][j] + additions[i][j]; } } counts[k] = newTable; } } } } // Compute chi-squared values m_ChiSquareds = new double[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { if (i != classIndex) { m_ChiSquareds[i] = ContingencyTables.chiVal(ContingencyTables.reduceMatrix(counts[i]), false); } } }
From source file:classifier.SellerClassifier.java
private Instances loadData(String dataset) throws Exception { DataSource data = new DataSource(dataset); Instances instances = data.getDataSet(); if (instances.classIndex() == -1) { instances.setClassIndex(instances.numAttributes() - 1); }//from w w w . j a va 2 s . com return instances; }
From source file:classifier.SentenceBasedTextDirectoryLoader.java
License:Open Source License
/** * Return the full data set. If the structure hasn't yet been determined by * a call to getStructure then method should do so before processing the * rest of the data set./* ww w . java2 s .co m*/ * * @return the structure of the data set as an empty set of Instances * @throws IOException * if there is no source or parsing fails */ @Override public Instances getDataSet() throws IOException { if (getDirectory() == null) throw new IOException("No directory/source has been specified"); String directoryPath = getDirectory().getAbsolutePath(); ArrayList<String> classes = new ArrayList<String>(); ArrayList<String> filenames = new ArrayList<String>(); Enumeration enm = getStructure().classAttribute().enumerateValues(); while (enm.hasMoreElements()) classes.add((String) enm.nextElement()); Instances data = getStructure(); int fileCount = 0; // each class is actually the filename - this is preserved around weka, // so its useful for tracking associations later and using as an "index" // for (int k = 0; k < classes.size(); k++) { String subdirPath = (String) classes.get(k); File subdir = new File(directoryPath + File.separator + subdirPath); String[] files = subdir.list(); for (int j = 0; j < files.length; j++) { try { fileCount++; if (getDebug()) System.err.println("processing " + fileCount + " : " + files[j]); File txt = new File(directoryPath + File.separator + subdirPath + File.separator + files[j]); filenames.add(files[j]); BufferedInputStream is; is = new BufferedInputStream(new FileInputStream(txt)); StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } // Here is my extension to Text Directory Loader. String regexSentenceSplit = "(\\n)"; String rawtext = txtStr.toString(); rawtext = rawtext.toLowerCase(); rawtext.trim(); // split the sentences String[] sentences = rawtext.split(regexSentenceSplit); for (String sentence : sentences) { double[] newInst = null; if (m_OutputFilename) newInst = new double[3]; else newInst = new double[2]; newInst[0] = (double) data.attribute(0).addStringValue(sentence + "\n"); if (m_OutputFilename) newInst[1] = (double) data.attribute(1) .addStringValue(subdirPath + File.separator + files[j]); newInst[data.classIndex()] = (double) k; data.add(new DenseInstance(1.0, newInst)); // } } writeFilenames(directoryPath, filenames); } catch (Exception e) { System.err.println("failed to convert file: " + directoryPath + File.separator + files[j]); } } } // this.m_structure.setClassIndex(-1); return data; }