List of usage examples for weka.core Instances classIndex
publicint classIndex()
From source file:gate.plugin.learningframework.data.CorpusRepresentationWeka.java
public static weka.core.Instance wekaInstanceFromMalletInstance(Instances wekaDataset, cc.mallet.types.Instance malletInstance) { FeatureVector fv = (FeatureVector) malletInstance.getData(); int size = fv.numLocations(); int wekaTargetIndex = wekaDataset.classIndex(); // TODO: for now we just directly copy over the mallet values to the weka values // We may need to handle certain cases with missing values separately! // create the arrays with one more entry which will be the target, if we have a target //int indices[] = haveTarget ? new int[size + 1] : new int[size]; // experimental change: always allocate the space for the class attribute! // We do this because Weka Random Forest threw an exception and complained about a missing // class. /*www . j av a 2 s . c om*/ int indices[] = new int[size + 1]; double values[] = new double[size + 1]; for (int i = 0; i < size; i++) { indices[i] = fv.indexAtLocation(i); values[i] = fv.valueAtLocation(i); } // now set the target, if we have one Object malletValue = malletInstance.getTarget(); if (malletValue != null) { // we do have a target value, could be a class label or a numeric value indices[size] = wekaTargetIndex; // if we have a target alphabet, convert the label to a class index, otherwise expect // a double value directly if (malletInstance.getTargetAlphabet() == null) { values[size] = (double) malletInstance.getTarget(); } else { LabelAlphabet la = (LabelAlphabet) malletInstance.getTargetAlphabet(); Label malletLabel = (Label) malletInstance.getTarget(); int targetIndex = malletLabel.getIndex(); String targetString = malletLabel.toString(); int wekaIndex = wekaDataset.classAttribute().indexOfValue(targetString); values[size] = (double) wekaIndex; if (targetIndex != wekaIndex) { System.err.println("DEBUG ASSERTION FAILED: malletIndex for target is not equal to wekaIndex"); } } } else { // we do not have a target value, so lets create a missing value target for weka indices[size] = wekaDataset.classIndex(); values[size] = Double.NaN; } weka.core.SparseInstance wekaInstance = new weka.core.SparseInstance(1.0, values, indices, values.length); // TODO: is this necessary, is this useful? // What does this actually do? Hopefully not actually add or modify anything in the wekaDataset // and just give the instance a chance to know about the attributes? wekaInstance.setDataset(wekaDataset); return wekaInstance; }
From source file:gr.auth.ee.lcs.ArffTrainTestLoader.java
License:Open Source License
/** * Load instances into the global train store and create test set. * //from w w w.ja v a 2 s. co m * @param filename * the .arff filename to be used * @param generateTestSet * true if a test set is going to be generated * @throws IOException * if the input file is not found */ public final void loadInstances(final String filename, final boolean generateTestSet) throws IOException { // Open .arff final Instances set = InstancesUtility.openInstance(filename); if (set.classIndex() < 0) { set.setClassIndex(set.numAttributes() - 1); } set.randomize(new Random()); if (generateTestSet) { final int numOfFolds = (int) SettingsLoader.getNumericSetting("NumberOfFolds", 10); final int fold = (int) Math.floor(Math.random() * numOfFolds); trainSet = set.trainCV(numOfFolds, fold); testSet = set.testCV(numOfFolds, fold); } else { trainSet = set; } myLcs.instances = InstancesUtility.convertIntancesToDouble(trainSet); myLcs.labelCardinality = InstancesUtility.getLabelCardinality(trainSet); }
From source file:gr.auth.ee.lcs.ArffTrainTestLoader.java
License:Open Source License
/** * Load instances into the global train store and create test set. * // w w w. j a v a2 s . com * @param filename * the .arff filename to be used * @param testFile * the test file to be loaded * @throws IOException * if the input file is not found */ public final void loadInstancesWithTest(final String filename, final String testFile) throws IOException { // Open .arff final Instances set = InstancesUtility.openInstance(filename); if (set.classIndex() < 0) set.setClassIndex(set.numAttributes() - 1); set.randomize(new Random()); trainSet = set; myLcs.instances = InstancesUtility.convertIntancesToDouble(trainSet); myLcs.labelCardinality = InstancesUtility.getLabelCardinality(trainSet); testSet = InstancesUtility.openInstance(testFile); myLcs.trainSet = trainSet; myLcs.testSet = testSet; myLcs.testInstances = InstancesUtility.convertIntancesToDouble(testSet); System.out.println("Label cardinality: " + myLcs.labelCardinality); }
From source file:gr.auth.ee.lcs.data.representations.complex.SingleClassRepresentation.java
License:Open Source License
@Override protected void createClassRepresentation(final Instances instances) { if (instances.classIndex() < 0) instances.setClassIndex(instances.numAttributes() - 1); // Rule Consequents final Enumeration<?> classNames = instances.classAttribute().enumerateValues(); final String[] ruleConsequents = new String[instances.numClasses()]; this.ruleConsequents = ruleConsequents; for (int i = 0; i < instances.numClasses(); i++) ruleConsequents[i] = (String) classNames.nextElement(); attributeList[attributeList.length - 1] = new UniLabel(chromosomeSize, "class", ruleConsequents); }
From source file:gr.auth.ee.lcs.utilities.InstancesUtility.java
License:Open Source License
/** * Splits the .arff input dataset to |number-of-distinct-label-combinations| Instances which are stored in the partitions[] array. * Called by initializePopulation() as a preparatory step to clustering. * @throws Exception //from www .ja v a 2 s. c o m * * */ public static Instances[] partitionInstances(final AbstractLearningClassifierSystem lcs, final String filename) throws Exception { // Open .arff final Instances set = InstancesUtility.openInstance(filename); if (set.classIndex() < 0) { set.setClassIndex(set.numAttributes() - 1); } //set.randomize(new Random()); int numberOfLabels = (int) SettingsLoader.getNumericSetting("numberOfLabels", 1); // the partitions vector holds the indices String stringsArray[] = new String[lcs.instances.length]; int indicesArray[] = new int[lcs.instances.length]; // convert each instance's labelset into a string and store it in the stringsArray array for (int i = 0; i < set.numInstances(); i++) { stringsArray[i] = ""; indicesArray[i] = i; for (int j = set.numAttributes() - numberOfLabels; j < set.numAttributes(); j++) { stringsArray[i] += (int) set.instance(i).value(j); } } // contains the indicesVector(s) Vector<Vector> mothershipVector = new Vector<Vector>(); String baseString = ""; for (int i = 0; i < set.numInstances(); i++) { baseString = stringsArray[i]; if (baseString.equals("")) continue; Vector<Integer> indicesVector = new Vector<Integer>(); for (int j = 0; j < set.numInstances(); j++) { if (baseString.equals(stringsArray[j])) { stringsArray[j] = ""; indicesVector.add(j); } } mothershipVector.add(indicesVector); } Instances[] partitions = new Instances[mothershipVector.size()]; for (int i = 0; i < mothershipVector.size(); i++) { partitions[i] = new Instances(set, mothershipVector.elementAt(i).size()); for (int j = 0; j < mothershipVector.elementAt(i).size(); j++) { Instance instanceToAdd = set.instance((Integer) mothershipVector.elementAt(i).elementAt(j)); partitions[i].add(instanceToAdd); } } /* * up to here, the partitions array has been formed. it contains the split dataset by label combinations * it holds both the attributes and the labels, but for clustering the input should only be the attributes, * so we need to delete the labels. this is taken care of by initializePopulation() */ return partitions; }
From source file:gr.auth.ee.lcs.utilities.InstancesUtility.java
License:Open Source License
public static Instances[] partitionInstances(final AbstractLearningClassifierSystem lcs, final Instances trainSet) throws Exception { // Open .arff final Instances set = trainSet; if (set.classIndex() < 0) { set.setClassIndex(set.numAttributes() - 1); }// w w w .j av a 2 s . c om //set.randomize(new Random()); int numberOfLabels = (int) SettingsLoader.getNumericSetting("numberOfLabels", 1); // the partitions vector holds the indices String stringsArray[] = new String[trainSet.numInstances()]; int indicesArray[] = new int[trainSet.numInstances()]; // convert each instance's labelset into a string and store it in the stringsArray array for (int i = 0; i < set.numInstances(); i++) { stringsArray[i] = ""; indicesArray[i] = i; for (int j = set.numAttributes() - numberOfLabels; j < set.numAttributes(); j++) { stringsArray[i] += (int) set.instance(i).value(j); } } // contains the indicesVector(s) Vector<Vector> mothershipVector = new Vector<Vector>(); String baseString = ""; for (int i = 0; i < set.numInstances(); i++) { baseString = stringsArray[i]; if (baseString.equals("")) continue; Vector<Integer> indicesVector = new Vector<Integer>(); for (int j = 0; j < set.numInstances(); j++) { if (baseString.equals(stringsArray[j])) { stringsArray[j] = ""; indicesVector.add(j); } } mothershipVector.add(indicesVector); } Instances[] partitions = new Instances[mothershipVector.size()]; for (int i = 0; i < mothershipVector.size(); i++) { partitions[i] = new Instances(set, mothershipVector.elementAt(i).size()); for (int j = 0; j < mothershipVector.elementAt(i).size(); j++) { Instance instanceToAdd = set.instance((Integer) mothershipVector.elementAt(i).elementAt(j)); partitions[i].add(instanceToAdd); } } /* * up to here, the partitions array has been formed. it contains the split dataset by label combinations * it holds both the attributes and the labels, but for clustering the input should only be the attributes, * so we need to delete the labels. this is taken care of by initializePopulation() */ return partitions; }
From source file:gyc.OverBoostM1.java
License:Open Source License
/** * /* w ww . j a v a2 s. c o m*/ * nMajnMin * @param data * @param i * @return */ protected Instances randomSampling(Instances copia, int majC, int minC, int nMaj, int nMin, Random simplingRandom) { int[] majExamples = new int[copia.numInstances()]; int[] minExamples = new int[copia.numInstances()]; int majCount = 0, minCount = 0; // First, we copy the examples from the minority class and save the indexes of the majority // the new data-set contains samples_min + samples_min * N / 100 int size = nMaj + nMin; //selected = new int[size]; // we store the selected examples indexes String majClassName = copia.attribute(copia.classIndex()).value(majC); Instances myDataset = new Instances(copia, 0); int nData = 0; for (int i = 0; i < copia.numInstances(); i++) { if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) { // save index majExamples[majCount] = i; majCount++; } else { minExamples[minCount] = i; minCount++; } } if (minCount <= 0) return copia; /* random undersampling of the majority */ //boolean[] taken = new boolean[copia.numInstances()]; int r; if (nMaj == majCount) { //System.out.println("#equal"); for (int i = 0; i < nMaj; i++) { myDataset.add(copia.instance(majExamples[i])); } } else { for (int i = 0; i < nMaj; i++) { r = simplingRandom.nextInt(majCount); //selected[nData] = majExamples[r]; myDataset.add(copia.instance(majExamples[r])); //taken[majExamples[r]] = true; } } for (int i = 0; i < nMin; i++) { r = simplingRandom.nextInt(minCount); //System.out.print("_"+r); //selected[nData] = minExamples[r]; myDataset.add(copia.instance(minExamples[r])); //taken[minExamples[r]] = true; } //System.out.println(); //System.out.println("minC="+minCount+"; majC="+majCount); myDataset.randomize(simplingRandom); return myDataset; }
From source file:gyc.SMOTEBagging.java
License:Open Source License
/** * //from w w w. j a v a2 s . c o m * 100%majminSMOTE (k, a). * @param data * @param i * @return */ protected Instances randomSampling(Instances copia, int majC, int minC, int a, Random simplingRandom) { int[] majExamples = new int[copia.numInstances()]; int[] minExamples = new int[copia.numInstances()]; int majCount = 0, minCount = 0; // First, we copy the examples from the minority class and save the indexes of the majority // resample min at rate (Nmaj/Nmin)*a% int size = copia.attributeStats(copia.classIndex()).nominalCounts[majC] * a / 100; // class name String majClassName = copia.attribute(copia.classIndex()).value(majC); for (int i = 0; i < copia.numInstances(); i++) { if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) { // save index majExamples[majCount] = i; majCount++; } else { minExamples[minCount] = i; minCount++; } } /* random undersampling of the majority */ Instances myDataset = new Instances(copia, 0); int r; //100%majC for (int i = 0; i < majCount; i++) { myDataset.add(copia.instance(majExamples[i])); } if (minCount == 0) return myDataset; //(Nmaj/Nmin)*a% minC for (int i = 0; i < size; i++) { r = simplingRandom.nextInt(minCount); myDataset.add(copia.instance(minExamples[r])); } myDataset.randomize(simplingRandom); if (size == 1) { try { //neighbor Resample filter = new Resample(); filter.setInputFormat(myDataset); filter.setBiasToUniformClass(1.0); filter.setRandomSeed(simplingRandom.nextInt()); myDataset = Filter.useFilter(myDataset, filter); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } if (size > 1) { try { SMOTE filter = new SMOTE(); filter.setInputFormat(myDataset); // filter capabilities are checked here //data. double value = 100.0 * majCount / size - 100; //Percentage filter.setPercentage(value); //if (nMin<5) filter.setNearestNeighbors(nMin); filter.setRandomSeed(simplingRandom.nextInt()); //filterSMOTESMOTE myDataset = Filter.useFilter(myDataset, filter); //t.stop(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } return myDataset; }
From source file:gyc.SMOTEBagging.java
License:Open Source License
/** * Bagging method.// w w w .j a v a2 s .co m * * @param data the training data to be used for generating the * bagged classifier. * @throws Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(data); // remove instances with missing class data = new Instances(data); data.deleteWithMissingClass(); super.buildClassifier(data); if (m_CalcOutOfBag && (m_BagSizePercent != 100)) { throw new IllegalArgumentException( "Bag size needs to be 100% if " + "out-of-bag error is to be calculated!"); } int bagSize = data.numInstances() * m_BagSizePercent / 100; Random random = new Random(m_Seed); boolean[][] inBag = null; if (m_CalcOutOfBag) inBag = new boolean[m_Classifiers.length][]; int b = 0; for (int j = 0; j < m_Classifiers.length; j++) { // int classNum[] = data.attributeStats(data.classIndex()).nominalCounts; int minC, nMin = classNum[0]; int majC, nMaj = classNum[1]; if (nMin < nMaj) { minC = 0; majC = 1; } else { minC = 1; majC = 0; nMin = classNum[1]; nMaj = classNum[0]; } b = b + 10; Instances bagData = randomSampling(data, majC, minC, b, random); /* // create the in-bag dataset if (m_CalcOutOfBag) { inBag[j] = new boolean[data.numInstances()]; bagData = resampleWithWeights(data, random, inBag[j]); } else { bagData = data.resampleWithWeights(random); if (bagSize < data.numInstances()) { bagData.randomize(random); Instances newBagData = new Instances(bagData, 0, bagSize); bagData = newBagData; } } if (m_Classifier instanceof Randomizable) { ((Randomizable) m_Classifiers[j]).setSeed(random.nextInt()); }*/ // build the classifier m_Classifiers[j].buildClassifier(bagData); //classNum=bagData.attributeStats(bagData.classIndex()).nominalCounts; //System.out.println("after:"+classNum[0]+"-"+classNum[1]); } // calc OOB error? if (getCalcOutOfBag()) { double outOfBagCount = 0.0; double errorSum = 0.0; boolean numeric = data.classAttribute().isNumeric(); for (int i = 0; i < data.numInstances(); i++) { double vote; double[] votes; if (numeric) votes = new double[1]; else votes = new double[data.numClasses()]; // determine predictions for instance int voteCount = 0; for (int j = 0; j < m_Classifiers.length; j++) { if (inBag[j][i]) continue; voteCount++; double pred = m_Classifiers[j].classifyInstance(data.instance(i)); if (numeric) votes[0] += pred; else votes[(int) pred]++; } // "vote" if (numeric) { vote = votes[0]; if (voteCount > 0) { vote /= voteCount; // average } } else { vote = Utils.maxIndex(votes); // majority vote } // error for instance outOfBagCount += data.instance(i).weight(); if (numeric) { errorSum += StrictMath.abs(vote - data.instance(i).classValue()) * data.instance(i).weight(); } else { if (vote != data.instance(i).classValue()) errorSum += data.instance(i).weight(); } } m_OutOfBagError = errorSum / outOfBagCount; } else { m_OutOfBagError = 0; } }
From source file:gyc.UnderOverBoostM1.java
License:Open Source License
/** * /*from w w w . ja v a 2s. co m*/ * nMajnMin * @param data * @param i * @return */ protected Instances randomSampling(Instances copia, int majC, int minC, int a, Random simplingRandom) { int[] majExamples = new int[copia.numInstances()]; int[] minExamples = new int[copia.numInstances()]; int majCount = 0, minCount = 0; // First, we copy the examples from the minority class and save the indexes of the majority // the new data-set contains samples_min + samples_min * N / 100 int size = copia.attributeStats(copia.classIndex()).nominalCounts[majC] * a / 100 * 2; // class name String majClassName = copia.attribute(copia.classIndex()).value(majC); for (int i = 0; i < copia.numInstances(); i++) { if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) { // save index majExamples[majCount] = i; majCount++; } else { minExamples[minCount] = i; minCount++; } } /* random undersampling of the majority */ Instances myDataset = new Instances(copia, 0); int r; for (int i = 0; i < size / 2; i++) { r = simplingRandom.nextInt(majCount); myDataset.add(copia.instance(majExamples[r])); if (minCount > 0) { r = simplingRandom.nextInt(minCount); myDataset.add(copia.instance(minExamples[r])); } } myDataset.randomize(simplingRandom); return myDataset; }