Example usage for weka.core Instances classIndex

List of usage examples for weka.core Instances classIndex

Introduction

In this page you can find the example usage for weka.core Instances classIndex.

Prototype


publicint classIndex() 

Source Link

Document

Returns the class attribute's index.

Usage

From source file:gate.plugin.learningframework.data.CorpusRepresentationWeka.java

public static weka.core.Instance wekaInstanceFromMalletInstance(Instances wekaDataset,
        cc.mallet.types.Instance malletInstance) {
    FeatureVector fv = (FeatureVector) malletInstance.getData();
    int size = fv.numLocations();
    int wekaTargetIndex = wekaDataset.classIndex();
    // TODO: for now we just directly copy over the mallet values to the weka values
    // We may need to handle certain cases with missing values separately!

    // create  the arrays with one more entry which will be the target, if we have a target

    //int indices[] = haveTarget ? new int[size + 1] : new int[size];
    // experimental change: always allocate the space for the class attribute! 
    // We do this because Weka Random Forest threw an exception and complained about a missing
    // class. /*www  . j av a  2  s . c  om*/
    int indices[] = new int[size + 1];
    double values[] = new double[size + 1];
    for (int i = 0; i < size; i++) {
        indices[i] = fv.indexAtLocation(i);
        values[i] = fv.valueAtLocation(i);
    }
    // now set the target, if we have one 
    Object malletValue = malletInstance.getTarget();
    if (malletValue != null) { // we do have a target value, could be a class label or a numeric value
        indices[size] = wekaTargetIndex;
        // if we have a target alphabet, convert the label to a class index, otherwise expect
        // a double value directly
        if (malletInstance.getTargetAlphabet() == null) {
            values[size] = (double) malletInstance.getTarget();
        } else {
            LabelAlphabet la = (LabelAlphabet) malletInstance.getTargetAlphabet();
            Label malletLabel = (Label) malletInstance.getTarget();
            int targetIndex = malletLabel.getIndex();
            String targetString = malletLabel.toString();
            int wekaIndex = wekaDataset.classAttribute().indexOfValue(targetString);
            values[size] = (double) wekaIndex;
            if (targetIndex != wekaIndex) {
                System.err.println("DEBUG ASSERTION FAILED: malletIndex for target is not equal to wekaIndex");
            }
        }
    } else { // we do not have a target value, so lets create a missing value target for weka
        indices[size] = wekaDataset.classIndex();
        values[size] = Double.NaN;
    }
    weka.core.SparseInstance wekaInstance = new weka.core.SparseInstance(1.0, values, indices, values.length);
    // TODO: is this necessary, is this useful?
    // What does this actually do? Hopefully not actually add or modify anything in the wekaDataset
    // and just give the instance a chance to know about the attributes?
    wekaInstance.setDataset(wekaDataset);
    return wekaInstance;
}

From source file:gr.auth.ee.lcs.ArffTrainTestLoader.java

License:Open Source License

/**
 * Load instances into the global train store and create test set.
 * //from  w  w w.ja v  a  2  s. co m
 * @param filename
 *            the .arff filename to be used
 * @param generateTestSet
 *            true if a test set is going to be generated
 * @throws IOException
 *             if the input file is not found
 */
public final void loadInstances(final String filename, final boolean generateTestSet) throws IOException {
    // Open .arff
    final Instances set = InstancesUtility.openInstance(filename);
    if (set.classIndex() < 0) {
        set.setClassIndex(set.numAttributes() - 1);
    }
    set.randomize(new Random());

    if (generateTestSet) {
        final int numOfFolds = (int) SettingsLoader.getNumericSetting("NumberOfFolds", 10);
        final int fold = (int) Math.floor(Math.random() * numOfFolds);
        trainSet = set.trainCV(numOfFolds, fold);
        testSet = set.testCV(numOfFolds, fold);
    } else {
        trainSet = set;
    }

    myLcs.instances = InstancesUtility.convertIntancesToDouble(trainSet);
    myLcs.labelCardinality = InstancesUtility.getLabelCardinality(trainSet);

}

From source file:gr.auth.ee.lcs.ArffTrainTestLoader.java

License:Open Source License

/**
 * Load instances into the global train store and create test set.
 * // w w  w. j a v  a2  s  .  com
 * @param filename
 *            the .arff filename to be used
 * @param testFile
 *            the test file to be loaded
 * @throws IOException
 *             if the input file is not found
 */
public final void loadInstancesWithTest(final String filename, final String testFile) throws IOException {

    // Open .arff
    final Instances set = InstancesUtility.openInstance(filename);

    if (set.classIndex() < 0)
        set.setClassIndex(set.numAttributes() - 1);
    set.randomize(new Random());
    trainSet = set;

    myLcs.instances = InstancesUtility.convertIntancesToDouble(trainSet);
    myLcs.labelCardinality = InstancesUtility.getLabelCardinality(trainSet);
    testSet = InstancesUtility.openInstance(testFile);

    myLcs.trainSet = trainSet;
    myLcs.testSet = testSet;

    myLcs.testInstances = InstancesUtility.convertIntancesToDouble(testSet);

    System.out.println("Label cardinality: " + myLcs.labelCardinality);

}

From source file:gr.auth.ee.lcs.data.representations.complex.SingleClassRepresentation.java

License:Open Source License

@Override
protected void createClassRepresentation(final Instances instances) {

    if (instances.classIndex() < 0)
        instances.setClassIndex(instances.numAttributes() - 1);

    // Rule Consequents
    final Enumeration<?> classNames = instances.classAttribute().enumerateValues();
    final String[] ruleConsequents = new String[instances.numClasses()];
    this.ruleConsequents = ruleConsequents;
    for (int i = 0; i < instances.numClasses(); i++)
        ruleConsequents[i] = (String) classNames.nextElement();

    attributeList[attributeList.length - 1] = new UniLabel(chromosomeSize, "class", ruleConsequents);

}

From source file:gr.auth.ee.lcs.utilities.InstancesUtility.java

License:Open Source License

/**
 * Splits the .arff input dataset to |number-of-distinct-label-combinations| Instances which are stored in the partitions[] array. 
 * Called by initializePopulation() as a preparatory step to clustering.
 * @throws Exception //from   www  .ja v  a 2 s. c o m
 * 
 * */

public static Instances[] partitionInstances(final AbstractLearningClassifierSystem lcs, final String filename)
        throws Exception {

    // Open .arff
    final Instances set = InstancesUtility.openInstance(filename);
    if (set.classIndex() < 0) {
        set.setClassIndex(set.numAttributes() - 1);
    }
    //set.randomize(new Random());
    int numberOfLabels = (int) SettingsLoader.getNumericSetting("numberOfLabels", 1);

    // the partitions vector holds the indices      
    String stringsArray[] = new String[lcs.instances.length];
    int indicesArray[] = new int[lcs.instances.length];

    // convert each instance's labelset into a string and store it in the stringsArray array
    for (int i = 0; i < set.numInstances(); i++) {
        stringsArray[i] = "";
        indicesArray[i] = i;

        for (int j = set.numAttributes() - numberOfLabels; j < set.numAttributes(); j++) {
            stringsArray[i] += (int) set.instance(i).value(j);
        }
    }

    // contains the indicesVector(s)
    Vector<Vector> mothershipVector = new Vector<Vector>();

    String baseString = "";
    for (int i = 0; i < set.numInstances(); i++) {

        baseString = stringsArray[i];
        if (baseString.equals(""))
            continue;
        Vector<Integer> indicesVector = new Vector<Integer>();

        for (int j = 0; j < set.numInstances(); j++) {
            if (baseString.equals(stringsArray[j])) {
                stringsArray[j] = "";
                indicesVector.add(j);
            }
        }
        mothershipVector.add(indicesVector);
    }

    Instances[] partitions = new Instances[mothershipVector.size()];

    for (int i = 0; i < mothershipVector.size(); i++) {
        partitions[i] = new Instances(set, mothershipVector.elementAt(i).size());
        for (int j = 0; j < mothershipVector.elementAt(i).size(); j++) {
            Instance instanceToAdd = set.instance((Integer) mothershipVector.elementAt(i).elementAt(j));
            partitions[i].add(instanceToAdd);
        }
    }
    /*
     * up to here, the partitions array has been formed. it contains the split dataset by label combinations
     * it holds both the attributes and the labels, but for clustering the input should only be the attributes,
     * so we need to delete the labels. this is taken care of by initializePopulation()
     */
    return partitions;
}

From source file:gr.auth.ee.lcs.utilities.InstancesUtility.java

License:Open Source License

public static Instances[] partitionInstances(final AbstractLearningClassifierSystem lcs,
        final Instances trainSet) throws Exception {

    // Open .arff
    final Instances set = trainSet;
    if (set.classIndex() < 0) {
        set.setClassIndex(set.numAttributes() - 1);
    }// w w  w .j av  a  2 s  . c  om
    //set.randomize(new Random());
    int numberOfLabels = (int) SettingsLoader.getNumericSetting("numberOfLabels", 1);

    // the partitions vector holds the indices      
    String stringsArray[] = new String[trainSet.numInstances()];
    int indicesArray[] = new int[trainSet.numInstances()];

    // convert each instance's labelset into a string and store it in the stringsArray array
    for (int i = 0; i < set.numInstances(); i++) {
        stringsArray[i] = "";
        indicesArray[i] = i;

        for (int j = set.numAttributes() - numberOfLabels; j < set.numAttributes(); j++) {
            stringsArray[i] += (int) set.instance(i).value(j);
        }
    }

    // contains the indicesVector(s)
    Vector<Vector> mothershipVector = new Vector<Vector>();

    String baseString = "";
    for (int i = 0; i < set.numInstances(); i++) {

        baseString = stringsArray[i];
        if (baseString.equals(""))
            continue;
        Vector<Integer> indicesVector = new Vector<Integer>();

        for (int j = 0; j < set.numInstances(); j++) {
            if (baseString.equals(stringsArray[j])) {
                stringsArray[j] = "";
                indicesVector.add(j);
            }
        }
        mothershipVector.add(indicesVector);
    }

    Instances[] partitions = new Instances[mothershipVector.size()];

    for (int i = 0; i < mothershipVector.size(); i++) {
        partitions[i] = new Instances(set, mothershipVector.elementAt(i).size());
        for (int j = 0; j < mothershipVector.elementAt(i).size(); j++) {
            Instance instanceToAdd = set.instance((Integer) mothershipVector.elementAt(i).elementAt(j));
            partitions[i].add(instanceToAdd);
        }
    }
    /*
     * up to here, the partitions array has been formed. it contains the split dataset by label combinations
     * it holds both the attributes and the labels, but for clustering the input should only be the attributes,
     * so we need to delete the labels. this is taken care of by initializePopulation()
     */
    return partitions;
}

From source file:gyc.OverBoostM1.java

License:Open Source License

/**
 * /*  w ww  .  j a  v a2  s.  c o  m*/
 * nMajnMin
 * @param data
 * @param i
 * @return
 */
protected Instances randomSampling(Instances copia, int majC, int minC, int nMaj, int nMin,
        Random simplingRandom) {
    int[] majExamples = new int[copia.numInstances()];
    int[] minExamples = new int[copia.numInstances()];
    int majCount = 0, minCount = 0;
    // First, we copy the examples from the minority class and save the indexes of the majority
    // the new data-set contains samples_min + samples_min * N / 100
    int size = nMaj + nMin;
    //selected = new int[size]; // we store the selected examples indexes

    String majClassName = copia.attribute(copia.classIndex()).value(majC);

    Instances myDataset = new Instances(copia, 0);
    int nData = 0;
    for (int i = 0; i < copia.numInstances(); i++) {
        if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) {
            // save index
            majExamples[majCount] = i;
            majCount++;
        } else {
            minExamples[minCount] = i;
            minCount++;
        }
    }
    if (minCount <= 0)
        return copia;
    /* random undersampling of the majority */
    //boolean[] taken = new boolean[copia.numInstances()];
    int r;
    if (nMaj == majCount) {
        //System.out.println("#equal");
        for (int i = 0; i < nMaj; i++) {
            myDataset.add(copia.instance(majExamples[i]));
        }
    } else {
        for (int i = 0; i < nMaj; i++) {
            r = simplingRandom.nextInt(majCount);
            //selected[nData] = majExamples[r];
            myDataset.add(copia.instance(majExamples[r]));
            //taken[majExamples[r]] = true;
        }
    }
    for (int i = 0; i < nMin; i++) {
        r = simplingRandom.nextInt(minCount);
        //System.out.print("_"+r);

        //selected[nData] = minExamples[r];
        myDataset.add(copia.instance(minExamples[r]));
        //taken[minExamples[r]] = true;
    }

    //System.out.println();
    //System.out.println("minC="+minCount+"; majC="+majCount);

    myDataset.randomize(simplingRandom);
    return myDataset;
}

From source file:gyc.SMOTEBagging.java

License:Open Source License

/**
 * //from   w w  w. j a  v  a2  s . c  o m
 * 100%majminSMOTE (k, a).
 * @param data
 * @param i
 * @return
 */
protected Instances randomSampling(Instances copia, int majC, int minC, int a, Random simplingRandom) {
    int[] majExamples = new int[copia.numInstances()];
    int[] minExamples = new int[copia.numInstances()];
    int majCount = 0, minCount = 0;
    // First, we copy the examples from the minority class and save the indexes of the majority
    // resample min at rate (Nmaj/Nmin)*a%
    int size = copia.attributeStats(copia.classIndex()).nominalCounts[majC] * a / 100;
    // class name
    String majClassName = copia.attribute(copia.classIndex()).value(majC);

    for (int i = 0; i < copia.numInstances(); i++) {
        if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) {
            // save index
            majExamples[majCount] = i;
            majCount++;
        } else {
            minExamples[minCount] = i;
            minCount++;
        }
    }

    /* random undersampling of the majority */
    Instances myDataset = new Instances(copia, 0);
    int r;
    //100%majC
    for (int i = 0; i < majCount; i++) {
        myDataset.add(copia.instance(majExamples[i]));
    }
    if (minCount == 0)
        return myDataset;
    //(Nmaj/Nmin)*a% minC
    for (int i = 0; i < size; i++) {
        r = simplingRandom.nextInt(minCount);
        myDataset.add(copia.instance(minExamples[r]));
    }
    myDataset.randomize(simplingRandom);

    if (size == 1) {
        try {
            //neighbor
            Resample filter = new Resample();
            filter.setInputFormat(myDataset);
            filter.setBiasToUniformClass(1.0);
            filter.setRandomSeed(simplingRandom.nextInt());
            myDataset = Filter.useFilter(myDataset, filter);
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    if (size > 1) {
        try {
            SMOTE filter = new SMOTE();
            filter.setInputFormat(myDataset); // filter capabilities are checked here
            //data.
            double value = 100.0 * majCount / size - 100;
            //Percentage
            filter.setPercentage(value);
            //if (nMin<5) filter.setNearestNeighbors(nMin);
            filter.setRandomSeed(simplingRandom.nextInt());
            //filterSMOTESMOTE
            myDataset = Filter.useFilter(myDataset, filter);
            //t.stop();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    return myDataset;
}

From source file:gyc.SMOTEBagging.java

License:Open Source License

/**
 * Bagging method.// w w w .j  a  v  a2  s  .co m
 *
 * @param data the training data to be used for generating the
 * bagged classifier.
 * @throws Exception if the classifier could not be built successfully
 */
public void buildClassifier(Instances data) throws Exception {

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    data = new Instances(data);
    data.deleteWithMissingClass();

    super.buildClassifier(data);

    if (m_CalcOutOfBag && (m_BagSizePercent != 100)) {
        throw new IllegalArgumentException(
                "Bag size needs to be 100% if " + "out-of-bag error is to be calculated!");
    }

    int bagSize = data.numInstances() * m_BagSizePercent / 100;
    Random random = new Random(m_Seed);

    boolean[][] inBag = null;
    if (m_CalcOutOfBag)
        inBag = new boolean[m_Classifiers.length][];
    int b = 0;
    for (int j = 0; j < m_Classifiers.length; j++) {

        //
        int classNum[] = data.attributeStats(data.classIndex()).nominalCounts;
        int minC, nMin = classNum[0];
        int majC, nMaj = classNum[1];
        if (nMin < nMaj) {
            minC = 0;
            majC = 1;
        } else {
            minC = 1;
            majC = 0;
            nMin = classNum[1];
            nMaj = classNum[0];
        }

        b = b + 10;
        Instances bagData = randomSampling(data, majC, minC, b, random);

        /*      // create the in-bag dataset
              if (m_CalcOutOfBag) {
           inBag[j] = new boolean[data.numInstances()];
           bagData = resampleWithWeights(data, random, inBag[j]);
              } else {
           bagData = data.resampleWithWeights(random);
           if (bagSize < data.numInstances()) {
             bagData.randomize(random);
             Instances newBagData = new Instances(bagData, 0, bagSize);
             bagData = newBagData;
           }
              }
                      
              if (m_Classifier instanceof Randomizable) {
           ((Randomizable) m_Classifiers[j]).setSeed(random.nextInt());
              }*/

        // build the classifier
        m_Classifiers[j].buildClassifier(bagData);
        //classNum=bagData.attributeStats(bagData.classIndex()).nominalCounts;
        //System.out.println("after:"+classNum[0]+"-"+classNum[1]);
    }

    // calc OOB error?
    if (getCalcOutOfBag()) {
        double outOfBagCount = 0.0;
        double errorSum = 0.0;
        boolean numeric = data.classAttribute().isNumeric();

        for (int i = 0; i < data.numInstances(); i++) {
            double vote;
            double[] votes;
            if (numeric)
                votes = new double[1];
            else
                votes = new double[data.numClasses()];

            // determine predictions for instance
            int voteCount = 0;
            for (int j = 0; j < m_Classifiers.length; j++) {
                if (inBag[j][i])
                    continue;

                voteCount++;
                double pred = m_Classifiers[j].classifyInstance(data.instance(i));
                if (numeric)
                    votes[0] += pred;
                else
                    votes[(int) pred]++;
            }

            // "vote"
            if (numeric) {
                vote = votes[0];
                if (voteCount > 0) {
                    vote /= voteCount; // average
                }
            } else {
                vote = Utils.maxIndex(votes); // majority vote
            }

            // error for instance
            outOfBagCount += data.instance(i).weight();
            if (numeric) {
                errorSum += StrictMath.abs(vote - data.instance(i).classValue()) * data.instance(i).weight();
            } else {
                if (vote != data.instance(i).classValue())
                    errorSum += data.instance(i).weight();
            }
        }

        m_OutOfBagError = errorSum / outOfBagCount;
    } else {
        m_OutOfBagError = 0;
    }
}

From source file:gyc.UnderOverBoostM1.java

License:Open Source License

/**
 * /*from   w w w .  ja  v  a 2s. co m*/
 * nMajnMin
 * @param data
 * @param i
 * @return
 */
protected Instances randomSampling(Instances copia, int majC, int minC, int a, Random simplingRandom) {
    int[] majExamples = new int[copia.numInstances()];
    int[] minExamples = new int[copia.numInstances()];
    int majCount = 0, minCount = 0;
    // First, we copy the examples from the minority class and save the indexes of the majority
    // the new data-set contains samples_min + samples_min * N / 100
    int size = copia.attributeStats(copia.classIndex()).nominalCounts[majC] * a / 100 * 2;
    // class name
    String majClassName = copia.attribute(copia.classIndex()).value(majC);

    for (int i = 0; i < copia.numInstances(); i++) {
        if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) {
            // save index
            majExamples[majCount] = i;
            majCount++;
        } else {
            minExamples[minCount] = i;
            minCount++;
        }
    }

    /* random undersampling of the majority */
    Instances myDataset = new Instances(copia, 0);
    int r;
    for (int i = 0; i < size / 2; i++) {
        r = simplingRandom.nextInt(majCount);
        myDataset.add(copia.instance(majExamples[r]));

        if (minCount > 0) {
            r = simplingRandom.nextInt(minCount);
            myDataset.add(copia.instance(minExamples[r]));
        }
    }

    myDataset.randomize(simplingRandom);
    return myDataset;
}