Example usage for weka.core Instances deleteWithMissingClass

List of usage examples for weka.core Instances deleteWithMissingClass

Introduction

In this page you can find the example usage for weka.core Instances deleteWithMissingClass.

Prototype

public void deleteWithMissingClass() 

Source Link

Document

Removes all instances with a missing class value from the dataset.

Usage

From source file:com.reactivetechnologies.analytics.core.eval.AdaBoostM1WithBuiltClassifiers.java

License:Open Source License

@Override
public void buildClassifier(Instances data) throws Exception {
    /** Changed here: Using the provided classifiers */
    /** End *//* ww w  .j ava2  s . c  o  m*/

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    data = new Instances(data);
    data.deleteWithMissingClass();

    // only class? -> build ZeroR model
    if (data.numAttributes() == 1) {
        System.err.println(
                "Cannot build model (only class attribute present in data!), " + "using ZeroR model instead!");
        m_ZeroR = new weka.classifiers.rules.ZeroR();
        m_ZeroR.buildClassifier(data);
        return;
    } else {
        m_ZeroR = null;
    }

    m_NumClasses = data.numClasses();
    if ((!m_UseResampling) && (m_Classifier instanceof WeightedInstancesHandler)) {
        buildClassifierWithWeights(data);
    } else {
        buildClassifierUsingResampling(data);
    }
}

From source file:com.reactivetechnologies.analytics.core.eval.BaggingWithBuiltClassifiers.java

License:Open Source License

@Override
public void buildClassifier(Instances data) throws Exception {

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    data = new Instances(data);
    data.deleteWithMissingClass();

    /** Changed here: Use supplied classifier */
    //super.buildClassifier(data);
    /** End change */

    if (m_CalcOutOfBag && (m_BagSizePercent != 100)) {
        throw new IllegalArgumentException(
                "Bag size needs to be 100% if " + "out-of-bag error is to be calculated!");
    }/*from   w  w  w  .  j a  v  a 2  s .  c  o  m*/

    int bagSize = (int) (data.numInstances() * (m_BagSizePercent / 100.0));
    Random random = new Random(m_Seed);

    boolean[][] inBag = null;
    if (m_CalcOutOfBag)
        inBag = new boolean[m_Classifiers.length][];

    for (int j = 0; j < m_Classifiers.length; j++) {
        Instances bagData = null;

        // create the in-bag dataset
        if (m_CalcOutOfBag) {
            inBag[j] = new boolean[data.numInstances()];
            bagData = data.resampleWithWeights(random, inBag[j]);
        } else {
            bagData = data.resampleWithWeights(random);
            if (bagSize < data.numInstances()) {
                bagData.randomize(random);
                Instances newBagData = new Instances(bagData, 0, bagSize);
                bagData = newBagData;
            }
        }

        /** Changed here: Use supplied classifier */
        /*if (m_Classifier instanceof Randomizable) {
          ((Randomizable) m_Classifiers[j]).setSeed(random.nextInt());
        }
                
        // build the classifier
        m_Classifiers[j].buildClassifier(bagData);*/
        /** End change */
    }

    // calc OOB error?
    if (getCalcOutOfBag()) {
        double outOfBagCount = 0.0;
        double errorSum = 0.0;
        boolean numeric = data.classAttribute().isNumeric();

        for (int i = 0; i < data.numInstances(); i++) {
            double vote;
            double[] votes;
            if (numeric)
                votes = new double[1];
            else
                votes = new double[data.numClasses()];

            // determine predictions for instance
            int voteCount = 0;
            for (int j = 0; j < m_Classifiers.length; j++) {
                if (inBag[j][i])
                    continue;

                voteCount++;
                // double pred = m_Classifiers[j].classifyInstance(data.instance(i));
                if (numeric) {
                    // votes[0] += pred;
                    votes[0] += m_Classifiers[j].classifyInstance(data.instance(i));
                } else {
                    // votes[(int) pred]++;
                    double[] newProbs = m_Classifiers[j].distributionForInstance(data.instance(i));
                    // average the probability estimates
                    for (int k = 0; k < newProbs.length; k++) {
                        votes[k] += newProbs[k];
                    }
                }
            }

            // "vote"
            if (numeric) {
                vote = votes[0];
                if (voteCount > 0) {
                    vote /= voteCount; // average
                }
            } else {
                if (Utils.eq(Utils.sum(votes), 0)) {
                } else {
                    Utils.normalize(votes);
                }
                vote = Utils.maxIndex(votes); // predicted class
            }

            // error for instance
            outOfBagCount += data.instance(i).weight();
            if (numeric) {
                errorSum += StrictMath.abs(vote - data.instance(i).classValue()) * data.instance(i).weight();
            } else {
                if (vote != data.instance(i).classValue())
                    errorSum += data.instance(i).weight();
            }
        }

        m_OutOfBagError = errorSum / outOfBagCount;
    } else {
        m_OutOfBagError = 0;
    }
}

From source file:com.reactivetechnologies.analytics.core.eval.StackingWithBuiltClassifiers.java

License:Open Source License

/**
 * Buildclassifier selects a classifier from the set of classifiers
 * by minimising error on the training data.
 *
 * @param data the training data to be used for generating the
 * boosted classifier.//from   w  ww .j a  v  a 2  s  . co  m
 * @throws Exception if the classifier could not be built successfully
 */
@Override
public void buildClassifier(Instances data) throws Exception {

    if (m_MetaClassifier == null) {
        throw new IllegalArgumentException("No meta classifier has been set");
    }

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    Instances newData = new Instances(data);
    m_BaseFormat = new Instances(data, 0);
    newData.deleteWithMissingClass();

    Random random = new Random(m_Seed);
    newData.randomize(random);
    if (newData.classAttribute().isNominal()) {
        newData.stratify(m_NumFolds);
    }

    // Create meta level
    generateMetaLevel(newData, random);

    /** Changed here */
    // DO NOT Rebuilt all the base classifiers on the full training data
    /*for (int i = 0; i < m_Classifiers.length; i++) {
      getClassifier(i).buildClassifier(newData);
    }*/
    /** End change */
}

From source file:com.reactivetechnologies.analytics.core.eval.VotingWithBuiltClassifiers.java

License:Open Source License

/**
 * Buildclassifier selects a classifier from the set of classifiers
 * by minimising error on the training data.
 *
 * @param data the training data to be used for generating the
 * boosted classifier.//  ww  w. j ava2 s .co  m
 * @throws Exception if the classifier could not be built successfully
 */
@Override
public void buildClassifier(Instances data) throws Exception {

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    Instances newData = new Instances(data);
    newData.deleteWithMissingClass();

    m_Random = new Random(getSeed());

    /** Changed here */
    /*for (int i = 0; i < m_Classifiers.length; i++) {
      getClassifier(i).buildClassifier(newData);
    }*/
    /** End change */
}

From source file:com.tum.classifiertest.FastRandomForest.java

License:Open Source License

/**
 * Builds a classifier for a set of instances.
 *
 * @param data the instances to train the classifier with
 *
 * @throws Exception if something goes wrong
 *///  w w  w .ja  va2s  .co m
public void buildClassifier(Instances data) throws Exception {

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    data = new Instances(data);
    data.deleteWithMissingClass();

    // only class? -> build ZeroR model
    if (data.numAttributes() == 1) {
        System.err.println(
                "Cannot build model (only class attribute present in data!), " + "using ZeroR model instead!");
        m_ZeroR = new weka.classifiers.rules.ZeroR();
        m_ZeroR.buildClassifier(data);
        return;
    } else {
        m_ZeroR = null;
    }

    /* Save header with attribute info. Can be accessed later by FastRfTrees
     * through their m_MotherForest field. */
    setM_Info(new Instances(data, 0));

    m_bagger = new FastRfBagging();

    // Set up the tree options which are held in the motherForest.
    m_KValue = m_numFeatures;
    if (m_KValue > data.numAttributes() - 1)
        m_KValue = data.numAttributes() - 1;
    if (m_KValue < 1)
        m_KValue = (int) Utils.log2(data.numAttributes()) + 1;

    FastRandomTree rTree = new FastRandomTree();
    rTree.m_MotherForest = this; // allows to retrieve KValue and MaxDepth
    // some temporary arrays which need to be separate for every tree, so
    // that the trees can be trained in parallel in different threads

    // set up the bagger and build the forest
    m_bagger.setClassifier(rTree);
    m_bagger.setSeed(m_randomSeed);
    m_bagger.setNumIterations(m_numTrees);
    m_bagger.setCalcOutOfBag(true);
    m_bagger.setComputeImportances(this.getComputeImportances());

    m_bagger.buildClassifier(data, m_NumThreads, this);

}

From source file:com.tum.classifiertest.FastRfBagging.java

License:Open Source License

/**
 * Bagging method. Produces DataCache objects with bootstrap samples of
 * the original data, and feeds them to the base classifier (which can only
 * be a FastRandomTree)./*from  w w  w  .jav a 2 s  . co  m*/
 *
 * @param data         The training set to be used for generating the
 *                     bagged classifier.
 * @param numThreads   The number of simultaneous threads to use for
 *                     computation. Pass zero (0) for autodetection.
 * @param motherForest A reference to the FastRandomForest object that
 *                     invoked this.
 *
 * @throws Exception if the classifier could not be built successfully
 */
public void buildClassifier(Instances data, int numThreads, FastRandomForest motherForest) throws Exception {

    // can classifier handle the vals?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    data = new Instances(data);
    data.deleteWithMissingClass();

    if (!(m_Classifier instanceof FastRandomTree))
        throw new IllegalArgumentException(
                "The FastRfBagging class accepts " + "only FastRandomTree as its base classifier.");

    /* We fill the m_Classifiers array by creating lots of trees with new()
     * because this is much faster than using serialization to deep-copy the
     * one tree in m_Classifier - this is what the super.buildClassifier(data)
     * normally does. */
    m_Classifiers = new Classifier[m_NumIterations];
    for (int i = 0; i < m_Classifiers.length; i++) {
        FastRandomTree curTree = new FastRandomTree();
        // all parameters for training will be looked up in the motherForest (maxDepth, k_Value)
        curTree.m_MotherForest = motherForest;
        // 0.99: reference to these arrays will get passed down all nodes so the array can be re-used 
        // 0.99: this array is of size two as now all splits are binary - even categorical ones
        curTree.tempProps = new double[2];
        curTree.tempDists = new double[2][];
        curTree.tempDists[0] = new double[data.numClasses()];
        curTree.tempDists[1] = new double[data.numClasses()];
        curTree.tempDistsOther = new double[2][];
        curTree.tempDistsOther[0] = new double[data.numClasses()];
        curTree.tempDistsOther[1] = new double[data.numClasses()];
        m_Classifiers[i] = curTree;
    }

    // this was SLOW.. takes approx 1/2 time as training the forest afterwards (!!!)
    // super.buildClassifier(data);

    if (m_CalcOutOfBag && (m_BagSizePercent != 100)) {
        throw new IllegalArgumentException(
                "Bag size needs to be 100% if " + "out-of-bag error is to be calculated!");
    }

    // sorting is performed inside this constructor
    DataCache myData = new DataCache(data);

    int bagSize = data.numInstances() * m_BagSizePercent / 100;
    Random random = new Random(m_Seed);

    boolean[][] inBag = new boolean[m_Classifiers.length][];

    // thread management
    ExecutorService threadPool = Executors
            .newFixedThreadPool(numThreads > 0 ? numThreads : Runtime.getRuntime().availableProcessors());
    List<Future<?>> futures = new ArrayList<Future<?>>(m_Classifiers.length);

    try {

        for (int treeIdx = 0; treeIdx < m_Classifiers.length; treeIdx++) {

            // create the in-bag dataset (and be sure to remember what's in bag)
            // for computing the out-of-bag error later
            DataCache bagData = myData.resample(bagSize, random);
            bagData.reusableRandomGenerator = bagData.getRandomNumberGenerator(random.nextInt());
            inBag[treeIdx] = bagData.inBag; // store later for OOB error calculation

            // build the classifier
            if (m_Classifiers[treeIdx] instanceof FastRandomTree) {

                FastRandomTree aTree = (FastRandomTree) m_Classifiers[treeIdx];
                aTree.data = bagData;

                Future<?> future = threadPool.submit(aTree);
                futures.add(future);

            } else {
                throw new IllegalArgumentException(
                        "The FastRfBagging class accepts " + "only FastRandomTree as its base classifier.");
            }

        }

        // make sure all trees have been trained before proceeding
        for (int treeIdx = 0; treeIdx < m_Classifiers.length; treeIdx++) {
            futures.get(treeIdx).get();

        }

        // calc OOB error?
        if (getCalcOutOfBag() || getComputeImportances()) {
            //m_OutOfBagError = computeOOBError(data, inBag, threadPool);
            m_OutOfBagError = computeOOBError(myData, inBag, threadPool);
        } else {
            m_OutOfBagError = 0;
        }

        //calc feature importances
        m_FeatureImportances = null;
        //m_FeatureNames = null;
        if (getComputeImportances()) {
            m_FeatureImportances = new double[data.numAttributes()];
            ///m_FeatureNames = new String[data.numAttributes()];
            //Instances dataCopy = new Instances(data); //To scramble
            //int[] permutation = FastRfUtils.randomPermutation(data.numInstances(), random);
            for (int j = 0; j < data.numAttributes(); j++) {
                if (j != data.classIndex()) {
                    //double sError = computeOOBError(FastRfUtils.scramble(data, dataCopy, j, permutation), inBag, threadPool);
                    //double sError = computeOOBError(data, inBag, threadPool, j, 0);
                    float[] unscrambled = myData.scrambleOneAttribute(j, random);
                    double sError = computeOOBError(myData, inBag, threadPool);
                    myData.vals[j] = unscrambled; // restore the original state
                    m_FeatureImportances[j] = sError - m_OutOfBagError;
                }
                //m_FeatureNames[j] = data.attribute(j).name();
            }
        }

        threadPool.shutdown();

    } finally {
        threadPool.shutdownNow();
    }
}

From source file:com.walmart.productgenome.matching.models.EMSRandomForest.java

License:Open Source License

/**
 * Builds a classifier for a set of instances.
 *
 * @param data the instances to train the classifier with
 * @throws Exception if something goes wrong
 *///  w  ww  .java2 s. com
public void buildClassifier(Instances data) throws Exception {

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    data = new Instances(data);
    data.deleteWithMissingClass();

    m_bagger = new Bagging();
    RandomTree rTree = new RandomTree();

    // set up the random tree options
    m_KValue = m_numFeatures;
    if (m_KValue < 1)
        m_KValue = (int) Utils.log2(data.numAttributes()) + 1;
    rTree.setKValue(m_KValue);
    rTree.setMaxDepth(getMaxDepth());

    // set up the bagger and build the forest
    m_bagger.setClassifier(rTree);
    m_bagger.setSeed(m_randomSeed);
    m_bagger.setNumIterations(m_numTrees);
    m_bagger.setCalcOutOfBag(true);
    m_bagger.setNumExecutionSlots(m_numExecutionSlots);
    m_bagger.buildClassifier(data);
}

From source file:core.classifier.MyFirstClassifier.java

License:Open Source License

/**
 * Method for building the classifier. Implements a one-against-one
 * wrapper for multi-class problems.// w  w w  .  j  a  v a2 s.com
 *
 * @param insts the set of training instances
 * @throws Exception if the classifier can't be built successfully
 */
public void buildClassifier(Instances insts) throws Exception {

    if (!m_checksTurnedOff) {
        // can classifier handle the data?
        getCapabilities().testWithFail(insts);

        // remove instances with missing class
        insts = new Instances(insts);
        insts.deleteWithMissingClass();

        /* Removes all the instances with weight equal to 0.
         MUST be done since condition (8) of Keerthi's paper
         is made with the assertion Ci > 0 (See equation (3a). */
        Instances data = new Instances(insts, insts.numInstances());
        for (int i = 0; i < insts.numInstances(); i++) {
            if (insts.instance(i).weight() > 0)
                data.add(insts.instance(i));
        }
        if (data.numInstances() == 0) {
            throw new Exception("No training instances left after removing " + "instances with weight 0!");
        }
        insts = data;
    }

    if (!m_checksTurnedOff) {
        m_Missing = new ReplaceMissingValues();
        m_Missing.setInputFormat(insts);
        insts = Filter.useFilter(insts, m_Missing);
    } else {
        m_Missing = null;
    }

    if (getCapabilities().handles(Capability.NUMERIC_ATTRIBUTES)) {
        boolean onlyNumeric = true;
        if (!m_checksTurnedOff) {
            for (int i = 0; i < insts.numAttributes(); i++) {
                if (i != insts.classIndex()) {
                    if (!insts.attribute(i).isNumeric()) {
                        onlyNumeric = false;
                        break;
                    }
                }
            }
        }

        if (!onlyNumeric) {
            m_NominalToBinary = new NominalToBinary();
            m_NominalToBinary.setInputFormat(insts);
            insts = Filter.useFilter(insts, m_NominalToBinary);
        } else {
            m_NominalToBinary = null;
        }
    } else {
        m_NominalToBinary = null;
    }

    if (m_filterType == FILTER_STANDARDIZE) {
        m_Filter = new Standardize();
        m_Filter.setInputFormat(insts);
        insts = Filter.useFilter(insts, m_Filter);
    } else if (m_filterType == FILTER_NORMALIZE) {
        m_Filter = new Normalize();
        m_Filter.setInputFormat(insts);
        insts = Filter.useFilter(insts, m_Filter);
    } else {
        m_Filter = null;
    }

    m_classIndex = insts.classIndex();
    m_classAttribute = insts.classAttribute();
    m_KernelIsLinear = (m_kernel instanceof PolyKernel) && (((PolyKernel) m_kernel).getExponent() == 1.0);

    // Generate subsets representing each class
    Instances[] subsets = new Instances[insts.numClasses()];
    for (int i = 0; i < insts.numClasses(); i++) {
        subsets[i] = new Instances(insts, insts.numInstances());
    }
    for (int j = 0; j < insts.numInstances(); j++) {
        Instance inst = insts.instance(j);
        subsets[(int) inst.classValue()].add(inst);
    }
    for (int i = 0; i < insts.numClasses(); i++) {
        subsets[i].compactify();
    }

    // Build the binary classifiers
    Random rand = new Random(m_randomSeed);
    m_classifiers = new BinarySMO[insts.numClasses()][insts.numClasses()];
    for (int i = 0; i < insts.numClasses(); i++) {
        for (int j = i + 1; j < insts.numClasses(); j++) {
            m_classifiers[i][j] = new BinarySMO();
            m_classifiers[i][j].setKernel(Kernel.makeCopy(getKernel()));
            Instances data = new Instances(insts, insts.numInstances());
            for (int k = 0; k < subsets[i].numInstances(); k++) {
                data.add(subsets[i].instance(k));
            }
            for (int k = 0; k < subsets[j].numInstances(); k++) {
                data.add(subsets[j].instance(k));
            }
            data.compactify();
            data.randomize(rand);
            m_classifiers[i][j].buildClassifier(data, i, j, m_fitLogisticModels, m_numFolds, m_randomSeed);
        }
    }
}

From source file:decisiontree.MyC45.java

public void buildClassifier(Instances instances) throws Exception {
    // can classifier handle the data?
    getCapabilities().testWithFail(instances);

    // handle instances with missing class
    instances = new Instances(instances);
    instances.deleteWithMissingClass();

    // handle missing values
    Instances processedInstances = handleMissingValues(instances);

    makeTree(processedInstances);/*from   w w w .j a  va2 s.  c o m*/
}

From source file:decisiontree.MyID3.java

@Override
public void buildClassifier(Instances data) throws Exception {
    getCapabilities().testWithFail(data);
    data = new Instances(data);
    data.deleteWithMissingClass();
    makeTree(data);/*from w w  w  .  java2  s  . c  om*/
}