Example usage for weka.core Instances numInstances

Introduction

In this page you can find the example usage for weka.core Instances numInstances.

Prototype


publicint numInstances()

Source Link

Document

Returns the number of instances in the dataset.

Usage

From source file:com.spread.experiment.tempuntilofficialrelease.ClassificationViaClustering108.java

License:Open Source License

/**
 * builds the classifier//  w w  w  .j a v a  2 s.  co m
 * 
 * @param data the training instances
 * @throws Exception if something goes wrong
 */
@Override
public void buildClassifier(Instances data) throws Exception {

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    // save original header (needed for clusters to classes output)
    m_OriginalHeader = data.stringFreeStructure();

    // remove class attribute for clusterer
    Instances clusterData = new Instances(data);
    clusterData.setClassIndex(-1);
    clusterData.deleteAttributeAt(data.classIndex());
    m_ClusteringHeader = clusterData.stringFreeStructure();

    if (m_ClusteringHeader.numAttributes() == 0) {
        System.err.println("Data contains only class attribute, defaulting to ZeroR model.");
        m_ZeroR = new ZeroR();
        m_ZeroR.buildClassifier(data);
    } else {
        m_ZeroR = null;

        // build clusterer
        m_ActualClusterer = AbstractClusterer.makeCopy(m_Clusterer);
        m_ActualClusterer.buildClusterer(clusterData);

        if (!getLabelAllClusters()) {

            // determine classes-to-clusters mapping
            ClusterEvaluation eval = new ClusterEvaluation();
            eval.setClusterer(m_ActualClusterer);
            eval.evaluateClusterer(clusterData);
            double[] clusterAssignments = eval.getClusterAssignments();
            int[][] counts = new int[eval.getNumClusters()][m_OriginalHeader.numClasses()];
            int[] clusterTotals = new int[eval.getNumClusters()];
            double[] best = new double[eval.getNumClusters() + 1];
            double[] current = new double[eval.getNumClusters() + 1];
            for (int i = 0; i < data.numInstances(); i++) {
                Instance instance = data.instance(i);
                if (!instance.classIsMissing()) {
                    counts[(int) clusterAssignments[i]][(int) instance.classValue()]++;
                    clusterTotals[(int) clusterAssignments[i]]++;
                }
            }
            best[eval.getNumClusters()] = Double.MAX_VALUE;
            ClusterEvaluation.mapClasses(eval.getNumClusters(), 0, counts, clusterTotals, current, best, 0);
            m_ClustersToClasses = new double[best.length];
            System.arraycopy(best, 0, m_ClustersToClasses, 0, best.length);
        } else {
            m_ClusterClassProbs = new double[m_ActualClusterer.numberOfClusters()][data.numClasses()];
            for (int i = 0; i < data.numInstances(); i++) {
                Instance clusterInstance = clusterData.instance(i);
                Instance originalInstance = data.instance(i);
                if (!originalInstance.classIsMissing()) {
                    double[] probs = m_ActualClusterer.distributionForInstance(clusterInstance);
                    for (int j = 0; j < probs.length; j++) {
                        m_ClusterClassProbs[j][(int) originalInstance.classValue()] += probs[j];
                    }
                }
            }
            for (int i = 0; i < m_ClusterClassProbs.length; i++) {
                Utils.normalize(m_ClusterClassProbs[i]);
            }
        }
    }
}

From source file:com.tum.classifiertest.DataCache.java

License:Open Source License

/**
 * Creates a DataCache by copying data from a weka.core.Instances object.
 *//* w ww  .ja v  a2 s.  c o m*/
public DataCache(Instances origData) throws Exception {

    classIndex = origData.classIndex();
    numAttributes = origData.numAttributes();
    numClasses = origData.numClasses();
    numInstances = origData.numInstances();

    attNumVals = new int[origData.numAttributes()];
    for (int i = 0; i < attNumVals.length; i++) {
        if (origData.attribute(i).isNumeric()) {
            attNumVals[i] = 0;
        } else if (origData.attribute(i).isNominal()) {
            attNumVals[i] = origData.attribute(i).numValues();
        } else
            throw new Exception("Only numeric and nominal attributes are supported.");
    }

    /* Array is indexed by attribute first, to speed access in RF splitting. */
    vals = new float[numAttributes][numInstances];
    for (int a = 0; a < numAttributes; a++) {
        for (int i = 0; i < numInstances; i++) {
            if (origData.instance(i).isMissing(a))
                vals[a][i] = Float.MAX_VALUE; // to make sure missing values go to the end
            else
                vals[a][i] = (float) origData.instance(i).value(a); // deep copy
        }
    }

    instWeights = new double[numInstances];
    instClassValues = new int[numInstances];
    for (int i = 0; i < numInstances; i++) {
        instWeights[i] = origData.instance(i).weight();
        instClassValues[i] = (int) origData.instance(i).classValue();
    }

    /* compute the sortedInstances for the whole dataset */

    sortedIndices = new int[numAttributes][];

    for (int a = 0; a < numAttributes; a++) { // ================= attr by attr

        if (a == classIndex)
            continue;

        if (attNumVals[a] > 0) { // ------------------------------------- nominal

            // Handling nominal attributes: as of FastRF 0.99, they're sorted as well
            // missing values are coded as Float.MAX_VALUE and go to the end

            sortedIndices[a] = new int[numInstances];
            //int count = 0;

            sortedIndices[a] = FastRfUtils.sort(vals[a]);

            /*for (int i = 0; i < numInstances; i++) {
              if ( !this.isValueMissing(a, i) ) {
                sortedIndices[a][count] = i;
                count++;
              }
            }
                    
            for (int i = 0; i < numInstances; i++) {
              if ( this.isValueMissing(a, i) ) {
                sortedIndices[a][count] = i;
                count++;
              }
            }*/

        } else { // ----------------------------------------------------- numeric

            // Sorted indices are computed for numeric attributes
            // missing values are coded as Float.MAX_VALUE and go to the end
            sortedIndices[a] = FastRfUtils.sort(vals[a]);

        } // ---------------------------------------------------------- attr kind

    } // ========================================================= attr by attr

    // System.out.println(" Done.");

}

From source file:com.tum.classifiertest.FastRfBagging.java

License:Open Source License

/**
 * Bagging method. Produces DataCache objects with bootstrap samples of
 * the original data, and feeds them to the base classifier (which can only
 * be a FastRandomTree)./*  w  w w . java  2  s .co  m*/
 *
 * @param data         The training set to be used for generating the
 *                     bagged classifier.
 * @param numThreads   The number of simultaneous threads to use for
 *                     computation. Pass zero (0) for autodetection.
 * @param motherForest A reference to the FastRandomForest object that
 *                     invoked this.
 *
 * @throws Exception if the classifier could not be built successfully
 */
public void buildClassifier(Instances data, int numThreads, FastRandomForest motherForest) throws Exception {

    // can classifier handle the vals?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    data = new Instances(data);
    data.deleteWithMissingClass();

    if (!(m_Classifier instanceof FastRandomTree))
        throw new IllegalArgumentException(
                "The FastRfBagging class accepts " + "only FastRandomTree as its base classifier.");

    /* We fill the m_Classifiers array by creating lots of trees with new()
     * because this is much faster than using serialization to deep-copy the
     * one tree in m_Classifier - this is what the super.buildClassifier(data)
     * normally does. */
    m_Classifiers = new Classifier[m_NumIterations];
    for (int i = 0; i < m_Classifiers.length; i++) {
        FastRandomTree curTree = new FastRandomTree();
        // all parameters for training will be looked up in the motherForest (maxDepth, k_Value)
        curTree.m_MotherForest = motherForest;
        // 0.99: reference to these arrays will get passed down all nodes so the array can be re-used 
        // 0.99: this array is of size two as now all splits are binary - even categorical ones
        curTree.tempProps = new double[2];
        curTree.tempDists = new double[2][];
        curTree.tempDists[0] = new double[data.numClasses()];
        curTree.tempDists[1] = new double[data.numClasses()];
        curTree.tempDistsOther = new double[2][];
        curTree.tempDistsOther[0] = new double[data.numClasses()];
        curTree.tempDistsOther[1] = new double[data.numClasses()];
        m_Classifiers[i] = curTree;
    }

    // this was SLOW.. takes approx 1/2 time as training the forest afterwards (!!!)
    // super.buildClassifier(data);

    if (m_CalcOutOfBag && (m_BagSizePercent != 100)) {
        throw new IllegalArgumentException(
                "Bag size needs to be 100% if " + "out-of-bag error is to be calculated!");
    }

    // sorting is performed inside this constructor
    DataCache myData = new DataCache(data);

    int bagSize = data.numInstances() * m_BagSizePercent / 100;
    Random random = new Random(m_Seed);

    boolean[][] inBag = new boolean[m_Classifiers.length][];

    // thread management
    ExecutorService threadPool = Executors
            .newFixedThreadPool(numThreads > 0 ? numThreads : Runtime.getRuntime().availableProcessors());
    List<Future<?>> futures = new ArrayList<Future<?>>(m_Classifiers.length);

    try {

        for (int treeIdx = 0; treeIdx < m_Classifiers.length; treeIdx++) {

            // create the in-bag dataset (and be sure to remember what's in bag)
            // for computing the out-of-bag error later
            DataCache bagData = myData.resample(bagSize, random);
            bagData.reusableRandomGenerator = bagData.getRandomNumberGenerator(random.nextInt());
            inBag[treeIdx] = bagData.inBag; // store later for OOB error calculation

            // build the classifier
            if (m_Classifiers[treeIdx] instanceof FastRandomTree) {

                FastRandomTree aTree = (FastRandomTree) m_Classifiers[treeIdx];
                aTree.data = bagData;

                Future<?> future = threadPool.submit(aTree);
                futures.add(future);

            } else {
                throw new IllegalArgumentException(
                        "The FastRfBagging class accepts " + "only FastRandomTree as its base classifier.");
            }

        }

        // make sure all trees have been trained before proceeding
        for (int treeIdx = 0; treeIdx < m_Classifiers.length; treeIdx++) {
            futures.get(treeIdx).get();

        }

        // calc OOB error?
        if (getCalcOutOfBag() || getComputeImportances()) {
            //m_OutOfBagError = computeOOBError(data, inBag, threadPool);
            m_OutOfBagError = computeOOBError(myData, inBag, threadPool);
        } else {
            m_OutOfBagError = 0;
        }

        //calc feature importances
        m_FeatureImportances = null;
        //m_FeatureNames = null;
        if (getComputeImportances()) {
            m_FeatureImportances = new double[data.numAttributes()];
            ///m_FeatureNames = new String[data.numAttributes()];
            //Instances dataCopy = new Instances(data); //To scramble
            //int[] permutation = FastRfUtils.randomPermutation(data.numInstances(), random);
            for (int j = 0; j < data.numAttributes(); j++) {
                if (j != data.classIndex()) {
                    //double sError = computeOOBError(FastRfUtils.scramble(data, dataCopy, j, permutation), inBag, threadPool);
                    //double sError = computeOOBError(data, inBag, threadPool, j, 0);
                    float[] unscrambled = myData.scrambleOneAttribute(j, random);
                    double sError = computeOOBError(myData, inBag, threadPool);
                    myData.vals[j] = unscrambled; // restore the original state
                    m_FeatureImportances[j] = sError - m_OutOfBagError;
                }
                //m_FeatureNames[j] = data.attribute(j).name();
            }
        }

        threadPool.shutdown();

    } finally {
        threadPool.shutdownNow();
    }
}

From source file:com.tum.classifiertest.FastRfBagging.java

License:Open Source License

/**
 * Compute the out-of-bag error for a set of instances.
 *
 * @param data       the instances/*from  w  ww  . ja v  a2s .  co  m*/
 * @param inBag      numTrees x numInstances indicating out-of-bag instances
 * @param threadPool the pool of threads
 *
 * @return the oob error
 */
private double computeOOBError(Instances data, boolean[][] inBag, ExecutorService threadPool)
        throws InterruptedException, ExecutionException {

    boolean numeric = data.classAttribute().isNumeric();

    List<Future<Double>> votes = new ArrayList<Future<Double>>(data.numInstances());
    for (int i = 0; i < data.numInstances(); i++) {
        VotesCollector aCollector = new VotesCollector(m_Classifiers, i, data, inBag);
        votes.add(threadPool.submit(aCollector));
    }

    double outOfBagCount = 0.0;
    double errorSum = 0.0;

    for (int i = 0; i < data.numInstances(); i++) {

        double vote = votes.get(i).get();

        // error for instance
        outOfBagCount += data.instance(i).weight();
        if (numeric) {
            errorSum += StrictMath.abs(vote - data.instance(i).classValue()) * data.instance(i).weight();
        } else {
            if (vote != data.instance(i).classValue())
                errorSum += data.instance(i).weight();
        }

    }

    return errorSum / outOfBagCount;
}

From source file:com.tum.classifiertest.FastRfUtils.java

License:Open Source License

/**
 * Produces a random permutation of the values of an attribute in a dataset using Knuth shuffle.
 * <p/>// w ww.  ja va2  s  .  co m
 * Copies back the current values of the previously scrambled attribute and uses the given permutation
 * to scramble the values of the new attribute all by copying from the original dataset.
 *
 * @param src      the source dataset
 * @param dst      the scrambled dataset
 * @param attIndex the attribute index
 * @param perm     the random permutation
 *
 * @return fluent
 */
public static Instances scramble(Instances src, Instances dst, final int attIndex, int[] perm) {

    for (int i = 0; i < src.numInstances(); i++) {

        Instance scrambled = dst.instance(i);

        if (attIndex > 0)
            scrambled.setValue(attIndex - 1, src.instance(i).value(attIndex - 1));
        scrambled.setValue(attIndex, src.instance(perm[i]).value(attIndex));
    }

    return dst;
}

From source file:com.yahoo.labs.samoa.instances.WekaToSamoaInstanceConverter.java

License:Apache License

/**
* Samoa instances from weka instances.//from  w  w  w  .j ava2s .  c o  m
*
* @param instances the instances
* @return the instances
*/
public Instances samoaInstances(weka.core.Instances instances) {
    Instances samoaInstances = samoaInstancesInformation(instances);
    //We assume that we have only one samoaInstanceInformation for WekaToSamoaInstanceConverter
    this.samoaInstanceInformation = samoaInstances;
    for (int i = 0; i < instances.numInstances(); i++) {
        samoaInstances.add(samoaInstance(instances.instance(i)));
    }
    return samoaInstances;
}

From source file:com.zooclassifier.Model.FileLoader.java

public FileLoader(String filename) throws FileNotFoundException, IOException {
    BufferedReader reader = new BufferedReader(new FileReader(filename));
    ArffLoader.ArffReader arff = new ArffLoader.ArffReader(reader);
    Instances data = arff.getData();
    data.setClassIndex(data.numAttributes() - 1);

    attributes = new String[data.numInstances()][data.numAttributes() - 1];
    labels = new String[data.numInstances()];

    for (int i = 0; i < data.numInstances(); i++) {
        Instance instance = data.instance(i);
        for (int j = 0; j < instance.numAttributes() - 1; j++) {
            attributes[i][j] = instance.stringValue(j);
        }/*from  w  w w. j  a  v a 2s.  c om*/
        labels[i] = instance.stringValue(instance.numAttributes() - 1);
    }

    attributesLegalValues = new String[data.numAttributes() - 1][];
    for (int i = 0; i < data.numAttributes() - 1; i++) {
        attributesLegalValues[i] = (String[]) Collections.list(data.attribute(i).enumerateValues())
                .toArray(new String[data.attribute(i).numValues()]);
    }

    labelsLegalValues = (String[]) Collections.list(data.attribute(data.numAttributes() - 1).enumerateValues())
            .toArray(new String[data.attribute(data.numAttributes() - 1).numValues()]);
}

From source file:core.classifier.MyFirstClassifier.java

License:Open Source License

/**
 * Method for building the classifier. Implements a one-against-one
 * wrapper for multi-class problems.//from  w  w w. ja  va2s .c om
 *
 * @param insts the set of training instances
 * @throws Exception if the classifier can't be built successfully
 */
public void buildClassifier(Instances insts) throws Exception {

    if (!m_checksTurnedOff) {
        // can classifier handle the data?
        getCapabilities().testWithFail(insts);

        // remove instances with missing class
        insts = new Instances(insts);
        insts.deleteWithMissingClass();

        /* Removes all the instances with weight equal to 0.
         MUST be done since condition (8) of Keerthi's paper
         is made with the assertion Ci > 0 (See equation (3a). */
        Instances data = new Instances(insts, insts.numInstances());
        for (int i = 0; i < insts.numInstances(); i++) {
            if (insts.instance(i).weight() > 0)
                data.add(insts.instance(i));
        }
        if (data.numInstances() == 0) {
            throw new Exception("No training instances left after removing " + "instances with weight 0!");
        }
        insts = data;
    }

    if (!m_checksTurnedOff) {
        m_Missing = new ReplaceMissingValues();
        m_Missing.setInputFormat(insts);
        insts = Filter.useFilter(insts, m_Missing);
    } else {
        m_Missing = null;
    }

    if (getCapabilities().handles(Capability.NUMERIC_ATTRIBUTES)) {
        boolean onlyNumeric = true;
        if (!m_checksTurnedOff) {
            for (int i = 0; i < insts.numAttributes(); i++) {
                if (i != insts.classIndex()) {
                    if (!insts.attribute(i).isNumeric()) {
                        onlyNumeric = false;
                        break;
                    }
                }
            }
        }

        if (!onlyNumeric) {
            m_NominalToBinary = new NominalToBinary();
            m_NominalToBinary.setInputFormat(insts);
            insts = Filter.useFilter(insts, m_NominalToBinary);
        } else {
            m_NominalToBinary = null;
        }
    } else {
        m_NominalToBinary = null;
    }

    if (m_filterType == FILTER_STANDARDIZE) {
        m_Filter = new Standardize();
        m_Filter.setInputFormat(insts);
        insts = Filter.useFilter(insts, m_Filter);
    } else if (m_filterType == FILTER_NORMALIZE) {
        m_Filter = new Normalize();
        m_Filter.setInputFormat(insts);
        insts = Filter.useFilter(insts, m_Filter);
    } else {
        m_Filter = null;
    }

    m_classIndex = insts.classIndex();
    m_classAttribute = insts.classAttribute();
    m_KernelIsLinear = (m_kernel instanceof PolyKernel) && (((PolyKernel) m_kernel).getExponent() == 1.0);

    // Generate subsets representing each class
    Instances[] subsets = new Instances[insts.numClasses()];
    for (int i = 0; i < insts.numClasses(); i++) {
        subsets[i] = new Instances(insts, insts.numInstances());
    }
    for (int j = 0; j < insts.numInstances(); j++) {
        Instance inst = insts.instance(j);
        subsets[(int) inst.classValue()].add(inst);
    }
    for (int i = 0; i < insts.numClasses(); i++) {
        subsets[i].compactify();
    }

    // Build the binary classifiers
    Random rand = new Random(m_randomSeed);
    m_classifiers = new BinarySMO[insts.numClasses()][insts.numClasses()];
    for (int i = 0; i < insts.numClasses(); i++) {
        for (int j = i + 1; j < insts.numClasses(); j++) {
            m_classifiers[i][j] = new BinarySMO();
            m_classifiers[i][j].setKernel(Kernel.makeCopy(getKernel()));
            Instances data = new Instances(insts, insts.numInstances());
            for (int k = 0; k < subsets[i].numInstances(); k++) {
                data.add(subsets[i].instance(k));
            }
            for (int k = 0; k < subsets[j].numInstances(); k++) {
                data.add(subsets[j].instance(k));
            }
            data.compactify();
            data.randomize(rand);
            m_classifiers[i][j].buildClassifier(data, i, j, m_fitLogisticModels, m_numFolds, m_randomSeed);
        }
    }
}

From source file:core.ClusterEvaluationEX.java

License:Open Source License

public Instances DeleteNoise(Instances data) {
    noise = data.stringFreeStructure();//  w ww  . j a v  a 2s.  c  om
    for (int i = 0; i < data.numInstances(); i++) {
        if (data.instance(i).value(1) == -1) {
            noise.add(data.instance(i));
            data.delete(i);
            i--;
        }
    }
    return data;
}

From source file:core.ClusterEvaluationEX.java

License:Open Source License

/**
 * Perform a cross-validation for DensityBasedClusterer on a set of instances.
 *
 * @param clusterer the clusterer to use
 * @param data the training data//from www .j  ava  2s . co m
 * @param numFolds number of folds of cross validation to perform
 * @param random random number seed for cross-validation
 * @return the cross-validated log-likelihood
 * @throws Exception if an error occurs
 */
public static double crossValidateModel(DensityBasedClusterer clusterer, Instances data, int numFolds,
        Random random) throws Exception {
    Instances train, test;
    double foldAv = 0;
    ;
    data = new Instances(data);
    data.randomize(random);
    //    double sumOW = 0;
    for (int i = 0; i < numFolds; i++) {
        // Build and test clusterer
        train = data.trainCV(numFolds, i, random);

        clusterer.buildClusterer(train);

        test = data.testCV(numFolds, i);

        for (int j = 0; j < test.numInstances(); j++) {
            try {
                foldAv += ((DensityBasedClusterer) clusterer).logDensityForInstance(test.instance(j));
                //     sumOW += test.instance(j).weight();
                //   double temp = Utils.sum(tempDist);
            } catch (Exception ex) {
                // unclustered instances
            }
        }
    }

    //    return foldAv / sumOW;
    return foldAv / data.numInstances();
}