Example usage for weka.core Instances Instances

List of usage examples for weka.core Instances Instances

Introduction

In this page you can find the example usage for weka.core Instances Instances.

Prototype

public Instances(Instances dataset) 

Source Link

Document

Constructor copying all instances and references to the header information from the given set of instances.

Usage

From source file:core.ClusterEvaluationEX.java

License:Open Source License

/**
 * Performs a cross-validation /*from   w  w  w.ja  v a2  s.c o  m*/
 * for a DensityBasedClusterer clusterer on a set of instances.
 *
 * @param clustererString a string naming the class of the clusterer
 * @param data the data on which the cross-validation is to be 
 * performed 
 * @param numFolds the number of folds for the cross-validation
 * @param options the options to the clusterer
 * @param random a random number generator
 * @return a string containing the cross validated log likelihood
 * @throws Exception if a clusterer could not be generated 
 */
public static String crossValidateModel(String clustererString, Instances data, int numFolds, String[] options,
        Random random) throws Exception {
    Clusterer clusterer = null;
    String[] savedOptions = null;
    double CvAv = 0.0;
    StringBuffer CvString = new StringBuffer();

    if (options != null) {
        savedOptions = new String[options.length];
    }

    data = new Instances(data);

    // create clusterer
    try {
        clusterer = (Clusterer) Class.forName(clustererString).newInstance();
    } catch (Exception e) {
        throw new Exception("Can't find class with name " + clustererString + '.');
    }

    if (!(clusterer instanceof DensityBasedClusterer)) {
        throw new Exception(clustererString + " must be a distrinbution " + "clusterer.");
    }

    // Save options
    if (options != null) {
        System.arraycopy(options, 0, savedOptions, 0, options.length);
    }

    // Parse options
    if (clusterer instanceof OptionHandler) {
        try {
            ((OptionHandler) clusterer).setOptions(savedOptions);
            Utils.checkForRemainingOptions(savedOptions);
        } catch (Exception e) {
            throw new Exception("Can't parse given options in " + "cross-validation!");
        }
    }
    CvAv = crossValidateModel((DensityBasedClusterer) clusterer, data, numFolds, random);

    CvString.append("\n" + numFolds + " fold CV Log Likelihood: " + Utils.doubleToString(CvAv, 6, 4) + "\n");
    return CvString.toString();
}

From source file:cotraining.copy.Evaluation_D.java

License:Open Source License

/**
 * Performs a (stratified if class is nominal) cross-validation 
 * for a classifier on a set of instances. Now performs
 * a deep copy of the classifier before each call to 
 * buildClassifier() (just in case the classifier is not
 * initialized properly)./*from  ww  w  . j  a v  a2s .  co  m*/
 *
 * @param classifier the classifier with any options set.
 * @param data the data on which the cross-validation is to be 
 * performed 
 * @param numFolds the number of folds for the cross-validation
 * @param random random number generator for randomization 
 * @param forPredictionsString varargs parameter that, if supplied, is
 * expected to hold a StringBuffer to print predictions to, 
 * a Range of attributes to output and a Boolean (true if the distribution
 * is to be printed)
 * @throws Exception if a classifier could not be generated 
 * successfully or the class is not defined
 */
public void crossValidateModel(Classifier classifier, Instances data, int numFolds, Random random,
        Object... forPredictionsPrinting) throws Exception {

    // Make a copy of the data we can reorder
    data = new Instances(data);
    data.randomize(random);
    if (data.classAttribute().isNominal()) {
        data.stratify(numFolds);
    }

    // We assume that the first element is a StringBuffer, the second a Range (attributes
    // to output) and the third a Boolean (whether or not to output a distribution instead
    // of just a classification)
    if (forPredictionsPrinting.length > 0) {
        // print the header first
        StringBuffer buff = (StringBuffer) forPredictionsPrinting[0];
        Range attsToOutput = (Range) forPredictionsPrinting[1];
        boolean printDist = ((Boolean) forPredictionsPrinting[2]).booleanValue();
        printClassificationsHeader(data, attsToOutput, printDist, buff);
    }

    // Do the folds
    for (int i = 0; i < numFolds; i++) {
        Instances train = data.trainCV(numFolds, i, random);
        setPriors(train);
        Classifier copiedClassifier = Classifier.makeCopy(classifier);
        copiedClassifier.buildClassifier(train);
        Instances test = data.testCV(numFolds, i);
        evaluateModel(copiedClassifier, test, forPredictionsPrinting);
    }
    m_NumFolds = numFolds;
}

From source file:cs.man.ac.uk.classifiers.GetAUC.java

License:Open Source License

/**
 * Loads the data set stored at the path stored in dataPath.
 * @return true if the data set was successfully loaded, else false.
 *//*from  w w  w  .jav  a 2 s .  c  om*/
private static boolean getData() {
    // load data
    try {
        data = new Instances(new BufferedReader(new FileReader(dataPath)));
        data.setClassIndex(data.numAttributes() - 1);
        System.out.println("Data set loaded from: " + dataPath);
        return true;
    } catch (FileNotFoundException e) {
        System.out.println("Could not load data set! No data set file at: " + dataPath);
        return false;
    } catch (IOException e) {
        System.out.println("Could not load data set! IOException reading file at: " + dataPath);
        return false;
    }
}

From source file:cs.man.ac.uk.classifiers.GetAUC.java

License:Open Source License

/**
 * Computes the AUC for the supplied stream learner.
 * @return the AUC as a double value./*w  w w  . j  a va 2 s.c o m*/
 */
private static double validate5x2CVStream() {
    try {
        // Other options
        int runs = 5;
        int folds = 2;
        double AUC_SUM = 0;

        // perform cross-validation
        for (int i = 0; i < runs; i++) {
            // randomize data
            int seed = i + 1;
            Random rand = new Random(seed);
            Instances randData = new Instances(data);
            randData.randomize(rand);

            if (randData.classAttribute().isNominal()) {
                System.out.println("Stratifying...");
                randData.stratify(folds);
            }

            for (int n = 0; n < folds; n++) {
                Instances train = randData.trainCV(folds, n);
                Instances test = randData.testCV(folds, n);

                Distribution testDistribution = new Distribution(test);

                ArffSaver trainSaver = new ArffSaver();
                trainSaver.setInstances(train);
                trainSaver.setFile(new File(trainPath));
                trainSaver.writeBatch();

                ArffSaver testSaver = new ArffSaver();
                testSaver.setInstances(test);

                double[][] dist = testDistribution.matrix();
                int negativeClassSize = (int) dist[0][0];
                int positiveClassSize = (int) dist[0][1];
                double balance = (double) positiveClassSize / (double) negativeClassSize;

                String tempTestPath = testPath.replace(".arff",
                        "_" + positiveClassSize + "_" + negativeClassSize + "_" + balance + "_1.0.arff");// [Test-n-Set-n]_[+]_[-]_[K]_[L];
                testSaver.setFile(new File(tempTestPath));
                testSaver.writeBatch();

                ARFFFile file = new ARFFFile(tempTestPath, CLASS_INDEX, new DebugLogger(false));
                file.createMetaData();

                HoeffdingTreeTester streamClassifier = new HoeffdingTreeTester(trainPath, tempTestPath,
                        CLASS_INDEX, new String[] { "0", "1" }, new DebugLogger(true));

                streamClassifier.train();

                System.in.read();

                //AUC_SUM += streamClassifier.getROCExternalData("",(int)testDistribution.perClass(1),(int)testDistribution.perClass(0));
                streamClassifier.testStatic(homeDirectory + "/FuckSakeTest.txt");

                String[] files = Common.getFilePaths(scratch);
                for (int j = 0; j < files.length; j++)
                    Common.fileDelete(files[j]);
            }
        }

        return AUC_SUM / ((double) runs * (double) folds);
    } catch (Exception e) {
        System.out.println("Exception validating data!");
        e.printStackTrace();
        return 0;
    }
}

From source file:cs.man.ac.uk.classifiers.GetAUC.java

License:Open Source License

/**
 * Computes the AUC for the supplied learner.
 * @return the AUC as a double value./*  w  ww . j a  v a  2s.  c o  m*/
 */
@SuppressWarnings("unused")
private static double validate5x2CV() {
    try {
        // other options
        int runs = 5;
        int folds = 2;
        double AUC_SUM = 0;

        // perform cross-validation
        for (int i = 0; i < runs; i++) {
            // randomize data
            int seed = i + 1;
            Random rand = new Random(seed);
            Instances randData = new Instances(data);
            randData.randomize(rand);

            if (randData.classAttribute().isNominal()) {
                System.out.println("Stratifying...");
                randData.stratify(folds);
            }

            Evaluation eval = new Evaluation(randData);

            for (int n = 0; n < folds; n++) {
                Instances train = randData.trainCV(folds, n);
                Instances test = randData.testCV(folds, n);

                // the above code is used by the StratifiedRemoveFolds filter, the
                // code below by the Explorer/Experimenter:
                // Instances train = randData.trainCV(folds, n, rand);

                // build and evaluate classifier
                String[] options = { "-U", "-A" };
                J48 classifier = new J48();
                //HTree classifier = new HTree();

                classifier.setOptions(options);
                classifier.buildClassifier(train);
                eval.evaluateModel(classifier, test);

                // generate curve
                ThresholdCurve tc = new ThresholdCurve();
                int classIndex = 0;
                Instances result = tc.getCurve(eval.predictions(), classIndex);

                // plot curve
                vmc = new ThresholdVisualizePanel();
                AUC_SUM += ThresholdCurve.getROCArea(result);
                System.out.println("AUC: " + ThresholdCurve.getROCArea(result) + " \tAUC SUM: " + AUC_SUM);
            }
        }

        return AUC_SUM / ((double) runs * (double) folds);
    } catch (Exception e) {
        System.out.println("Exception validating data!");
        return 0;
    }
}

From source file:cs.man.ac.uk.predict.Predictor.java

License:Open Source License

public static void makePredictionsEnsembleNew(String trainPath, String testPath, String resultPath) {
    System.out.println("Training set: " + trainPath);
    System.out.println("Test set: " + testPath);

    /**/*from   w  ww .  j  a  v  a  2s.  com*/
     * The ensemble classifiers. This is a heterogeneous ensemble.
     */
    J48 learner1 = new J48();
    SMO learner2 = new SMO();
    NaiveBayes learner3 = new NaiveBayes();
    MultilayerPerceptron learner5 = new MultilayerPerceptron();

    System.out.println("Training Ensemble.");
    long startTime = System.nanoTime();
    try {
        BufferedReader reader = new BufferedReader(new FileReader(trainPath));
        Instances data = new Instances(reader);
        data.setClassIndex(data.numAttributes() - 1);
        System.out.println("Training data length: " + data.numInstances());

        learner1.buildClassifier(data);
        learner2.buildClassifier(data);
        learner3.buildClassifier(data);
        learner5.buildClassifier(data);

        long endTime = System.nanoTime();
        long nanoseconds = endTime - startTime;
        double seconds = (double) nanoseconds / 1000000000.0;
        System.out.println("Training Ensemble completed in " + nanoseconds + " (ns) or " + seconds + " (s).");
    } catch (IOException e) {
        System.out.println("Could not train Ensemble classifier IOException on training data file.");
    } catch (Exception e) {
        System.out.println("Could not train Ensemble classifier Exception building model.");
    }

    try {
        String line = "";

        // Read the file and display it line by line. 
        BufferedReader in = null;

        // Read in and store each positive prediction in the tree map.
        try {
            //open stream to file
            in = new BufferedReader(new FileReader(testPath));

            while ((line = in.readLine()) != null) {
                if (line.toLowerCase().contains("@data"))
                    break;
            }
        } catch (Exception e) {
        }

        // A different ARFF loader used here (compared to above) as
        // the ARFF file may be extremely large. In which case the whole
        // file cannot be read in. Instead it is read in incrementally.
        ArffLoader loader = new ArffLoader();
        loader.setFile(new File(testPath));

        Instances data = loader.getStructure();
        data.setClassIndex(data.numAttributes() - 1);

        System.out.println("Ensemble Classifier is ready.");
        System.out.println("Testing on all instances avaialable.");

        startTime = System.nanoTime();

        int instanceNumber = 0;

        // label instances
        Instance current;

        while ((current = loader.getNextInstance(data)) != null) {
            instanceNumber += 1;
            line = in.readLine();

            double classification1 = learner1.classifyInstance(current);
            double classification2 = learner2.classifyInstance(current);
            double classification3 = learner3.classifyInstance(current);
            double classification5 = learner5.classifyInstance(current);

            // All classifiers must agree. This is a very primitive ensemble strategy!
            if (classification1 == 1 && classification2 == 1 && classification3 == 1 && classification5 == 1) {
                if (line != null) {
                    //System.out.println("Instance: "+instanceNumber+"\t"+line);
                    //System.in.read();
                }
                Writer.append(resultPath, instanceNumber + "\n");
            }
        }

        in.close();

        System.out.println("Test set instances: " + instanceNumber);

        long endTime = System.nanoTime();
        long duration = endTime - startTime;
        double seconds = (double) duration / 1000000000.0;

        System.out.println("Testing Ensemble completed in " + duration + " (ns) or " + seconds + " (s).");
    } catch (Exception e) {
        System.out.println("Could not test Ensemble classifier due to an error.");
    }
}

From source file:cs.man.ac.uk.predict.Predictor.java

License:Open Source License

public static void makePredictionsEnsembleStream(String trainPath, String testPath, String resultPath) {
    System.out.println("Training set: " + trainPath);
    System.out.println("Test set: " + testPath);

    /**// w w  w .j av a2 s . co  m
     * The ensemble classifiers. This is a heterogeneous ensemble.
     */
    J48 learner1 = new J48();
    SMO learner2 = new SMO();
    NaiveBayes learner3 = new NaiveBayes();
    MultilayerPerceptron learner5 = new MultilayerPerceptron();

    System.out.println("Training Ensemble.");
    long startTime = System.nanoTime();
    try {
        BufferedReader reader = new BufferedReader(new FileReader(trainPath));
        Instances data = new Instances(reader);
        data.setClassIndex(data.numAttributes() - 1);
        System.out.println("Training data length: " + data.numInstances());

        learner1.buildClassifier(data);
        learner2.buildClassifier(data);
        learner3.buildClassifier(data);
        learner5.buildClassifier(data);

        long endTime = System.nanoTime();
        long nanoseconds = endTime - startTime;
        double seconds = (double) nanoseconds / 1000000000.0;
        System.out.println("Training Ensemble completed in " + nanoseconds + " (ns) or " + seconds + " (s).");
    } catch (IOException e) {
        System.out.println("Could not train Ensemble classifier IOException on training data file.");
    } catch (Exception e) {
        System.out.println("Could not train Ensemble classifier Exception building model.");
    }

    try {
        // A different ARFF loader used here (compared to above) as
        // the ARFF file may be extremely large. In which case the whole
        // file cannot be read in. Instead it is read in incrementally.
        ArffLoader loader = new ArffLoader();
        loader.setFile(new File(testPath));

        Instances data = loader.getStructure();
        data.setClassIndex(data.numAttributes() - 1);

        System.out.println("Ensemble Classifier is ready.");
        System.out.println("Testing on all instances avaialable.");

        startTime = System.nanoTime();

        int instanceNumber = 0;

        // label instances
        Instance current;

        while ((current = loader.getNextInstance(data)) != null) {
            instanceNumber += 1;

            double classification1 = learner1.classifyInstance(current);
            double classification2 = learner2.classifyInstance(current);
            double classification3 = learner3.classifyInstance(current);
            double classification5 = learner5.classifyInstance(current);

            // All classifiers must agree. This is a very primitive ensemble strategy!
            if (classification1 == 1 && classification2 == 1 && classification3 == 1 && classification5 == 1) {
                Writer.append(resultPath, instanceNumber + "\n");
            }
        }

        System.out.println("Test set instances: " + instanceNumber);

        long endTime = System.nanoTime();
        long duration = endTime - startTime;
        double seconds = (double) duration / 1000000000.0;

        System.out.println("Testing Ensemble completed in " + duration + " (ns) or " + seconds + " (s).");
    } catch (Exception e) {
        System.out.println("Could not test Ensemble classifier due to an error.");
    }
}

From source file:cs.man.ac.uk.predict.Predictor.java

License:Open Source License

public static void makePredictionsJ48(String trainPath, String testPath, String resultPath) {
    /**/*w  w w . jav  a2s  .com*/
     * The decision tree classifier.
     */
    J48 learner = new J48();

    System.out.println("Training set: " + trainPath);
    System.out.println("Test set: " + testPath);

    System.out.println("Training J48");
    long startTime = System.nanoTime();
    try {
        BufferedReader reader = new BufferedReader(new FileReader(trainPath));
        Instances data = new Instances(reader);
        data.setClassIndex(data.numAttributes() - 1);
        System.out.println("Training data length: " + data.numInstances());
        learner.buildClassifier(data);

        long endTime = System.nanoTime();
        long nanoseconds = endTime - startTime;
        double seconds = (double) nanoseconds / 1000000000.0;
        System.out.println("Training J48 completed in " + nanoseconds + " (ns) or " + seconds + " (s)");
    } catch (IOException e) {
        System.out.println("Could not train J48 classifier IOException on training data file");
    } catch (Exception e) {
        System.out.println("Could not train J48 classifier Exception building model");
    }

    try {
        // Prepare data for testing
        //BufferedReader reader = new BufferedReader( new FileReader(testPath));
        //Instances data = new Instances(reader);
        //data.setClassIndex(data.numAttributes() - 1);

        ArffLoader loader = new ArffLoader();
        loader.setFile(new File(testPath));
        Instances data = loader.getStructure();
        data.setClassIndex(data.numAttributes() - 1);

        System.out.println("J48 Classifier is ready.");
        System.out.println("Testing on all instances avaialable.");
        System.out.println("Test set instances: " + data.numInstances());

        startTime = System.nanoTime();

        int instanceNumber = 0;

        // label instances
        Instance current;

        //for (int i = 0; i < data.numInstances(); i++) 
        while ((current = loader.getNextInstance(data)) != null) {
            instanceNumber += 1;

            //double classification = learner.classifyInstance(data.instance(i));
            double classification = learner.classifyInstance(current);
            //String instanceClass= Double.toString(data.instance(i).classValue());

            if (classification == 1)// Predicted positive, actually negative
            {
                Writer.append(resultPath, instanceNumber + "\n");
            }
        }

        long endTime = System.nanoTime();
        long duration = endTime - startTime;
        double seconds = (double) duration / 1000000000.0;

        System.out.println("Testing J48 completed in " + duration + " (ns) or " + seconds + " (s)");
    } catch (Exception e) {
        System.out.println("Could not test J48 classifier due to an error");
    }
}

From source file:cz.vse.fis.keg.entityclassifier.core.salience.EntitySaliencer.java

License:Open Source License

private void trainModel() {

    BufferedReader reader = null;

    try {//ww  w.j a  v a2  s . c  om

        URL fileURL = THDController.getInstance().getClass().getResource(Settings.SALIENCE_DATASET);
        File arrfFile = new File(fileURL.getFile());

        reader = new BufferedReader(new FileReader(arrfFile));
        Instances data = new Instances(reader);
        data.setClassIndex(data.numAttributes() - 1);

        //            classifier = new NaiveBayes();
        classifier = new RandomForest();

        // Train the classifer.
        classifier.buildClassifier(data);

    } catch (FileNotFoundException ex) {
        Logger.getLogger(EntitySaliencer.class.getName()).log(Level.SEVERE, null, ex);
    } catch (IOException ex) {
        Logger.getLogger(EntitySaliencer.class.getName()).log(Level.SEVERE, null, ex);
    } catch (Exception ex) {
        Logger.getLogger(EntitySaliencer.class.getName()).log(Level.SEVERE, null, ex);
    } finally {
        try {
            reader.close();
            System.out.println("Model was successfully trained.");
        } catch (IOException ex) {
            Logger.getLogger(EntitySaliencer.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
}

From source file:data.generation.target.utils.PrincipalComponents.java

License:Open Source License

private void buildAttributeConstructor(Instances data) throws Exception {
    m_eigenvalues = null;/*from   w  ww. j  a  va2s. c  o  m*/
    m_outputNumAtts = -1;
    m_attributeFilter = null;
    m_nominalToBinFilter = null;
    m_sumOfEigenValues = 0.0;
    m_trainInstances = new Instances(data);

    // make a copy of the training data so that we can get the class
    // column to append to the transformed data (if necessary)
    m_trainHeader = new Instances(m_trainInstances, 0);

    m_replaceMissingFilter = new ReplaceMissingValues();
    m_replaceMissingFilter.setInputFormat(m_trainInstances);
    m_trainInstances = Filter.useFilter(m_trainInstances, m_replaceMissingFilter);

    /*if (m_normalize) {
      m_normalizeFilter = new Normalize();
      m_normalizeFilter.setInputFormat(m_trainInstances);
      m_trainInstances = Filter.useFilter(m_trainInstances, m_normalizeFilter);
    } */

    m_nominalToBinFilter = new NominalToBinary();
    m_nominalToBinFilter.setInputFormat(m_trainInstances);
    m_trainInstances = Filter.useFilter(m_trainInstances, m_nominalToBinFilter);

    // delete any attributes with only one distinct value or are all missing
    Vector deleteCols = new Vector();
    for (int i = 0; i < m_trainInstances.numAttributes(); i++) {
        if (m_trainInstances.numDistinctValues(i) <= 1) {
            deleteCols.addElement(new Integer(i));
        }
    }

    if (m_trainInstances.classIndex() >= 0) {
        // get rid of the class column
        m_hasClass = true;
        m_classIndex = m_trainInstances.classIndex();
        deleteCols.addElement(new Integer(m_classIndex));
    }

    // remove columns from the data if necessary
    if (deleteCols.size() > 0) {
        m_attributeFilter = new Remove();
        int[] todelete = new int[deleteCols.size()];
        for (int i = 0; i < deleteCols.size(); i++) {
            todelete[i] = ((Integer) (deleteCols.elementAt(i))).intValue();
        }
        m_attributeFilter.setAttributeIndicesArray(todelete);
        m_attributeFilter.setInvertSelection(false);
        m_attributeFilter.setInputFormat(m_trainInstances);
        m_trainInstances = Filter.useFilter(m_trainInstances, m_attributeFilter);
    }

    // can evaluator handle the processed data ? e.g., enough attributes?
    getCapabilities().testWithFail(m_trainInstances);

    m_numInstances = m_trainInstances.numInstances();
    m_numAttribs = m_trainInstances.numAttributes();

    //fillCorrelation();
    fillCovariance();

    double[] d = new double[m_numAttribs];
    double[][] v = new double[m_numAttribs][m_numAttribs];

    Matrix corr = new Matrix(m_correlation);
    corr.eigenvalueDecomposition(v, d);
    m_eigenvectors = (double[][]) v.clone();
    m_eigenvalues = (double[]) d.clone();

    /*for (int i = 0; i < m_numAttribs; i++) {
      for (int j = 0; j < m_numAttribs; j++) {
        System.err.println(v[i][j] + " ");
      }
      System.err.println(d[i]);
    } */

    // any eigenvalues less than 0 are not worth anything --- change to 0
    for (int i = 0; i < m_eigenvalues.length; i++) {
        if (m_eigenvalues[i] < 0) {
            m_eigenvalues[i] = 0.0;
        }
    }
    m_sortedEigens = Utils.sort(m_eigenvalues);
    m_sumOfEigenValues = Utils.sum(m_eigenvalues);

    m_transformedFormat = setOutputFormat();
    if (m_transBackToOriginal) {
        m_originalSpaceFormat = setOutputFormatOriginal();

        // new ordered eigenvector matrix
        int numVectors = (m_transformedFormat.classIndex() < 0) ? m_transformedFormat.numAttributes()
                : m_transformedFormat.numAttributes() - 1;

        double[][] orderedVectors = new double[m_eigenvectors.length][numVectors + 1];

        // try converting back to the original space
        for (int i = m_numAttribs - 1; i > (m_numAttribs - numVectors - 1); i--) {
            for (int j = 0; j < m_numAttribs; j++) {
                orderedVectors[j][m_numAttribs - i] = m_eigenvectors[j][m_sortedEigens[i]];
            }
        }

        // transpose the matrix
        int nr = orderedVectors.length;
        int nc = orderedVectors[0].length;
        m_eTranspose = new double[nc][nr];
        for (int i = 0; i < nc; i++) {
            for (int j = 0; j < nr; j++) {
                m_eTranspose[i][j] = orderedVectors[j][i];
            }
        }
    }
}