Example usage for weka.core Instances delete

Introduction

In this page you can find the example usage for weka.core Instances delete.

Prototype


public void delete(int index)

Source Link

Document

Removes an instance at the given position from the set.

Usage

From source file:core.ClusterEvaluationEX.java

License:Open Source License

public Instances DeleteNoise(Instances data) {
    noise = data.stringFreeStructure();//  w w w .  ja v  a2  s. c  om
    for (int i = 0; i < data.numInstances(); i++) {
        if (data.instance(i).value(1) == -1) {
            noise.add(data.instance(i));
            data.delete(i);
            i--;
        }
    }
    return data;
}

From source file:de.ugoe.cs.cpdp.dataprocessing.CLAMIProcessor.java

License:Apache License

/**
 * <p>/*w ww  .ja va2s.  c  om*/
 * Applies the CLAMI processor to the data. The test data is also required, in order to
 * guarantee a consistent metric set.
 * </p>
 *
 * @param testdata
 *            test data; the data is not modified, only metrics are dropped
 * @param data
 *            data to which the CLAMI processor is applied
 */
public void applyCLAMI(Instances testdata, Instances data) {

    // first determine medians
    double[] medians = new double[data.numAttributes()];
    // get medians
    for (int j = 0; j < data.numAttributes(); j++) {
        if (j != data.classIndex()) {
            medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1);
        }
    }
    // now determine cluster number for each instance
    double[] clusterNumber = new double[data.numInstances()];
    for (int i = 0; i < data.numInstances(); i++) {
        int countHighValues = 0;
        Instance currentInstance = data.get(i);
        for (int j = 0; j < data.numAttributes(); j++) {
            if (j != data.classIndex()) {
                if (currentInstance.value(j) > medians[j]) {
                    countHighValues++;
                }
            }
        }
        clusterNumber[i] = countHighValues;
    }

    // determine median of cluster number
    Median m = new Median();
    double medianClusterNumber = m.evaluate(clusterNumber);

    // now we filter the metrics
    int[] numMetricViolations = new int[data.numAttributes()];
    for (int j = 0; j < data.numAttributes(); j++) {
        int currentViolations = 0;
        for (int i = 0; i < data.numInstances(); i++) {
            Instance currentInstance = data.get(i);
            if (j != data.classIndex()) {
                if (clusterNumber[i] > medianClusterNumber) {
                    // "buggy"
                    if (currentInstance.value(j) <= medians[j]) {
                        currentViolations++;
                    }
                } else {
                    // "not buggy"
                    if (currentInstance.value(j) > medians[j]) {
                        currentViolations++;
                    }
                }
            }
        }
        numMetricViolations[j] = currentViolations;
    }

    SortedSet<Integer> distinctViolationCounts = new TreeSet<>();
    for (int currentViolations : numMetricViolations) {
        distinctViolationCounts.add(currentViolations);
    }
    Iterator<Integer> violationCountInterator = distinctViolationCounts.iterator();

    int violationCutoff = violationCountInterator.next();
    // now we filter the data;
    // this is first tried with the metrics with fewest violations. if no buggy/bugfree
    // instances remain, this is repeated with the next metrics with second fewest violations,
    // and so on.
    // this part is a bit unclear from the description in the paper, but I confirmed with the
    // author that this is how they implemented it
    boolean[] cleanInstances = new boolean[data.numInstances()];
    int numCleanBuggyInstances = 0;
    int numCleanBugfreeInstances = 0;
    do {
        violationCutoff = violationCountInterator.next();
        cleanInstances = new boolean[data.numInstances()];
        numCleanBuggyInstances = 0;
        numCleanBugfreeInstances = 0;
        for (int i = 0; i < data.numInstances(); i++) {
            int currentViolations = 0;
            Instance currentInstance = data.get(i);
            for (int j = 0; j < data.numAttributes(); j++) {
                if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
                    if (clusterNumber[i] > medianClusterNumber) {
                        // "buggy"
                        if (currentInstance.value(j) <= medians[j]) {
                            currentViolations++;
                        }
                    } else {
                        // "not buggy"
                        if (currentInstance.value(j) > medians[j]) {
                            currentViolations++;
                        }
                    }
                }
            }
            if (currentViolations == 0) {
                cleanInstances[i] = true;
                if (clusterNumber[i] > medianClusterNumber) {
                    numCleanBuggyInstances++;
                } else {
                    numCleanBugfreeInstances++;
                }
            } else {
                cleanInstances[i] = false;
            }
        }
    } while (numCleanBuggyInstances == 0 || numCleanBugfreeInstances == 0);

    // output some interesting information to provide insights into the CLAMI model
    Console.traceln(Level.FINE, "Selected Metrics and Median-threshold: ");
    for (int j = 0; j < data.numAttributes(); j++) {
        if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
            Console.traceln(Level.FINE, "\t" + data.attribute(j).name() + ": " + medians[j]);
        }
    }

    // finally modify the instances
    // drop the metrics (also from the testdata)
    for (int j = data.numAttributes() - 1; j >= 0; j--) {
        if (j != data.classIndex() && numMetricViolations[j] != violationCutoff) {
            data.deleteAttributeAt(j);
            testdata.deleteAttributeAt(j);
        }
    }
    // drop the unclean instances
    for (int i = data.numInstances() - 1; i >= 0; i--) {
        if (!cleanInstances[i]) {
            data.delete(i);
        } else {
            // set the classification
            if (clusterNumber[i] > medianClusterNumber) {
                data.get(i).setClassValue(1.0d);
            } else {
                data.get(i).setClassValue(0.0d);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NominalAttributeFilter.java

License:Apache License

@Override
public void apply(Instances testdata, Instances traindata) {
    int indexOfConfidenceAttribute = -1;

    // Find index of the named confidence attribute to filter for
    for (int i = 0; i < traindata.numAttributes(); i++) {
        if (traindata.attribute(i).name().equals(nominalAttributeName)) {
            indexOfConfidenceAttribute = i;
        }//from  ww w .j a  v  a  2  s  . c  om
    }

    // if it was not found return
    if (indexOfConfidenceAttribute == -1) {
        return;
    }

    // Find index of nominal values
    Attribute confidenceAttribute = traindata.attribute(indexOfConfidenceAttribute);
    ArrayList<Object> nominalValuesOfConfidenceAttribute = Collections
            .list(confidenceAttribute.enumerateValues());
    ArrayList<Double> indexOfnominalAttributeValues = new ArrayList<Double>();

    for (int k = 0; k < nominalValuesOfConfidenceAttribute.size(); k++) {
        for (String attributeValue : nominalAttributeValues) {
            if (((String) nominalValuesOfConfidenceAttribute.get(k)).equals(attributeValue)) {
                indexOfnominalAttributeValues.add((double) k);
            }
        }
    }

    // Go through all instances and check if nominal attribute equals
    for (int j = traindata.numInstances() - 1; j >= 0; j--) {
        Instance wekaInstance = traindata.get(j);

        // delete all instances where nominal attribute has the value of one of the parameter
        if (indexOfnominalAttributeValues.contains(wekaInstance.value(indexOfConfidenceAttribute))) {
            traindata.delete(j);
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataselection.SynonymOutlierRemoval.java

License:Apache License

/**
 * <p>/*  w  w w .java2  s . co  m*/
 * Applies the synonym outlier removal.
 * </p>
 *
 * @param traindata
 *            data from which the outliers are removed.
 */
public void applySynonymRemoval(Instances traindata) {
    double minDistance[][] = new double[traindata.size()][traindata.numAttributes() - 1];
    double minDistanceAttribute[] = new double[traindata.numAttributes() - 1];
    double distance;
    for (int j = 0; j < minDistanceAttribute.length; j++) {
        minDistanceAttribute[j] = Double.MAX_VALUE;
    }
    for (int i1 = traindata.size() - 1; i1 < traindata.size(); i1++) {
        int k = 0;
        for (int j = 0; j < traindata.numAttributes(); j++) {
            if (j != traindata.classIndex()) {
                minDistance[i1][k] = Double.MAX_VALUE;
                for (int i2 = 0; i2 < traindata.size(); i2++) {
                    if (i1 != i2) {
                        distance = Math.abs(traindata.get(i1).value(j) - traindata.get(i2).value(j));
                        if (distance < minDistance[i1][k]) {
                            minDistance[i1][k] = distance;
                        }
                        if (distance < minDistanceAttribute[k]) {
                            minDistanceAttribute[k] = distance;
                        }
                    }
                }
                k++;
            }
        }
    }
    for (int i = traindata.size() - 1; i >= 0; i--) {
        boolean hasClosest = false;
        for (int j = 0; !hasClosest && j < traindata.numAttributes(); j++) {
            hasClosest = minDistance[i][j] <= minDistanceAttribute[j];
        }
        if (!hasClosest) {
            traindata.delete(i);
        }
    }
}

From source file:de.ugoe.cs.cpdp.execution.WithinProjectOrderedSplitExperiment.java

License:Apache License

/**
 * Executes the experiment with the steps as described in the class comment.
 * //from   w w w  .j ava2s .co  m
 * @see Runnable#run()
 */
@Override
public void run() {
    final List<SoftwareVersion> versions = new LinkedList<>();

    for (IVersionLoader loader : config.getLoaders()) {
        versions.addAll(loader.load());
    }

    for (IVersionFilter filter : config.getVersionFilters()) {
        filter.apply(versions);
    }
    boolean writeHeader = true;
    int versionCount = 1;
    int testVersionCount = 0;
    int numTrainers = 0;

    for (SoftwareVersion testVersion : versions) {
        if (isVersion(testVersion, config.getTestVersionFilters())) {
            testVersionCount++;
        }
    }

    numTrainers += config.getSetWiseTrainers().size();
    numTrainers += config.getSetWiseTestdataAwareTrainers().size();
    numTrainers += config.getTrainers().size();
    numTrainers += config.getTestAwareTrainers().size();

    // sort versions
    Collections.sort(versions);

    for (SoftwareVersion testVersion : versions) {
        if (isVersion(testVersion, config.getTestVersionFilters())) {
            Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: starting",
                    config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion()));
            int numResultsAvailable = resultsAvailable(testVersion);
            if (numResultsAvailable >= numTrainers * config.getRepetitions()) {
                Console.traceln(Level.INFO,
                        String.format("[%s] [%02d/%02d] %s: results already available; skipped",
                                config.getExperimentName(), versionCount, testVersionCount,
                                testVersion.getVersion()));
                versionCount++;
                continue;
            }

            // Setup testdata and training data
            Instances testdata = testVersion.getInstances();
            List<Double> efforts = testVersion.getEfforts();

            // now split data into parts
            double percentage = 0.5; // 0.5 as default value
            String param = config.getExecutionStrategyParameters();
            if (config.getExecutionStrategyParameters() != null) {
                try {
                    percentage = Double.parseDouble(param);
                } catch (NumberFormatException e) {
                    throw new RuntimeException(
                            "invalid execution strategy parameter, must be numeric: " + param);
                }
            }
            int initialTestSize = testdata.size();
            Instances traindata = new Instances(testdata);
            for (int i = initialTestSize - 1; i >= 0; i--) {
                if ((((double) i) / initialTestSize) < percentage) {
                    testdata.delete(i);
                    if (efforts != null) {
                        efforts.remove(i);
                    }
                } else {
                    traindata.delete(i);
                }
            }

            for (IProcessesingStrategy processor : config.getPreProcessors()) {
                Console.traceln(Level.FINE,
                        String.format("[%s] [%02d/%02d] %s: applying preprocessor %s",
                                config.getExperimentName(), versionCount, testVersionCount,
                                testVersion.getVersion(), processor.getClass().getName()));
                processor.apply(testdata, traindata);
            }
            for (IPointWiseDataselectionStrategy dataselector : config.getPointWiseSelectors()) {
                Console.traceln(Level.FINE,
                        String.format("[%s] [%02d/%02d] %s: applying pointwise selection %s",
                                config.getExperimentName(), versionCount, testVersionCount,
                                testVersion.getVersion(), dataselector.getClass().getName()));
                traindata = dataselector.apply(testdata, traindata);
            }
            for (IProcessesingStrategy processor : config.getPostProcessors()) {
                Console.traceln(Level.FINE,
                        String.format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s",
                                config.getExperimentName(), versionCount, testVersionCount,
                                testVersion.getVersion(), processor.getClass().getName()));
                processor.apply(testdata, traindata);
            }
            for (ITrainingStrategy trainer : config.getTrainers()) {
                Console.traceln(Level.FINE,
                        String.format("[%s] [%02d/%02d] %s: applying trainer %s", config.getExperimentName(),
                                versionCount, testVersionCount, testVersion.getVersion(), trainer.getName()));
                trainer.apply(traindata);
            }
            for (ITestAwareTrainingStrategy trainer : config.getTestAwareTrainers()) {
                Console.traceln(Level.FINE,
                        String.format("[%s] [%02d/%02d] %s: applying trainer %s", config.getExperimentName(),
                                versionCount, testVersionCount, testVersion.getVersion(), trainer.getName()));
                trainer.apply(testdata, traindata);
            }
            File resultsDir = new File(config.getResultsPath());
            if (!resultsDir.exists()) {
                resultsDir.mkdir();
            }
            for (IEvaluationStrategy evaluator : config.getEvaluators()) {
                Console.traceln(Level.FINE,
                        String.format("[%s] [%02d/%02d] %s: applying evaluator %s", config.getExperimentName(),
                                versionCount, testVersionCount, testVersion.getVersion(),
                                evaluator.getClass().getName()));
                List<ITrainer> allTrainers = new LinkedList<>();
                for (ITrainingStrategy trainer : config.getTrainers()) {
                    allTrainers.add(trainer);
                }
                for (ITestAwareTrainingStrategy trainer : config.getTestAwareTrainers()) {
                    allTrainers.add(trainer);
                }
                if (writeHeader) {
                    evaluator.setParameter(config.getResultsPath() + "/" + config.getExperimentName() + ".csv");
                }
                evaluator.apply(testdata, traindata, allTrainers, efforts, writeHeader,
                        config.getResultStorages());
                writeHeader = false;
            }
            Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: finished",
                    config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion()));
            versionCount++;
        }
    }
}

From source file:de.unidue.langtech.grading.tc.LearningCurveTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    boolean multiLabel = false;

    for (Integer numberInstances : NUMBER_OF_TRAINING_INSTANCES) {
        for (int iteration = 0; iteration < ITERATIONS; iteration++) {
            File arffFileTrain = new File(
                    aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY)
                            .getPath() + "/" + TRAINING_DATA_FILENAME);
            File arffFileTest = new File(
                    aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TEST_DATA, AccessMode.READONLY).getPath()
                            + "/" + TRAINING_DATA_FILENAME);

            Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);
            Instances testData = TaskUtils.getInstances(arffFileTest, multiLabel);

            if (numberInstances > trainData.size()) {
                continue;
            }// w  w  w  . j av  a2 s .  c  o  m

            Classifier cl = AbstractClassifier.forName(classificationArguments.get(0),
                    classificationArguments.subList(1, classificationArguments.size()).toArray(new String[0]));

            Instances copyTestData = new Instances(testData);
            trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);
            testData = WekaUtils.removeOutcomeId(testData, multiLabel);

            Random generator = new Random();
            generator.setSeed(System.nanoTime());

            trainData.randomize(generator);

            // remove fraction of training data that should not be used for training
            for (int i = trainData.size() - 1; i >= numberInstances; i--) {
                trainData.delete(i);
            }

            // file to hold prediction results
            File evalOutput = new File(
                    aContext.getStorageLocation(TEST_TASK_OUTPUT_KEY, AccessMode.READWRITE).getPath() + "/"
                            + EVALUATION_DATA_FILENAME + "_" + numberInstances + "_" + iteration);

            // train the classifier on the train set split - not necessary in multilabel setup, but
            // in single label setup
            cl.buildClassifier(trainData);

            weka.core.SerializationHelper.write(evalOutput.getAbsolutePath(),
                    WekaUtils.getEvaluationSinglelabel(cl, trainData, testData));
            testData = WekaUtils.getPredictionInstancesSingleLabel(testData, cl);
            testData = WekaUtils.addOutcomeId(testData, copyTestData, false);

            //                // Write out the predictions
            //                DataSink.write(aContext.getStorageLocation(TEST_TASK_OUTPUT_KEY, AccessMode.READWRITE)
            //                        .getAbsolutePath() + "/" + PREDICTIONS_FILENAME + "_" + trainPercent, testData); 
        }
    }
}

From source file:edu.umbc.cs.maple.utils.WekaUtils.java

License:Open Source License

/** Take a certain number of a set of instances.
 * @param instances//from   w  ww .  java2 s  .c  om
 * @param numInstances the number of instances to keep
 * @return a reduced set of instances according to the given number to keep
 */
public static Instances trimInstances(Instances instances, int numInstances) {
    Instances trimmedInstances = new Instances(instances);
    for (int i = trimmedInstances.numInstances() - 1; i >= numInstances; i--) {
        trimmedInstances.delete(i);
    }
    return trimmedInstances;
}

From source file:edu.umbc.cs.maple.utils.WekaUtils.java

License:Open Source License

/** Extract a particular subset of the instances.
 * @param instances// ww  w  .java2 s .com
 * @param startIdx the start instance index
 * @param numInstancesToRetrieve the number of instances to retrieve
 * @return the specified subset of the instances.
 */
public static Instances subsetInstances(Instances instances, int startIdx, int numInstancesToRetrieve) {
    double possibleNumInstancesToRetrieve = instances.numInstances() - startIdx;
    if (numInstancesToRetrieve > possibleNumInstancesToRetrieve) {
        throw new IllegalArgumentException(
                "Cannot retrieve more than " + possibleNumInstancesToRetrieve + " instances.");
    }

    int endIdx = startIdx + numInstancesToRetrieve - 1;

    // delete all instance indices outside of [startIdx, endIdx]
    Instances subset = new Instances(instances);
    for (int i = subset.numInstances() - 1; i >= 0; i--) {
        if (i < startIdx || i > endIdx)
            subset.delete(i);
    }

    return subset;
}

From source file:GClass.EvaluationInternal.java

License:Open Source License

/**
 * Evaluates a classifier with the options given in an array of
 * strings. <p>/*w  w  w  .ja  v a 2 s .com*/
 *
 * Valid options are: <p>
 *
 * -t name of training file <br>
 * Name of the file with the training data. (required) <p>
 *
 * -T name of test file <br>
 * Name of the file with the test data. If missing a cross-validation
 * is performed. <p>
 *
 * -c class index <br>
 * Index of the class attribute (1, 2, ...; default: last). <p>
 *
 * -x number of folds <br>
 * The number of folds for the cross-validation (default: 10). <p>
 *
 * -s random number seed <br>
 * Random number seed for the cross-validation (default: 1). <p>
 *
 * -m file with cost matrix <br>
 * The name of a file containing a cost matrix. <p>
 *
 * -l name of model input file <br>
 * Loads classifier from the given file. <p>
 *
 * -d name of model output file <br>
 * Saves classifier built from the training data into the given file. <p>
 *
 * -v <br>
 * Outputs no statistics for the training data. <p>
 *
 * -o <br>
 * Outputs statistics only, not the classifier. <p>
 *
 * -i <br>
 * Outputs detailed information-retrieval statistics per class. <p>
 *
 * -k <br>
 * Outputs information-theoretic statistics. <p>
 *
 * -p <br>
 * Outputs predictions for test instances (and nothing else). <p>
 *
 * -r <br>
 * Outputs cumulative margin distribution (and nothing else). <p>
 *
 * -g <br>
 * Only for classifiers that implement "Graphable." Outputs
 * the graph representation of the classifier (and nothing
 * else). <p>
 *
 * @param classifier machine learning classifier
 * @param options the array of string containing the options
 * @exception Exception if model could not be evaluated successfully
 * @return a string describing the results */
public static String[] evaluateModel(Classifier classifier, String trainFileName, String objectOutputFileName)
        throws Exception {

    Instances train = null, tempTrain, test = null, template = null;
    int seed = 1, folds = 10, classIndex = -1;
    String testFileName, sourceClass, classIndexString, seedString, foldsString, objectInputFileName,
            attributeRangeString;
    boolean IRstatistics = false, noOutput = false, printClassifications = false, trainStatistics = true,
            printMargins = false, printComplexityStatistics = false, printGraph = false,
            classStatistics = false, printSource = false;
    StringBuffer text = new StringBuffer();
    BufferedReader trainReader = null, testReader = null;
    ObjectInputStream objectInputStream = null;
    CostMatrix costMatrix = null;
    StringBuffer schemeOptionsText = null;
    Range attributesToOutput = null;
    long trainTimeStart = 0, trainTimeElapsed = 0, testTimeStart = 0, testTimeElapsed = 0;

    try {

        String[] options = null;

        // Get basic options (options the same for all schemes)
        classIndexString = Utils.getOption('c', options);
        if (classIndexString.length() != 0) {
            classIndex = Integer.parseInt(classIndexString);
        }
        //  trainFileName = Utils.getOption('t', options);

        objectInputFileName = Utils.getOption('l', options);
        //   objectOutputFileName = Utils.getOption('d', options);
        testFileName = Utils.getOption('T', options);
        if (trainFileName.length() == 0) {
            if (objectInputFileName.length() == 0) {
                throw new Exception("No training file and no object " + "input file given.");
            }
            if (testFileName.length() == 0) {
                throw new Exception("No training file and no test " + "file given.");
            }
        } else if ((objectInputFileName.length() != 0)
                && ((!(classifier instanceof UpdateableClassifier)) || (testFileName.length() == 0))) {
            throw new Exception("Classifier not incremental, or no " + "test file provided: can't "
                    + "use both train and model file.");
        }
        try {
            if (trainFileName.length() != 0) {
                trainReader = new BufferedReader(new FileReader(trainFileName));
            }
            if (testFileName.length() != 0) {
                testReader = new BufferedReader(new FileReader(testFileName));
            }
            if (objectInputFileName.length() != 0) {
                InputStream is = new FileInputStream(objectInputFileName);
                if (objectInputFileName.endsWith(".gz")) {
                    is = new GZIPInputStream(is);
                }
                objectInputStream = new ObjectInputStream(is);
            }
        } catch (Exception e) {
            throw new Exception("Can't open file " + e.getMessage() + '.');
        }
        if (testFileName.length() != 0) {
            template = test = new Instances(testReader, 1);
            if (classIndex != -1) {
                test.setClassIndex(classIndex - 1);
            } else {
                test.setClassIndex(test.numAttributes() - 1);
            }
            if (classIndex > test.numAttributes()) {
                throw new Exception("Index of class attribute too large.");
            }
        }
        if (trainFileName.length() != 0) {
            if ((classifier instanceof UpdateableClassifier) && (testFileName.length() != 0)) {
                train = new Instances(trainReader, 1);
            } else {
                train = new Instances(trainReader);
            }
            template = train;
            if (classIndex != -1) {
                train.setClassIndex(classIndex - 1);
            } else {
                train.setClassIndex(train.numAttributes() - 1);
            }
            if ((testFileName.length() != 0) && !test.equalHeaders(train)) {
                throw new IllegalArgumentException("Train and test file not compatible!");
            }
            if (classIndex > train.numAttributes()) {
                throw new Exception("Index of class attribute too large.");
            }
            //train = new Instances(train);
        }
        if (template == null) {
            throw new Exception("No actual dataset provided to use as template");
        }
        seedString = Utils.getOption('s', options);
        if (seedString.length() != 0) {
            seed = Integer.parseInt(seedString);
        }
        foldsString = Utils.getOption('x', options);
        if (foldsString.length() != 0) {
            folds = Integer.parseInt(foldsString);
        }
        costMatrix = handleCostOption(Utils.getOption('m', options), template.numClasses());

        classStatistics = Utils.getFlag('i', options);
        noOutput = Utils.getFlag('o', options);
        trainStatistics = !Utils.getFlag('v', options);
        printComplexityStatistics = Utils.getFlag('k', options);
        printMargins = Utils.getFlag('r', options);
        printGraph = Utils.getFlag('g', options);
        sourceClass = Utils.getOption('z', options);
        printSource = (sourceClass.length() != 0);

        // Check -p option
        try {
            attributeRangeString = Utils.getOption('p', options);
        } catch (Exception e) {
            throw new Exception(e.getMessage() + "\nNOTE: the -p option has changed. "
                    + "It now expects a parameter specifying a range of attributes "
                    + "to list with the predictions. Use '-p 0' for none.");
        }
        if (attributeRangeString.length() != 0) {
            printClassifications = true;
            if (!attributeRangeString.equals("0")) {
                attributesToOutput = new Range(attributeRangeString);
            }
        }

        // If a model file is given, we can't process
        // scheme-specific options
        if (objectInputFileName.length() != 0) {
            Utils.checkForRemainingOptions(options);
        } else {

            // Set options for classifier
            if (classifier instanceof OptionHandler) {
                /* for (int i = 0; i < options.length; i++) {
                if (options[i].length() != 0) {
                    if (schemeOptionsText == null) {
                        schemeOptionsText = new StringBuffer();
                    }
                    if (options[i].indexOf(' ') != -1) {
                        schemeOptionsText.append('"' + options[i] + "\" ");
                    } else {
                        schemeOptionsText.append(options[i] + " ");
                    }
                }
                 }
                 */
                ((OptionHandler) classifier).setOptions(options);
            }
        }
        Utils.checkForRemainingOptions(options);

    } catch (Exception e) {
        throw new Exception("\nWeka exception: " + e.getMessage() + makeOptionString(classifier));
    }

    // Setup up evaluation objects
    EvaluationInternal trainingEvaluation = new EvaluationInternal(new Instances(template, 0), costMatrix);
    EvaluationInternal testingEvaluation = new EvaluationInternal(new Instances(template, 0), costMatrix);

    if (objectInputFileName.length() != 0) {

        // Load classifier from file
        classifier = (Classifier) objectInputStream.readObject();
        objectInputStream.close();
    }

    // Build the classifier if no object file provided
    if ((classifier instanceof UpdateableClassifier) && (testFileName.length() != 0) && (costMatrix == null)
            && (trainFileName.length() != 0)) {

        // Build classifier incrementally
        trainingEvaluation.setPriors(train);
        testingEvaluation.setPriors(train);
        trainTimeStart = System.currentTimeMillis();
        if (objectInputFileName.length() == 0) {
            classifier.buildClassifier(train);
        }
        while (train.readInstance(trainReader)) {

            trainingEvaluation.updatePriors(train.instance(0));
            testingEvaluation.updatePriors(train.instance(0));
            ((UpdateableClassifier) classifier).updateClassifier(train.instance(0));
            train.delete(0);
        }
        trainTimeElapsed = System.currentTimeMillis() - trainTimeStart;
        trainReader.close();
    } else if (objectInputFileName.length() == 0) {

        // Build classifier in one go
        tempTrain = new Instances(train);
        trainingEvaluation.setPriors(tempTrain);
        testingEvaluation.setPriors(tempTrain);
        trainTimeStart = System.currentTimeMillis();
        classifier.buildClassifier(tempTrain);
        trainTimeElapsed = System.currentTimeMillis() - trainTimeStart;
    }

    // Save the classifier if an object output file is provided
    if (objectOutputFileName.length() != 0) {
        OutputStream os = new FileOutputStream(objectOutputFileName);
        if (objectOutputFileName.endsWith(".gz")) {
            os = new GZIPOutputStream(os);
        }
        ObjectOutputStream objectOutputStream = new ObjectOutputStream(os);
        objectOutputStream.writeObject(classifier);
        objectOutputStream.flush();
        objectOutputStream.close();
    }

    /*   // If classifier is drawable output string describing graph
       if ((classifier instanceof Drawable)
    && (printGraph)) {
    return ((Drawable) classifier).graph();
       }
            
       // Output the classifier as equivalent source
       if ((classifier instanceof Sourcable)
    && (printSource)) {
    return wekaStaticWrapper((Sourcable) classifier, sourceClass);
       }
            
       // Output test instance predictions only
       if (printClassifications) {
    return printClassifications(classifier, new Instances(template, 0),
                                testFileName, classIndex, attributesToOutput);
       }
       */

    // Output model
    if (!(noOutput || printMargins)) {
        if (classifier instanceof OptionHandler) {
            if (schemeOptionsText != null) {
                text.append("\nOptions: " + schemeOptionsText);
                text.append("\n");
            }
        }
        text.append("\n" + classifier.toString() + "\n");
    }

    if (!printMargins && (costMatrix != null)) {
        text.append("\n=== Evaluation Cost Matrix ===\n\n").append(costMatrix.toString());
    }

    // Compute error estimate from training data
    if ((trainStatistics) && (trainFileName.length() != 0)) {

        if ((classifier instanceof UpdateableClassifier) && (testFileName.length() != 0)
                && (costMatrix == null)) {

            // Classifier was trained incrementally, so we have to
            // reopen the training data in order to test on it.
            trainReader = new BufferedReader(new FileReader(trainFileName));

            // Incremental testing
            train = new Instances(trainReader, 1);
            if (classIndex != -1) {
                train.setClassIndex(classIndex - 1);
            } else {
                train.setClassIndex(train.numAttributes() - 1);
            }
            testTimeStart = System.currentTimeMillis();
            while (train.readInstance(trainReader)) {

                trainingEvaluation.evaluateModelOnce((Classifier) classifier, train.instance(0));
                train.delete(0);
            }
            testTimeElapsed = System.currentTimeMillis() - testTimeStart;
            trainReader.close();
        } else {
            testTimeStart = System.currentTimeMillis();
            trainingEvaluation.evaluateModel(classifier, train);
            testTimeElapsed = System.currentTimeMillis() - testTimeStart;
        }

        // Print the results of the training evaluation
        //  if (printMargins) {
        //      return trainingEvaluation.toCumulativeMarginDistributionString();
        //   } else {
        text.append("\nTime taken to build model: " + Utils.doubleToString(trainTimeElapsed / 1000.0, 2)
                + " seconds");
        text.append("\nTime taken to test model on training data: "
                + Utils.doubleToString(testTimeElapsed / 1000.0, 2) + " seconds");
        text.append(trainingEvaluation.toSummaryString("\n\n=== Error on training" + " data ===\n",
                printComplexityStatistics));
        if (template.classAttribute().isNominal()) {
            if (classStatistics) {
                text.append("\n\n" + trainingEvaluation.toClassDetailsString());
            }
            text.append("\n\n" + trainingEvaluation.toMatrixString());
        }

        //  }
    }

    // Compute proper error estimates
    if (testFileName.length() != 0) {

        // Testing is on the supplied test data
        while (test.readInstance(testReader)) {

            testingEvaluation.evaluateModelOnce((Classifier) classifier, test.instance(0));
            test.delete(0);
        }
        testReader.close();

        text.append("\n\n"
                + testingEvaluation.toSummaryString("=== Error on test data ===\n", printComplexityStatistics));
    } else if (trainFileName.length() != 0) {

        // Testing is via cross-validation on training data
        Random random = new Random(seed);
        testingEvaluation.crossValidateModel(classifier, train, folds, random);
        if (template.classAttribute().isNumeric()) {
            text.append("\n\n\n" + testingEvaluation.toSummaryString("=== Cross-validation ===\n",
                    printComplexityStatistics));
        } else {
            text.append("\n\n\n" + testingEvaluation
                    .toSummaryString("=== Stratified " + "cross-validation ===\n", printComplexityStatistics));
        }
    }
    if (template.classAttribute().isNominal()) {
        if (classStatistics) {
            text.append("\n\n" + testingEvaluation.toClassDetailsString());
        }
        text.append("\n\n" + testingEvaluation.toMatrixString());
    }

    String result = "\t" + Utils.doubleToString(trainingEvaluation.pctCorrect(), 12, 4) + " %";
    result += "       " + Utils.doubleToString(testingEvaluation.pctCorrect(), 12, 4) + " %";

    String[] returnString = { text.toString(), result };
    return returnString;
}

From source file:GClass.EvaluationInternal.java

License:Open Source License

/**
 * Prints the predictions for the given dataset into a String variable.
 *//*from ww  w  .  j a va2  s  . c  o  m*/
protected static String printClassifications(Classifier classifier, Instances train, String testFileName,
        int classIndex, Range attributesToOutput) throws Exception {

    StringBuffer text = new StringBuffer();
    if (testFileName.length() != 0) {
        BufferedReader testReader = null;
        try {
            testReader = new BufferedReader(new FileReader(testFileName));
        } catch (Exception e) {
            throw new Exception("Can't open file " + e.getMessage() + '.');
        }
        Instances test = new Instances(testReader, 1);
        if (classIndex != -1) {
            test.setClassIndex(classIndex - 1);
        } else {
            test.setClassIndex(test.numAttributes() - 1);
        }
        int i = 0;
        while (test.readInstance(testReader)) {
            Instance instance = test.instance(0);
            Instance withMissing = (Instance) instance.copy();
            withMissing.setDataset(test);
            double predValue = ((Classifier) classifier).classifyInstance(withMissing);
            if (test.classAttribute().isNumeric()) {
                if (Instance.isMissingValue(predValue)) {
                    text.append(i + " missing ");
                } else {
                    text.append(i + " " + predValue + " ");
                }
                if (instance.classIsMissing()) {
                    text.append("missing");
                } else {
                    text.append(instance.classValue());
                }
                text.append(" " + attributeValuesString(withMissing, attributesToOutput) + "\n");
            } else {
                if (Instance.isMissingValue(predValue)) {
                    text.append(i + " missing ");
                } else {
                    text.append(i + " " + test.classAttribute().value((int) predValue) + " ");
                }
                if (Instance.isMissingValue(predValue)) {
                    text.append("missing ");
                } else {
                    text.append(classifier.distributionForInstance(withMissing)[(int) predValue] + " ");
                }
                text.append(instance.toString(instance.classIndex()) + " "
                        + attributeValuesString(withMissing, attributesToOutput) + "\n");
            }
            test.delete(0);
            i++;
        }
        testReader.close();
    }
    return text.toString();
}