List of usage examples for weka.core Instances randomize
public void randomize(Random random)
From source file:entity.NoiseInjectionManager.java
License:Open Source License
/** * /*w ww .ja v a 2s. com*/ * Increments fp and fn by specified percentages. * Randomize order of instances and modifies instances until noise quota is reached. * Than randomized instances again. * NOTE: It modifies the given dataset, because it is a reference. * * @param origDataset * @param fpPercentage * @param fnPercentage * @return Instances noisyDataset */ public Instances addNoiseToDataset(Instances origDataset, BigDecimal fpPercentage, BigDecimal fnPercentage) { // exits if no noise must be added if (fnPercentage.equals(BigDecimal.ZERO) && fpPercentage.equals(BigDecimal.ZERO)) { if (verbose) System.out.println("[NoiseManager , addNoiseToDataset] nessun errore da aggiungere"); return origDataset; } // total instances in dataset int numInstances = origDataset.numInstances(); // finds positive (buggy) and negative (non-buggy) instances numbers int numOfPositives = 0; int numOfNegatives = 0; for (int j = 0; j < numInstances; j++) { if (origDataset.instance(j).stringValue(origDataset.classIndex()).equals(Settings.buggyLabel)) { numOfPositives++; } // this is a redundant control, but better safe than sorry else if (origDataset.instance(j).stringValue(origDataset.classIndex()).equals(Settings.nonbuggyLabel)) { numOfNegatives++; } } // calculates the number of false positives to insert int fpToInsert = (int) Math.round(numOfNegatives * fpPercentage.doubleValue() / 100); int fpInserted = 0; if (verbose) System.out.println("\n\n[NoiseManager , addNoiseToDataset] fpToInsert= " + fpToInsert + ", totIntances= " + origDataset.numInstances() + " true negatives= " + numOfNegatives + " %fp= " + fpPercentage); // calculates the number of false negatives to insert int fnToInsert = (int) Math.round(numOfPositives * fnPercentage.doubleValue() / 100); int fnInserted = 0; if (verbose) System.out.println("[NoiseManager , addNoiseToDataset] fnToInsert= " + fnToInsert + ", totIntances= " + origDataset.numInstances() + " true positives= " + numOfPositives + " %fn= " + fnPercentage); if (verbose) System.out.println("[NoiseManager , addNoiseToDataset] buggy label: " + Settings.buggyLabel + " - nonbuggy label: " + Settings.nonbuggyLabel); // randomize order of instances origDataset.randomize(RandomizationManager.randomGenerator); for (int i = 0; i < origDataset.numInstances(); i++) { if (verbose) System.out.print("\nORIGINAL VALUES: " + origDataset.instance(i).value(origDataset.attribute(origDataset.classIndex())) + " - " + origDataset.instance(i).stringValue(origDataset.classIndex())); // gets the classification attribute (it HAS to be the last) Attribute att = origDataset.instance(i).attribute(origDataset.classIndex()); // if there are fn to add and this is a positive instances it turns it into a negative, making it a fn if ((fnInserted < fnToInsert) && (origDataset.instance(i).stringValue(origDataset.classIndex()) .equals(Settings.buggyLabel))) { origDataset.instance(i).setValue(att, Settings.nonbuggyLabel); fnInserted++; if (verbose) System.out.print(" - added FN, added " + fnInserted + " of " + fnToInsert + " "); } // if there are fp to add and this is a negative instances it turns it into a positive, making it a fp else if ((fpInserted < fpToInsert) && (origDataset.instance(i).stringValue(origDataset.classIndex()) .equals(Settings.nonbuggyLabel))) { origDataset.instance(i).setValue(att, Settings.buggyLabel); fpInserted++; if (verbose) System.out.print(" - added FP, added " + fpInserted + " of " + fpToInsert + " "); } if (verbose) System.out.print(" FINAL ELEMENT VALUES: " + origDataset.instance(i).value(origDataset.attribute(origDataset.classIndex())) + " - " + origDataset.instance(i).stringValue(origDataset.classIndex())); } // randomize order of instances origDataset.randomize(RandomizationManager.randomGenerator); return origDataset; }
From source file:entity.NoiseInjectionManager.java
License:Open Source License
/** * Increments fp and fn in combination by a specified percentages. * Randomize order of instances and modifies instances until noise quota is reached. * Than randomized instances again./*from w w w . jav a2 s . c o m*/ * NOTE: It modifies the given dataset, because it is a reference. * * @param origDataset * @param combinedFpFnPercentage * @return noisydata */ public Instances addNoiseToDataset(Instances origDataset, BigDecimal combinedFpFnPercentage) { // exits if no noise must be added if (combinedFpFnPercentage.equals(BigDecimal.ZERO)) { if (verbose) System.out.println("[NoiseManager , addNoiseToDataset] nessun errore da aggiungere"); return origDataset; } // total instances in dataset int numInstances = origDataset.numInstances(); // finds positive (buggy) and negative (non-buggy) instances numbers int fpAndFnToInsert = (int) Math.round(numInstances * combinedFpFnPercentage.doubleValue() / 100); int fpAndFnInserted = 0; if (verbose) System.out.println("\n\n[NoiseManager , addNoiseToDataset] fpAndFnToInsert= " + fpAndFnToInsert + ", totIntances= " + origDataset.numInstances()); if (verbose) System.out.println("[NoiseManager , addNoiseToDataset] buggy label: " + Settings.buggyLabel + " - nonbuggy label: " + Settings.nonbuggyLabel); // randomize order of instances origDataset.randomize(RandomizationManager.randomGenerator); for (int i = 0; i < origDataset.numInstances(); i++) { if (verbose) System.out.print("\nORIGINAL VALUES: " + origDataset.instance(i).value(origDataset.attribute(origDataset.classIndex())) + " - " + origDataset.instance(i).stringValue(origDataset.classIndex())); // gets the classification attribute (it HAS to be the last) Attribute att = origDataset.instance(i).attribute(origDataset.classIndex()); // if there are fn or fp to add if (fpAndFnInserted < fpAndFnToInsert) { // if this is a positive instances it turns it into a negative, making it a fn if (origDataset.instance(i).stringValue(origDataset.classIndex()).equals(Settings.buggyLabel)) { if (verbose) System.out.print(" - added FN, added " + fpAndFnInserted + " of " + fpAndFnToInsert + " "); origDataset.instance(i).setValue(att, Settings.nonbuggyLabel); fpAndFnInserted++; } // if this is a negative instances it turns it into a positive, making it a fp else if (origDataset.instance(i).stringValue(origDataset.classIndex()) .equals(Settings.nonbuggyLabel)) { if (verbose) System.out.print(" - added FP, added " + fpAndFnInserted + " of " + fpAndFnToInsert + " "); origDataset.instance(i).setValue(att, Settings.buggyLabel); fpAndFnInserted++; } } if (verbose) System.out.print(" FINAL ELEMENT VALUES: " + origDataset.instance(i).value(origDataset.attribute(origDataset.classIndex())) + " - " + origDataset.instance(i).stringValue(origDataset.classIndex())); } // randomize order of instances origDataset.randomize(RandomizationManager.randomGenerator); return origDataset; }
From source file:es.jarias.FMC.RunFMC.java
License:Open Source License
public static void main(String[] args) { try {/*w w w.jav a 2 s . c o m*/ String classifierName = Utils.getOption("classifier", args); String discType = Utils.getOption("disc", args); String fss = Utils.getOption("fss", args); String prune = Utils.getOption("prune", args); String arffFilename = Utils.getOption("arff", args); String xmlFilename = Utils.getOption("xml", args); String outPath = Utils.getOption("outpath", args); String cvString = Utils.getOption("cv", args); String seedString = Utils.getOption("seed", args); // Check parameters: if (arffFilename.equals("") || arffFilename.equals("") || arffFilename.equals("")) throw new Exception("Please provide valid input and output files."); // Set Defaults: if (classifierName.equals("")) classifierName = "bayes.NaiveBayes"; if (discType.equals("")) discType = "supervised"; if (fss.equals("")) fss = "CFS"; if (prune.equals("")) prune = "full"; int cv_folds = 10; if (!cvString.equals("")) cv_folds = Integer.parseInt(cvString); int seed = 1990; if (!seedString.equals("")) seed = Integer.parseInt(seedString); MultiLabelInstances original = null; try { original = new MultiLabelInstances(arffFilename, xmlFilename); } catch (InvalidDataFormatException e) { System.out.println("Please provide valid multilabel arff+xml mulan files"); System.exit(-1); } MultiLabelInstances dataset = original.clone(); Instances aux = dataset.getDataSet(); aux.randomize(new Random(seed)); dataset = dataset.reintegrateModifiedDataSet(aux); System.out.println("--------------------------------------------"); System.out.println("FMC multi-label classifier experiment"); System.out.println("-Pruning strategy: " + prune); System.out.println("-Base Classifier: " + classifierName); System.out.println("-Discretization: " + discType); System.out.println("-Feature Selection: " + fss); System.out.println("-Folds: " + cv_folds); System.out.println("-Seed: " + seed); // Perform CV or Holdout if (cv_folds != 0) { for (int fold = 0; fold < cv_folds; fold++) { MultiLabelInstances trainData = original .reintegrateModifiedDataSet(dataset.getDataSet().trainCV(cv_folds, fold)); MultiLabelInstances testData = original .reintegrateModifiedDataSet(dataset.getDataSet().testCV(cv_folds, fold)); FMC.buildModel(trainData, testData, fold, classifierName, discType, fss, outPath, prune); } } else { double HOLDOUT_PERCENTAGE = 0.6; int trainSize = (int) Math.floor(dataset.getNumInstances() * HOLDOUT_PERCENTAGE); int testSize = dataset.getNumInstances() - trainSize; MultiLabelInstances trainData = dataset .reintegrateModifiedDataSet(new Instances(dataset.getDataSet(), 0, trainSize)); MultiLabelInstances testData = dataset .reintegrateModifiedDataSet(new Instances(dataset.getDataSet(), trainSize, testSize)); FMC.buildModel(trainData, testData, 0, classifierName, discType, fss, outPath, prune); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:es.upm.dit.gsi.barmas.dataset.utils.DatasetSplitter.java
License:Open Source License
/** * @param folds//from ww w. ja v a2 s. c o m * @param minAgents * @param maxAgents * @param originalDatasetPath * @param outputDir * @param scenario * @param logger */ public void splitDataset(int folds, int minAgents, int maxAgents, String originalDatasetPath, String outputDir, String scenario, Logger logger) { int ratioint = (int) ((1 / (double) folds) * 100); double roundedratio = ((double) ratioint) / 100; // Look for essentials List<String[]> essentials = this.getEssentials(originalDatasetPath, logger); for (int fold = 0; fold < folds; fold++) { String outputDirWithRatio = outputDir + "/" + roundedratio + "testRatio/iteration-" + fold; File dir = new File(outputDirWithRatio); if (!dir.exists() || !dir.isDirectory()) { dir.mkdirs(); } logger.finer("--> splitDataset()"); logger.fine("Creating experiment.info..."); try { Instances originalData = this.getDataFromCSV(originalDatasetPath); originalData.randomize(new Random()); originalData.stratify(folds); // TestDataSet Instances testData = originalData.testCV(folds, fold); CSVSaver saver = new CSVSaver(); ArffSaver arffsaver = new ArffSaver(); File file = new File(outputDirWithRatio + File.separator + "test-dataset.csv"); if (!file.exists()) { saver.resetOptions(); saver.setInstances(testData); saver.setFile(file); saver.writeBatch(); } file = new File(outputDirWithRatio + File.separator + "test-dataset.arff"); if (!file.exists()) { arffsaver.resetOptions(); arffsaver.setInstances(testData); arffsaver.setFile(file); arffsaver.writeBatch(); } // BayesCentralDataset Instances trainData = originalData.trainCV(folds, fold); file = new File(outputDirWithRatio + File.separator + "bayes-central-dataset.csv"); if (!file.exists()) { saver.resetOptions(); saver.setInstances(trainData); saver.setFile(file); saver.writeBatch(); this.copyFileUsingApacheCommonsIO(file, new File( outputDirWithRatio + File.separator + "bayes-central-dataset-noEssentials.csv"), logger); CsvWriter w = new CsvWriter(new FileWriter(file, true), ','); for (String[] essential : essentials) { w.writeRecord(essential); } w.close(); } file = new File(outputDirWithRatio + File.separator + "bayes-central-dataset.arff"); if (!file.exists()) { arffsaver.resetOptions(); arffsaver.setInstances(trainData); arffsaver.setFile(file); arffsaver.writeBatch(); this.copyFileUsingApacheCommonsIO(file, new File( outputDirWithRatio + File.separator + "bayes-central-dataset-noEssentials.arff"), logger); CsvWriter w = new CsvWriter(new FileWriter(file, true), ','); for (String[] essential : essentials) { w.writeRecord(essential); } w.close(); } // Agent datasets CsvReader csvreader = new CsvReader(new FileReader(new File(originalDatasetPath))); csvreader.readHeaders(); String[] headers = csvreader.getHeaders(); csvreader.close(); for (int agents = minAgents; agents <= maxAgents; agents++) { this.createExperimentInfoFile(folds, agents, originalDatasetPath, outputDirWithRatio, scenario, logger); HashMap<String, CsvWriter> writers = new HashMap<String, CsvWriter>(); String agentsDatasetsDir = outputDirWithRatio + File.separator + agents + "agents"; HashMap<String, CsvWriter> arffWriters = new HashMap<String, CsvWriter>(); File f = new File(agentsDatasetsDir); if (!f.isDirectory()) { f.mkdirs(); } Instances copy = new Instances(trainData); copy.delete(); for (int i = 0; i < agents; i++) { String fileName = agentsDatasetsDir + File.separator + "agent-" + i + "-dataset.csv"; file = new File(fileName); if (!file.exists()) { CsvWriter writer = new CsvWriter(new FileWriter(fileName), ','); writer.writeRecord(headers); writers.put("AGENT" + i, writer); } fileName = agentsDatasetsDir + File.separator + "agent-" + i + "-dataset.arff"; file = new File(fileName); if (!file.exists()) { arffsaver.resetOptions(); arffsaver.setInstances(copy); arffsaver.setFile(new File(fileName)); arffsaver.writeBatch(); CsvWriter arffwriter = new CsvWriter(new FileWriter(fileName, true), ','); arffWriters.put("AGENT" + i, arffwriter); } logger.fine("AGENT" + i + " dataset created in csv and arff formats."); } // Append essentials to all for (String[] essential : essentials) { for (CsvWriter wr : writers.values()) { wr.writeRecord(essential); } for (CsvWriter arffwr : arffWriters.values()) { arffwr.writeRecord(essential); } } int agentCounter = 0; for (int j = 0; j < trainData.numInstances(); j++) { Instance instance = trainData.instance(j); CsvWriter writer = writers.get("AGENT" + agentCounter); CsvWriter arffwriter = arffWriters.get("AGENT" + agentCounter); String[] row = new String[instance.numAttributes()]; for (int a = 0; a < instance.numAttributes(); a++) { row[a] = instance.stringValue(a); } if (writer != null) { writer.writeRecord(row); } if (arffwriter != null) { arffwriter.writeRecord(row); } agentCounter++; if (agentCounter == agents) { agentCounter = 0; } } for (CsvWriter wr : writers.values()) { wr.close(); } for (CsvWriter arffwr : arffWriters.values()) { arffwr.close(); } } } catch (Exception e) { logger.severe("Exception while splitting dataset. ->"); logger.severe(e.getMessage()); System.exit(1); } logger.finest("Dataset for fold " + fold + " created."); } logger.finer("<-- splitDataset()"); }
From source file:experimentalclassifier.ExperimentalClassifier.java
/** * @param args the command line arguments *///from ww w . j av a 2 s . co m public static void main(String[] args) throws Exception { DataSource source = new DataSource("data/iris.csv"); Instances data = source.getDataSet(); if (data.classIndex() == -1) { data.setClassIndex(data.numAttributes() - 1); } data.randomize(new Random()); String[] options = weka.core.Utils.splitOptions("-P 30"); RemovePercentage remove = new RemovePercentage(); remove.setOptions(options); remove.setInputFormat(data); Instances train = Filter.useFilter(data, remove); remove.setInvertSelection(true); remove.setInputFormat(data); Instances test = Filter.useFilter(data, remove); Classifier classifier = new HardCodedClassifier(); classifier.buildClassifier(train);//Currently, this does nothing Evaluation eval = new Evaluation(train); eval.evaluateModel(classifier, test); System.out.println(eval.toSummaryString("\nResults\n======\n", false)); }
From source file:expshell.ExpShell.java
/** * @param args the command line arguments * @throws java.lang.Exception/*from w w w . ja v a2s. c o m*/ */ public static void main(String[] args) throws Exception { String file = "C:\\Users\\YH Jonathan Kwok\\Documents\\NetBeansProjects\\ExpShell\\src\\expshell\\iris.csv"; DataSource source = new DataSource(file); Instances data = source.getDataSet(); if (data.classIndex() == -1) data.setClassIndex(data.numAttributes() - 1); //Randomize it data.randomize(new Random(1)); RemovePercentage rp = new RemovePercentage(); rp.setPercentage(70); rp.setInputFormat(data); Instances training = Filter.useFilter(data, rp); rp.setInvertSelection(true); rp.setInputFormat(data); Instances test = Filter.useFilter(data, rp); //standardize the data Standardize filter = new Standardize(); filter.setInputFormat(training); Instances newTest = Filter.useFilter(test, filter); Instances newTraining = Filter.useFilter(training, filter); //Part 5 - Now it's a knn Classifier knn = new NeuralClassifier(); knn.buildClassifier(newTraining); Evaluation eval = new Evaluation(newTraining); eval.evaluateModel(knn, newTest); System.out.println(eval.toSummaryString("***** Overall results: *****", false)); }
From source file:gate.plugin.learningframework.engines.EngineWeka.java
@Override public Object evaluateHoldout(InstanceList instances, double portion, int repeats, String parms) { // Get the parameters // -s/-seed <int> : seed, default 0 // -S/-nostratify : switch off stratification if we evaluate classification Parms opts = new Parms(parms, "s:seed:i", "S:nostratify:b"); int seed = (int) opts.getValueOrElse("seed", 0); if (algorithm instanceof AlgorithmRegression) { throw new UnsupportedOperationException("Weka holdout eval for regression not supported yet."); } else {//from w w w. j av a2s . c o m // must be classification algorithm then! weka.core.Instances all = new CorpusRepresentationWeka(corpusRepresentationMallet) .getRepresentationWeka(); boolean noStratify = (boolean) opts.getValueOrElse("nostratify", 0); Random rand = new Random(seed); all.randomize(rand); boolean stratified = !noStratify; // TODO: not sure if/how we can do stratification for holdout evaluation // TODO: there must be a better way to do the splitting too! // TODO: if there is no better way to split, maybe do out outside for // TODO: how to implement repeats? if (repeats != 1) { throw new GateRuntimeException("Only repeats == 1 supported yet"); } // both regression and classification? int trainSize = (int) Math.round(all.numInstances() * portion); int testSize = all.numInstances() - trainSize; Instances train = new Instances(all, 0, trainSize); Instances test = new Instances(all, trainSize, testSize); Classifier classifier = (Classifier) trainer; try { classifier.buildClassifier(train); } catch (Exception ex) { throw new GateRuntimeException("Error during training of Weka classifier", ex); } Evaluation eval = null; try { eval = new Evaluation(train); } catch (Exception ex) { throw new GateRuntimeException("Could not create Evaluation object", ex); } try { eval.evaluateModel(classifier, test); } catch (Exception ex) { throw new GateRuntimeException("Error evaluating the classifier", ex); } System.out.println("Evaluation result:\n" + eval); return eval; } }
From source file:GClass.EvaluationInternal.java
License:Open Source License
/** * Performs a (stratified if class is nominal) cross-validation * for a classifier on a set of instances. * * @param classifier the classifier with any options set. * @param data the data on which the cross-validation is to be * performed// w w w .j a v a 2 s . c o m * @param numFolds the number of folds for the cross-validation * @param random random number generator for randomization * @exception Exception if a classifier could not be generated * successfully or the class is not defined */ public void crossValidateModel(Classifier classifier, Instances data, int numFolds, Random random) throws Exception { // Make a copy of the data we can reorder data = new Instances(data); data.randomize(random); if (data.classAttribute().isNominal()) { data.stratify(numFolds); } // Do the folds for (int i = 0; i < numFolds; i++) { Instances train = data.trainCV(numFolds, i, random); setPriors(train); Classifier copiedClassifier = Classifier.makeCopy(classifier); copiedClassifier.buildClassifier(train); Instances test = data.testCV(numFolds, i); evaluateModel(copiedClassifier, test); } m_NumFolds = numFolds; }
From source file:general.Util.java
/** * show learning statistic result by percentage split * @param data training data// www .ja v a2 s. co m * @param trainPercent percentage of the training data * @param Classifier model */ public static void PercentageSplit(Instances data, double trainPercent, String Classifier) { try { int trainSize = (int) Math.round(data.numInstances() * trainPercent / 100); int testSize = data.numInstances() - trainSize; data.randomize(new Random(1)); Instances train = new Instances(data, 0, trainSize); Instances test = new Instances(data, trainSize, testSize); train.setClassIndex(train.numAttributes() - 1); test.setClassIndex(test.numAttributes() - 1); switch (Classifier.toLowerCase()) { case "naivebayes": classifier = new NaiveBayes(); break; case "j48-prune": classifier = new MyJ48(true, 0.25f); break; case "j48-unprune": classifier = new MyJ48(false, 0f); break; case "id3": classifier = new MyID3(); break; default: break; } classifier.buildClassifier(train); for (int i = 0; i < test.numInstances(); i++) { try { double pred = classifier.classifyInstance(test.instance(i)); System.out.print("ID: " + test.instance(i)); System.out .print(", actual: " + test.classAttribute().value((int) test.instance(i).classValue())); System.out.println(", predicted: " + test.classAttribute().value((int) pred)); } catch (Exception ex) { Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex); } } // Start evaluate model using instances test and print results try { Evaluation eval = new Evaluation(train); eval.evaluateModel(classifier, test); System.out.println(eval.toSummaryString("\nResults\n\n", false)); } catch (Exception e) { e.printStackTrace(); } } catch (Exception ex) { Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:gr.auth.ee.lcs.ArffTrainTestLoader.java
License:Open Source License
/** * Load instances into the global train store and create test set. * //w w w. jav a2s.co m * @param filename * the .arff filename to be used * @param generateTestSet * true if a test set is going to be generated * @throws IOException * if the input file is not found */ public final void loadInstances(final String filename, final boolean generateTestSet) throws IOException { // Open .arff final Instances set = InstancesUtility.openInstance(filename); if (set.classIndex() < 0) { set.setClassIndex(set.numAttributes() - 1); } set.randomize(new Random()); if (generateTestSet) { final int numOfFolds = (int) SettingsLoader.getNumericSetting("NumberOfFolds", 10); final int fold = (int) Math.floor(Math.random() * numOfFolds); trainSet = set.trainCV(numOfFolds, fold); testSet = set.testCV(numOfFolds, fold); } else { trainSet = set; } myLcs.instances = InstancesUtility.convertIntancesToDouble(trainSet); myLcs.labelCardinality = InstancesUtility.getLabelCardinality(trainSet); }