List of usage examples for weka.core Instances classIndex
publicint classIndex()
From source file:old.CFS.java
/** * takes a dataset as first argument//from w w w . java 2 s . c o m * * @param args the commandline arguments * @throws Exception if something goes wrong */ public static void main(String[] args) throws Exception { // load data System.out.println("\n0. Loading data"); DataSource source = new DataSource("D:\\ALL\\imdb_grid_size=1000_MIN=50_genres=5.arff"); Instances data = source.getDataSet(); data.setClass(data.attribute("Horror")); if (data.classIndex() == -1) data.setClassIndex(data.numAttributes() - 1); // // 1. meta-classifier // useClassifier(data); // // // 2. filter // useFilter(data); // 3. low-level useLowLevel(data); }
From source file:org.esa.nest.gpf.SGD.java
/** * Method for building the classifier./* www .j a v a 2 s . c om*/ * * @param data the set of training instances. * @throws Exception if the classifier can't be built successfully. */ @Override public void buildClassifier(Instances data) throws Exception { reset(); // can classifier handle the data? getCapabilities().testWithFail(data); data = new Instances(data); data.deleteWithMissingClass(); if (data.numInstances() > 0 && !m_dontReplaceMissing) { m_replaceMissing = new ReplaceMissingValues(); m_replaceMissing.setInputFormat(data); data = Filter.useFilter(data, m_replaceMissing); } // check for only numeric attributes boolean onlyNumeric = true; for (int i = 0; i < data.numAttributes(); i++) { if (i != data.classIndex()) { if (!data.attribute(i).isNumeric()) { onlyNumeric = false; break; } } } if (!onlyNumeric) { if (data.numInstances() > 0) { m_nominalToBinary = new weka.filters.supervised.attribute.NominalToBinary(); } else { m_nominalToBinary = new weka.filters.unsupervised.attribute.NominalToBinary(); } m_nominalToBinary.setInputFormat(data); data = Filter.useFilter(data, m_nominalToBinary); } if (!m_dontNormalize && data.numInstances() > 0) { m_normalize = new Normalize(); m_normalize.setInputFormat(data); data = Filter.useFilter(data, m_normalize); } m_numInstances = data.numInstances(); m_weights = new double[data.numAttributes() + 1]; m_data = new Instances(data, 0); if (data.numInstances() > 0) { data.randomize(new Random(getSeed())); // randomize the data train(data); } }
From source file:org.ml.classifier.TextDirectoryToArff.java
License:Open Source License
public static void main(String[] args) { // if (args.length == 2) { TextDirectoryToArff tdta = new TextDirectoryToArff(); try {/*from www . j a v a 2s . c om*/ // Instances trainData = tdta.createDataset(TRAINING_FILES); // LOGGER.debug(trainData.toString()); Instances testData = tdta.createDataset(TESTING_FILES); // LOGGER.debug(testData.toString()); // System.out.println(testData); // System.exit(0); // apply the StringToWordVector in a batch mode // (see the source code of setOptions(String[]) method of the filter // if you want to know which command-line option corresponds to which // bean property) // StringToWordVector strToWordFilter = new StringToWordVector(); // strToWordFilter.setInputFormat(trainData); // strToWordFilter.setOutputWordCounts(true); // strToWordFilter.setTFTransform(true); // strToWordFilter.setIDFTransform(true); // trainData = Filter.useFilter(trainData, strToWordFilter); // testData = Filter.useFilter(testData, strToWordFilter); //transform to non-sparse format // SparseToNonSparse spFilter = new SparseToNonSparse(); // spFilter.setInputFormat(trainData); // trainData = Filter.useFilter(trainData, spFilter); // testData = Filter.useFilter(testData, spFilter); // Standardize standardizeFilter = new Standardize(); // standardizeFilter.setInputFormat(trainData); // // Instances newTrainData = Filter.useFilter(trainData, standardizeFilter); // Instances newTestData = Filter.useFilter(testData, standardizeFilter); // NaiveBayesMultinomial cl = null; // // train classifier // cl = new NaiveBayesMultinomial(); // // further options... // cl.buildClassifier(trainData); // FilteredClassifier fcl = new FilteredClassifier(); // fcl.setFilter(strToWordFilter); // fcl.setClassifier(cl); // // fcl.buildClassifier(trainData); // SerializationHelper.write(MODEL, fcl); // read the model from the file FilteredClassifier fcl = (FilteredClassifier) SerializationHelper.read(MODEL); // System.out.println("Training finished!"); // System.exit(0); // Evaluation eTest = new Evaluation(trainData); // eTest.evaluateModel(cl, trainData); // String strSummary = eTest.toSummaryString(); // LOGGER.debug(strSummary); // eTest.evaluateModel(cl, testData); // strSummary = eTest.toSummaryString(); // LOGGER.debug(strSummary); // Get the confusion matrix // double[][] cmMatrix = eTest.confusionMatrix(); // LOGGER.debug(cmMatrix); int[] myLst = { 5, 7, 9, 100, 345, 1000, 1500, 7500 }; for (int i = 0; i < myLst.length; i++) { int idx = myLst[i]; System.out.println("Actual: " + testData.instance(idx).stringValue(testData.classIndex())); long start = System.currentTimeMillis(); System.out.println(fcl.classifyInstance(testData.instance(idx))); long end = System.currentTimeMillis(); System.out.println("\n Time: " + (end - start) + " ms"); } } catch (Exception e) { LOGGER.error(e.getMessage()); e.printStackTrace(); } // } else { // System.out.println("Usage: java TextDirectoryToArff <directory name>"); // } }
From source file:org.openml.webapplication.fantail.dc.DCUntils.java
License:Open Source License
public static double[] computeAttributeEntropy(Instances data) { List<Double> attributeEntropy = new ArrayList<Double>(); for (int attIndex = 0; attIndex < data.numAttributes(); attIndex++) { if (data.attribute(attIndex).isNominal() && (data.classIndex() != attIndex)) { double[] attValueCounts = new double[data.numDistinctValues(attIndex)]; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); attValueCounts[(int) inst.value(attIndex)]++; }//from ww w. j av a2 s .c o m double attEntropy = 0; for (int c = 0; c < data.attribute(attIndex).numValues(); c++) { if (attValueCounts[c] > 0) { double prob_c = attValueCounts[c] / data.numInstances(); attEntropy += prob_c * (Utils.log2(prob_c)); } } attEntropy = attEntropy * -1.0; attributeEntropy.add(attEntropy); } } return ArrayUtils.toPrimitive(attributeEntropy.toArray(new Double[attributeEntropy.size()])); }
From source file:org.openml.webapplication.fantail.dc.DCUntils.java
License:Open Source License
public static double[] computeMutualInformation(Instances data) { List<Double> mutualInformation = new ArrayList<Double>(); for (int attIndex = 0; attIndex < data.numAttributes(); attIndex++) { if (data.attribute(attIndex).isNominal() && (data.classIndex() != attIndex)) { // System.out.println(data.attribute(attIndex)); double infoGain = computeInfoGain(data, data.attribute(attIndex)); infoGain = Math.round(infoGain * Math.pow(10, 14)) / Math.pow(10, 14); mutualInformation.add(infoGain); }// w ww .j a v a 2s . c o m } return ArrayUtils.toPrimitive(mutualInformation.toArray(new Double[mutualInformation.size()])); }
From source file:org.openml.webapplication.fantail.dc.statistical.AttributeEntropy.java
License:Open Source License
@Override public Map<String, Double> characterize(Instances data) { int nominal_count = 0; for (int i = 0; i < data.numAttributes(); ++i) { if (data.attribute(i).isNominal() && data.classIndex() != i) { nominal_count += 1;/*from w w w .j a v a 2s .co m*/ } } Map<String, Double> qualities = new HashMap<String, Double>(); if (data.classAttribute().isNominal()) { double classEntropy = DCUntils.computeClassEntropy(data); double[] attEntropy = DCUntils.computeAttributeEntropy(data); double[] mutualInformation = DCUntils.computeMutualInformation(data); double meanMI = StatUtils.mean(mutualInformation); double meanAttEntropy = nominal_count > 0 ? StatUtils.mean(attEntropy) : -1; double noiseSignalRatio; double ena = 0; if (meanMI <= 0) { ena = -1; noiseSignalRatio = -1; } else { ena = classEntropy / meanMI; noiseSignalRatio = (meanAttEntropy - meanMI) / meanMI; } qualities.put(ids[0], classEntropy); qualities.put(ids[1], meanAttEntropy); qualities.put(ids[2], meanMI); qualities.put(ids[3], ena); qualities.put(ids[4], noiseSignalRatio); qualities.put(ids[5], StatUtils.min(attEntropy)); qualities.put(ids[6], StatUtils.min(mutualInformation)); qualities.put(ids[7], StatUtils.max(attEntropy)); qualities.put(ids[8], StatUtils.max(mutualInformation)); qualities.put(ids[9], StatUtils.percentile(attEntropy, 25)); qualities.put(ids[10], StatUtils.percentile(mutualInformation, 25)); qualities.put(ids[11], StatUtils.percentile(attEntropy, 50)); qualities.put(ids[12], StatUtils.percentile(mutualInformation, 50)); qualities.put(ids[13], StatUtils.percentile(attEntropy, 75)); qualities.put(ids[14], StatUtils.percentile(mutualInformation, 75)); } else { // numeric target for (int i = 0; i < ids.length; ++i) { qualities.put(ids[i], -1.0); } } return qualities; }
From source file:org.openml.webapplication.features.ExtractFeatures.java
License:Open Source License
public static List<Feature> getFeatures(Instances dataset, String defaultClass) { if (defaultClass != null) { dataset.setClass(dataset.attribute(defaultClass)); } else {/* ww w .ja v a 2s .co m*/ dataset.setClassIndex(dataset.numAttributes() - 1); } final ArrayList<Feature> resultFeatures = new ArrayList<Feature>(); for (int i = 0; i < dataset.numAttributes(); i++) { Attribute att = dataset.attribute(i); int numValues = dataset.classAttribute().isNominal() ? dataset.classAttribute().numValues() : 0; AttributeStatistics attributeStats = new AttributeStatistics(dataset.attribute(i), numValues); for (int j = 0; j < dataset.numInstances(); ++j) { attributeStats.addValue(dataset.get(j).value(i), dataset.get(j).classValue()); } String data_type = null; Integer numberOfDistinctValues = null; Integer numberOfUniqueValues = null; Integer numberOfMissingValues = null; Integer numberOfIntegerValues = null; Integer numberOfRealValues = null; Integer numberOfNominalValues = null; Integer numberOfValues = null; Double maximumValue = null; Double minimumValue = null; Double meanValue = null; Double standardDeviation = null; AttributeStats as = dataset.attributeStats(i); numberOfDistinctValues = as.distinctCount; numberOfUniqueValues = as.uniqueCount; numberOfMissingValues = as.missingCount; numberOfIntegerValues = as.intCount; numberOfRealValues = as.realCount; numberOfMissingValues = as.missingCount; if (att.isNominal()) { numberOfNominalValues = att.numValues(); } numberOfValues = attributeStats.getTotalObservations(); if (att.isNumeric()) { maximumValue = attributeStats.getMaximum(); minimumValue = attributeStats.getMinimum(); meanValue = attributeStats.getMean(); standardDeviation = 0.0; try { standardDeviation = attributeStats.getStandardDeviation(); } catch (Exception e) { Conversion.log("WARNING", "StdDev", "Could not compute standard deviation of feature " + att.name() + ": " + e.getMessage()); } } if (att.type() == 0) { data_type = "numeric"; } else if (att.type() == 1) { data_type = "nominal"; } else if (att.type() == 2) { data_type = "string"; } else { data_type = "unknown"; } resultFeatures.add(new Feature(att.index(), att.name(), data_type, att.index() == dataset.classIndex(), numberOfDistinctValues, numberOfUniqueValues, numberOfMissingValues, numberOfIntegerValues, numberOfRealValues, numberOfNominalValues, numberOfValues, maximumValue, minimumValue, meanValue, standardDeviation, attributeStats.getClassDistribution())); } return resultFeatures; }
From source file:org.openml.webapplication.features.FantailConnector.java
License:Open Source License
private List<Quality> datasetCharacteristics(Instances fulldata, Integer start, Integer interval_size, List<String> qualitiesAvailable) throws Exception { List<Quality> result = new ArrayList<DataQuality.Quality>(); Instances intervalData;/* ww w.j ava 2 s .c o m*/ // Be careful changing this! if (interval_size != null) { intervalData = new Instances(fulldata, start, Math.min(interval_size, fulldata.numInstances() - start)); intervalData = applyFilter(intervalData, new StringToNominal(), "-R first-last"); intervalData.setClassIndex(fulldata.classIndex()); } else { intervalData = fulldata; // todo: use StringToNominal filter? might be to expensive } for (Characterizer dc : batchCharacterizers) { if (qualitiesAvailable != null && qualitiesAvailable.containsAll(Arrays.asList(dc.getIDs())) == false) { Conversion.log("OK", "Extract Batch Features", dc.getClass().getName() + ": " + Arrays.toString(dc.getIDs())); Map<String, Double> qualities = dc.characterize(intervalData); result.addAll(hashMaptoList(qualities, start, interval_size)); } else { Conversion.log("OK", "Extract Batch Features", dc.getClass().getName() + " - already in database"); } } return result; }
From source file:org.openscience.cdk.applications.taverna.io.ARFFFileReaderActivity.java
License:Open Source License
@Override public void work() throws Exception { // Get input//from w w w. ja va2 s. c om List<File> files = this.getInputAsFileList(this.INPUT_PORTS[0]); // Do work List<Instances> datasets = new LinkedList<Instances>(); for (File file : files) { try { Instances instances = DataSource.read(file.getPath()); int lastAttr = instances.numAttributes() - 1; if (instances.classIndex() == -1 && instances.attribute(lastAttr).name().equals("Class")) { instances.setClassIndex(lastAttr); } datasets.add(instances); } catch (Exception e) { ErrorLogger.getInstance().writeError(CDKTavernaException.READ_FILE_ERROR + file, this.getActivityName(), e); } } // Set output this.setOutputAsObjectList(datasets, this.OUTPUT_PORTS[0]); }
From source file:org.opentox.jaqpot3.qsar.trainer.MlrRegression.java
License:Open Source License
@Override public Model train(Instances data) throws JaqpotException { try {// www . jav a 2 s.co m getTask().getMeta().addComment( "Dataset successfully retrieved and converted " + "into a weka.core.Instances object"); UpdateTask firstTaskUpdater = new UpdateTask(getTask()); firstTaskUpdater.setUpdateMeta(true); firstTaskUpdater.setUpdateTaskStatus(true);//TODO: Is this necessary? try { firstTaskUpdater.update(); } catch (DbException ex) { throw new JaqpotException(ex); } finally { try { firstTaskUpdater.close(); } catch (DbException ex) { throw new JaqpotException(ex); } } Instances trainingSet = data; getTask().getMeta().addComment("The downloaded dataset is now preprocessed"); firstTaskUpdater = new UpdateTask(getTask()); firstTaskUpdater.setUpdateMeta(true); firstTaskUpdater.setUpdateTaskStatus(true);//TODO: Is this necessary? try { firstTaskUpdater.update(); } catch (DbException ex) { throw new JaqpotException(ex); } finally { try { firstTaskUpdater.close(); } catch (DbException ex) { throw new JaqpotException(ex); } } /* SET CLASS ATTRIBUTE */ Attribute target = trainingSet.attribute(targetUri.toString()); if (target == null) { throw new BadParameterException("The prediction feature you provided was not found in the dataset"); } else { if (!target.isNumeric()) { throw new QSARException("The prediction feature you provided is not numeric."); } } trainingSet.setClass(target); /* Very important: place the target feature at the end! (target = last)*/ int numAttributes = trainingSet.numAttributes(); int classIndex = trainingSet.classIndex(); Instances orderedTrainingSet = null; List<String> properOrder = new ArrayList<String>(numAttributes); for (int j = 0; j < numAttributes; j++) { if (j != classIndex) { properOrder.add(trainingSet.attribute(j).name()); } } properOrder.add(trainingSet.attribute(classIndex).name()); try { orderedTrainingSet = InstancesUtil.sortByFeatureAttrList(properOrder, trainingSet, -1); } catch (JaqpotException ex) { logger.error("Improper dataset - training will stop", ex); throw ex; } orderedTrainingSet.setClass(orderedTrainingSet.attribute(targetUri.toString())); /* START CONSTRUCTION OF MODEL */ Model m = new Model(Configuration.getBaseUri().augment("model", getUuid().toString())); m.setAlgorithm(getAlgorithm()); m.setCreatedBy(getTask().getCreatedBy()); m.setDataset(datasetUri); m.addDependentFeatures(dependentFeature); try { dependentFeature.loadFromRemote(); } catch (ServiceInvocationException ex) { Logger.getLogger(MlrRegression.class.getName()).log(Level.SEVERE, null, ex); } Set<LiteralValue> depFeatTitles = null; if (dependentFeature.getMeta() != null) { depFeatTitles = dependentFeature.getMeta().getTitles(); } String depFeatTitle = dependentFeature.getUri().toString(); if (depFeatTitles != null) { depFeatTitle = depFeatTitles.iterator().next().getValueAsString(); m.getMeta().addTitle("MLR model for " + depFeatTitle) .addDescription("MLR model for the prediction of " + depFeatTitle + " (uri: " + dependentFeature.getUri() + " )."); } else { m.getMeta().addTitle("MLR model for the prediction of the feature with URI " + depFeatTitle) .addComment("No name was found for the feature " + depFeatTitle); } /* * COMPILE THE LIST OF INDEPENDENT FEATURES with the exact order in which * these appear in the Instances object (training set). */ m.setIndependentFeatures(independentFeatures); /* CREATE PREDICTED FEATURE AND POST IT TO REMOTE SERVER */ String predictionFeatureUri = null; Feature predictedFeature = publishFeature(m, dependentFeature.getUnits(), "Predicted " + depFeatTitle + " by MLR model", datasetUri, featureService); m.addPredictedFeatures(predictedFeature); predictionFeatureUri = predictedFeature.getUri().toString(); getTask().getMeta().addComment("Prediction feature " + predictionFeatureUri + " was created."); firstTaskUpdater = new UpdateTask(getTask()); firstTaskUpdater.setUpdateMeta(true); firstTaskUpdater.setUpdateTaskStatus(true);//TODO: Is this necessary? try { firstTaskUpdater.update(); } catch (DbException ex) { throw new JaqpotException(ex); } finally { try { firstTaskUpdater.close(); } catch (DbException ex) { throw new JaqpotException(ex); } } /* ACTUAL TRAINING OF THE MODEL USING WEKA */ LinearRegression linreg = new LinearRegression(); String[] linRegOptions = { "-S", "1", "-C" }; try { linreg.setOptions(linRegOptions); linreg.buildClassifier(orderedTrainingSet); } catch (final Exception ex) {// illegal options or could not build the classifier! String message = "MLR Model could not be trained"; logger.error(message, ex); throw new JaqpotException(message, ex); } try { // evaluate classifier and print some statistics Evaluation eval = new Evaluation(orderedTrainingSet); eval.evaluateModel(linreg, orderedTrainingSet); String stats = eval.toSummaryString("\nResults\n======\n", false); ActualModel am = new ActualModel(linreg); am.setStatistics(stats); m.setActualModel(am); } catch (NotSerializableException ex) { String message = "Model is not serializable"; logger.error(message, ex); throw new JaqpotException(message, ex); } catch (final Exception ex) {// illegal options or could not build the classifier! String message = "MLR Model could not be trained"; logger.error(message, ex); throw new JaqpotException(message, ex); } m.getMeta().addPublisher("OpenTox").addComment("This is a Multiple Linear Regression Model"); //save the instances being predicted to abstract trainer for calculating DoA predictedInstances = orderedTrainingSet; excludeAttributesDoA.add(dependentFeature.getUri().toString()); return m; } catch (QSARException ex) { String message = "QSAR Exception: cannot train MLR model"; logger.error(message, ex); throw new JaqpotException(message, ex); } }