List of usage examples for weka.core Instances classIndex
publicint classIndex()
From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java
License:Apache License
/** * <p>/*w w w. j av a 2 s . com*/ * Applies the CLIFF relevancy filter to the data. * </p> * * @param data * the data * @return CLIFF-filtered data */ protected Instances applyCLIFF(Instances data) { final double[][] powerAttributes = new double[data.size()][data.numAttributes()]; final double[] powerEntity = new double[data.size()]; final int[] counts = data.attributeStats(data.classIndex()).nominalCounts; final double probDefect = data.numInstances() / (double) counts[1]; for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute()) { final double[] ranges = getRanges(data, j); final double[] probDefectRange = getRangeProbabilities(data, j, ranges); for (int i = 0; i < data.numInstances(); i++) { final double value = data.instance(i).value(j); final int range = determineRange(ranges, value); double probClass, probNotClass, probRangeClass, probRangeNotClass; if (data.instance(i).classValue() == 1) { probClass = probDefect; probNotClass = 1.0 - probDefect; probRangeClass = probDefectRange[range]; probRangeNotClass = 1.0 - probDefectRange[range]; } else { probClass = 1.0 - probDefect; probNotClass = probDefect; probRangeClass = 1.0 - probDefectRange[range]; probRangeNotClass = probDefectRange[range]; } powerAttributes[i][j] = Math.pow(probRangeClass, 2.0) / (probRangeClass * probClass + probRangeNotClass * probNotClass); } } } for (int i = 0; i < data.numInstances(); i++) { powerEntity[i] = 1.0; for (int j = 0; j < data.numAttributes(); j++) { powerEntity[i] *= powerAttributes[i][j]; } } double[] sortedPower = powerEntity.clone(); Arrays.sort(sortedPower); double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))]; final Instances selected = new Instances(data); selected.delete(); for (int i = 0; i < data.numInstances(); i++) { if (powerEntity[i] >= cutOff) { selected.add(data.instance(i)); } } return selected; }
From source file:de.ugoe.cs.cpdp.dataselection.DBSCANFilter.java
License:Apache License
/** * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances, * weka.core.Instances)/*from w w w. jav a2 s .co m*/ */ @Override public Instances apply(Instances testdata, Instances traindata) { Instances filteredTraindata = new Instances(traindata); filteredTraindata.clear(); double[][] data = new double[testdata.size() + traindata.size()][testdata.numAttributes() - 1]; int classIndex = testdata.classIndex(); for (int i = 0; i < testdata.size(); i++) { int k = 0; for (int j = 0; j < testdata.numAttributes(); j++) { if (j != classIndex) { data[i][k] = testdata.get(i).value(j); k++; } } } for (int i = 0; i < traindata.size(); i++) { int k = 0; for (int j = 0; j < traindata.numAttributes(); j++) { if (j != classIndex) { data[i + testdata.size()][k] = traindata.get(i).value(j); k++; } } } DatabaseConnection dbc = new ArrayAdapterDatabaseConnection(data); Database db = new StaticArrayDatabase(dbc, null); db.initialize(); DBSCAN<DoubleVector> dbscan = new DBSCAN<DoubleVector>(EuclideanDistanceFunction.STATIC, 1.0, 10); Clustering<Model> clusterer = dbscan.run(db); Relation<DoubleVector> rel = db.getRelation(TypeUtil.DOUBLE_VECTOR_FIELD); int firstInternalIndex = rel.iterDBIDs().internalGetIndex(); for (Cluster<Model> cluster : clusterer.getAllClusters()) { // check if cluster has any training data DBIDIter iter = rel.iterDBIDs(); boolean noMatch = true; for (int i = 0; noMatch && i < testdata.size(); i++) { noMatch = !cluster.getIDs().contains(iter); iter.advance(); } if (!noMatch) { // cluster contains test data for (DBIDIter clusterIter = cluster.getIDs().iter(); clusterIter.valid(); clusterIter.advance()) { int internalIndex = clusterIter.internalGetIndex() - testdata.size() - firstInternalIndex; if (internalIndex >= 0) { // index belongs to a training instance filteredTraindata.add(traindata.get(internalIndex)); } } } } return filteredTraindata; }
From source file:de.ugoe.cs.cpdp.dataselection.MahalanobisOutlierRemoval.java
License:Apache License
/** * <p>// w w w . j a va 2 s . com * removes all instances, whose Mahalanobi distance to the mean of the data is greater than * epsilon. * </p> * * @param data * data where the outliers are removed */ private void applyMahalanobisDistancesRemoval(Instances data) { RealMatrix values = new BlockRealMatrix(data.size(), data.numAttributes() - 1); for (int i = 0; i < data.size(); i++) { values.setRow(i, WekaUtils.instanceValues(data.get(i))); } RealMatrix inverseCovariance; try { inverseCovariance = new LUDecomposition(new Covariance(values).getCovarianceMatrix()).getSolver() .getInverse(); } catch (SingularMatrixException e) { Console.traceln(Level.WARNING, "could not perform Mahalanobis outlier removal due to singular covariance matrix"); return; } // create mean vector double[] meanValues = new double[data.numAttributes() - 1]; int k = 0; for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { meanValues[k] = data.attributeStats(j).numericStats.mean; k++; } } for (int i = data.size() - 1; i >= 0; i--) { double distance = mahalanobisDistance(inverseCovariance, WekaUtils.instanceValues(data.get(i)), meanValues); if (distance > epsilon) { data.remove(i); } } }
From source file:de.ugoe.cs.cpdp.dataselection.SynonymOutlierRemoval.java
License:Apache License
/** * <p>// www . j a va2 s . c om * Applies the synonym outlier removal. * </p> * * @param traindata * data from which the outliers are removed. */ public void applySynonymRemoval(Instances traindata) { double minDistance[][] = new double[traindata.size()][traindata.numAttributes() - 1]; double minDistanceAttribute[] = new double[traindata.numAttributes() - 1]; double distance; for (int j = 0; j < minDistanceAttribute.length; j++) { minDistanceAttribute[j] = Double.MAX_VALUE; } for (int i1 = traindata.size() - 1; i1 < traindata.size(); i1++) { int k = 0; for (int j = 0; j < traindata.numAttributes(); j++) { if (j != traindata.classIndex()) { minDistance[i1][k] = Double.MAX_VALUE; for (int i2 = 0; i2 < traindata.size(); i2++) { if (i1 != i2) { distance = Math.abs(traindata.get(i1).value(j) - traindata.get(i2).value(j)); if (distance < minDistance[i1][k]) { minDistance[i1][k] = distance; } if (distance < minDistanceAttribute[k]) { minDistanceAttribute[k] = distance; } } } k++; } } } for (int i = traindata.size() - 1; i >= 0; i--) { boolean hasClosest = false; for (int j = 0; !hasClosest && j < traindata.numAttributes(); j++) { hasClosest = minDistance[i][j] <= minDistanceAttribute[j]; } if (!hasClosest) { traindata.delete(i); } } }
From source file:de.ugoe.cs.cpdp.loader.DecentDataLoader.java
License:Apache License
/** * Loads the given decent file and tranform it from decent->arffx->arff * /*w ww . j av a 2s. c o m*/ * @return Instances in WEKA format */ @Override public Instances load(File file) { // Set attributeFilter setAttributeFilter(); // Register MetaModels try { registerMetaModels(); } catch (Exception e1) { Console.printerrln("Metamodels cannot be registered!"); e1.printStackTrace(); } // Set location of decent and arffx Model String decentModelLocation = file.getAbsolutePath(); String pathToDecentModelFolder = decentModelLocation.substring(0, decentModelLocation.lastIndexOf(File.separator)); String arffxModelLocation = pathToDecentModelFolder + "/model.arffx"; String logModelLocation = pathToDecentModelFolder + "/model.log"; String arffLocation = pathToDecentModelFolder + "/model.arff"; // If arff File exists, load from it! if (new File(arffLocation).exists()) { System.out.println("Loading arff File..."); BufferedReader reader; Instances data = null; try { reader = new BufferedReader(new FileReader(arffLocation)); data = new Instances(reader); reader.close(); } catch (FileNotFoundException e) { Console.printerrln("File with path: " + arffLocation + " was not found."); throw new RuntimeException(e); } catch (IOException e) { Console.printerrln("File with path: " + arffLocation + " cannot be read."); throw new RuntimeException(e); } // Set class attribute if not set if (data.classIndex() == -1) { Attribute classAttribute = data.attribute(classAttributeName); data.setClass(classAttribute); } return data; } // Location of EOL Scripts String preprocess = "./decent/epsilon/query/preprocess.eol"; String arffxToArffSource = "./decent/epsilon/query/addLabels.eol"; // Set Log Properties System.setProperty("epsilon.logLevel", logLevel); System.setProperty("epsilon.logToFile", logToFile); System.setProperty("epsilon.logFileAvailable", "false"); // Set decent2arffx Properties System.setProperty("epsilon.transformation.decent2arffx.skipSource", "false"); System.setProperty("epsilon.transformation.decent2arffx.type", "code"); // Preprocess Data, transform from decent2arffx try { IEolExecutableModule preProcessModule = loadModule(preprocess); IModel preProcessDecentModel = modelHandler.getDECENTModel(decentModelLocation, true, true); IModel preProcessArffxarffxModel = modelHandler.getARFFxModel(arffxModelLocation, false, true); preProcessModule.getContext().getModelRepository().addModel(preProcessDecentModel); preProcessModule.getContext().getModelRepository().addModel(preProcessArffxarffxModel); execute(preProcessModule, logModelLocation); preProcessDecentModel.dispose(); preProcessArffxarffxModel.dispose(); preProcessModule.reset(); } catch (URISyntaxException e) { Console.printerrln("URI Syntax for decent or arffx model is wrong."); e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } // Transform to arff, for label and confidence attributes try { IEolExecutableModule arffxToArffModule = loadModule(arffxToArffSource); IModel arffxToArffArffxModel = modelHandler.getARFFxModel(arffxModelLocation, true, true); arffxToArffModule.getContext().getModelRepository().addModel(arffxToArffArffxModel); execute(arffxToArffModule, logModelLocation); arffxToArffArffxModel.dispose(); // can be stored and retained alternatively arffxToArffModule.reset(); } catch (URISyntaxException e) { Console.printerrln("URI Syntax for arffx model is wrong."); e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } // Unregister MetaModels, otherwise cast will fail HashMap<String, Object> metaModelCache = new HashMap<>(); for (String key : EPackage.Registry.INSTANCE.keySet()) { metaModelCache.put(key, EPackage.Registry.INSTANCE.get(key)); } ; for (String key : metaModelCache.keySet()) { EPackage.Registry.INSTANCE.remove(key); } ; // Workaround to gernerate a usable URI. Absolute path is not // possible, therefore we need to construct a relative path URL location = DecentDataLoader.class.getProtectionDomain().getCodeSource().getLocation(); String basePath = location.getFile(); // Location is the bin folder, so we need to delete the last 4 characters basePath = basePath.substring(0, basePath.length() - 4); String relativePath = new File(basePath).toURI().relativize(new File(arffxModelLocation).toURI()).getPath(); // Loard arffx file and create WEKA Instances ARFFxResourceTool tool = new ARFFxResourceTool(); Resource resource = tool.loadResourceFromXMI(relativePath, "arffx"); Instances dataSet = null; for (EObject o : resource.getContents()) { Model m = (Model) o; dataSet = createWekaDataFormat(m); for (Instance i : m.getData()) { createWekaInstance(dataSet, i); } } // Set class attribute Attribute classAttribute = dataSet.attribute(classAttributeName); dataSet.setClass(classAttribute); // Save as ARFF save(dataSet, arffLocation); return dataSet; }
From source file:de.ugoe.cs.cpdp.util.WekaUtils.java
License:Apache License
/** * <p>//from w ww . j a va 2 s . co m * Calculates the distributional characteristics of the distances the instances within a data * set have to each other. * </p> * * @param data * data for which the instances are characterized * @return characteristics */ public static DistChar datasetDistance(Instances data) { double distance; double sumAll = 0.0; double sumAllQ = 0.0; double min = Double.MAX_VALUE; double max = Double.MIN_VALUE; int numCmp = 0; int l = 0; double[] inst1 = new double[data.numAttributes() - 1]; double[] inst2 = new double[data.numAttributes() - 1]; EuclideanDistance euclideanDistance = new EuclideanDistance(); for (int i = 0; i < data.numInstances(); i++) { l = 0; for (int k = 0; k < data.numAttributes(); k++) { if (k != data.classIndex()) { inst1[l] = data.instance(i).value(k); } } for (int j = 0; j < data.numInstances(); j++) { if (j != i) { l = 0; for (int k = 0; k < data.numAttributes(); k++) { if (k != data.classIndex()) { inst2[l] = data.instance(j).value(k); } } distance = euclideanDistance.compute(inst1, inst2); sumAll += distance; sumAllQ += distance * distance; numCmp++; if (distance < min) { min = distance; } if (distance > max) { max = distance; } } } } double mean = sumAll / numCmp; double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1))); return new DistChar(mean, std, min, max, data.numInstances()); }
From source file:de.unidue.langtech.grading.tc.ClusterExemplarTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { if (learningMode.equals(Constants.LM_MULTI_LABEL)) { throw new IllegalArgumentException("Cannot use multi-label setup in clustering."); }/*from w w w.j a v a2s. co m*/ boolean multiLabel = false; File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); Clusterer abstractClusterer = AbstractClusterer.forName(clusteringArguments.get(0), clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0])); // we assume that only this method has been used - breaks modularity, but need results fast ... :/ SimpleKMeans clusterer = (SimpleKMeans) abstractClusterer; trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); Instances copyTrainData = new Instances(trainData); // generate data for clusterer (w/o class) Remove filter = new Remove(); filter.setAttributeIndices("" + (trainData.classIndex() + 1)); filter.setInputFormat(trainData); Instances clusterTrainData = Filter.useFilter(trainData, filter); clusterer.buildClusterer(clusterTrainData); Instances centroids = clusterer.getClusterCentroids(); // Add addFilter = new Add(); // addFilter.setAttributeIndex(new Integer(numTestLabels + i + 1).toString()); // addFilter.setNominalLabels("0,1"); // addFilter.setAttributeName(trainData.attribute(i).name() + COMPATIBLE_OUTCOME_CLASS); // addFilter.setInputFormat(testData); trainData.clear(); Enumeration<Instance> centroidInstances = centroids.enumerateInstances(); while (centroidInstances.hasMoreElements()) { Instance centroidInstance = centroidInstances.nextElement(); // centroidInstance is usually not a real instance, but a virtual centroid // we need to find the closest point in the training data double minDistance = Double.POSITIVE_INFINITY; int offset = 0; int minOffset = 0; Enumeration<Instance> trainInstances = clusterTrainData.enumerateInstances(); while (trainInstances.hasMoreElements()) { Instance trainInstance = trainInstances.nextElement(); double dist = distance(centroidInstance, trainInstance); if (dist < minDistance) { minDistance = dist; minOffset = offset; } offset++; } // add selected instance to instances trainData.add(copyTrainData.get(minOffset)); } // write the new training data (that will be used by the test task instead of the original one) DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/" + ARFF_FILENAME, trainData); }
From source file:de.unidue.langtech.grading.tc.ClusteringTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { if (learningMode.equals(Constants.LM_MULTI_LABEL)) { throw new IllegalArgumentException("Cannot use multi-label setup in clustering."); }//from ww w .j a v a 2 s . c o m boolean multiLabel = false; File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); // get number of outcomes List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel); Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0), clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0])); Instances copyTrainData = new Instances(trainData); trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); // generate data for clusterer (w/o class) Remove filter = new Remove(); filter.setAttributeIndices("" + (trainData.classIndex() + 1)); filter.setInputFormat(trainData); Instances clusterTrainData = Filter.useFilter(trainData, filter); clusterer.buildClusterer(clusterTrainData); // get a mapping from clusterIDs to instance offsets in the ARFF Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer); Map<String, String> instanceId2TextMap = getInstanceId2TextMap(aContext); ConditionalFrequencyDistribution<Integer, String> clusterAssignments = new ConditionalFrequencyDistribution<Integer, String>(); for (Integer clusterId : clusterMap.keySet()) { System.out.println("CLUSTER: " + clusterId); for (Integer offset : clusterMap.get(clusterId)) { // get instance ID from instance Instance instance = copyTrainData.get(offset); Double classOffset = new Double(instance.value(copyTrainData.classAttribute())); String label = (String) trainOutcomeValues.get(classOffset.intValue()); clusterAssignments.addSample(clusterId, label); String instanceId = instance .stringValue(copyTrainData.attribute(AddIdFeatureExtractor.ID_FEATURE_NAME).index()); System.out.println(label + "\t" + instanceId2TextMap.get(instanceId)); } System.out.println(); } System.out.println("ID\tSIZE\tPURITY\tRMSE"); for (Integer clusterId : clusterMap.keySet()) { FrequencyDistribution<String> fd = clusterAssignments.getFrequencyDistribution(clusterId); double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN(); String purityString = String.format("%.2f", purity); double rmse = getRMSE(fd, trainOutcomeValues); String rmseString = String.format("%.2f", rmse); System.out.println( clusterId + "\t" + clusterMap.get(clusterId).size() + "\t" + purityString + "\t" + rmseString); } System.out.println(); }
From source file:de.unidue.langtech.grading.tc.ClusterTrainTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { if (learningMode.equals(Constants.LM_MULTI_LABEL)) { throw new IllegalArgumentException("Cannot use multi-label setup in clustering."); }/* w w w . java2 s.com*/ boolean multiLabel = false; File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); // get number of outcomes List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel); Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0), clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0])); Instances copyTrainData = new Instances(trainData); trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); // generate data for clusterer (w/o class) Remove filter = new Remove(); filter.setAttributeIndices("" + (trainData.classIndex() + 1)); filter.setInputFormat(trainData); Instances clusterTrainData = Filter.useFilter(trainData, filter); clusterer.buildClusterer(clusterTrainData); // get a mapping from clusterIDs to instance offsets in the ARFF Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer); // get a CFD that stores the number of outcomes for each class indexed by the clusterID ConditionalFrequencyDistribution<Integer, String> clusterCfd = getClusterCfd(clusterMap, copyTrainData, trainOutcomeValues); Map<Integer, String> mostFrequentClassPerCluster = new HashMap<Integer, String>(); Map<Integer, Double> clusterScoreMap = new HashMap<Integer, Double>(); for (Integer clusterId : clusterMap.keySet()) { FrequencyDistribution<String> fd = clusterCfd.getFrequencyDistribution(clusterId); mostFrequentClassPerCluster.put(clusterId, fd.getSampleWithMaxFreq()); double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN(); // attention - cannot simply use RMSE here - as smaller values are better unlike with purity // double rmse = getRMSE(fd, trainOutcomeValues); clusterScoreMap.put(clusterId, purity); } // sort clusters by score Map<Integer, Double> sortedClusters = new TreeMap<Integer, Double>(new ValueComparator(clusterScoreMap)); sortedClusters.putAll(clusterScoreMap); // change the outcome values of instances according to the most frequent class in its cluster double avgPurity = 0.0; int n = 0; for (Integer clusterId : sortedClusters.keySet()) { // we need to take as many clusters until we have seen at least each class once if (onlyPureClusters && trainOutcomeValues.size() == 0) { break; } // // do not use clusters of single responses, as they always have purity of 1 // if (clusterCfd.getFrequencyDistribution(clusterId).getN() == 1) { // continue; // } n++; avgPurity += clusterScoreMap.get(clusterId); String mostFrequentClass = mostFrequentClassPerCluster.get(clusterId); trainOutcomeValues.remove(mostFrequentClass); for (Integer instanceOffset : clusterMap.get(clusterId)) { copyTrainData.get(instanceOffset).setValue(copyTrainData.classIndex(), mostFrequentClass); } } avgPurity = avgPurity / n; System.out.println("Average cluster purity: " + avgPurity); // write the new training data (that will be used by the test task instead of the original one) DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/" + ARFF_FILENAME, copyTrainData); }
From source file:de.upb.timok.oneclassclassifier.WekaSvmClassifier.java
License:Open Source License
@Override public void train(List<double[]> trainingSamples) { Instances data = DatasetTransformationUtils.trainingSetToInstances(trainingSamples); // setting class attribute if the data format does not provide this information // For example, the XRFF format saves the class attribute information as well try {//from w ww .j a v a2s . c o m if (filter != null) { filter.setInputFormat(data); data = Filter.useFilter(data, filter); } if (data.classIndex() == -1) { data.setClassIndex(data.numAttributes() - 1); } wekaSvm.buildClassifier(data); } catch (final Exception e) { logger.error("Unexpected exception", e); } }