List of usage examples for weka.core Instances get
@Override
publicInstance get(int index)
From source file:de.ugoe.cs.cpdp.dataselection.MahalanobisOutlierRemoval.java
License:Apache License
/** * <p>// w w w . ja v a2 s . c o m * removes all instances, whose Mahalanobi distance to the mean of the data is greater than * epsilon. * </p> * * @param data * data where the outliers are removed */ private void applyMahalanobisDistancesRemoval(Instances data) { RealMatrix values = new BlockRealMatrix(data.size(), data.numAttributes() - 1); for (int i = 0; i < data.size(); i++) { values.setRow(i, WekaUtils.instanceValues(data.get(i))); } RealMatrix inverseCovariance; try { inverseCovariance = new LUDecomposition(new Covariance(values).getCovarianceMatrix()).getSolver() .getInverse(); } catch (SingularMatrixException e) { Console.traceln(Level.WARNING, "could not perform Mahalanobis outlier removal due to singular covariance matrix"); return; } // create mean vector double[] meanValues = new double[data.numAttributes() - 1]; int k = 0; for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { meanValues[k] = data.attributeStats(j).numericStats.mean; k++; } } for (int i = data.size() - 1; i >= 0; i--) { double distance = mahalanobisDistance(inverseCovariance, WekaUtils.instanceValues(data.get(i)), meanValues); if (distance > epsilon) { data.remove(i); } } }
From source file:de.ugoe.cs.cpdp.dataselection.NeighborhoodFilter.java
License:Apache License
/** * <p>/*from w ww . j av a 2 s. c om*/ * Applies the relevancy filter after Ryu et al. * </p> * * @param testdata * test data * @param traindata * training data * @return filtered trainind data */ private Instances applyNeighborhoodFilter(Instances testdata, Instances traindata) { TreeSet<Integer> selectedInstances = new TreeSet<>(); for (int i = 0; i < testdata.size(); i++) { double minHam = Double.MAX_VALUE; for (int j = 0; j < traindata.size(); j++) { double distance = WekaUtils.hammingDistance(testdata.get(i), traindata.get(j)); if (distance < minHam) { minHam = distance; } } for (int j = 0; j < traindata.size(); j++) { double distance = WekaUtils.hammingDistance(testdata.get(i), traindata.get(j)); if (distance <= minHam) { selectedInstances.add(j); } } } Instances selectedTraindata = new Instances(testdata); selectedTraindata.clear(); for (Integer index : selectedInstances) { selectedTraindata.add(traindata.instance(index)); } return selectedTraindata; }
From source file:de.ugoe.cs.cpdp.dataselection.SynonymOutlierRemoval.java
License:Apache License
/** * <p>/*from w w w .j a v a 2 s . c o m*/ * Applies the synonym outlier removal. * </p> * * @param traindata * data from which the outliers are removed. */ public void applySynonymRemoval(Instances traindata) { double minDistance[][] = new double[traindata.size()][traindata.numAttributes() - 1]; double minDistanceAttribute[] = new double[traindata.numAttributes() - 1]; double distance; for (int j = 0; j < minDistanceAttribute.length; j++) { minDistanceAttribute[j] = Double.MAX_VALUE; } for (int i1 = traindata.size() - 1; i1 < traindata.size(); i1++) { int k = 0; for (int j = 0; j < traindata.numAttributes(); j++) { if (j != traindata.classIndex()) { minDistance[i1][k] = Double.MAX_VALUE; for (int i2 = 0; i2 < traindata.size(); i2++) { if (i1 != i2) { distance = Math.abs(traindata.get(i1).value(j) - traindata.get(i2).value(j)); if (distance < minDistance[i1][k]) { minDistance[i1][k] = distance; } if (distance < minDistanceAttribute[k]) { minDistanceAttribute[k] = distance; } } } k++; } } } for (int i = traindata.size() - 1; i >= 0; i--) { boolean hasClosest = false; for (int j = 0; !hasClosest && j < traindata.numAttributes(); j++) { hasClosest = minDistance[i][j] <= minDistanceAttribute[j]; } if (!hasClosest) { traindata.delete(i); } } }
From source file:de.ugoe.cs.cpdp.util.WekaUtils.java
License:Apache License
/** * <p>// w w w . j a v a 2 s. co m * Upscales the value of a single attribute. This is a workaround to get BayesNet running for * all data. Works on a copy of the training data, i.e., leaves the original data untouched. * </p> * * @param traindata * data from which the attribute is upscaled. * @param attributeIndex * index of the attribute * @return data with upscaled attribute */ public static Instances upscaleAttribute(Instances traindata, int attributeIndex) { Instances traindataCopy = new Instances(traindata); for (int i = 0; i < traindata.size(); i++) { traindataCopy.get(i).setValue(attributeIndex, traindata.get(i).value(attributeIndex) * SCALER); } return traindataCopy; }
From source file:de.unidue.langtech.grading.tc.ClusterExemplarTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { if (learningMode.equals(Constants.LM_MULTI_LABEL)) { throw new IllegalArgumentException("Cannot use multi-label setup in clustering."); }/* ww w . ja v a2s . c o m*/ boolean multiLabel = false; File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); Clusterer abstractClusterer = AbstractClusterer.forName(clusteringArguments.get(0), clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0])); // we assume that only this method has been used - breaks modularity, but need results fast ... :/ SimpleKMeans clusterer = (SimpleKMeans) abstractClusterer; trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); Instances copyTrainData = new Instances(trainData); // generate data for clusterer (w/o class) Remove filter = new Remove(); filter.setAttributeIndices("" + (trainData.classIndex() + 1)); filter.setInputFormat(trainData); Instances clusterTrainData = Filter.useFilter(trainData, filter); clusterer.buildClusterer(clusterTrainData); Instances centroids = clusterer.getClusterCentroids(); // Add addFilter = new Add(); // addFilter.setAttributeIndex(new Integer(numTestLabels + i + 1).toString()); // addFilter.setNominalLabels("0,1"); // addFilter.setAttributeName(trainData.attribute(i).name() + COMPATIBLE_OUTCOME_CLASS); // addFilter.setInputFormat(testData); trainData.clear(); Enumeration<Instance> centroidInstances = centroids.enumerateInstances(); while (centroidInstances.hasMoreElements()) { Instance centroidInstance = centroidInstances.nextElement(); // centroidInstance is usually not a real instance, but a virtual centroid // we need to find the closest point in the training data double minDistance = Double.POSITIVE_INFINITY; int offset = 0; int minOffset = 0; Enumeration<Instance> trainInstances = clusterTrainData.enumerateInstances(); while (trainInstances.hasMoreElements()) { Instance trainInstance = trainInstances.nextElement(); double dist = distance(centroidInstance, trainInstance); if (dist < minDistance) { minDistance = dist; minOffset = offset; } offset++; } // add selected instance to instances trainData.add(copyTrainData.get(minOffset)); } // write the new training data (that will be used by the test task instead of the original one) DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/" + ARFF_FILENAME, trainData); }
From source file:de.unidue.langtech.grading.tc.ClusteringTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { if (learningMode.equals(Constants.LM_MULTI_LABEL)) { throw new IllegalArgumentException("Cannot use multi-label setup in clustering."); }//from w ww . ja v a 2 s . c o m boolean multiLabel = false; File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); // get number of outcomes List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel); Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0), clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0])); Instances copyTrainData = new Instances(trainData); trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); // generate data for clusterer (w/o class) Remove filter = new Remove(); filter.setAttributeIndices("" + (trainData.classIndex() + 1)); filter.setInputFormat(trainData); Instances clusterTrainData = Filter.useFilter(trainData, filter); clusterer.buildClusterer(clusterTrainData); // get a mapping from clusterIDs to instance offsets in the ARFF Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer); Map<String, String> instanceId2TextMap = getInstanceId2TextMap(aContext); ConditionalFrequencyDistribution<Integer, String> clusterAssignments = new ConditionalFrequencyDistribution<Integer, String>(); for (Integer clusterId : clusterMap.keySet()) { System.out.println("CLUSTER: " + clusterId); for (Integer offset : clusterMap.get(clusterId)) { // get instance ID from instance Instance instance = copyTrainData.get(offset); Double classOffset = new Double(instance.value(copyTrainData.classAttribute())); String label = (String) trainOutcomeValues.get(classOffset.intValue()); clusterAssignments.addSample(clusterId, label); String instanceId = instance .stringValue(copyTrainData.attribute(AddIdFeatureExtractor.ID_FEATURE_NAME).index()); System.out.println(label + "\t" + instanceId2TextMap.get(instanceId)); } System.out.println(); } System.out.println("ID\tSIZE\tPURITY\tRMSE"); for (Integer clusterId : clusterMap.keySet()) { FrequencyDistribution<String> fd = clusterAssignments.getFrequencyDistribution(clusterId); double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN(); String purityString = String.format("%.2f", purity); double rmse = getRMSE(fd, trainOutcomeValues); String rmseString = String.format("%.2f", rmse); System.out.println( clusterId + "\t" + clusterMap.get(clusterId).size() + "\t" + purityString + "\t" + rmseString); } System.out.println(); }
From source file:de.unidue.langtech.grading.tc.ClusterTrainTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { if (learningMode.equals(Constants.LM_MULTI_LABEL)) { throw new IllegalArgumentException("Cannot use multi-label setup in clustering."); }//from ww w . j a v a 2 s. c o m boolean multiLabel = false; File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); // get number of outcomes List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel); Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0), clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0])); Instances copyTrainData = new Instances(trainData); trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); // generate data for clusterer (w/o class) Remove filter = new Remove(); filter.setAttributeIndices("" + (trainData.classIndex() + 1)); filter.setInputFormat(trainData); Instances clusterTrainData = Filter.useFilter(trainData, filter); clusterer.buildClusterer(clusterTrainData); // get a mapping from clusterIDs to instance offsets in the ARFF Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer); // get a CFD that stores the number of outcomes for each class indexed by the clusterID ConditionalFrequencyDistribution<Integer, String> clusterCfd = getClusterCfd(clusterMap, copyTrainData, trainOutcomeValues); Map<Integer, String> mostFrequentClassPerCluster = new HashMap<Integer, String>(); Map<Integer, Double> clusterScoreMap = new HashMap<Integer, Double>(); for (Integer clusterId : clusterMap.keySet()) { FrequencyDistribution<String> fd = clusterCfd.getFrequencyDistribution(clusterId); mostFrequentClassPerCluster.put(clusterId, fd.getSampleWithMaxFreq()); double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN(); // attention - cannot simply use RMSE here - as smaller values are better unlike with purity // double rmse = getRMSE(fd, trainOutcomeValues); clusterScoreMap.put(clusterId, purity); } // sort clusters by score Map<Integer, Double> sortedClusters = new TreeMap<Integer, Double>(new ValueComparator(clusterScoreMap)); sortedClusters.putAll(clusterScoreMap); // change the outcome values of instances according to the most frequent class in its cluster double avgPurity = 0.0; int n = 0; for (Integer clusterId : sortedClusters.keySet()) { // we need to take as many clusters until we have seen at least each class once if (onlyPureClusters && trainOutcomeValues.size() == 0) { break; } // // do not use clusters of single responses, as they always have purity of 1 // if (clusterCfd.getFrequencyDistribution(clusterId).getN() == 1) { // continue; // } n++; avgPurity += clusterScoreMap.get(clusterId); String mostFrequentClass = mostFrequentClassPerCluster.get(clusterId); trainOutcomeValues.remove(mostFrequentClass); for (Integer instanceOffset : clusterMap.get(clusterId)) { copyTrainData.get(instanceOffset).setValue(copyTrainData.classIndex(), mostFrequentClass); } } avgPurity = avgPurity / n; System.out.println("Average cluster purity: " + avgPurity); // write the new training data (that will be used by the test task instead of the original one) DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/" + ARFF_FILENAME, copyTrainData); }
From source file:de.unidue.langtech.grading.tc.ClusterTrainTask.java
License:Open Source License
private ConditionalFrequencyDistribution<Integer, String> getClusterCfd(Map<Integer, Set<Integer>> clusterMap, Instances data, List<String> outcomeValues) { ConditionalFrequencyDistribution<Integer, String> clusterAssignments = new ConditionalFrequencyDistribution<Integer, String>(); for (Integer clusterId : clusterMap.keySet()) { for (Integer offset : clusterMap.get(clusterId)) { // get instance ID from instance Instance instance = data.get(offset); Double classOffset = new Double(instance.value(data.classAttribute())); String label = outcomeValues.get(classOffset.intValue()); clusterAssignments.addSample(clusterId, label); }/* w ww .ja v a 2 s . co m*/ } return clusterAssignments; }
From source file:de.upb.timok.utils.DatasetTransformationUtils.java
License:Open Source License
public static List<double[]> instancesToDoubles(Instances instances, boolean chopClassAttribute) { final List<double[]> result = new ArrayList<>(); for (int i = 0; i < instances.size(); i++) { final Instance instance = instances.get(i); double[] temp = instance.toDoubleArray(); if (chopClassAttribute) { temp = Arrays.copyOfRange(temp, 0, temp.length - 1); }//from www. j a va2s . c o m result.add(temp); } return result; }
From source file:edu.oregonstate.eecs.mcplan.abstraction.EvaluateSimilarityFunction.java
License:Open Source License
public static Instances transformInstances(final Instances src, final CoordinateTransform transform) { final ArrayList<Attribute> out_attributes = new ArrayList<Attribute>(); for (int i = 0; i < transform.outDimension(); ++i) { out_attributes.add(new Attribute("x" + i)); }//from w w w . java2 s . co m out_attributes.add((Attribute) src.classAttribute().copy()); final Instances out = new Instances(src.relationName() + "_" + transform.name(), out_attributes, 0); for (int i = 0; i < src.size(); ++i) { final Instance inst = src.get(i); final RealVector flat = new ArrayRealVector(WekaUtil.unlabeledFeatures(inst)); final RealVector transformed_vector = transform.encode(flat).x; final double[] transformed = new double[transformed_vector.getDimension() + 1]; for (int j = 0; j < transformed_vector.getDimension(); ++j) { transformed[j] = transformed_vector.getEntry(j); } transformed[transformed.length - 1] = inst.classValue(); final Instance transformed_instance = new DenseInstance(inst.weight(), transformed); out.add(transformed_instance); transformed_instance.setDataset(out); } out.setClassIndex(out.numAttributes() - 1); return out; }