List of usage examples for weka.core Instances get
@Override
publicInstance get(int index)
From source file:de.ugoe.cs.cpdp.dataprocessing.MORPH.java
License:Apache License
/** * //from w w w . ja v a 2s .c om * <p> * Applies MORPH to the data * </p> * * @param data * data to which the processor is applied */ public void applyMORPH(Instances data) { for (int i = 0; i < data.numInstances(); i++) { morphInstance(data.get(i), data); } }
From source file:de.ugoe.cs.cpdp.dataprocessing.NominalAttributeFilter.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { int indexOfConfidenceAttribute = -1; // Find index of the named confidence attribute to filter for for (int i = 0; i < traindata.numAttributes(); i++) { if (traindata.attribute(i).name().equals(nominalAttributeName)) { indexOfConfidenceAttribute = i; }/*from w w w . j av a 2 s . c om*/ } // if it was not found return if (indexOfConfidenceAttribute == -1) { return; } // Find index of nominal values Attribute confidenceAttribute = traindata.attribute(indexOfConfidenceAttribute); ArrayList<Object> nominalValuesOfConfidenceAttribute = Collections .list(confidenceAttribute.enumerateValues()); ArrayList<Double> indexOfnominalAttributeValues = new ArrayList<Double>(); for (int k = 0; k < nominalValuesOfConfidenceAttribute.size(); k++) { for (String attributeValue : nominalAttributeValues) { if (((String) nominalValuesOfConfidenceAttribute.get(k)).equals(attributeValue)) { indexOfnominalAttributeValues.add((double) k); } } } // Go through all instances and check if nominal attribute equals for (int j = traindata.numInstances() - 1; j >= 0; j--) { Instance wekaInstance = traindata.get(j); // delete all instances where nominal attribute has the value of one of the parameter if (indexOfnominalAttributeValues.contains(wekaInstance.value(indexOfConfidenceAttribute))) { traindata.delete(j); } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.Oversampling.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts; if (counts[1] < counts[0]) { Instances negatives = new Instances(traindata); Instances positives = new Instances(traindata); for (int i = traindata.size() - 1; i >= 0; i--) { if (Double.compare(1.0, negatives.get(i).classValue()) == 0) { negatives.remove(i);/*from w w w. j av a 2 s . c o m*/ } if (Double.compare(0.0, positives.get(i).classValue()) == 0) { positives.remove(i); } } Resample resample = new Resample(); resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]); try { resample.setInputFormat(traindata); positives = Filter.useFilter(positives, resample); } catch (Exception e) { throw new RuntimeException(e); } traindata.clear(); for (int i = 0; i < negatives.size(); i++) { traindata.add(negatives.get(i)); } for (int i = 0; i < positives.size(); i++) { traindata.add(positives.get(i)); } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.Resampling.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { Resample resample = new Resample(); resample.setSampleSizePercent(100);//from w w w .j a v a2 s .c o m resample.setBiasToUniformClass(1.0); Instances traindataSample; try { resample.setInputFormat(traindata); traindataSample = Filter.useFilter(traindata, resample); } catch (Exception e) { throw new RuntimeException(e); } traindata.clear(); for (int i = 0; i < traindataSample.size(); i++) { traindata.add(traindataSample.get(i)); } }
From source file:de.ugoe.cs.cpdp.dataprocessing.SynonymAttributePruning.java
License:Apache License
/** * <p>/*from w w w .java 2s.c o m*/ * Applies the synonym pruning based on the training data. * </p> * * @param testdata * the test data * @param traindata * the training data */ private void applySynonymPruning(Instances testdata, Instances traindata) { double distance; for (int j = traindata.numAttributes() - 1; j >= 0; j--) { if (j != traindata.classIndex()) { boolean hasClosest = false; for (int i1 = 0; !hasClosest && i1 < traindata.size(); i1++) { for (int i2 = 0; !hasClosest && i2 < traindata.size(); i2++) { if (i1 != i2) { double minVal = Double.MAX_VALUE; double distanceJ = Double.MAX_VALUE; for (int k = 0; k < traindata.numAttributes(); k++) { distance = Math.abs(traindata.get(i1).value(k) - traindata.get(i2).value(k)); if (distance < minVal) { minVal = distance; } if (k == j) { distanceJ = distance; } } hasClosest = distanceJ <= minVal; } } } if (!hasClosest) { testdata.deleteAttributeAt(j); traindata.deleteAttributeAt(j); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.TopMetricFilter.java
License:Apache License
private void determineTopKAttributes(Instances testdata, SetUniqueList<Instances> traindataSet) throws Exception { Integer[] counts = new Integer[traindataSet.get(0).numAttributes() - 1]; IntStream.range(0, counts.length).forEach(val -> counts[val] = 0); for (Instances traindata : traindataSet) { J48 decisionTree = new J48(); decisionTree.buildClassifier(traindata); int k = 0; for (int j = 0; j < traindata.numAttributes(); j++) { if (j != traindata.classIndex()) { if (decisionTree.toString().contains(traindata.attribute(j).name())) { counts[k] = counts[k] + 1; }/*from w w w . ja v a 2 s. com*/ k++; } } } int[] topkIndex = new int[counts.length]; IntStream.range(0, counts.length).forEach(val -> topkIndex[val] = val); SortUtils.quicksort(counts, topkIndex, true); // get CFSs for each training set List<Set<Integer>> cfsSets = new LinkedList<>(); for (Instances traindata : traindataSet) { boolean selectionSuccessful = false; boolean secondAttempt = false; Instances traindataCopy = null; do { try { if (secondAttempt) { AttributeSelection attsel = new AttributeSelection(); CfsSubsetEval eval = new CfsSubsetEval(); GreedyStepwise search = new GreedyStepwise(); search.setSearchBackwards(true); attsel.setEvaluator(eval); attsel.setSearch(search); attsel.SelectAttributes(traindataCopy); Set<Integer> cfsSet = new HashSet<>(); for (int attr : attsel.selectedAttributes()) { cfsSet.add(attr); } cfsSets.add(cfsSet); selectionSuccessful = true; } else { AttributeSelection attsel = new AttributeSelection(); CfsSubsetEval eval = new CfsSubsetEval(); GreedyStepwise search = new GreedyStepwise(); search.setSearchBackwards(true); attsel.setEvaluator(eval); attsel.setSearch(search); attsel.SelectAttributes(traindata); Set<Integer> cfsSet = new HashSet<>(); for (int attr : attsel.selectedAttributes()) { cfsSet.add(attr); } cfsSets.add(cfsSet); selectionSuccessful = true; } } catch (IllegalArgumentException e) { String regex = "A nominal attribute \\((.*)\\) cannot have duplicate labels.*"; Pattern p = Pattern.compile(regex); Matcher m = p.matcher(e.getMessage()); if (!m.find()) { // cannot treat problem, rethrow exception throw e; } String attributeName = m.group(1); int attrIndex = traindata.attribute(attributeName).index(); if (secondAttempt) { traindataCopy = WekaUtils.upscaleAttribute(traindataCopy, attrIndex); } else { traindataCopy = WekaUtils.upscaleAttribute(traindata, attrIndex); } Console.traceln(Level.FINE, "upscaled attribute " + attributeName + "; restarting training"); secondAttempt = true; continue; } } while (!selectionSuccessful); // dummy loop for internal continue } double[] coverages = new double[topkIndex.length]; for (Set<Integer> cfsSet : cfsSets) { Set<Integer> topkSet = new HashSet<>(); for (int k = 0; k < topkIndex.length; k++) { topkSet.add(topkIndex[k]); coverages[k] += (coverage(topkSet, cfsSet) / traindataSet.size()); } } double bestCoverageValue = Double.MIN_VALUE; int bestCoverageIndex = 0; for (int i = 0; i < coverages.length; i++) { if (coverages[i] > bestCoverageValue) { bestCoverageValue = coverages[i]; bestCoverageIndex = i; } } // build correlation matrix SpearmansCorrelation corr = new SpearmansCorrelation(); double[][] correlationMatrix = new double[bestCoverageIndex][bestCoverageIndex]; for (Instances traindata : traindataSet) { double[][] vectors = new double[bestCoverageIndex][traindata.size()]; for (int i = 0; i < traindata.size(); i++) { for (int j = 0; j < bestCoverageIndex; j++) { vectors[j][i] = traindata.get(i).value(topkIndex[j]); } } for (int j = 0; j < bestCoverageIndex; j++) { for (int k = j + 1; k < bestCoverageIndex; k++) { correlationMatrix[j][k] = Math.abs(corr.correlation(vectors[j], vectors[k])); } } } Set<Integer> topkSetIndexSet = new TreeSet<>(); // j<30 ensures that the computational time does not explode since the powerset is 2^n in // complexity for (int j = 0; j < bestCoverageIndex && j < 30; j++) { topkSetIndexSet.add(j); } Set<Set<Integer>> allCombinations = Sets.powerSet(topkSetIndexSet); double bestOptCoverage = Double.MIN_VALUE; Set<Integer> opttopkSetIndexSet = null; for (Set<Integer> combination : allCombinations) { if (isUncorrelated(correlationMatrix, combination)) { double currentCoverage = 0.0; Set<Integer> topkCombination = new TreeSet<>(); for (Integer index : combination) { topkCombination.add(topkIndex[index]); } for (Set<Integer> cfsSet : cfsSets) { currentCoverage += (coverage(topkCombination, cfsSet) / traindataSet.size()); } if (currentCoverage > bestOptCoverage) { bestOptCoverage = currentCoverage; opttopkSetIndexSet = combination; } } } Set<Integer> opttopkIndex = new TreeSet<>(); for (Integer index : opttopkSetIndexSet) { opttopkIndex.add(topkIndex[index]); } Console.traceln(Level.FINE, "selected the following metrics:"); for (Integer index : opttopkIndex) { Console.traceln(Level.FINE, traindataSet.get(0).attribute(index).name()); } // finally remove attributes for (int j = testdata.numAttributes() - 1; j >= 0; j--) { if (j != testdata.classIndex() && !opttopkIndex.contains(j)) { testdata.deleteAttributeAt(j); for (Instances traindata : traindataSet) { traindata.deleteAttributeAt(j); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.TransferComponentAnalysis.java
License:Apache License
/** * <p>/*from ww w. ja va 2 s . c o m*/ * Creates the kernel matrix of the test and training data * </p> * * @param testdata * the test data * @param traindata * the training data * @return kernel matrix */ private PrimitiveMatrix buildKernel(Instances testdata, Instances traindata) { final int kernelDim = traindata.numInstances() + testdata.numInstances(); Builder<PrimitiveMatrix> kernelBuilder = PrimitiveMatrix.getBuilder(kernelDim, kernelDim); // built upper left quadrant (source, source) for (int i = 0; i < traindata.numInstances(); i++) { for (int j = 0; j < traindata.numInstances(); j++) { kernelBuilder.set(i, j, linearKernel(traindata.get(i), traindata.get(j))); } } // built upper right quadrant (source, target) for (int i = 0; i < traindata.numInstances(); i++) { for (int j = 0; j < testdata.numInstances(); j++) { kernelBuilder.set(i, j + traindata.numInstances(), linearKernel(traindata.get(i), testdata.get(j))); } } // built lower left quadrant (target, source) for (int i = 0; i < testdata.numInstances(); i++) { for (int j = 0; j < traindata.numInstances(); j++) { kernelBuilder.set(i + traindata.numInstances(), j, linearKernel(testdata.get(i), traindata.get(j))); } } // built lower right quadrant (target, target) for (int i = 0; i < testdata.numInstances(); i++) { for (int j = 0; j < testdata.numInstances(); j++) { kernelBuilder.set(i + traindata.numInstances(), j + traindata.numInstances(), linearKernel(testdata.get(i), testdata.get(j))); } } return kernelBuilder.build(); }
From source file:de.ugoe.cs.cpdp.dataselection.DBSCANFilter.java
License:Apache License
/** * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances, * weka.core.Instances)/*from ww w. j a v a 2s . c o m*/ */ @Override public Instances apply(Instances testdata, Instances traindata) { Instances filteredTraindata = new Instances(traindata); filteredTraindata.clear(); double[][] data = new double[testdata.size() + traindata.size()][testdata.numAttributes() - 1]; int classIndex = testdata.classIndex(); for (int i = 0; i < testdata.size(); i++) { int k = 0; for (int j = 0; j < testdata.numAttributes(); j++) { if (j != classIndex) { data[i][k] = testdata.get(i).value(j); k++; } } } for (int i = 0; i < traindata.size(); i++) { int k = 0; for (int j = 0; j < traindata.numAttributes(); j++) { if (j != classIndex) { data[i + testdata.size()][k] = traindata.get(i).value(j); k++; } } } DatabaseConnection dbc = new ArrayAdapterDatabaseConnection(data); Database db = new StaticArrayDatabase(dbc, null); db.initialize(); DBSCAN<DoubleVector> dbscan = new DBSCAN<DoubleVector>(EuclideanDistanceFunction.STATIC, 1.0, 10); Clustering<Model> clusterer = dbscan.run(db); Relation<DoubleVector> rel = db.getRelation(TypeUtil.DOUBLE_VECTOR_FIELD); int firstInternalIndex = rel.iterDBIDs().internalGetIndex(); for (Cluster<Model> cluster : clusterer.getAllClusters()) { // check if cluster has any training data DBIDIter iter = rel.iterDBIDs(); boolean noMatch = true; for (int i = 0; noMatch && i < testdata.size(); i++) { noMatch = !cluster.getIDs().contains(iter); iter.advance(); } if (!noMatch) { // cluster contains test data for (DBIDIter clusterIter = cluster.getIDs().iter(); clusterIter.valid(); clusterIter.advance()) { int internalIndex = clusterIter.internalGetIndex() - testdata.size() - firstInternalIndex; if (internalIndex >= 0) { // index belongs to a training instance filteredTraindata.add(traindata.get(internalIndex)); } } } } return filteredTraindata; }
From source file:de.ugoe.cs.cpdp.dataselection.DecisionTreeSelection.java
License:Apache License
@Override public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { final Instances data = characteristicInstances(testdata, traindataSet); final ArrayList<String> attVals = new ArrayList<String>(); attVals.add("same"); attVals.add("more"); attVals.add("less"); final ArrayList<Attribute> atts = new ArrayList<Attribute>(); for (int j = 0; j < data.numAttributes(); j++) { atts.add(new Attribute(data.attribute(j).name(), attVals)); }// ww w . ja va 2 s. c om atts.add(new Attribute("score")); Instances similarityData = new Instances("similarity", atts, 0); similarityData.setClassIndex(similarityData.numAttributes() - 1); try { Classifier classifier = new J48(); for (int i = 0; i < traindataSet.size(); i++) { classifier.buildClassifier(traindataSet.get(i)); for (int j = 0; j < traindataSet.size(); j++) { if (i != j) { double[] similarity = new double[data.numAttributes() + 1]; for (int k = 0; k < data.numAttributes(); k++) { if (0.9 * data.get(i + 1).value(k) > data.get(j + 1).value(k)) { similarity[k] = 2.0; } else if (1.1 * data.get(i + 1).value(k) < data.get(j + 1).value(k)) { similarity[k] = 1.0; } else { similarity[k] = 0.0; } } Evaluation eval = new Evaluation(traindataSet.get(j)); eval.evaluateModel(classifier, traindataSet.get(j)); similarity[data.numAttributes()] = eval.fMeasure(1); similarityData.add(new DenseInstance(1.0, similarity)); } } } REPTree repTree = new REPTree(); if (repTree.getNumFolds() > similarityData.size()) { repTree.setNumFolds(similarityData.size()); } repTree.setNumFolds(2); repTree.buildClassifier(similarityData); Instances testTrainSimilarity = new Instances(similarityData); testTrainSimilarity.clear(); for (int i = 0; i < traindataSet.size(); i++) { double[] similarity = new double[data.numAttributes() + 1]; for (int k = 0; k < data.numAttributes(); k++) { if (0.9 * data.get(0).value(k) > data.get(i + 1).value(k)) { similarity[k] = 2.0; } else if (1.1 * data.get(0).value(k) < data.get(i + 1).value(k)) { similarity[k] = 1.0; } else { similarity[k] = 0.0; } } testTrainSimilarity.add(new DenseInstance(1.0, similarity)); } int bestScoringProductIndex = -1; double maxScore = Double.MIN_VALUE; for (int i = 0; i < traindataSet.size(); i++) { double score = repTree.classifyInstance(testTrainSimilarity.get(i)); if (score > maxScore) { maxScore = score; bestScoringProductIndex = i; } } Instances bestScoringProduct = traindataSet.get(bestScoringProductIndex); traindataSet.clear(); traindataSet.add(bestScoringProduct); } catch (Exception e) { Console.printerr("failure during DecisionTreeSelection: " + e.getMessage()); throw new RuntimeException(e); } }
From source file:de.ugoe.cs.cpdp.dataselection.LACE2.java
License:Apache License
@Override public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { Instances selectedData = new Instances(testdata); selectedData.clear();/*from w ww . ja v a 2 s . co m*/ LinkedList<Instances> traindataCopy = new LinkedList<>(traindataSet); Collections.shuffle(traindataCopy); CLIFF cliff = new CLIFF(); cliff.setParameter(Double.toString(percentage)); MORPH morph = new MORPH(); Median median = new Median(); double minDist = Double.MIN_VALUE; for (Instances traindata : traindataCopy) { Instances cliffedData = cliff.applyCLIFF(traindata); if (minDist == Double.MIN_VALUE) { // determine distance for leader-follower algorithm Instances sample; if (traindata.size() > 100) { Resample resample = new Resample(); resample.setSampleSizePercent(100.0 / traindata.size() * 100.0); resample.setBiasToUniformClass(0.0); resample.setNoReplacement(true); try { resample.setInputFormat(traindata); sample = Filter.useFilter(traindata, resample); } catch (Exception e) { throw new RuntimeException(e); } } else { sample = new Instances(traindata); } double[] distances = new double[sample.size()]; for (int i = 0; i < sample.size(); i++) { Instance unlikeNeighbor = morph.getNearestUnlikeNeighbor(sample.get(i), sample); distances[i] = MathArrays.distance(WekaUtils.instanceValues(sample.get(i)), WekaUtils.instanceValues(unlikeNeighbor)); } minDist = median.evaluate(distances); } for (int i = 0; i < cliffedData.size(); i++) { Instance unlikeNeighbor = morph.getNearestUnlikeNeighbor(cliffedData.get(i), selectedData); if (unlikeNeighbor == null) { selectedData.add(cliffedData.get(i)); } else { double distance = MathArrays.distance(WekaUtils.instanceValues(cliffedData.get(i)), WekaUtils.instanceValues(unlikeNeighbor)); if (distance > minDist) { morph.morphInstance(cliffedData.get(i), cliffedData); selectedData.add(cliffedData.get(i)); } } } } }