Example usage for weka.core Instances get

Introduction

In this page you can find the example usage for weka.core Instances get.

Prototype



@Override
publicInstance get(int index)

Source Link

Document

Returns the instance at the given position.

Usage

From source file:de.ugoe.cs.cpdp.dataprocessing.MORPH.java

License:Apache License

/**
 * //from   w w w  . ja  v a  2s .c  om
 * <p>
 * Applies MORPH to the data
 * </p>
 *
 * @param data
 *            data to which the processor is applied
 */
public void applyMORPH(Instances data) {
    for (int i = 0; i < data.numInstances(); i++) {
        morphInstance(data.get(i), data);
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NominalAttributeFilter.java

License:Apache License

@Override
public void apply(Instances testdata, Instances traindata) {
    int indexOfConfidenceAttribute = -1;

    // Find index of the named confidence attribute to filter for
    for (int i = 0; i < traindata.numAttributes(); i++) {
        if (traindata.attribute(i).name().equals(nominalAttributeName)) {
            indexOfConfidenceAttribute = i;
        }/*from  w  w w . j av  a 2  s . c om*/
    }

    // if it was not found return
    if (indexOfConfidenceAttribute == -1) {
        return;
    }

    // Find index of nominal values
    Attribute confidenceAttribute = traindata.attribute(indexOfConfidenceAttribute);
    ArrayList<Object> nominalValuesOfConfidenceAttribute = Collections
            .list(confidenceAttribute.enumerateValues());
    ArrayList<Double> indexOfnominalAttributeValues = new ArrayList<Double>();

    for (int k = 0; k < nominalValuesOfConfidenceAttribute.size(); k++) {
        for (String attributeValue : nominalAttributeValues) {
            if (((String) nominalValuesOfConfidenceAttribute.get(k)).equals(attributeValue)) {
                indexOfnominalAttributeValues.add((double) k);
            }
        }
    }

    // Go through all instances and check if nominal attribute equals
    for (int j = traindata.numInstances() - 1; j >= 0; j--) {
        Instance wekaInstance = traindata.get(j);

        // delete all instances where nominal attribute has the value of one of the parameter
        if (indexOfnominalAttributeValues.contains(wekaInstance.value(indexOfConfidenceAttribute))) {
            traindata.delete(j);
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.Oversampling.java

License:Apache License

@Override
public void apply(Instances testdata, Instances traindata) {

    final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts;
    if (counts[1] < counts[0]) {
        Instances negatives = new Instances(traindata);
        Instances positives = new Instances(traindata);

        for (int i = traindata.size() - 1; i >= 0; i--) {
            if (Double.compare(1.0, negatives.get(i).classValue()) == 0) {
                negatives.remove(i);/*from  w w w. j  av a  2 s .  c  o  m*/
            }
            if (Double.compare(0.0, positives.get(i).classValue()) == 0) {
                positives.remove(i);
            }
        }

        Resample resample = new Resample();
        resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]);
        try {
            resample.setInputFormat(traindata);
            positives = Filter.useFilter(positives, resample);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        traindata.clear();
        for (int i = 0; i < negatives.size(); i++) {
            traindata.add(negatives.get(i));
        }
        for (int i = 0; i < positives.size(); i++) {
            traindata.add(positives.get(i));
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.Resampling.java

License:Apache License

@Override
public void apply(Instances testdata, Instances traindata) {
    Resample resample = new Resample();
    resample.setSampleSizePercent(100);//from w w w  .j  a v a2 s  .c o  m
    resample.setBiasToUniformClass(1.0);

    Instances traindataSample;
    try {
        resample.setInputFormat(traindata);
        traindataSample = Filter.useFilter(traindata, resample);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    traindata.clear();
    for (int i = 0; i < traindataSample.size(); i++) {
        traindata.add(traindataSample.get(i));
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.SynonymAttributePruning.java

License:Apache License

/**
 * <p>/*from w  w w .java  2s.c o m*/
 * Applies the synonym pruning based on the training data.
 * </p>
 *
 * @param testdata
 *            the test data
 * @param traindata
 *            the training data
 */
private void applySynonymPruning(Instances testdata, Instances traindata) {
    double distance;
    for (int j = traindata.numAttributes() - 1; j >= 0; j--) {
        if (j != traindata.classIndex()) {
            boolean hasClosest = false;
            for (int i1 = 0; !hasClosest && i1 < traindata.size(); i1++) {
                for (int i2 = 0; !hasClosest && i2 < traindata.size(); i2++) {
                    if (i1 != i2) {
                        double minVal = Double.MAX_VALUE;
                        double distanceJ = Double.MAX_VALUE;
                        for (int k = 0; k < traindata.numAttributes(); k++) {
                            distance = Math.abs(traindata.get(i1).value(k) - traindata.get(i2).value(k));
                            if (distance < minVal) {
                                minVal = distance;
                            }
                            if (k == j) {
                                distanceJ = distance;
                            }
                        }
                        hasClosest = distanceJ <= minVal;
                    }
                }
            }
            if (!hasClosest) {
                testdata.deleteAttributeAt(j);
                traindata.deleteAttributeAt(j);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.TopMetricFilter.java

License:Apache License

private void determineTopKAttributes(Instances testdata, SetUniqueList<Instances> traindataSet)
        throws Exception {
    Integer[] counts = new Integer[traindataSet.get(0).numAttributes() - 1];
    IntStream.range(0, counts.length).forEach(val -> counts[val] = 0);
    for (Instances traindata : traindataSet) {
        J48 decisionTree = new J48();
        decisionTree.buildClassifier(traindata);
        int k = 0;
        for (int j = 0; j < traindata.numAttributes(); j++) {
            if (j != traindata.classIndex()) {
                if (decisionTree.toString().contains(traindata.attribute(j).name())) {
                    counts[k] = counts[k] + 1;
                }/*from   w  w w .  ja v a 2  s.  com*/
                k++;
            }
        }
    }
    int[] topkIndex = new int[counts.length];
    IntStream.range(0, counts.length).forEach(val -> topkIndex[val] = val);
    SortUtils.quicksort(counts, topkIndex, true);

    // get CFSs for each training set
    List<Set<Integer>> cfsSets = new LinkedList<>();
    for (Instances traindata : traindataSet) {
        boolean selectionSuccessful = false;
        boolean secondAttempt = false;
        Instances traindataCopy = null;
        do {
            try {
                if (secondAttempt) {
                    AttributeSelection attsel = new AttributeSelection();
                    CfsSubsetEval eval = new CfsSubsetEval();
                    GreedyStepwise search = new GreedyStepwise();
                    search.setSearchBackwards(true);
                    attsel.setEvaluator(eval);
                    attsel.setSearch(search);
                    attsel.SelectAttributes(traindataCopy);
                    Set<Integer> cfsSet = new HashSet<>();
                    for (int attr : attsel.selectedAttributes()) {
                        cfsSet.add(attr);
                    }
                    cfsSets.add(cfsSet);
                    selectionSuccessful = true;
                } else {
                    AttributeSelection attsel = new AttributeSelection();
                    CfsSubsetEval eval = new CfsSubsetEval();
                    GreedyStepwise search = new GreedyStepwise();
                    search.setSearchBackwards(true);
                    attsel.setEvaluator(eval);
                    attsel.setSearch(search);
                    attsel.SelectAttributes(traindata);
                    Set<Integer> cfsSet = new HashSet<>();
                    for (int attr : attsel.selectedAttributes()) {
                        cfsSet.add(attr);
                    }
                    cfsSets.add(cfsSet);
                    selectionSuccessful = true;
                }
            } catch (IllegalArgumentException e) {
                String regex = "A nominal attribute \\((.*)\\) cannot have duplicate labels.*";
                Pattern p = Pattern.compile(regex);
                Matcher m = p.matcher(e.getMessage());
                if (!m.find()) {
                    // cannot treat problem, rethrow exception
                    throw e;
                }
                String attributeName = m.group(1);
                int attrIndex = traindata.attribute(attributeName).index();
                if (secondAttempt) {
                    traindataCopy = WekaUtils.upscaleAttribute(traindataCopy, attrIndex);
                } else {
                    traindataCopy = WekaUtils.upscaleAttribute(traindata, attrIndex);
                }
                Console.traceln(Level.FINE, "upscaled attribute " + attributeName + "; restarting training");
                secondAttempt = true;
                continue;
            }
        } while (!selectionSuccessful); // dummy loop for internal continue
    }

    double[] coverages = new double[topkIndex.length];
    for (Set<Integer> cfsSet : cfsSets) {
        Set<Integer> topkSet = new HashSet<>();
        for (int k = 0; k < topkIndex.length; k++) {
            topkSet.add(topkIndex[k]);
            coverages[k] += (coverage(topkSet, cfsSet) / traindataSet.size());
        }
    }
    double bestCoverageValue = Double.MIN_VALUE;
    int bestCoverageIndex = 0;
    for (int i = 0; i < coverages.length; i++) {
        if (coverages[i] > bestCoverageValue) {
            bestCoverageValue = coverages[i];
            bestCoverageIndex = i;
        }
    }
    // build correlation matrix
    SpearmansCorrelation corr = new SpearmansCorrelation();
    double[][] correlationMatrix = new double[bestCoverageIndex][bestCoverageIndex];
    for (Instances traindata : traindataSet) {
        double[][] vectors = new double[bestCoverageIndex][traindata.size()];
        for (int i = 0; i < traindata.size(); i++) {
            for (int j = 0; j < bestCoverageIndex; j++) {
                vectors[j][i] = traindata.get(i).value(topkIndex[j]);
            }
        }
        for (int j = 0; j < bestCoverageIndex; j++) {
            for (int k = j + 1; k < bestCoverageIndex; k++) {
                correlationMatrix[j][k] = Math.abs(corr.correlation(vectors[j], vectors[k]));
            }
        }
    }
    Set<Integer> topkSetIndexSet = new TreeSet<>();
    // j<30 ensures that the computational time does not explode since the powerset is 2^n in
    // complexity
    for (int j = 0; j < bestCoverageIndex && j < 30; j++) {
        topkSetIndexSet.add(j);
    }
    Set<Set<Integer>> allCombinations = Sets.powerSet(topkSetIndexSet);
    double bestOptCoverage = Double.MIN_VALUE;
    Set<Integer> opttopkSetIndexSet = null;
    for (Set<Integer> combination : allCombinations) {
        if (isUncorrelated(correlationMatrix, combination)) {
            double currentCoverage = 0.0;
            Set<Integer> topkCombination = new TreeSet<>();
            for (Integer index : combination) {
                topkCombination.add(topkIndex[index]);
            }
            for (Set<Integer> cfsSet : cfsSets) {
                currentCoverage += (coverage(topkCombination, cfsSet) / traindataSet.size());
            }
            if (currentCoverage > bestOptCoverage) {
                bestOptCoverage = currentCoverage;
                opttopkSetIndexSet = combination;
            }
        }
    }
    Set<Integer> opttopkIndex = new TreeSet<>();
    for (Integer index : opttopkSetIndexSet) {
        opttopkIndex.add(topkIndex[index]);
    }
    Console.traceln(Level.FINE, "selected the following metrics:");
    for (Integer index : opttopkIndex) {
        Console.traceln(Level.FINE, traindataSet.get(0).attribute(index).name());
    }
    // finally remove attributes
    for (int j = testdata.numAttributes() - 1; j >= 0; j--) {
        if (j != testdata.classIndex() && !opttopkIndex.contains(j)) {
            testdata.deleteAttributeAt(j);
            for (Instances traindata : traindataSet) {
                traindata.deleteAttributeAt(j);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.TransferComponentAnalysis.java

License:Apache License

/**
 * <p>/*from   ww w. ja  va  2 s  . c  o m*/
 * Creates the kernel matrix of the test and training data
 * </p>
 *
 * @param testdata
 *            the test data
 * @param traindata
 *            the training data
 * @return kernel matrix
 */
private PrimitiveMatrix buildKernel(Instances testdata, Instances traindata) {
    final int kernelDim = traindata.numInstances() + testdata.numInstances();

    Builder<PrimitiveMatrix> kernelBuilder = PrimitiveMatrix.getBuilder(kernelDim, kernelDim);
    // built upper left quadrant (source, source)
    for (int i = 0; i < traindata.numInstances(); i++) {
        for (int j = 0; j < traindata.numInstances(); j++) {
            kernelBuilder.set(i, j, linearKernel(traindata.get(i), traindata.get(j)));
        }
    }

    // built upper right quadrant (source, target)
    for (int i = 0; i < traindata.numInstances(); i++) {
        for (int j = 0; j < testdata.numInstances(); j++) {
            kernelBuilder.set(i, j + traindata.numInstances(), linearKernel(traindata.get(i), testdata.get(j)));
        }
    }

    // built lower left quadrant (target, source)
    for (int i = 0; i < testdata.numInstances(); i++) {
        for (int j = 0; j < traindata.numInstances(); j++) {
            kernelBuilder.set(i + traindata.numInstances(), j, linearKernel(testdata.get(i), traindata.get(j)));
        }
    }

    // built lower right quadrant (target, target)
    for (int i = 0; i < testdata.numInstances(); i++) {
        for (int j = 0; j < testdata.numInstances(); j++) {
            kernelBuilder.set(i + traindata.numInstances(), j + traindata.numInstances(),
                    linearKernel(testdata.get(i), testdata.get(j)));
        }
    }
    return kernelBuilder.build();
}

From source file:de.ugoe.cs.cpdp.dataselection.DBSCANFilter.java

License:Apache License

/**
 * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances,
 *      weka.core.Instances)/*from ww  w. j  a v  a 2s  . c o  m*/
 */
@Override
public Instances apply(Instances testdata, Instances traindata) {
    Instances filteredTraindata = new Instances(traindata);
    filteredTraindata.clear();

    double[][] data = new double[testdata.size() + traindata.size()][testdata.numAttributes() - 1];
    int classIndex = testdata.classIndex();
    for (int i = 0; i < testdata.size(); i++) {
        int k = 0;
        for (int j = 0; j < testdata.numAttributes(); j++) {
            if (j != classIndex) {
                data[i][k] = testdata.get(i).value(j);
                k++;
            }
        }
    }
    for (int i = 0; i < traindata.size(); i++) {
        int k = 0;
        for (int j = 0; j < traindata.numAttributes(); j++) {
            if (j != classIndex) {
                data[i + testdata.size()][k] = traindata.get(i).value(j);
                k++;
            }
        }
    }
    DatabaseConnection dbc = new ArrayAdapterDatabaseConnection(data);
    Database db = new StaticArrayDatabase(dbc, null);
    db.initialize();
    DBSCAN<DoubleVector> dbscan = new DBSCAN<DoubleVector>(EuclideanDistanceFunction.STATIC, 1.0, 10);
    Clustering<Model> clusterer = dbscan.run(db);
    Relation<DoubleVector> rel = db.getRelation(TypeUtil.DOUBLE_VECTOR_FIELD);
    int firstInternalIndex = rel.iterDBIDs().internalGetIndex();

    for (Cluster<Model> cluster : clusterer.getAllClusters()) {
        // check if cluster has any training data
        DBIDIter iter = rel.iterDBIDs();
        boolean noMatch = true;
        for (int i = 0; noMatch && i < testdata.size(); i++) {
            noMatch = !cluster.getIDs().contains(iter);
            iter.advance();
        }
        if (!noMatch) {
            // cluster contains test data
            for (DBIDIter clusterIter = cluster.getIDs().iter(); clusterIter.valid(); clusterIter.advance()) {
                int internalIndex = clusterIter.internalGetIndex() - testdata.size() - firstInternalIndex;
                if (internalIndex >= 0) {
                    // index belongs to a training instance
                    filteredTraindata.add(traindata.get(internalIndex));
                }
            }

        }
    }

    return filteredTraindata;
}

From source file:de.ugoe.cs.cpdp.dataselection.DecisionTreeSelection.java

License:Apache License

@Override
public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
    final Instances data = characteristicInstances(testdata, traindataSet);

    final ArrayList<String> attVals = new ArrayList<String>();
    attVals.add("same");
    attVals.add("more");
    attVals.add("less");
    final ArrayList<Attribute> atts = new ArrayList<Attribute>();
    for (int j = 0; j < data.numAttributes(); j++) {
        atts.add(new Attribute(data.attribute(j).name(), attVals));
    }//  ww w  . ja  va 2 s.  c  om
    atts.add(new Attribute("score"));
    Instances similarityData = new Instances("similarity", atts, 0);
    similarityData.setClassIndex(similarityData.numAttributes() - 1);

    try {
        Classifier classifier = new J48();
        for (int i = 0; i < traindataSet.size(); i++) {
            classifier.buildClassifier(traindataSet.get(i));
            for (int j = 0; j < traindataSet.size(); j++) {
                if (i != j) {
                    double[] similarity = new double[data.numAttributes() + 1];
                    for (int k = 0; k < data.numAttributes(); k++) {
                        if (0.9 * data.get(i + 1).value(k) > data.get(j + 1).value(k)) {
                            similarity[k] = 2.0;
                        } else if (1.1 * data.get(i + 1).value(k) < data.get(j + 1).value(k)) {
                            similarity[k] = 1.0;
                        } else {
                            similarity[k] = 0.0;
                        }
                    }

                    Evaluation eval = new Evaluation(traindataSet.get(j));
                    eval.evaluateModel(classifier, traindataSet.get(j));
                    similarity[data.numAttributes()] = eval.fMeasure(1);
                    similarityData.add(new DenseInstance(1.0, similarity));
                }
            }
        }
        REPTree repTree = new REPTree();
        if (repTree.getNumFolds() > similarityData.size()) {
            repTree.setNumFolds(similarityData.size());
        }
        repTree.setNumFolds(2);
        repTree.buildClassifier(similarityData);

        Instances testTrainSimilarity = new Instances(similarityData);
        testTrainSimilarity.clear();
        for (int i = 0; i < traindataSet.size(); i++) {
            double[] similarity = new double[data.numAttributes() + 1];
            for (int k = 0; k < data.numAttributes(); k++) {
                if (0.9 * data.get(0).value(k) > data.get(i + 1).value(k)) {
                    similarity[k] = 2.0;
                } else if (1.1 * data.get(0).value(k) < data.get(i + 1).value(k)) {
                    similarity[k] = 1.0;
                } else {
                    similarity[k] = 0.0;
                }
            }
            testTrainSimilarity.add(new DenseInstance(1.0, similarity));
        }

        int bestScoringProductIndex = -1;
        double maxScore = Double.MIN_VALUE;
        for (int i = 0; i < traindataSet.size(); i++) {
            double score = repTree.classifyInstance(testTrainSimilarity.get(i));
            if (score > maxScore) {
                maxScore = score;
                bestScoringProductIndex = i;
            }
        }
        Instances bestScoringProduct = traindataSet.get(bestScoringProductIndex);
        traindataSet.clear();
        traindataSet.add(bestScoringProduct);
    } catch (Exception e) {
        Console.printerr("failure during DecisionTreeSelection: " + e.getMessage());
        throw new RuntimeException(e);
    }
}

From source file:de.ugoe.cs.cpdp.dataselection.LACE2.java

License:Apache License

@Override
public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
    Instances selectedData = new Instances(testdata);
    selectedData.clear();/*from   w  ww . ja  v a 2 s . co m*/

    LinkedList<Instances> traindataCopy = new LinkedList<>(traindataSet);
    Collections.shuffle(traindataCopy);

    CLIFF cliff = new CLIFF();
    cliff.setParameter(Double.toString(percentage));
    MORPH morph = new MORPH();
    Median median = new Median();
    double minDist = Double.MIN_VALUE;

    for (Instances traindata : traindataCopy) {
        Instances cliffedData = cliff.applyCLIFF(traindata);
        if (minDist == Double.MIN_VALUE) {
            // determine distance for leader-follower algorithm
            Instances sample;
            if (traindata.size() > 100) {
                Resample resample = new Resample();
                resample.setSampleSizePercent(100.0 / traindata.size() * 100.0);
                resample.setBiasToUniformClass(0.0);
                resample.setNoReplacement(true);
                try {
                    resample.setInputFormat(traindata);
                    sample = Filter.useFilter(traindata, resample);
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }
            } else {
                sample = new Instances(traindata);
            }
            double[] distances = new double[sample.size()];
            for (int i = 0; i < sample.size(); i++) {
                Instance unlikeNeighbor = morph.getNearestUnlikeNeighbor(sample.get(i), sample);
                distances[i] = MathArrays.distance(WekaUtils.instanceValues(sample.get(i)),
                        WekaUtils.instanceValues(unlikeNeighbor));
            }
            minDist = median.evaluate(distances);
        }
        for (int i = 0; i < cliffedData.size(); i++) {
            Instance unlikeNeighbor = morph.getNearestUnlikeNeighbor(cliffedData.get(i), selectedData);
            if (unlikeNeighbor == null) {
                selectedData.add(cliffedData.get(i));
            } else {
                double distance = MathArrays.distance(WekaUtils.instanceValues(cliffedData.get(i)),
                        WekaUtils.instanceValues(unlikeNeighbor));
                if (distance > minDist) {
                    morph.morphInstance(cliffedData.get(i), cliffedData);
                    selectedData.add(cliffedData.get(i));
                }
            }
        }
    }
}