Example usage for weka.core Instances instance

List of usage examples for weka.core Instances instance

Introduction

In this page you can find the example usage for weka.core Instances instance.

Prototype



publicInstance instance(int index) 

Source Link

Document

Returns the instance at the given position.

Usage

From source file:elh.eus.absa.CLI.java

License:Open Source License

/**
 * Main access to the train-atc functionalities. Train ATC using a double one vs. all classifier
 * (E and A) for E#A aspect categories/* w  ww .  ja v a2s .c o m*/
 * @throws Exception 
 */
public final void trainATC2(final InputStream inputStream) throws IOException {
    // load training parameters file
    String paramFile = parsedArguments.getString("params");
    String testFile = parsedArguments.getString("testset");
    String paramFile2 = parsedArguments.getString("params2");
    String corpusFormat = parsedArguments.getString("corpusFormat");
    //String validation = parsedArguments.getString("validation");
    String lang = parsedArguments.getString("language");
    //int foldNum = Integer.parseInt(parsedArguments.getString("foldNum"));
    //boolean printPreds = parsedArguments.getBoolean("printPreds");
    boolean nullSentenceOpinions = parsedArguments.getBoolean("nullSentences");
    boolean onlyTest = parsedArguments.getBoolean("testOnly");
    double threshold = 0.5;
    double threshold2 = 0.5;
    String modelsPath = "/home/inaki/elixa-atp/ovsaModels";

    CorpusReader reader = new CorpusReader(inputStream, corpusFormat, nullSentenceOpinions, lang);
    Features atcTrain = new Features(reader, paramFile, "3");
    Instances traindata = atcTrain.loadInstances(true, "atc");

    if (onlyTest) {
        if (FileUtilsElh.checkFile(testFile)) {
            System.err.println("read from test file");
            reader = new CorpusReader(new FileInputStream(new File(testFile)), corpusFormat,
                    nullSentenceOpinions, lang);
            atcTrain.setCorpus(reader);
            traindata = atcTrain.loadInstances(true, "atc");
        }
    }

    //setting class attribute (entCat|attCat|entAttCat|polarityCat)

    //HashMap<String, Integer> opInst = atcTrain.getOpinInst();      
    //WekaWrapper classifyAtts;
    WekaWrapper onevsall;
    try {

        //classify.printMultilabelPredictions(classify.multiLabelPrediction());      */   

        //onevsall
        Instances entdata = new Instances(traindata);
        entdata.deleteAttributeAt(entdata.attribute("attCat").index());
        entdata.deleteAttributeAt(entdata.attribute("entAttCat").index());
        entdata.setClassIndex(entdata.attribute("entCat").index());
        onevsall = new WekaWrapper(entdata, true);

        if (!onlyTest) {
            onevsall.trainOneVsAll(modelsPath, paramFile + "entCat");
            System.out.println("trainATC: one vs all models ready");
        }
        onevsall.setTestdata(entdata);
        HashMap<Integer, HashMap<String, Double>> ovsaRes = onevsall.predictOneVsAll(modelsPath,
                paramFile + "entCat");
        System.out.println("trainATC: one vs all predictions ready");
        HashMap<Integer, String> instOps = new HashMap<Integer, String>();
        for (String oId : atcTrain.getOpinInst().keySet()) {
            instOps.put(atcTrain.getOpinInst().get(oId), oId);
        }

        atcTrain = new Features(reader, paramFile2, "3");
        entdata = atcTrain.loadInstances(true, "attTrain2_data");
        entdata.deleteAttributeAt(entdata.attribute("entAttCat").index());
        //entdata.setClassIndex(entdata.attribute("entCat").index());

        Attribute insAtt = entdata.attribute("instanceId");
        double maxInstId = entdata.kthSmallestValue(insAtt, entdata.numDistinctValues(insAtt) - 1);
        System.err.println("last instance has index: " + maxInstId);
        for (int ins = 0; ins < entdata.numInstances(); ins++) {
            System.err.println("ins" + ins);
            int i = (int) entdata.instance(ins).value(insAtt);
            Instance currentInst = entdata.instance(ins);
            //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i));
            String sId = reader.getOpinion(instOps.get(i)).getsId();
            String oId = instOps.get(i);
            reader.removeSentenceOpinions(sId);
            int oSubId = 0;
            for (String cl : ovsaRes.get(i).keySet()) {
                //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                if (ovsaRes.get(i).get(cl) > threshold) {
                    //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));                  
                    // for the first one update the instances
                    if (oSubId >= 1) {
                        Instance newIns = new SparseInstance(currentInst);
                        newIns.setDataset(entdata);
                        entdata.add(newIns);
                        newIns.setValue(insAtt, maxInstId + oSubId);
                        newIns.setClassValue(cl);
                        instOps.put((int) maxInstId + oSubId, oId);

                    }
                    // if the are more create new instances
                    else {
                        currentInst.setClassValue(cl);
                        //create and add opinion to the structure
                        //   trgt, offsetFrom, offsetTo, polarity, cat, sId);
                        //Opinion op = new Opinion(instOps.get(i)+"_"+oSubId, "", 0, 0, "", cl, sId);
                        //reader.addOpinion(op);
                    }
                    oSubId++;
                }
            } //finished updating instances data                                    
        }

        entdata.setClass(entdata.attribute("attCat"));
        onevsall = new WekaWrapper(entdata, true);

        /**
         *  Bigarren sailkatzailea
         * 
         * */
        if (!onlyTest) {
            onevsall.trainOneVsAll(modelsPath, paramFile + "attCat");
            System.out.println("trainATC: one vs all attcat models ready");
        }

        ovsaRes = onevsall.predictOneVsAll(modelsPath, paramFile + "entAttCat");

        insAtt = entdata.attribute("instanceId");
        maxInstId = entdata.kthSmallestValue(insAtt, insAtt.numValues());
        System.err.println("last instance has index: " + maxInstId);
        for (int ins = 0; ins < entdata.numInstances(); ins++) {
            System.err.println("ins: " + ins);
            int i = (int) entdata.instance(ins).value(insAtt);
            Instance currentInst = entdata.instance(ins);
            //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i));
            String sId = reader.getOpinion(instOps.get(i)).getsId();
            String oId = instOps.get(i);
            reader.removeSentenceOpinions(sId);
            int oSubId = 0;
            for (String cl : ovsaRes.get(i).keySet()) {
                //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                if (ovsaRes.get(i).get(cl) > threshold2) {
                    ///System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                    if (ovsaRes.get(i).get(cl) > threshold) {
                        //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));                  
                        // for the first one update the instances
                        if (oSubId >= 1) {
                            String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl;
                            //create and add opinion to the structure
                            //   trgt, offsetFrom, offsetTo, polarity, cat, sId);                     
                            Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId);
                            reader.addOpinion(op);
                        }
                        // if the are more create new instances
                        else {
                            String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl;
                            //create and add opinion to the structure
                            //   trgt, offsetFrom, offsetTo, polarity, cat, sId);
                            reader.removeOpinion(oId);
                            Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId);
                            reader.addOpinion(op);
                        }
                        oSubId++;
                    }
                } //finished updating instances data                                    
            }
        }
        reader.print2Semeval2015format(paramFile + "entAttCat.xml");
    } catch (Exception e) {
        e.printStackTrace();
    }

    //traindata.setClass(traindata.attribute("entAttCat"));
    System.err.println("DONE CLI train-atc2 (oneVsAll)");
}

From source file:elh.eus.absa.WekaWrapper.java

License:Open Source License

/**
 *      Train one vs all models over the given training data.
 *  /*  w ww . j a v  a 2 s . c om*/
 * @param modelpath directory to store each model for the one vs. all method
 * @param prefix prefix the models should have (each model will have the name of its class appended
 * @throws Exception
 */
public void trainOneVsAll(String modelpath, String prefix) throws Exception {
    Instances orig = new Instances(traindata);
    Enumeration<Object> classValues = traindata.classAttribute().enumerateValues();
    String classAtt = traindata.classAttribute().name();
    while (classValues.hasMoreElements()) {
        String v = (String) classValues.nextElement();
        System.err.println("trainer onevsall for class " + v + " classifier");
        //needed because of weka's sparse data format problems THIS IS TROUBLE! ...
        if (v.equalsIgnoreCase("dummy")) {
            continue;
        }
        // copy instances and set the same class value
        Instances ovsa = new Instances(orig);
        //create a new class attribute         
        //   // Declare the class attribute along with its values
        ArrayList<String> classVal = new ArrayList<String>();
        classVal.add("dummy"); //needed because of weka's sparse data format problems...
        classVal.add(v);
        classVal.add("UNKNOWN");
        ovsa.insertAttributeAt(new Attribute(classAtt + "2", classVal), ovsa.numAttributes());
        //change all instance labels that have not the current class value to "other"
        for (int i = 0; i < ovsa.numInstances(); i++) {
            Instance inst = ovsa.instance(i);
            String instClass = inst.stringValue(ovsa.attribute(classAtt).index());
            if (instClass.equalsIgnoreCase(v)) {
                inst.setValue(ovsa.attribute(classAtt + "2").index(), v);
            } else {
                inst.setValue(ovsa.attribute(classAtt + "2").index(), "UNKNOWN");
            }
        }
        //delete the old class attribute and set the new.         
        ovsa.setClassIndex(ovsa.attribute(classAtt + "2").index());
        ovsa.deleteAttributeAt(ovsa.attribute(classAtt).index());
        ovsa.renameAttribute(ovsa.attribute(classAtt + "2").index(), classAtt);
        ovsa.setClassIndex(ovsa.attribute(classAtt).index());

        //build the classifier, crossvalidate and store the model
        setTraindata(ovsa);
        saveModel(modelpath + File.separator + prefix + "_" + v + ".model");
        setTestdata(ovsa);
        testModel(modelpath + File.separator + prefix + "_" + v + ".model");

        System.err.println("trained onevsall " + v + " classifier");
    }

    setTraindata(orig);
}

From source file:entities.ArffFile.java

/**
 * Dada una lista de parametros, se ejecuta el filtro de microagregacion.
 * Todos estos parametros son entrada del usuario.
 * @param df Puede ser Euclidian o Manhattan distance, se especifica en la entrada.
 * @param numCluster//from  w ww  . j a  v  a2s  .  c o  m
 * @param seed
 * @param maxIterations
 * @param replaceMissingValues
 * @param preserveInstancesOrder
 * @param attributes lista de los atributos que se desean generalizar con cluster
 */
public void microAgregacion(DistanceFunction df, int numCluster, int seed, int maxIterations,
        boolean replaceMissingValues, boolean preserveInstancesOrder, List<Integer> attributes)
        throws Exception {
    //instancesFilter = new Instances(instances);
    SimpleKMeans kMeans;
    kMeans = new SimpleKMeans();
    Instances uniqueAttributes;
    uniqueAttributes = new Instances(instancesFilter);
    List<String> names = new ArrayList<>();
    int i = 0;
    for (Integer attribute : attributes) {
        String name = new String(instancesFilter.attribute(attribute).name());
        if (instancesFilter.attribute(attribute).isDate() || instancesFilter.attribute(attribute).isString())
            throw new Exception("No se puede hacer cluster con atributos de tipo DATE o STRING");
        names.add(name);
    }
    while (uniqueAttributes.numAttributes() != attributes.size()) {
        if (!names.contains(uniqueAttributes.attribute(i).name()))
            uniqueAttributes.deleteAttributeAt(i);
        else
            i++;
    }
    try {
        kMeans.setNumClusters(numCluster);
        kMeans.setMaxIterations(maxIterations);
        kMeans.setSeed(seed);
        kMeans.setDisplayStdDevs(false);
        kMeans.setDistanceFunction(df);
        kMeans.setDontReplaceMissingValues(replaceMissingValues);
        kMeans.setPreserveInstancesOrder(preserveInstancesOrder);
        kMeans.buildClusterer(uniqueAttributes);
        //System.out.println(kMeans);
        for (int j = 0; j < uniqueAttributes.numInstances(); j++) {
            int cluster = kMeans.clusterInstance(uniqueAttributes.instance(j));
            for (int k = 0; k < uniqueAttributes.numAttributes(); k++) {
                if (uniqueAttributes.attribute(k).isNumeric())
                    uniqueAttributes.instance(j).setValue(k,
                            Double.parseDouble(kMeans.getClusterCentroids().instance(cluster).toString(k)));
                else
                    uniqueAttributes.instance(j).setValue(k,
                            kMeans.getClusterCentroids().instance(cluster).toString(k));
            }
        }
        replaceValues(uniqueAttributes, attributes);
    } catch (Exception ex) {
        Logger.getLogger(ArffFile.class.getName()).log(Level.SEVERE, null, ex);
    }
    //saveToFile("4");
}

From source file:entities.ArffFile.java

/**
 * Agrega los nuevos valores que se encuentran en uniqueAttribute
 * A instancesFilter para luego ser exportado en archivo arff
 * @param uniqueAttribute /*from w  w  w.  jav a 2s.c o m*/
 */
public void replaceValues(Instances uniqueAttribute, List<Integer> attributes) {
    for (int i = 0; i < instancesFilter.numInstances(); i++) {
        for (int j = 0; j < attributes.size(); j++) {
            if (instancesFilter.attribute(attributes.get(j)).isNumeric())
                instancesFilter.instance(i).setValue(attributes.get(j),
                        Double.parseDouble(uniqueAttribute.instance(i).toString(j)));
            else
                instancesFilter.instance(i).setValue(attributes.get(j),
                        uniqueAttribute.instance(i).toString(j));
        }
    }
}

From source file:entity.DifficultyResamplingManager.java

License:Open Source License

/**
 * Return max dimensions of subdataset for a PR (total, p, n)
 * @param originalDataset/*from  ww  w.j a v  a2  s . c  o  m*/
 * @param positiveExamplePercentProportion
 * @return
 */
public SubdatasetDimensions calculateSubdatasetDimensionsForProportion(Instances originalDataset,
        BigDecimal positiveExamplePercentProportion) {

    // size of subdataset, initialized to original size
    int total = originalDataset.numInstances();
    // number of positive instances
    int p = 0;
    // number of negative instances
    int n = 0;
    // current PR
    int pp = 0;

    // count positives
    for (int i = 0; i < total; i++) {
        if (originalDataset.instance(i).stringValue(originalDataset.classIndex()).equals(Settings.buggyLabel)) {
            p++;
        }
    }

    n = total - p;

    // finds actual PR
    pp = calculatePositivePercentCeil(p + n, p);

    if (verbose)
        System.out.println(
                "[DifficultyResamplingManager , calculateSubdatasetDimensionsForProportion] attuale: p=" + p
                        + " n=" + n + " pp = " + pp);

    // if current PR equals desired one, return current dimensions
    if (pp == positiveExamplePercentProportion.intValue())
        return new SubdatasetDimensions(p, n);

    // if current PR is greater than the desired one
    // decrements p until ceiling of current PR is greater than the desired one
    if (pp > positiveExamplePercentProportion.intValue()) {
        while (pp > positiveExamplePercentProportion.intValue()) {
            p--;
            pp = calculatePositivePercentCeil(p + n, p);
            if (verbose)
                System.out
                        .println("[DifficultyResamplingManager , calculateSubdatasetDimensionsForProportion] p="
                                + p + " n=" + n + " pp = " + pp);
        }
        // goes back if the previous PR was "nearer" to the desired than the current one
        if (isPPPNearerThanPPToDesiredPercent(calculatePositivePercentCeil(p + 1 + n, p + 1), pp,
                positiveExamplePercentProportion.intValue())) {
            p++;
            pp = calculatePositivePercentCeil(p + n, p);
        }
    }

    // if current PR is less than the desired one
    // decrements n until ceiling of current PR is less than the desired one
    if (pp < positiveExamplePercentProportion.intValue()) {
        while (pp < positiveExamplePercentProportion.intValue()) {
            n--;
            pp = calculatePositivePercentCeil(p + n, p);
            if (verbose)
                System.out
                        .println("[DifficultyResamplingManager , calculateSubdatasetDimensionsForProportion] p="
                                + p + " n=" + n + " pp = " + pp);
        }
        // goes back if the previous PR was "nearer" to the desired than the current one
        if (isPPPNearerThanPPToDesiredPercent(calculatePositivePercentCeil(p + n + 1, p), pp,
                positiveExamplePercentProportion.intValue())) {
            n++;
            pp = calculatePositivePercentCeil(p + n, p);
        }
    }

    if (verbose)
        System.out
                .println("[DifficultyResamplingManager , calculateSubdatasetDimensionsForProportion] finale p="
                        + p + " n=" + n + " pp = " + pp);
    return new SubdatasetDimensions(p, n);
}

From source file:entity.DifficultyResamplingManager.java

License:Open Source License

/**
 * called by generateResampledSubdataset
 * //from  w ww .  j  a v  a 2  s.  co  m
 * @param originalDataset
 * @param subdatasetDimensions
 * @return
 */
private Instances generateResampledSubdataset(Instances originalDataset,
        SubdatasetDimensions subdatasetDimensions) {

    // creates an empty dataset
    Instances resampledSubdataset = new Instances(originalDataset);
    resampledSubdataset.delete();

    // randomize dataset instances order
    originalDataset.randomize(RandomizationManager.randomGenerator);

    // calc number of positives to insert
    int positivesToInsert = subdatasetDimensions.getP();
    if (verbose)
        System.out.println("[DifficultyResamplingManager, generateResampledSubdataset] positivesToInsert = "
                + positivesToInsert);

    // calc number of negatives to insert
    int negativesToInsert = subdatasetDimensions.getN();

    // iterates over the original dataset instances
    for (int i = 0; i < originalDataset.numInstances(); i++) {
        // if instance is positive and more are needed in the new dataset, inserts into new dataset
        if ((positivesToInsert > 0) && (originalDataset.instance(i).stringValue(originalDataset.classIndex())
                .equals(Settings.buggyLabel))) {
            resampledSubdataset.add(originalDataset.instance(i));
            positivesToInsert--;
        }

        // if instance is negative and more are needed in the new dataset, inserts into new dataset
        if ((negativesToInsert > 0) && (originalDataset.instance(i).stringValue(originalDataset.classIndex())
                .equals(Settings.nonbuggyLabel))) {
            resampledSubdataset.add(originalDataset.instance(i));
            negativesToInsert--;
        }

    }

    if (verbose)
        System.out.println("[DifficultyResamplingManager, generateResampledSubdataset] resampling terminato: "
                + this.printDatasetInfo(resampledSubdataset));
    return resampledSubdataset;
}

From source file:entity.DifficultyResamplingManager.java

License:Open Source License

/**
 * prints number of posive and negative instances and respective percentaghes
 * @param dataset// w  w  w. j a  va  2  s . c om
 * @return
 */
public String printDatasetInfo(Instances dataset) {

    int positives = 0;
    int negatives = 0;

    for (int i = 0; i < dataset.numInstances(); i++) {

        if (dataset.instance(i).stringValue(dataset.classIndex()).equals(Settings.buggyLabel)) {
            positives++;
        }

        if (dataset.instance(i).stringValue(dataset.classIndex()).equals(Settings.nonbuggyLabel)) {
            negatives++;
        }
    }

    double percent = ((double) positives / (double) dataset.numInstances()) * 100;
    return new String("totale istanze: " + dataset.numInstances() + ", p+n=" + (positives + negatives) + ", p: "
            + positives + ", n: " + negatives + ", %p : " + percent);
}

From source file:entity.NoiseInjectionManager.java

License:Open Source License

/**
 * //from   ww w .j  a va  2 s. co m
 * Increments fp and fn by specified percentages.
 * Randomize order of instances and modifies instances until noise quota is reached.
 * Than randomized instances again.
 * NOTE: It modifies the given dataset, because it is a reference.
 *  
 * @param origDataset
 * @param fpPercentage
 * @param fnPercentage
 * @return Instances noisyDataset
 */
public Instances addNoiseToDataset(Instances origDataset, BigDecimal fpPercentage, BigDecimal fnPercentage) {

    // exits if no noise must be added
    if (fnPercentage.equals(BigDecimal.ZERO) && fpPercentage.equals(BigDecimal.ZERO)) {
        if (verbose)
            System.out.println("[NoiseManager , addNoiseToDataset] nessun errore da aggiungere");
        return origDataset;
    }

    // total instances in dataset
    int numInstances = origDataset.numInstances();

    // finds positive (buggy) and negative (non-buggy) instances numbers
    int numOfPositives = 0;
    int numOfNegatives = 0;

    for (int j = 0; j < numInstances; j++) {

        if (origDataset.instance(j).stringValue(origDataset.classIndex()).equals(Settings.buggyLabel)) {
            numOfPositives++;
        }
        // this is a redundant control, but better safe than sorry
        else if (origDataset.instance(j).stringValue(origDataset.classIndex()).equals(Settings.nonbuggyLabel)) {
            numOfNegatives++;
        }
    }

    // calculates the number of false positives to insert
    int fpToInsert = (int) Math.round(numOfNegatives * fpPercentage.doubleValue() / 100);
    int fpInserted = 0;
    if (verbose)
        System.out.println("\n\n[NoiseManager , addNoiseToDataset] fpToInsert= " + fpToInsert
                + ", totIntances= " + origDataset.numInstances() + " true negatives= " + numOfNegatives
                + " %fp= " + fpPercentage);

    // calculates the number of false negatives to insert
    int fnToInsert = (int) Math.round(numOfPositives * fnPercentage.doubleValue() / 100);
    int fnInserted = 0;
    if (verbose)
        System.out.println("[NoiseManager , addNoiseToDataset] fnToInsert= " + fnToInsert + ", totIntances= "
                + origDataset.numInstances() + " true positives= " + numOfPositives + " %fn= " + fnPercentage);

    if (verbose)
        System.out.println("[NoiseManager , addNoiseToDataset] buggy label: " + Settings.buggyLabel
                + " - nonbuggy label: " + Settings.nonbuggyLabel);

    // randomize order of instances
    origDataset.randomize(RandomizationManager.randomGenerator);

    for (int i = 0; i < origDataset.numInstances(); i++) {
        if (verbose)
            System.out.print("\nORIGINAL VALUES: "
                    + origDataset.instance(i).value(origDataset.attribute(origDataset.classIndex())) + " - "
                    + origDataset.instance(i).stringValue(origDataset.classIndex()));

        // gets the classification attribute (it HAS to be the last)
        Attribute att = origDataset.instance(i).attribute(origDataset.classIndex());

        // if there are fn to add and this is a positive instances it turns it into a negative, making it a fn 
        if ((fnInserted < fnToInsert) && (origDataset.instance(i).stringValue(origDataset.classIndex())
                .equals(Settings.buggyLabel))) {

            origDataset.instance(i).setValue(att, Settings.nonbuggyLabel);
            fnInserted++;
            if (verbose)
                System.out.print(" - added FN, added " + fnInserted + " of " + fnToInsert + " ");
        }

        // if there are fp to add and this is a negative instances it turns it into a positive, making it a fp 
        else if ((fpInserted < fpToInsert) && (origDataset.instance(i).stringValue(origDataset.classIndex())
                .equals(Settings.nonbuggyLabel))) {

            origDataset.instance(i).setValue(att, Settings.buggyLabel);
            fpInserted++;
            if (verbose)
                System.out.print(" - added FP, added " + fpInserted + " of " + fpToInsert + " ");

        }

        if (verbose)
            System.out.print(" FINAL ELEMENT VALUES: "
                    + origDataset.instance(i).value(origDataset.attribute(origDataset.classIndex())) + " - "
                    + origDataset.instance(i).stringValue(origDataset.classIndex()));
    }

    // randomize order of instances
    origDataset.randomize(RandomizationManager.randomGenerator);
    return origDataset;
}

From source file:entity.NoiseInjectionManager.java

License:Open Source License

/**
 * Increments fp and fn in combination by a specified percentages.
 * Randomize order of instances and modifies instances until noise quota is reached.
 * Than randomized instances again.// w w  w. j av a2  s.com
 * NOTE: It modifies the given dataset, because it is a reference.
 * 
 * @param origDataset
 * @param combinedFpFnPercentage
 * @return noisydata
 */
public Instances addNoiseToDataset(Instances origDataset, BigDecimal combinedFpFnPercentage) {

    // exits if no noise must be added
    if (combinedFpFnPercentage.equals(BigDecimal.ZERO)) {
        if (verbose)
            System.out.println("[NoiseManager , addNoiseToDataset] nessun errore da aggiungere");
        return origDataset;
    }

    // total instances in dataset
    int numInstances = origDataset.numInstances();

    // finds positive (buggy) and negative (non-buggy) instances numbers
    int fpAndFnToInsert = (int) Math.round(numInstances * combinedFpFnPercentage.doubleValue() / 100);
    int fpAndFnInserted = 0;
    if (verbose)
        System.out.println("\n\n[NoiseManager , addNoiseToDataset] fpAndFnToInsert= " + fpAndFnToInsert
                + ", totIntances= " + origDataset.numInstances());

    if (verbose)
        System.out.println("[NoiseManager , addNoiseToDataset] buggy label: " + Settings.buggyLabel
                + " - nonbuggy label: " + Settings.nonbuggyLabel);

    // randomize order of instances
    origDataset.randomize(RandomizationManager.randomGenerator);

    for (int i = 0; i < origDataset.numInstances(); i++) {
        if (verbose)
            System.out.print("\nORIGINAL VALUES: "
                    + origDataset.instance(i).value(origDataset.attribute(origDataset.classIndex())) + " - "
                    + origDataset.instance(i).stringValue(origDataset.classIndex()));

        // gets the classification attribute (it HAS to be the last)
        Attribute att = origDataset.instance(i).attribute(origDataset.classIndex());

        // if there are fn or fp to add  
        if (fpAndFnInserted < fpAndFnToInsert) {

            // if this is a positive instances it turns it into a negative, making it a fn
            if (origDataset.instance(i).stringValue(origDataset.classIndex()).equals(Settings.buggyLabel)) {

                if (verbose)
                    System.out.print(" - added FN, added " + fpAndFnInserted + " of " + fpAndFnToInsert + " ");
                origDataset.instance(i).setValue(att, Settings.nonbuggyLabel);
                fpAndFnInserted++;
            }

            // if this is a negative instances it turns it into a positive, making it a fp
            else if (origDataset.instance(i).stringValue(origDataset.classIndex())
                    .equals(Settings.nonbuggyLabel)) {

                if (verbose)
                    System.out.print(" - added FP, added " + fpAndFnInserted + " of " + fpAndFnToInsert + " ");
                origDataset.instance(i).setValue(att, Settings.buggyLabel);
                fpAndFnInserted++;
            }
        }

        if (verbose)
            System.out.print(" FINAL ELEMENT VALUES: "
                    + origDataset.instance(i).value(origDataset.attribute(origDataset.classIndex())) + " - "
                    + origDataset.instance(i).stringValue(origDataset.classIndex()));
    }

    // randomize order of instances
    origDataset.randomize(RandomizationManager.randomGenerator);
    return origDataset;
}

From source file:es.jarias.FMC.ClassCompoundTransformation.java

License:Open Source License

/**
 * //w w  w .  j  a v a2  s  . co  m
 * @param mlData
 * @return the transformed instances
 * @throws Exception
 */
public Instances transformInstances(MultiLabelInstances mlData) throws Exception {
    data = mlData.getDataSet();
    numLabels = mlData.getNumLabels();
    labelIndices = mlData.getLabelIndices();

    Instances newData = null;

    // This must be different in order to combine ALL class states, not only existing ones.
    // gather distinct label combinations
    // ASSUME CLASSES ARE BINARY

    ArrayList<LabelSet> labelSets = new ArrayList<LabelSet>();

    double[] dblLabels = new double[numLabels];
    double nCombinations = Math.pow(2, numLabels);

    for (int i = 0; i < nCombinations; i++) {
        for (int l = 0; l < numLabels; l++) {
            int digit = (int) Math.pow(2, numLabels - 1 - l);
            dblLabels[l] = (digit & i) / digit;
        }

        LabelSet labelSet = new LabelSet(dblLabels);
        labelSets.add(labelSet);
    }

    //        for (int i = 0; i < numInstances; i++) {
    //            // construct labelset
    //            double[] dblLabels = new double[numLabels];
    //            for (int j = 0; j < numLabels; j++) {
    //                int index = labelIndices[j];
    //                dblLabels[j] = Double.parseDouble(data.attribute(index).value((int) data.instance(i).value(index)));
    //            }
    //            LabelSet labelSet = new LabelSet(dblLabels);
    //
    //            // add labelset if not already present
    //            labelSets.add(labelSet);
    //        }

    // create class attribute
    ArrayList<String> classValues = new ArrayList<String>(labelSets.size());
    for (LabelSet subset : labelSets) {
        classValues.add(subset.toBitString());
    }
    newClass = new Attribute("class", classValues);

    //        for (String s : classValues)
    //        {
    //           System.out.print(s+", ");
    //           
    //        }
    //        System.out.println();

    // remove all labels
    newData = RemoveAllLabels.transformInstances(data, labelIndices);

    // add new class attribute
    newData.insertAttributeAt(newClass, newData.numAttributes());
    newData.setClassIndex(newData.numAttributes() - 1);

    // add class values
    for (int i = 0; i < newData.numInstances(); i++) {
        //System.out.println(newData.instance(i).toString());
        String strClass = "";
        for (int j = 0; j < numLabels; j++) {
            int index = labelIndices[j];
            strClass = strClass + data.attribute(index).value((int) data.instance(i).value(index));
        }
        //System.out.println(strClass);
        newData.instance(i).setClassValue(strClass);
    }
    transformedFormat = new Instances(newData, 0);
    return newData;
}