Example usage for weka.core Instances kthSmallestValue

Introduction

In this page you can find the example usage for weka.core Instances kthSmallestValue.

Prototype

public double kthSmallestValue(int attIndex, int k)

Source Link

Document

Returns the kth-smallest attribute value of a numeric attribute.

Usage

From source file:br.ufrn.ia.core.clustering.SimpleKMeansIaProject.java

License:Open Source License

protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) {
    double[] vals = new double[members.numAttributes()];

    // used only for Manhattan Distance
    Instances sortedMembers = null;
    int middle = 0;
    boolean dataIsEven = false;

    if (m_DistanceFunction instanceof ManhattanDistance) {
        middle = (members.numInstances() - 1) / 2;
        dataIsEven = ((members.numInstances() % 2) == 0);
        if (m_PreserveOrder) {
            sortedMembers = members;//from  ww w.j  a  v a  2s .com
        } else {
            sortedMembers = new Instances(members);
        }
    }

    for (int j = 0; j < members.numAttributes(); j++) {

        // in case of Euclidian distance the centroid is the mean point
        // in case of Manhattan distance the centroid is the median point
        // in both cases, if the attribute is nominal, the centroid is the
        // mode
        if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) {
            vals[j] = members.meanOrMode(j);
        } else if (m_DistanceFunction instanceof ManhattanDistance) {
            // singleton special case
            if (members.numInstances() == 1) {
                vals[j] = members.instance(0).value(j);
            } else {
                sortedMembers.kthSmallestValue(j, middle + 1);
                vals[j] = sortedMembers.instance(middle).value(j);
                if (dataIsEven) {
                    sortedMembers.kthSmallestValue(j, middle + 2);
                    vals[j] = (vals[j] + sortedMembers.instance(middle + 1).value(j)) / 2;
                }
            }
        }

        if (updateClusterInfo) {
            m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount;
            m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts;
            if (members.attribute(j).isNominal()) {
                if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils
                        .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) {
                    vals[j] = Utils.missingValue(); // mark mode as missing
                }
            } else {
                if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) {
                    vals[j] = Utils.missingValue(); // mark mean as missing
                }
            }
        }
    }
    if (updateClusterInfo)
        m_ClusterCentroids.add(new DenseInstance(1.0, vals));
    return vals;
}

From source file:clusterer.SimpleKMeansWithSilhouette.java

License:Open Source License

/**
 * Move the centroid to it's new coordinates. Generate the centroid
 * coordinates based on it's members (objects assigned to the cluster of the
 * centroid) and the distance function being used.
 * /* w  ww . j  a v  a  2s .  c  om*/
 * @param centroidIndex index of the centroid which the coordinates will be
 *          computed
 * @param members the objects that are assigned to the cluster of this
 *          centroid
 * @param updateClusterInfo if the method is supposed to update the m_Cluster
 *          arrays
 * @param addToCentroidInstances true if the method is to add the computed
 *          coordinates to the Instances holding the centroids
 * @return the centroid coordinates
 */
protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo,
        boolean addToCentroidInstances) {

    double[] vals = new double[members.numAttributes()];
    double[][] nominalDists = new double[members.numAttributes()][];
    double[] weightMissing = new double[members.numAttributes()];
    double[] weightNonMissing = new double[members.numAttributes()];

    // Quickly calculate some relevant statistics 
    for (int j = 0; j < members.numAttributes(); j++) {
        if (members.attribute(j).isNominal()) {
            nominalDists[j] = new double[members.attribute(j).numValues()];
        }
    }
    for (Instance inst : members) {
        for (int j = 0; j < members.numAttributes(); j++) {
            if (inst.isMissing(j)) {
                weightMissing[j] += inst.weight();
            } else {
                weightNonMissing[j] += inst.weight();
                if (members.attribute(j).isNumeric()) {
                    vals[j] += inst.weight() * inst.value(j); // Will be overwritten in Manhattan case
                } else {
                    nominalDists[j][(int) inst.value(j)] += inst.weight();
                }
            }
        }
    }
    for (int j = 0; j < members.numAttributes(); j++) {
        if (members.attribute(j).isNumeric()) {
            if (weightNonMissing[j] > 0) {
                vals[j] /= weightNonMissing[j];
            } else {
                vals[j] = Utils.missingValue();
            }
        } else {
            double max = -Double.MAX_VALUE;
            double maxIndex = -1;
            for (int i = 0; i < nominalDists[j].length; i++) {
                if (nominalDists[j][i] > max) {
                    max = nominalDists[j][i];
                    maxIndex = i;
                }
                if (max < weightMissing[j]) {
                    vals[j] = Utils.missingValue();
                } else {
                    vals[j] = maxIndex;
                }
            }
        }
    }

    if (m_DistanceFunction instanceof ManhattanDistance) {

        // Need to replace means by medians
        Instances sortedMembers = null;
        int middle = (members.numInstances() - 1) / 2;
        boolean dataIsEven = ((members.numInstances() % 2) == 0);
        if (m_PreserveOrder) {
            sortedMembers = members;
        } else {
            sortedMembers = new Instances(members);
        }
        for (int j = 0; j < members.numAttributes(); j++) {
            if ((weightNonMissing[j] > 0) && members.attribute(j).isNumeric()) {
                // singleton special case
                if (members.numInstances() == 1) {
                    vals[j] = members.instance(0).value(j);
                } else {
                    vals[j] = sortedMembers.kthSmallestValue(j, middle + 1);
                    if (dataIsEven) {
                        vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2;
                    }
                }
            }
        }
    }

    if (updateClusterInfo) {
        for (int j = 0; j < members.numAttributes(); j++) {
            m_ClusterMissingCounts[centroidIndex][j] = weightMissing[j];
            m_ClusterNominalCounts[centroidIndex][j] = nominalDists[j];
        }
    }

    if (addToCentroidInstances) {
        m_ClusterCentroids.add(new DenseInstance(1.0, vals));
    }

    return vals;
}

From source file:cn.edu.xmu.dm.d3c.clustering.SimpleKMeans.java

License:Open Source License

/**
 * Move the centroid to it's new coordinates. Generate the centroid coordinates based 
 * on it's  members (objects assigned to the cluster of the centroid) and the distance 
 * function being used./*from  w ww .j ava 2s. co  m*/
 * @param centroidIndex index of the centroid which the coordinates will be computed
 * @param members the objects that are assigned to the cluster of this centroid
 * @param updateClusterInfo if the method is supposed to update the m_Cluster arrays
 * @return the centroid coordinates
 */
protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) {
    double[] vals = new double[members.numAttributes()];

    //used only for Manhattan Distance
    Instances sortedMembers = null;
    int middle = 0;
    boolean dataIsEven = false;

    if (m_DistanceFunction instanceof ManhattanDistance) {
        middle = (members.numInstances() - 1) / 2;
        dataIsEven = ((members.numInstances() % 2) == 0);
        if (m_PreserveOrder) {
            sortedMembers = members;
        } else {
            sortedMembers = new Instances(members);
        }
    }

    for (int j = 0; j < members.numAttributes(); j++) {

        //in case of Euclidian distance the centroid is the mean point
        //in case of Manhattan distance the centroid is the median point
        //in both cases, if the attribute is nominal, the centroid is the mode
        if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) {
            vals[j] = members.meanOrMode(j);
        } else if (m_DistanceFunction instanceof ManhattanDistance) {
            //singleton special case
            if (members.numInstances() == 1) {
                vals[j] = members.instance(0).value(j);
            } else {
                sortedMembers.kthSmallestValue(j, middle + 1);
                vals[j] = sortedMembers.instance(middle).value(j);
                if (dataIsEven) {
                    sortedMembers.kthSmallestValue(j, middle + 2);
                    vals[j] = (vals[j] + sortedMembers.instance(middle + 1).value(j)) / 2;
                }
            }
        }

        if (updateClusterInfo) {
            m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount;
            m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts;
            if (members.attribute(j).isNominal()) {
                if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils
                        .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) {
                    vals[j] = Utils.missingValue(); // mark mode as missing
                }
            } else {
                if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) {
                    vals[j] = Utils.missingValue(); // mark mean as missing
                }
            }
        }
    }
    if (updateClusterInfo)
        m_ClusterCentroids.add(new DenseInstance(1.0, vals));
    return vals;
}

From source file:de.ugoe.cs.cpdp.dataprocessing.CLAMIProcessor.java

License:Apache License

/**
 * <p>/*from   www.  j  a va2 s  .co  m*/
 * Applies the CLAMI processor to the data. The test data is also required, in order to
 * guarantee a consistent metric set.
 * </p>
 *
 * @param testdata
 *            test data; the data is not modified, only metrics are dropped
 * @param data
 *            data to which the CLAMI processor is applied
 */
public void applyCLAMI(Instances testdata, Instances data) {

    // first determine medians
    double[] medians = new double[data.numAttributes()];
    // get medians
    for (int j = 0; j < data.numAttributes(); j++) {
        if (j != data.classIndex()) {
            medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1);
        }
    }
    // now determine cluster number for each instance
    double[] clusterNumber = new double[data.numInstances()];
    for (int i = 0; i < data.numInstances(); i++) {
        int countHighValues = 0;
        Instance currentInstance = data.get(i);
        for (int j = 0; j < data.numAttributes(); j++) {
            if (j != data.classIndex()) {
                if (currentInstance.value(j) > medians[j]) {
                    countHighValues++;
                }
            }
        }
        clusterNumber[i] = countHighValues;
    }

    // determine median of cluster number
    Median m = new Median();
    double medianClusterNumber = m.evaluate(clusterNumber);

    // now we filter the metrics
    int[] numMetricViolations = new int[data.numAttributes()];
    for (int j = 0; j < data.numAttributes(); j++) {
        int currentViolations = 0;
        for (int i = 0; i < data.numInstances(); i++) {
            Instance currentInstance = data.get(i);
            if (j != data.classIndex()) {
                if (clusterNumber[i] > medianClusterNumber) {
                    // "buggy"
                    if (currentInstance.value(j) <= medians[j]) {
                        currentViolations++;
                    }
                } else {
                    // "not buggy"
                    if (currentInstance.value(j) > medians[j]) {
                        currentViolations++;
                    }
                }
            }
        }
        numMetricViolations[j] = currentViolations;
    }

    SortedSet<Integer> distinctViolationCounts = new TreeSet<>();
    for (int currentViolations : numMetricViolations) {
        distinctViolationCounts.add(currentViolations);
    }
    Iterator<Integer> violationCountInterator = distinctViolationCounts.iterator();

    int violationCutoff = violationCountInterator.next();
    // now we filter the data;
    // this is first tried with the metrics with fewest violations. if no buggy/bugfree
    // instances remain, this is repeated with the next metrics with second fewest violations,
    // and so on.
    // this part is a bit unclear from the description in the paper, but I confirmed with the
    // author that this is how they implemented it
    boolean[] cleanInstances = new boolean[data.numInstances()];
    int numCleanBuggyInstances = 0;
    int numCleanBugfreeInstances = 0;
    do {
        violationCutoff = violationCountInterator.next();
        cleanInstances = new boolean[data.numInstances()];
        numCleanBuggyInstances = 0;
        numCleanBugfreeInstances = 0;
        for (int i = 0; i < data.numInstances(); i++) {
            int currentViolations = 0;
            Instance currentInstance = data.get(i);
            for (int j = 0; j < data.numAttributes(); j++) {
                if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
                    if (clusterNumber[i] > medianClusterNumber) {
                        // "buggy"
                        if (currentInstance.value(j) <= medians[j]) {
                            currentViolations++;
                        }
                    } else {
                        // "not buggy"
                        if (currentInstance.value(j) > medians[j]) {
                            currentViolations++;
                        }
                    }
                }
            }
            if (currentViolations == 0) {
                cleanInstances[i] = true;
                if (clusterNumber[i] > medianClusterNumber) {
                    numCleanBuggyInstances++;
                } else {
                    numCleanBugfreeInstances++;
                }
            } else {
                cleanInstances[i] = false;
            }
        }
    } while (numCleanBuggyInstances == 0 || numCleanBugfreeInstances == 0);

    // output some interesting information to provide insights into the CLAMI model
    Console.traceln(Level.FINE, "Selected Metrics and Median-threshold: ");
    for (int j = 0; j < data.numAttributes(); j++) {
        if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
            Console.traceln(Level.FINE, "\t" + data.attribute(j).name() + ": " + medians[j]);
        }
    }

    // finally modify the instances
    // drop the metrics (also from the testdata)
    for (int j = data.numAttributes() - 1; j >= 0; j--) {
        if (j != data.classIndex() && numMetricViolations[j] != violationCutoff) {
            data.deleteAttributeAt(j);
            testdata.deleteAttributeAt(j);
        }
    }
    // drop the unclean instances
    for (int i = data.numInstances() - 1; i >= 0; i--) {
        if (!cleanInstances[i]) {
            data.delete(i);
        } else {
            // set the classification
            if (clusterNumber[i] > medianClusterNumber) {
                data.get(i).setClassValue(1.0d);
            } else {
                data.get(i).setClassValue(0.0d);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.CLAProcessor.java

License:Apache License

/**
 * Applies the CLA processor the the data.
 * //from ww  w  . j  av a 2  s.co m
 * @param data
 *            data to which the processor is applied
 */
public void applyCLA(Instances data) {
    // first determine medians
    double[] medians = new double[data.numAttributes()];
    // get medians
    for (int j = 0; j < data.numAttributes(); j++) {
        if (j != data.classIndex()) {
            medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1);
        }
    }
    // now determine cluster number for each instance
    double[] clusterNumber = new double[data.numInstances()];
    for (int i = 0; i < data.numInstances(); i++) {
        int countHighValues = 0;
        Instance currentInstance = data.get(i);
        for (int j = 0; j < data.numAttributes(); j++) {
            if (j != data.classIndex()) {
                if (currentInstance.value(j) > medians[j]) {
                    countHighValues++;
                }
            }
        }
        clusterNumber[i] = countHighValues;
    }

    // determine median of cluster number
    Median m = new Median();
    double medianClusterNumber = m.evaluate(Arrays.stream(clusterNumber).distinct().toArray());

    // finally modify the instances
    // drop the unclean instances
    for (int i = data.numInstances() - 1; i >= 0; i--) {
        // set the classification
        if (clusterNumber[i] > medianClusterNumber) {
            data.get(i).setClassValue(1.0d);
        } else {
            data.get(i).setClassValue(0.0d);
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java

License:Apache License

/**
 * <p>/*  w w w.jav  a 2s  . c o m*/
 * Gets an array with the ranges from the data for a given attribute
 * </p>
 *
 * @param data
 *            the data
 * @param j
 *            index of the attribute
 * @return the ranges for the attribute
 */
private double[] getRanges(Instances data, int j) {
    double[] values = new double[numRanges + 1];
    for (int k = 0; k < numRanges; k++) {
        values[k] = data.kthSmallestValue(j, (int) (data.size() * (k + 1.0) / numRanges));
    }
    values[numRanges] = data.attributeStats(j).numericStats.max;
    return values;
}

From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java

License:Open Source License

/**
 * Move the centroid to it's new coordinates. Generate the centroid
 * coordinates based on it's members (objects assigned to the cluster of the
 * centroid) and the distance function being used.
 * /*from  ww w .j  a  v  a 2  s  . c  o m*/
 * @param centroidIndex index of the centroid which the coordinates will be
 *          computed
 * @param members the objects that are assigned to the cluster of this
 *          centroid
 * @param updateClusterInfo if the method is supposed to update the m_Cluster
 *          arrays
 * @return the centroid coordinates
 */
protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) {
    double[] vals = new double[members.numAttributes()];

    // used only for Manhattan Distance
    Instances sortedMembers = null;
    int middle = 0;
    boolean dataIsEven = false;

    if (m_DistanceFunction instanceof ManhattanDistance
            || m_DistanceFunction instanceof CustomPairWiseDistance) {
        middle = (members.numInstances() - 1) / 2;
        dataIsEven = ((members.numInstances() % 2) == 0);
        if (m_PreserveOrder) {
            sortedMembers = members;
        } else {
            sortedMembers = new Instances(members);
        }
    }

    for (int j = 0; j < members.numAttributes(); j++) {

        // in case of Euclidian distance the centroid is the mean point
        // in case of Manhattan distance the centroid is the median point
        // in both cases, if the attribute is nominal, the centroid is the mode
        if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) {
            vals[j] = members.meanOrMode(j);
        } else if (m_DistanceFunction instanceof ManhattanDistance
                || m_DistanceFunction instanceof CustomPairWiseDistance) {
            // singleton special case
            if (members.numInstances() == 1) {
                vals[j] = members.instance(0).value(j);
            } else {
                vals[j] = sortedMembers.kthSmallestValue(j, middle + 1);
                if (dataIsEven) {
                    vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2;
                }
            }
        }

        if (updateClusterInfo) {
            m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount;
            m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts;
            if (members.attribute(j).isNominal()) {
                if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils
                        .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) {
                    vals[j] = Instance.missingValue(); // mark mode as missing
                }
            } else {
                if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) {
                    vals[j] = Instance.missingValue(); // mark mean as missing
                }
            }
        }
    }
    if (updateClusterInfo) {
        m_ClusterCentroids.add(new Instance(1.0, vals));
    }
    return vals;
}

From source file:elh.eus.absa.CLI.java

License:Open Source License

/**
 * Main access to the train-atc functionalities. Train ATC using a double one vs. all classifier
 * (E and A) for E#A aspect categories/*from w ww  .j  a  v a2  s .  c  om*/
 * @throws Exception 
 */
public final void trainATC2(final InputStream inputStream) throws IOException {
    // load training parameters file
    String paramFile = parsedArguments.getString("params");
    String testFile = parsedArguments.getString("testset");
    String paramFile2 = parsedArguments.getString("params2");
    String corpusFormat = parsedArguments.getString("corpusFormat");
    //String validation = parsedArguments.getString("validation");
    String lang = parsedArguments.getString("language");
    //int foldNum = Integer.parseInt(parsedArguments.getString("foldNum"));
    //boolean printPreds = parsedArguments.getBoolean("printPreds");
    boolean nullSentenceOpinions = parsedArguments.getBoolean("nullSentences");
    boolean onlyTest = parsedArguments.getBoolean("testOnly");
    double threshold = 0.5;
    double threshold2 = 0.5;
    String modelsPath = "/home/inaki/elixa-atp/ovsaModels";

    CorpusReader reader = new CorpusReader(inputStream, corpusFormat, nullSentenceOpinions, lang);
    Features atcTrain = new Features(reader, paramFile, "3");
    Instances traindata = atcTrain.loadInstances(true, "atc");

    if (onlyTest) {
        if (FileUtilsElh.checkFile(testFile)) {
            System.err.println("read from test file");
            reader = new CorpusReader(new FileInputStream(new File(testFile)), corpusFormat,
                    nullSentenceOpinions, lang);
            atcTrain.setCorpus(reader);
            traindata = atcTrain.loadInstances(true, "atc");
        }
    }

    //setting class attribute (entCat|attCat|entAttCat|polarityCat)

    //HashMap<String, Integer> opInst = atcTrain.getOpinInst();      
    //WekaWrapper classifyAtts;
    WekaWrapper onevsall;
    try {

        //classify.printMultilabelPredictions(classify.multiLabelPrediction());      */   

        //onevsall
        Instances entdata = new Instances(traindata);
        entdata.deleteAttributeAt(entdata.attribute("attCat").index());
        entdata.deleteAttributeAt(entdata.attribute("entAttCat").index());
        entdata.setClassIndex(entdata.attribute("entCat").index());
        onevsall = new WekaWrapper(entdata, true);

        if (!onlyTest) {
            onevsall.trainOneVsAll(modelsPath, paramFile + "entCat");
            System.out.println("trainATC: one vs all models ready");
        }
        onevsall.setTestdata(entdata);
        HashMap<Integer, HashMap<String, Double>> ovsaRes = onevsall.predictOneVsAll(modelsPath,
                paramFile + "entCat");
        System.out.println("trainATC: one vs all predictions ready");
        HashMap<Integer, String> instOps = new HashMap<Integer, String>();
        for (String oId : atcTrain.getOpinInst().keySet()) {
            instOps.put(atcTrain.getOpinInst().get(oId), oId);
        }

        atcTrain = new Features(reader, paramFile2, "3");
        entdata = atcTrain.loadInstances(true, "attTrain2_data");
        entdata.deleteAttributeAt(entdata.attribute("entAttCat").index());
        //entdata.setClassIndex(entdata.attribute("entCat").index());

        Attribute insAtt = entdata.attribute("instanceId");
        double maxInstId = entdata.kthSmallestValue(insAtt, entdata.numDistinctValues(insAtt) - 1);
        System.err.println("last instance has index: " + maxInstId);
        for (int ins = 0; ins < entdata.numInstances(); ins++) {
            System.err.println("ins" + ins);
            int i = (int) entdata.instance(ins).value(insAtt);
            Instance currentInst = entdata.instance(ins);
            //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i));
            String sId = reader.getOpinion(instOps.get(i)).getsId();
            String oId = instOps.get(i);
            reader.removeSentenceOpinions(sId);
            int oSubId = 0;
            for (String cl : ovsaRes.get(i).keySet()) {
                //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                if (ovsaRes.get(i).get(cl) > threshold) {
                    //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));                  
                    // for the first one update the instances
                    if (oSubId >= 1) {
                        Instance newIns = new SparseInstance(currentInst);
                        newIns.setDataset(entdata);
                        entdata.add(newIns);
                        newIns.setValue(insAtt, maxInstId + oSubId);
                        newIns.setClassValue(cl);
                        instOps.put((int) maxInstId + oSubId, oId);

                    }
                    // if the are more create new instances
                    else {
                        currentInst.setClassValue(cl);
                        //create and add opinion to the structure
                        //   trgt, offsetFrom, offsetTo, polarity, cat, sId);
                        //Opinion op = new Opinion(instOps.get(i)+"_"+oSubId, "", 0, 0, "", cl, sId);
                        //reader.addOpinion(op);
                    }
                    oSubId++;
                }
            } //finished updating instances data                                    
        }

        entdata.setClass(entdata.attribute("attCat"));
        onevsall = new WekaWrapper(entdata, true);

        /**
         *  Bigarren sailkatzailea
         * 
         * */
        if (!onlyTest) {
            onevsall.trainOneVsAll(modelsPath, paramFile + "attCat");
            System.out.println("trainATC: one vs all attcat models ready");
        }

        ovsaRes = onevsall.predictOneVsAll(modelsPath, paramFile + "entAttCat");

        insAtt = entdata.attribute("instanceId");
        maxInstId = entdata.kthSmallestValue(insAtt, insAtt.numValues());
        System.err.println("last instance has index: " + maxInstId);
        for (int ins = 0; ins < entdata.numInstances(); ins++) {
            System.err.println("ins: " + ins);
            int i = (int) entdata.instance(ins).value(insAtt);
            Instance currentInst = entdata.instance(ins);
            //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i));
            String sId = reader.getOpinion(instOps.get(i)).getsId();
            String oId = instOps.get(i);
            reader.removeSentenceOpinions(sId);
            int oSubId = 0;
            for (String cl : ovsaRes.get(i).keySet()) {
                //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                if (ovsaRes.get(i).get(cl) > threshold2) {
                    ///System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                    if (ovsaRes.get(i).get(cl) > threshold) {
                        //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));                  
                        // for the first one update the instances
                        if (oSubId >= 1) {
                            String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl;
                            //create and add opinion to the structure
                            //   trgt, offsetFrom, offsetTo, polarity, cat, sId);                     
                            Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId);
                            reader.addOpinion(op);
                        }
                        // if the are more create new instances
                        else {
                            String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl;
                            //create and add opinion to the structure
                            //   trgt, offsetFrom, offsetTo, polarity, cat, sId);
                            reader.removeOpinion(oId);
                            Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId);
                            reader.addOpinion(op);
                        }
                        oSubId++;
                    }
                } //finished updating instances data                                    
            }
        }
        reader.print2Semeval2015format(paramFile + "entAttCat.xml");
    } catch (Exception e) {
        e.printStackTrace();
    }

    //traindata.setClass(traindata.attribute("entAttCat"));
    System.err.println("DONE CLI train-atc2 (oneVsAll)");
}

From source file:gr.iti.mklab.visual.quantization.SimpleKMeansWithOutput.java

License:Open Source License

/**
 * Move the centroid to it's new coordinates. Generate the centroid coordinates based on it's members
 * (objects assigned to the cluster of the centroid) and the distance function being used.
 * /*from  w  w  w.java2  s  . co m*/
 * @param centroidIndex
 *            index of the centroid which the coordinates will be computed
 * @param members
 *            the objects that are assigned to the cluster of this centroid
 * @param updateClusterInfo
 *            if the method is supposed to update the m_Cluster arrays
 * @param addToCentroidInstances
 *            true if the method is to add the computed coordinates to the Instances holding the centroids
 * @return the centroid coordinates
 */
protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo,
        boolean addToCentroidInstances) {
    double[] vals = new double[members.numAttributes()];

    // used only for Manhattan Distance
    Instances sortedMembers = null;
    int middle = 0;
    boolean dataIsEven = false;

    if (m_DistanceFunction instanceof ManhattanDistance) {
        middle = (members.numInstances() - 1) / 2;
        dataIsEven = ((members.numInstances() % 2) == 0);
        if (m_PreserveOrder) {
            sortedMembers = members;
        } else {
            sortedMembers = new Instances(members);
        }
    }

    for (int j = 0; j < members.numAttributes(); j++) {

        // in case of Euclidian distance the centroid is the mean point
        // in case of Manhattan distance the centroid is the median point
        // in both cases, if the attribute is nominal, the centroid is the mode
        if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) {
            vals[j] = members.meanOrMode(j);
        } else if (m_DistanceFunction instanceof ManhattanDistance) {
            // singleton special case
            if (members.numInstances() == 1) {
                vals[j] = members.instance(0).value(j);
            } else {
                vals[j] = sortedMembers.kthSmallestValue(j, middle + 1);
                if (dataIsEven) {
                    vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2;
                }
            }
        }

        if (updateClusterInfo) {
            m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount;
            m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts;
            if (members.attribute(j).isNominal()) {
                if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils
                        .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) {
                    vals[j] = Utils.missingValue(); // mark mode as missing
                }
            } else {
                if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) {
                    vals[j] = Utils.missingValue(); // mark mean as missing
                }
            }
        }
    }
    if (addToCentroidInstances) {
        m_ClusterCentroids.add(new DenseInstance(1.0, vals));
    }
    return vals;
}

From source file:lu.lippmann.cdb.lab.beta.util.WekaUtil2.java

License:Open Source License

/**
 * Generate the centroid coordinates based 
 * on it's  members (objects assigned to the cluster of the centroid) and the distance 
 * function being used.//from   ww w. j a  v a  2  s  .c  o m
 * @return the centroid
 */
public static MixedCentroid computeMixedCentroid(final boolean preserveOrder,
        final NormalizableDistance distanceFunction, final Instances numericInstances,
        final Instances originalInstances, final int clusterIndex) {
    final int numInstances = numericInstances.numInstances();
    final int numAttributes = numericInstances.numAttributes();

    final Map<TupleSI, Integer> addedAttr = new HashMap<TupleSI, Integer>();

    if (numInstances == 1) {
        Instance uniqueNumInstance = numericInstances.firstInstance();
        Instance uniqueMixInstance = originalInstances.firstInstance();
        double[] centroid = uniqueNumInstance.toDoubleArray();
        for (int i = 0; i < uniqueMixInstance.numAttributes(); i++) {
            if (!uniqueMixInstance.attribute(i).isNumeric()) {
                final String catVal = uniqueMixInstance.attribute(i).value((int) uniqueMixInstance.value(i));
                addedAttr.put(new TupleSI(catVal, i), 1);
            }
        }
        return new MixedCentroid(clusterIndex, centroid, addedAttr);
    }

    final double[] vals = new double[numAttributes];

    //used only for Manhattan Distance
    Instances sortedMembers = null;
    int middle = 0;
    boolean dataIsEven = false;

    final boolean isManhattanDist = (distanceFunction instanceof ManhattanDistance);
    final boolean isEuclideanDist = (distanceFunction instanceof EuclideanDistance);

    if (isManhattanDist) {
        middle = (numInstances - 1) / 2;
        dataIsEven = ((numInstances % 2) == 0);
        if (preserveOrder) {
            sortedMembers = numericInstances;
        } else {
            sortedMembers = new Instances(numericInstances);
        }
    }

    for (int j = 0; j < numAttributes; j++) {
        //in case of Euclidian distance the centroid is the mean point
        //in case of Manhattan distance the centroid is the median point
        //in both cases, if the attribute is nominal, the centroid is the mode            
        if (isEuclideanDist) {
            vals[j] = numericInstances.meanOrMode(j);

            for (int i = 0; i < numInstances; i++) {
                if (!originalInstances.attribute(j).isNumeric()) {
                    final Instance instance = originalInstances.instance(i);
                    final String catVal = instance.attribute(j).value((int) instance.value(j));
                    //Initialize map
                    final TupleSI key = new TupleSI(catVal, j);
                    if (!addedAttr.containsKey(key))
                        addedAttr.put(key, 0);
                    addedAttr.put(key, addedAttr.get(key) + 1);
                }
            }
        } else if (isManhattanDist) {
            sortedMembers.kthSmallestValue(j, middle + 1);
            vals[j] = sortedMembers.instance(middle).value(j);
            if (dataIsEven) {
                sortedMembers.kthSmallestValue(j, middle + 2);
                vals[j] = (vals[j] + sortedMembers.instance(middle + 1).value(j)) / 2;
            }
        } else {
            throw new IllegalStateException("Not handled distance ...");
        }
    }

    return new MixedCentroid(clusterIndex, vals, addedAttr);
}