Example usage for weka.core Instances sort

Introduction

In this page you can find the example usage for weka.core Instances sort.

Prototype

public void sort(Attribute att)

Source Link

Document

Sorts the instances based on an attribute.

Usage

From source file:lu.lippmann.cdb.ext.hydviga.gaps.GapFiller.java

License:Open Source License

private Instances fillAllGapsWithDiscretizedTime(final Instances ds) throws Exception {
    int firstDateIdx = WekaDataStatsUtil.getFirstDateAttributeIdx(ds);
    final String datename = ds.attribute(firstDateIdx).name();
    if (firstDateIdx == -1) {
        throw new Exception("No date attribute in this dataset!");
    }/*w  ww .j  a  v a 2  s .c om*/

    Instances newds = new Instances(ds);

    /* add discretized time */
    newds = WekaTimeSeriesUtil.buildDataSetWithDiscretizedTime(newds);

    /* add fake numerical time */
    newds.insertAttributeAt(new Attribute(datename + "_fake"), newds.numAttributes());
    for (int i = 0; i < newds.numInstances(); i++) {
        newds.instance(i).setValue(newds.numAttributes() - 1, newds.instance(i).value(firstDateIdx));
    }

    /* remove 'true' date */
    while (firstDateIdx != -1) {
        newds.deleteAttributeAt(firstDateIdx);
        firstDateIdx = WekaDataStatsUtil.getFirstDateAttributeIdx(newds);
    }

    /* transform nominal as binaries */
    for (int iidx : WekaDataStatsUtil.getNominalAttributesIndexes(newds)) {
        newds = WekaDataProcessingUtil.buildDataSetWithNominalAsBinary(newds, iidx);
    }

    /* rename attributes for which the name can occur issues in tree evaluation */
    for (int k = 0; k < newds.numAttributes(); k++) {
        String atn = newds.attribute(k).name();
        if (atn.contains("="))
            atn = atn.replaceAll("=", (int) (Math.random() * 1000) + "");
        if (atn.contains("<"))
            atn = atn.replaceAll("<", (int) (Math.random() * 1000) + "");
        if (atn.contains(">"))
            atn = atn.replaceAll(">", (int) (Math.random() * 1000) + "");
        if (atn.contains("."))
            atn = atn.replace(".", (int) (Math.random() * 1000) + "");
        newds = WekaDataProcessingUtil.renameAttribute(newds, k, atn);
    }

    /* replace missing values */
    newds = fillGaps0(newds);

    /* reconstruct date according to discretized time */
    final String df = ds.attribute(WekaDataStatsUtil.getFirstDateAttributeIdx(ds)).getDateFormat();
    newds.insertAttributeAt(new Attribute(datename + "_new", df), newds.numAttributes());
    final int newfirstDateIdx = WekaDataStatsUtil.getFirstDateAttributeIdx(newds);
    for (int i = 0; i < newds.numInstances(); i++) {
        final Instance inst = newds.instance(i);
        inst.setValue(newfirstDateIdx, newds.instance(i).value(newds.numAttributes() - 2));
    }

    /* sort by date ! */
    newds.sort(newfirstDateIdx);

    /* remove discretized time */
    final Set<String> toRemove = new HashSet<String>();
    for (int i = 0; i < newds.numAttributes(); i++) {
        if (newds.attribute(i).name().startsWith("t_"))
            toRemove.add(newds.attribute(i).name());
    }
    for (final String tr : toRemove)
        newds.deleteAttributeAt(newds.attribute(tr).index());

    /* delete the fake attribute time */
    newds.deleteAttributeAt(newds.numAttributes() - 2);

    return newds;
}

From source file:machine_learing_clasifier.MyC45.java

public double BestContinousAttribute(Instances i, Attribute att) {

    i.sort(att);
    Enumeration enumForMissingAttr = i.enumerateInstances();
    double temp = i.get(0).classValue();
    double igtemp = 0;
    double bestthreshold = 0;
    double a;//  ww  w.j a  v a2 s  . co m
    double b = i.get(0).value(att);
    while (enumForMissingAttr.hasMoreElements()) {
        Instance inst = (Instance) enumForMissingAttr.nextElement();
        if (temp != inst.classValue()) {
            temp = inst.classValue();
            a = b;
            b = inst.value(att);
            double threshold = a + ((b - a) / 2);
            double igtemp2 = computeInformationGainContinous(i, att, threshold);
            if (igtemp < igtemp2) {
                bestthreshold = threshold;
                igtemp = igtemp2;
            }

        }

    }
    return bestthreshold;
}

From source file:myclassifier.myC45Pack.SplitModel.java

public void buildClassifier(Instances dataSet) throws Exception {
    // Initialize the remaining instance variables.
    numSubsets = 0;/*from ww w  .  j  a  v  a 2s . co m*/
    splitPointValue = Double.MAX_VALUE;
    infoGain = 0;
    gainRatio = 0;

    // Different treatment for enumerated and numeric attributes.
    if (dataSet.attribute(attribIndex).isNominal()) {
        numOfBranches = dataSet.attribute(attribIndex).numValues();
        numOfSplitPoints = dataSet.attribute(attribIndex).numValues();
        handleNominalAttribute(dataSet);
    } else { //attribute numeric
        numOfBranches = 2;
        numOfSplitPoints = 0;
        dataSet.sort(dataSet.attribute(attribIndex));
        handleNumericAttribute(dataSet);
    }
}

From source file:myid3andc45classifier.Model.MyC45.java

@Override
public void buildClassifier(Instances data) throws Exception {
    getCapabilities().testWithFail(data);

    data = new Instances(data);
    data.deleteWithMissingClass();// ww w .ja v a2 s .c o m

    Enumeration enumAtt = data.enumerateAttributes();
    while (enumAtt.hasMoreElements()) {
        Attribute attr = (Attribute) enumAtt.nextElement();
        if (attr.isNumeric()) {
            ArrayList<Double> mid = new ArrayList<Double>();
            Instances savedData = null;
            double temp, max = Double.NEGATIVE_INFINITY;
            // TODO: split nominal
            data.sort(attr);
            for (int i = 0; i < data.numInstances() - 1; i++) {
                if (data.instance(i).classValue() != data.instance(i + 1).classValue()) {
                    if (data.attribute(attr.name() + " "
                            + (data.instance(i + 1).value(attr) + data.instance(i).value(attr)) / 2) == null) {
                        data = convertInstances(data, attr,
                                (data.instance(i + 1).value(attr) + data.instance(i).value(attr)) / 2);
                        //temp = computeInfoGainRatio(newData, newData.attribute(newData.numAttributes()-1));
                        //System.out.println("attribute "+newData.attribute(newData.numAttributes()-1).name());
                        //if (temp > max) {
                        //    max = temp;
                        //    savedData = newData;
                        //}
                    }
                }
            }

            //Penanganan Missing Value
            AttributeStats attributeStats = data.attributeStats(attr.index());
            double mean = attributeStats.numericStats.mean;
            if (Double.isNaN(mean))
                mean = 0;
            // Replace missing value with mean
            Enumeration instEnumerate = data.enumerateInstances();
            while (instEnumerate.hasMoreElements()) {
                Instance instance = (Instance) instEnumerate.nextElement();
                if (instance.isMissing(attr.index())) {
                    instance.setValue(attr.index(), mean);
                }
            }

            //data = new Instances(savedData);
        } else {
            //Penanganan Missing Value
            AttributeStats attributeStats = data.attributeStats(attr.index());
            int maxIndex = 0;
            for (int i = 1; i < attr.numValues(); i++) {
                if (attributeStats.nominalCounts[maxIndex] < attributeStats.nominalCounts[i]) {
                    maxIndex = i;
                }
            }
            // Replace missing value with max index
            Enumeration instEnumerate = data.enumerateInstances();
            while (instEnumerate.hasMoreElements()) {
                Instance instance = (Instance) instEnumerate.nextElement();
                if (instance.isMissing(attr.index())) {
                    instance.setValue(attr.index(), maxIndex);
                }
            }
        }
    }
    makeMyC45Tree(data);

}

From source file:newdtl.NewJ48.java

/**
 * Computes Gain Ratio for an attribute.
 *
 * @param data the data for which gain ratio is to be computed
 * @param att the attribute//from  www  .ja  v a 2s . c o m
 * @return the gain ratio for the given attribute and data
 * @throws Exception if computation fails
 */
private double[] computeGainRatio(Instances data, Attribute att) {
    if (att.isNumeric()) {
        data.sort(att);
        double[] threshold;
        double[] gainRatios;

        if (data.numInstances() == 1) {
            threshold = new double[1];
            gainRatios = new double[1];

            threshold[0] = data.instance(0).value(att);
            double infoGain = computeInfoGain(data, att, threshold[0]);
            double splitInfo = computeSplitInformation(data, att, threshold[0]);
            gainRatios[0] = infoGain > 0 ? infoGain / splitInfo : infoGain;
        } else {
            threshold = new double[data.numInstances() - 1];
            gainRatios = new double[data.numInstances() - 1];
            for (int i = 0; i < data.numInstances() - 1; i++) {
                threshold[i] = data.instance(i).value(att);
                double infoGain = computeInfoGain(data, att, threshold[i]);
                double splitInfo = computeSplitInformation(data, att, threshold[i]);
                gainRatios[i] = infoGain > 0 ? infoGain / splitInfo : infoGain;
            }
        }
        return new double[] { gainRatios[maxIndex(gainRatios)], threshold[maxIndex(gainRatios)] };
    } else {
        double infoGain = computeInfoGain(data, att);
        double splitInfo = computeSplitInformation(data, att);

        return new double[] { splitInfo > 0 ? infoGain / splitInfo : splitInfo, 0 };
    }
}

From source file:org.scripps.branch.classifier.ManualTree.java

License:Open Source License

/**
 * Trying to get generate distribution of classes
 * //from w ww  .j a va  2 s. c o m
 * @param Instances
 * @Param Attribute index to get distribution of
 * @Param HashMap to put data into
 * 
 * @return HashMap of class distribution data
 */
protected HashMap addDistributionData(Instances instances, int attIndex, HashMap distMap) throws Exception {
    Map<String, Comparable> temp = new HashMap<String, Comparable>();
    ArrayList<Object> distData = new ArrayList();
    // GenerateCSV csv = new GenerateCSV();
    // String data = "";
    boolean isNominal = false;
    instances.sort(attIndex);
    for (int i = 0; i < instances.numInstances(); i++) {
        Instance inst = instances.instance(i);
        if (!Double.isNaN(inst.value(attIndex))) {
            temp = new HashMap<String, Comparable>();
            if (inst.attribute(attIndex).isNominal()) {
                temp.put("value", inst.attribute(attIndex).value((int) inst.value(attIndex)));
                isNominal = true;
                // data+=inst.attribute(m_Attribute).value((int)inst.value(m_Attribute))+",";
            } else {
                temp.put("value", inst.value(attIndex));
                // data+=inst.value(att)+",";
            }
            temp.put("classprob", inst.classAttribute().value((int) inst.classValue()));
            // data+=inst.classAttribute().value((int)
            // inst.classValue())+"\n";
            distData.add(temp);
        }
    }
    if (!distData.isEmpty()) {
        distMap.put("dataArray", distData);
        distMap.put("isNominal", isNominal);
        setDistributionData(distMap);
    }
    return distMap;
    // To check if data is being generated right.
    // csv.generateCsvFile("/home/karthik/Documents/distribution.csv",
    // data);
}

From source file:org.scripps.branch.classifier.ManualTree.java

License:Open Source License

/**
 * Computes class distribution for an attribute.
 * /*from  ww w.  j  a va  2  s .  co  m*/
 * @param props
 * @param dists
 * @param att
 *            the attribute index
 * @param data
 *            the data to work with
 * @throws Exception
 *             if something goes wrong
 */
protected HashMap<String, Double> distribution(double[][] props, double[][][] dists, int att, Instances data,
        double givenSplitPoint, HashMap<String, Classifier> custom_classifiers) throws Exception {

    HashMap<String, Double> mp = new HashMap<String, Double>();
    double splitPoint = givenSplitPoint;
    double origSplitPoint = 0;
    Attribute attribute = null;
    double[][] dist = null;
    int indexOfFirstMissingValue = -1;
    String CustomClassifierId = null;
    CustomSet cSet = null;
    if (att >= data.numAttributes() && att < data.numAttributes() + custom_classifiers.size()) {
        CustomClassifierId = getKeyinMap(custom_classifiers, att, data);
    } else if (att >= data.numAttributes() + custom_classifiers.size()) {
        cSet = getReqCustomSet(att - (data.numAttributes() - 1 + custom_classifiers.size()), cSetList);
    } else {
        attribute = data.attribute(att);
    }
    if (CustomClassifierId == null && cSet == null) {
        if (attribute.isNominal()) {
            // For nominal attributes
            dist = new double[attribute.numValues()][data.numClasses()];
            for (int i = 0; i < data.numInstances(); i++) {
                Instance inst = data.instance(i);
                if (inst.isMissing(att)) {

                    // Skip missing values at this stage
                    if (indexOfFirstMissingValue < 0) {
                        indexOfFirstMissingValue = i;
                    }
                    continue;
                }
                dist[(int) inst.value(att)][(int) inst.classValue()] += inst.weight();
            }
        } else {

            // For numeric attributes
            double[][] currDist = new double[2][data.numClasses()];
            dist = new double[2][data.numClasses()];

            // Sort data
            data.sort(att);

            // Move all instances into second subset
            for (int j = 0; j < data.numInstances(); j++) {
                Instance inst = data.instance(j);
                if (inst.isMissing(att)) {

                    // Can stop as soon as we hit a missing value
                    indexOfFirstMissingValue = j;
                    break;
                }
                currDist[1][(int) inst.classValue()] += inst.weight();
            }

            // Value before splitting
            double priorVal = priorVal(currDist);

            // Save initial distribution
            for (int j = 0; j < currDist.length; j++) {
                System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length);
            }

            if (Double.isNaN(splitPoint)) {
                // Try all possible split points
                double currSplit = data.instance(0).value(att);
                double currVal, bestVal = -Double.MAX_VALUE;
                for (int i = 0; i < data.numInstances(); i++) {
                    Instance inst = data.instance(i);
                    if (inst.isMissing(att)) {

                        // Can stop as soon as we hit a missing value
                        break;
                    }

                    // Can we place a sensible split point here?
                    if (inst.value(att) > currSplit) {

                        // Compute gain for split point
                        currVal = gain(currDist, priorVal);

                        // Is the current split point the best point so far?
                        if (currVal > bestVal) {

                            // Store value of current point
                            bestVal = currVal;

                            // Save split point
                            splitPoint = (inst.value(att) + currSplit) / 2.0;
                            origSplitPoint = splitPoint;

                            // Save distribution
                            for (int j = 0; j < currDist.length; j++) {
                                System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length);
                            }
                        }
                    }
                    currSplit = inst.value(att);

                    // Shift over the weight
                    currDist[0][(int) inst.classValue()] += inst.weight();
                    currDist[1][(int) inst.classValue()] -= inst.weight();
                }
            } else {
                double currSplit = data.instance(0).value(att);
                double currVal, bestVal = -Double.MAX_VALUE;
                // Split data set using given split point.
                for (int i = 0; i < data.numInstances(); i++) {
                    Instance inst = data.instance(i);
                    if (inst.isMissing(att)) {
                        // Can stop as soon as we hit a missing value
                        break;
                    }
                    if (inst.value(att) > currSplit) {
                        // Compute gain for split point
                        currVal = gain(currDist, priorVal);
                        // Is the current split point the best point so far?
                        if (currVal > bestVal) {
                            // Store value of current point
                            bestVal = currVal;
                            // Save computed split point
                            origSplitPoint = (inst.value(att) + currSplit) / 2.0;
                        }
                    }
                    currSplit = inst.value(att);
                    // Shift over the weight
                    currDist[0][(int) inst.classValue()] += inst.weight();
                    currDist[1][(int) inst.classValue()] -= inst.weight();
                    if (inst.value(att) <= splitPoint) {
                        // Save distribution since split point is specified
                        for (int j = 0; j < currDist.length; j++) {
                            System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length);
                        }
                    }
                }
            }
        }
    } else if (CustomClassifierId != null) {
        Classifier fc = custom_classifiers.get(CustomClassifierId);
        dist = new double[data.numClasses()][data.numClasses()];
        Instance inst;
        for (int i = 0; i < data.numInstances(); i++) {
            inst = data.instance(i);
            double predictedClass = fc.classifyInstance(inst);
            if (predictedClass != Instance.missingValue()) {
                dist[(int) predictedClass][(int) inst.classValue()] += inst.weight();
            }
        }
    } else if (cSet != null) {
        dist = new double[data.numClasses()][data.numClasses()];
        JsonNode vertices = mapper.readTree(cSet.getConstraints());
        ArrayList<double[]> attrVertices = generateVerticesList(vertices);
        List<Attribute> aList = generateAttributeList(cSet, data, d);
        double[] testPoint = new double[2];
        int ctr = 0;
        for (int k = 0; k < data.numInstances(); k++) {
            testPoint = new double[2];
            ctr = 0;
            for (Attribute a : aList) {
                if (!data.instance(k).isMissing(a)) {
                    testPoint[ctr] = data.instance(k).value(a);
                    ctr++;
                }
            }
            int check = checkPointInPolygon(attrVertices, testPoint);
            dist[check][(int) data.instance(k).classValue()] += data.instance(k).weight();
        }
    }

    // Compute weights for subsetsCustomClassifierIndex
    props[att] = new double[dist.length];
    for (int k = 0; k < props[att].length; k++) {
        props[att][k] = Utils.sum(dist[k]);
    }
    if (Utils.eq(Utils.sum(props[att]), 0)) {
        for (int k = 0; k < props[att].length; k++) {
            props[att][k] = 1.0 / props[att].length;
        }
    } else {
        Utils.normalize(props[att]);
    }

    // Any instances with missing values ?
    if (indexOfFirstMissingValue > -1) {

        // Distribute weights for instances with missing values
        for (int i = indexOfFirstMissingValue; i < data.numInstances(); i++) {
            Instance inst = data.instance(i);
            if (attribute.isNominal()) {

                // Need to check if attribute value is missing
                if (inst.isMissing(att)) {
                    for (int j = 0; j < dist.length; j++) {
                        dist[j][(int) inst.classValue()] += props[att][j] * inst.weight();
                    }
                }
            } else {

                // Can be sure that value is missing, so no test required
                for (int j = 0; j < dist.length; j++) {
                    dist[j][(int) inst.classValue()] += props[att][j] * inst.weight();
                }
            }
        }
    }

    // Return distribution and split point
    dists[att] = dist;
    mp.put("split_point", splitPoint);
    mp.put("orig_split_point", origSplitPoint);
    return mp;
}

From source file:org.wkwk.classifier.MyC45.java

public double bestThreshold(Instances data, Attribute attr) {
    data.sort(attr);

    double m_ig = 0;
    double bestThr = 0;
    double classTemp = data.get(0).classValue();
    double valueTemp = data.get(0).value(attr);

    Enumeration instEnum = data.enumerateInstances();
    double dt;//from   ww  w  .  j a  v  a  2  s.  c om
    while (instEnum.hasMoreElements()) {
        Instance inst = (Instance) instEnum.nextElement();
        if (classTemp != inst.classValue()) {
            classTemp = inst.classValue();
            dt = valueTemp;
            valueTemp = inst.value(attr);
            double threshold = dt + ((valueTemp - dt) / 2);
            double igTemp = computeInfoGainCont(data, attr, threshold);
            if (m_ig < igTemp) {
                m_ig = igTemp;
                bestThr = threshold;
            }
        }
    }
    return bestThr;
}

From source file:Prediccion.PrecidePasoNodo.java

License:Open Source License

Instances cargarDatos(int hora) throws ParseException {
    //Declaramos los atributos de las instancias
    Attribute a0 = new Attribute("Intervalo", "yyyy-MM-dd HH:mm:ss");
    Attribute a1 = new Attribute("Total");

    ArrayList<Attribute> c = new ArrayList<>();
    c.add(a0);//from   www .j  av  a  2  s .  c  o  m
    c.add(a1);

    //Creamos el conjunto de instancias
    Instances instances = new Instances(nodo, c, 1000);

    //Instanciamos conexion con FT
    cFT = new conectarFusionTables();
    Sqlresponse r = cFT.select(TABLAID, "Intervalo, Total",
            "idNodo = " + nodo + " and Intervalo ENDS WITH '00:00:00'",
            "ORDER BY \'Intervalo\' DESC LIMIT 10000");

    for (List<Object> a : r.getRows()) {
        Instance i = new DenseInstance(2);

        String s0 = (String) a.get(0);
        String s1 = (String) a.get(1);

        System.err.println(s0 + " ->" + s1);

        i.setValue(instances.attribute(0), instances.attribute(0).parseDate(s0));
        i.setValue(instances.attribute(1), Integer.parseInt(s1));

        instances.add(i);
    }

    instances.sort(0);

    return instances;
}