Example usage for weka.core Instances attributeStats

List of usage examples for weka.core Instances attributeStats

Introduction

In this page you can find the example usage for weka.core Instances attributeStats.

Prototype


public AttributeStats attributeStats(int index) 

Source Link

Document

Calculates summary statistics on the values that appear in this set of instances for a specified attribute.

Usage

From source file:gyc.UnderOverBoostM1.java

License:Open Source License

/**
 * /*from   www .ja v a2  s .c  o  m*/
 * nMajnMin
 * @param data
 * @param i
 * @return
 */
protected Instances randomSampling(Instances copia, int majC, int minC, int a, Random simplingRandom) {
    int[] majExamples = new int[copia.numInstances()];
    int[] minExamples = new int[copia.numInstances()];
    int majCount = 0, minCount = 0;
    // First, we copy the examples from the minority class and save the indexes of the majority
    // the new data-set contains samples_min + samples_min * N / 100
    int size = copia.attributeStats(copia.classIndex()).nominalCounts[majC] * a / 100 * 2;
    // class name
    String majClassName = copia.attribute(copia.classIndex()).value(majC);

    for (int i = 0; i < copia.numInstances(); i++) {
        if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) {
            // save index
            majExamples[majCount] = i;
            majCount++;
        } else {
            minExamples[minCount] = i;
            minCount++;
        }
    }

    /* random undersampling of the majority */
    Instances myDataset = new Instances(copia, 0);
    int r;
    for (int i = 0; i < size / 2; i++) {
        r = simplingRandom.nextInt(majCount);
        myDataset.add(copia.instance(majExamples[r]));

        if (minCount > 0) {
            r = simplingRandom.nextInt(minCount);
            myDataset.add(copia.instance(minExamples[r]));
        }
    }

    myDataset.randomize(simplingRandom);
    return myDataset;
}

From source file:j48.C45Split.java

License:Open Source License

public void buildClassifier(Instances trainInstances) throws Exception {

    // Initialize the remaining instance variables.
    m_numSubsets = 0;//from  ww w.j  a v  a  2 s  . co m
    m_splitPoint = Double.MAX_VALUE;
    m_infoGain = 0;
    m_gainRatio = 0;

    // Different treatment for enumerated and numeric
    // attributes.
    if (trainInstances.attribute(m_attIndex).isNominal()) {
        m_complexityIndex = trainInstances.attribute(m_attIndex).numValues();
        m_index = m_complexityIndex;
        handleEnumeratedAttribute(trainInstances);

    } else {
        m_complexityIndex = 2;
        m_index = 0;
        trainInstances.sort(trainInstances.attribute(m_attIndex));

        // ///////////////////////////////////////////////////////////////////////////////////////
        double stdDev = trainInstances.attributeStats(m_attIndex).numericStats.stdDev;
        if (stdDev > 200) {
            //      rrrrr = stdDev/200;
            //      System.out.println(stdDev+" ");
            rrrrr = Math.log10(stdDev) / 1.2;
            //      rrrrr = 1.1;
            //      lllll = stdDev/2000;

            //      lllll = 0.3;

            lllll = Math.log10(stdDev) / 8;
        } else {
            lllll = Math.log10(stdDev) / 1.2;
            //         lllll = stdDev/200;
            //         lllll = 1.1;

            //         rrrrr = stdDev/2000;
            //         rrrrr = 0.3;
            rrrrr = Math.log10(stdDev) / 8;

        }
        handleNumericAttribute(trainInstances);
    }
}

From source file:lu.lippmann.cdb.common.gui.ts.TimeSeriesChartUtil.java

License:Open Source License

private static void fillWithSingleAxis(final Instances dataSet, final int dateIdx,
        final TimeSeriesCollection tsDataset) {
    final int numInstances = dataSet.numInstances();

    final Calendar cal = Calendar.getInstance();
    for (final Integer i : WekaDataStatsUtil.getNumericAttributesIndexes(dataSet)) {
        if (dataSet.attributeStats(i).missingCount == dataSet.numInstances()) {
            System.out.println("TimeSeriesChartUtil: Only missing values for '" + dataSet.attribute(i).name()
                    + "', so skip it!");
            continue;
        }/*from   ww  w . j  a va 2  s .  c  o m*/
        final TimeSeries ts = new TimeSeries(dataSet.attribute(i).name());
        for (int k = 0; k < numInstances; k++) {
            final Instance instancek = dataSet.instance(k);
            final long timeInMilliSec = (long) instancek.value(dateIdx);
            cal.setTimeInMillis(timeInMilliSec);

            if (instancek.isMissing(i)) {
                ts.addOrUpdate(new Millisecond(cal.getTime()), null);
            } else {
                ts.addOrUpdate(new Millisecond(cal.getTime()), instancek.value(i));
            }
        }
        if (!ts.isEmpty())
            tsDataset.addSeries(ts);
    }
}

From source file:lu.lippmann.cdb.common.gui.ts.TimeSeriesChartUtil.java

License:Open Source License

private static void fillWithSingleAxisInterval(final Instances dataSet, final int dateIdx,
        final YIntervalSeriesCollection tsDataset, final double deviation, final int deviatedAttrIdx) {
    final int numInstances = dataSet.numInstances();

    for (final Integer i : WekaDataStatsUtil.getNumericAttributesIndexes(dataSet)) {
        if (dataSet.attributeStats(i).missingCount == dataSet.numInstances()) {
            System.out.println("TimeSeriesChartUtil: Only missing values for '" + dataSet.attribute(i).name()
                    + "', so skip it!");
            continue;
        }//w  w w .  j av a2  s  .c  o  m
        final YIntervalSeries ts = new YIntervalSeries(dataSet.attribute(i).name());
        for (int k = 0; k < numInstances; k++) {
            final Instance instancek = dataSet.instance(k);
            final long timeInMilliSec = (long) instancek.value(dateIdx);

            if (instancek.isMissing(i)) {
                //ts.add(timeInMilliSec,null,0d,0d);               
            } else {
                if (i == deviatedAttrIdx && k > 0 && k < (numInstances - 1)) {
                    System.out.println(numInstances + " " + k + " " + instancek.value(i) + " "
                            + (instancek.value(i) - deviation) + " " + (instancek.value(i) + deviation));
                    ts.add(timeInMilliSec, instancek.value(i), instancek.value(i) - deviation,
                            instancek.value(i) + deviation);
                } else {
                    ts.add(timeInMilliSec, instancek.value(i), instancek.value(i), instancek.value(i));
                }
                //System.out.println(instancek.value(i)+" "+(instancek.value(i)-deviation)+" "+(instancek.value(i)+deviation));
            }
        }
        if (!ts.isEmpty())
            tsDataset.addSeries(ts);
    }
}

From source file:lu.lippmann.cdb.datasetview.tabs.StatsTabView.java

License:Open Source License

public static Instances buildStatsForNumericalAttributes(final Instances dataset) throws Exception {
    final StringBuilder sb = new StringBuilder("@relation blabla\n");
    sb.append("@attribute 'name' string\n");
    sb.append("@attribute 'min' string\n");
    sb.append("@attribute 'max' string\n");
    sb.append("@attribute 'mean' string\n");
    sb.append("@attribute 'stdDev' string\n");
    sb.append("@attribute 'missing values count' string\n");
    sb.append("@attribute 'missing values %' string\n");
    sb.append("@attribute 'values repartition' string\n");
    sb.append("@data\n");

    for (int i = 0; i < dataset.numAttributes(); i++) {
        if (dataset.attribute(i).isNumeric() && !dataset.attribute(i).isDate()) {
            sb.append(dataset.attribute(i).name()).append(",")
                    .append(FormatterUtil.DECIMAL_FORMAT.format(dataset.attributeStats(i).numericStats.min))
                    .append(",")
                    .append(FormatterUtil.DECIMAL_FORMAT.format(dataset.attributeStats(i).numericStats.max))
                    .append(",")
                    .append(FormatterUtil.DECIMAL_FORMAT.format(dataset.attributeStats(i).numericStats.mean))
                    .append(",")
                    .append(FormatterUtil.DECIMAL_FORMAT.format(dataset.attributeStats(i).numericStats.stdDev))
                    .append(",").append(dataset.attributeStats(i).missingCount).append(",")
                    .append(FormatterUtil.DECIMAL_FORMAT
                            .format(100d * dataset.attributeStats(i).missingCount / dataset.numInstances()))
                    .append(",").append("''").append("\n");
        } else if (dataset.attribute(i).isNominal()) {
            sb.append(dataset.attribute(i).name()).append(",'','','','','','','");

            final Map<Object, String> nominalRep = WekaDataStatsUtil
                    .getNominalRepartitionForDescription(dataset, i);
            for (Map.Entry<Object, String> e : nominalRep.entrySet()) {
                sb.append(e.getKey()).append("=").append(e.getValue()).append(" ");
            }/*from w ww  .j a va 2 s . c  om*/

            sb.append("'\n");
        }
    }

    final Instances newds = WekaDataAccessUtil.loadInstancesFromARFFString(sb.toString(), false, false);

    if (WekaDataStatsUtil.getNominalAttributesIndexes(dataset).length == 0) {
        newds.deleteAttributeAt(newds.numAttributes() - 1);
    }
    return newds;
}

From source file:lu.lippmann.cdb.datasetview.tabs.TimeSeriesCalendarPanel.java

License:Open Source License

public void refresh(final Instances dataSet, final int dateIdx, final int attrToHighlightIdx,
        final Mode calendarMode) {
    this.jxp.removeAll();

    final SimpleDateFormat f = new SimpleDateFormat(dataSet.attribute(dateIdx).getDateFormat());

    final LinkedHashMap<Date, Color> map = new LinkedHashMap<Date, Color>();
    final AttributeStats attributeStats = (attrToHighlightIdx < 0) ? null
            : dataSet.attributeStats(attrToHighlightIdx);
    for (int i = 0; i < dataSet.numInstances(); i++) {
        //System.out.println(i+" "+dataSet.instance(i).value(dateIdx));
        final String val = dataSet.instance(i).stringValue(dateIdx);

        try {/*from  w  ww  .  java 2 s. c o  m*/
            final Date d = f.parse(val);
            if (attrToHighlightIdx < 0) {
                map.put(d, Color.BLACK);
            } else if (dataSet.attribute(attrToHighlightIdx).isNominal()) {
                final int idxOfColor = ((int) dataSet.instance(i).value(attrToHighlightIdx)
                        * (this.colors.length - 1 - this.firstColorIdx)) / attributeStats.nominalCounts.length;
                map.put(d, this.colors[idxOfColor + this.firstColorIdx]);
            } else {
                final double normalizedValue = (dataSet.instance(i).value(attrToHighlightIdx)
                        - attributeStats.numericStats.min)
                        / (attributeStats.numericStats.max - attributeStats.numericStats.min);
                final int idxOfColor = (int) (normalizedValue * (this.colors.length - 1 - this.firstColorIdx));
                //System.out.println(normalizedValue+" "+idxOfColor);
                map.put(d, this.colors[idxOfColor + this.firstColorIdx]);
            }
        } catch (ParseException e) {
            e.printStackTrace();
        }
    }

    final JScrollPane scrollp = new JScrollPane(
            MonthCalendarView.buildMultPanel(map, calendarMode, (int) jxp.getSize().getWidth()),
            JScrollPane.VERTICAL_SCROLLBAR_ALWAYS, JScrollPane.HORIZONTAL_SCROLLBAR_NEVER);

    scrollp.setPreferredSize(new Dimension((int) jxp.getSize().getWidth() * 95 / 100,
            (int) jxp.getSize().getHeight() * 95 / 100));

    this.jxp.add(scrollp, BorderLayout.CENTER);

    if (attrToHighlightIdx >= 0) {
        final JXPanel legendPanel = new JXPanel();
        legendPanel.setBorder(new TitledBorder("Legend"));
        legendPanel.setBackground(Color.WHITE);
        legendPanel.setLayout(new GridLayout(0, 1));

        if (dataSet.attribute(attrToHighlightIdx).isNominal()) {
            int c = 0;
            final Map<Object, Integer> pv = WekaDataStatsUtil.getNominalRepartition(dataSet,
                    attrToHighlightIdx);
            for (final Map.Entry<Object, Integer> entry : pv.entrySet()) {
                final JXLabel comp = new JXLabel(entry.getKey().toString());
                final int idxOfColor = (c * (this.colors.length - this.firstColorIdx))
                        / attributeStats.nominalCounts.length;
                comp.setForeground(this.colors[idxOfColor + this.firstColorIdx]);
                legendPanel.add(comp);
                c++;
            }
        } else if (dataSet.attribute(attrToHighlightIdx).isNumeric()) {
            final JXLabel compMin = new JXLabel("Min: " + attributeStats.numericStats.min);
            compMin.setForeground(this.colors[this.firstColorIdx]);
            legendPanel.add(compMin);
            final JXLabel compMax = new JXLabel("Max: " + attributeStats.numericStats.max);
            compMax.setForeground(this.colors[this.colors.length - 1]);
            legendPanel.add(compMax);
        }
        this.jxp.add(legendPanel, BorderLayout.NORTH);
    }

    final JXPanel settingsPanel = new JXPanel();
    settingsPanel.setLayout(new GridLayout(1, 0));
    final JComboBox calendarModeCombo = new JComboBox(Mode.values());
    calendarModeCombo.setBorder(new TitledBorder("Mode"));
    final JComboBox attrToHighlightCombo = new JComboBox(
            WekaDataStatsUtil.getAttributeNames(dataSet).toArray());
    attrToHighlightCombo.setBorder(new TitledBorder("Attribute to highlight"));

    calendarModeCombo.setSelectedItem(calendarMode);
    calendarModeCombo.addActionListener(new ActionListener() {
        @Override
        public void actionPerformed(ActionEvent e) {
            refresh(dataSet, dateIdx, attrToHighlightCombo.getSelectedIndex(),
                    Mode.valueOf(calendarModeCombo.getSelectedItem().toString()));
        }
    });
    settingsPanel.add(calendarModeCombo);

    attrToHighlightCombo.setSelectedIndex(attrToHighlightIdx);
    attrToHighlightCombo.addActionListener(new ActionListener() {
        @Override
        public void actionPerformed(ActionEvent e) {
            refresh(dataSet, dateIdx, attrToHighlightCombo.getSelectedIndex(),
                    Mode.valueOf(calendarModeCombo.getSelectedItem().toString()));
        }
    });
    settingsPanel.add(attrToHighlightCombo);
    this.jxp.add(settingsPanel, BorderLayout.SOUTH);
}

From source file:lu.lippmann.cdb.lab.mds.MDSViewBuilder.java

License:Open Source License

/**
 * //from  ww  w  .j  a  v a  2  s.  com
 * @param instance
 * @param instances
 * @param mapAlias
 * @return
 */
private static Integer getStrongestClass(final Integer centroidIndex, final CollapsedInstances mds) {
    final KmeansResult mapCentroid = mds.getCentroidMap();
    final Instances newInstances = mapCentroid.getClusters().get(centroidIndex);
    final int classIndex = newInstances.classIndex();
    final AttributeStats classAttributeStats = newInstances.attributeStats(classIndex);
    int maxIndex = -1;
    int max = -1;
    for (int i = 0; i < classAttributeStats.nominalCounts.length; i++) {
        final int currentCount = classAttributeStats.nominalCounts[i];
        if (currentCount > max) {
            max = currentCount;
            maxIndex = i;
        }
    }

    // Problem with that line :-(
    return maxIndex;
}

From source file:mao.datamining.RemoveUselessColumnsByMissingValues.java

License:Open Source License

/**
 * Signify that this batch of input to the filter is finished.
 *
 * @return true if there are instances pending output
 * @throws Exception if no input format defined
 *//*from w w w .j  a va 2  s. co  m*/
public boolean batchFinished() throws Exception {

    if (getInputFormat() == null) {
        throw new IllegalStateException("No input instance format defined");
    }
    if (m_removeFilter == null) {

        // establish attributes to remove from first batch

        Instances toFilter = getInputFormat();
        int[] attsToDelete = new int[toFilter.numAttributes()];
        int numToDelete = 0;
        for (int i = 0; i < toFilter.numAttributes(); i++) {
            if (i == toFilter.classIndex())
                continue; // skip class
            AttributeStats stats = toFilter.attributeStats(i);

            //remove those attributes who has high ratio of missing values
            if ((stats.missingCount * 100) / stats.totalCount > m_maxMissingPercentage) {
                //            System.out.println("stats.missingPercentage: " + (stats.missingCount*100)/stats.totalCount+"%");            
                attsToDelete[numToDelete++] = i;
            }
            //remove those columns defined in the list by manual check
            if (this.column2DeleteSet.contains(toFilter.attribute(i).name())) {
                attsToDelete[numToDelete++] = i;
            }
        }

        int[] finalAttsToDelete = new int[numToDelete];
        System.arraycopy(attsToDelete, 0, finalAttsToDelete, 0, numToDelete);

        m_removeFilter = new Remove();
        m_removeFilter.setAttributeIndicesArray(finalAttsToDelete);
        m_removeFilter.setInvertSelection(false);
        m_removeFilter.setInputFormat(toFilter);

        for (int i = 0; i < toFilter.numInstances(); i++) {
            m_removeFilter.input(toFilter.instance(i));
        }
        m_removeFilter.batchFinished();

        Instance processed;
        Instances outputDataset = m_removeFilter.getOutputFormat();

        // restore old relation name to hide attribute filter stamp
        outputDataset.setRelationName(toFilter.relationName());

        setOutputFormat(outputDataset);
        while ((processed = m_removeFilter.output()) != null) {
            processed.setDataset(outputDataset);
            push(processed);
        }
    }
    flushInput();

    m_NewBatch = true;
    return (numPendingOutput() != 0);
}

From source file:mlda.attributes.MeanEntropiesNominalAttributes.java

License:Open Source License

/**
 * Calculate metric value//w w w. j a  v a  2 s.  co m
 * 
 * @param mlData Multi-label dataset to which calculate the metric
 * @return Value of the metric
 */
public double calculate(MultiLabelInstances mlData) {
    double mean = 0.0;

    Instances instances = mlData.getDataSet();

    int countNominal = 0;
    int[] featureIndices = mlData.getFeatureIndices();

    for (int fIndex : featureIndices) {
        AttributeStats attStats = instances.attributeStats(fIndex);
        if (attStats.nominalCounts != null) {
            countNominal++;
            mean += Utils.entropy(attStats.nominalCounts);
        }
    }

    mean = mean / countNominal;

    this.value = mean;
    return value;
}

From source file:mlda.labelsDistribution.MaxEntropy.java

License:Open Source License

/**
 * Calculate metric value/*from w ww.  j a v  a2s. c o m*/
 * 
 * @param mlData Multi-label dataset to which calculate the metric
 * @return Value of the metric
 */
public double calculate(MultiLabelInstances mlData) {
    Instances instances = mlData.getDataSet();

    int nLabels = mlData.getNumLabels();
    int[] labels = mlData.getLabelIndices();

    double[] entropies = new double[nLabels];

    for (int i = 0; i < nLabels; i++) {
        AttributeStats attStats = instances.attributeStats(labels[i]);

        if (attStats.nominalCounts != null) {
            entropies[i] = Utils.entropy(attStats.nominalCounts);
        }
    }

    double maxEntropy = Double.MIN_VALUE;
    for (double e : entropies) {
        if (e > maxEntropy) {
            maxEntropy = e;
        }
    }

    this.value = maxEntropy;

    return value;
}