Example usage for weka.core Instances enumerateAttributes

Introduction

In this page you can find the example usage for weka.core Instances enumerateAttributes.

Prototype

publicEnumeration<Attribute> enumerateAttributes()

Source Link

Document

Returns an enumeration of all the attributes.

Usage

From source file:dewaweebtreeclassifier.veranda.VerandaTree.java

/**
 * //from  w  ww  . j  av a2 s  .  c o  m
 * @param data 
 */
public void buildTree(Instances data) {
    // exit if there is no data left in the dataset
    if (data.numInstances() == 0) {
        mChild = null;
        return;
    }

    double[] informationGains = new double[data.numAttributes()];
    Enumeration enumAttrs = data.enumerateAttributes();
    while (enumAttrs.hasMoreElements()) {
        Attribute attr = (Attribute) enumAttrs.nextElement();
        informationGains[attr.index()] = computeGain(data, attr);
    }
    int maxIdx = Utils.maxIndex(informationGains);

    if (Utils.eq(informationGains[maxIdx], 0)) {
        mClassDistribution = new int[data.numClasses()];
        Enumeration enumInst = data.enumerateInstances();
        while (enumInst.hasMoreElements()) {
            Instance instance = (Instance) enumInst.nextElement();
            mClassDistribution[(int) instance.classValue()]++;
        }
        mClassValue = Utils.maxIndex(mClassDistribution);
    } else {
        mSplitAttribute = data.attribute(maxIdx);
        Instances[] splitInstances = splitInstancesOnAttribute(data, mSplitAttribute);
        mChild = new VerandaTree[mSplitAttribute.numValues()];
        for (int i = 0; i < mChild.length; i++) {
            mChild[i] = new VerandaTree();
            mChild[i].buildTree(splitInstances[i]);
        }
    }
}

From source file:dewaweebtreeclassifier.Veranda.java

/**
 * /* w w w  .ja v  a 2  s  . c  om*/
 * @param data
 * @return 
 */
private boolean isAllNominalAttributes(Instances data) {
    Enumeration enumAttr = data.enumerateAttributes();
    while (enumAttr.hasMoreElements()) {
        Attribute attr = (Attribute) enumAttr.nextElement();
        if (!attr.isNominal()) {
            return false;
        }
    }

    return true;
}

From source file:ergasia2pkg.LP_ROS.java

/**
 * Creates a new MultiLabelInstances object given a list of Instance
 *
 * @param Hashmap<String,List<Instance> map from which to create instances
 * @param MultiLabelInstances used just to get the Label metadata
 * @return MultiLabelInstances new MultiLabelInstances Object
 *///  ww  w .  j a v  a  2 s. c o  m
private MultiLabelInstances createNewMultilabelInstance(HashMap<String, List<Instance>> labelsetGroup,
        MultiLabelInstances mlData) throws InvalidDataFormatException {

    Instances in = mlData.getDataSet();
    Enumeration enumeration = in.enumerateAttributes();
    ArrayList attlist = Collections.list(enumeration);
    int capacity = 0;
    for (String labelset : labelsetGroup.keySet()) {
        capacity += labelsetGroup.get(labelset).size();
    }

    Instances newInstances = new Instances("sampledDataset", attlist, capacity);
    for (String labelset : labelsetGroup.keySet()) {
        List<Instance> instanceList = (ArrayList<Instance>) labelsetGroup.get(labelset);
        for (Instance inst : instanceList) {
            newInstances.add(inst);
        }
    }

    MultiLabelInstances newData = new MultiLabelInstances(newInstances, mlData.getLabelsMetaData());
    return newData;
}

From source file:gov.va.chir.tagline.dao.DatasetUtil.java

License:Open Source License

@SuppressWarnings("unchecked")
public static Instances createDataset(final Instances header, final Collection<Document> documents)
        throws Exception {

    // Update header to include all docIDs from the passed in documents
    // (Weka requires all values for nominal features)
    final Set<String> docIds = new TreeSet<String>();

    for (Document document : documents) {
        docIds.add(document.getName());/*from ww w  . j  a  v a 2s  . c o m*/
    }

    final AddValues avf = new AddValues();
    avf.setLabels(StringUtils.join(docIds, ","));

    // Have to add 1 because SingleIndex.setValue() has a bug, expecting
    // the passed in index to be 1-based rather than 0-based. Why? I have 
    // no idea.
    // Calling path: AddValues.setInputFormat() -->
    //               SingleIndex.setUpper() -->
    //               SingleIndex.setValue()
    avf.setAttributeIndex(String.valueOf(header.attribute(DOC_ID).index() + 1));

    avf.setInputFormat(header);
    final Instances newHeader = Filter.useFilter(header, avf);

    final Instances instances = new Instances(newHeader, documents.size());

    // Map attributes
    final Map<String, Attribute> attrMap = new HashMap<String, Attribute>();

    final Enumeration<Attribute> en = newHeader.enumerateAttributes();

    while (en.hasMoreElements()) {
        final Attribute attr = en.nextElement();

        attrMap.put(attr.name(), attr);
    }

    attrMap.put(newHeader.classAttribute().name(), newHeader.classAttribute());

    final Attribute docId = attrMap.get(DOC_ID);
    final Attribute lineId = attrMap.get(LINE_ID);
    final Attribute classAttr = attrMap.get(LABEL);

    // Add data
    for (Document document : documents) {
        final Map<String, Object> docFeatures = document.getFeatures();

        for (Line line : document.getLines()) {
            final Instance instance = new DenseInstance(attrMap.size());

            final Map<String, Object> lineFeatures = line.getFeatures();
            lineFeatures.putAll(docFeatures);

            instance.setValue(docId, document.getName());
            instance.setValue(lineId, line.getLineId());

            if (line.getLabel() == null) {
                instance.setMissing(classAttr);
            } else {
                instance.setValue(classAttr, line.getLabel());
            }

            for (Attribute attribute : attrMap.values()) {
                if (!attribute.equals(docId) && !attribute.equals(lineId) && !attribute.equals(classAttr)) {
                    final String name = attribute.name();
                    final Object obj = lineFeatures.get(name);

                    if (obj instanceof Double) {
                        instance.setValue(attribute, ((Double) obj).doubleValue());
                    } else if (obj instanceof Integer) {
                        instance.setValue(attribute, ((Integer) obj).doubleValue());
                    } else {
                        instance.setValue(attribute, obj.toString());
                    }
                }
            }

            instances.add(instance);
        }
    }

    // Set last attribute as class
    instances.setClassIndex(attrMap.size() - 1);

    return instances;
}

From source file:id3.MyID3.java

/**
 * Membuat pohon keputusan// w  w w  .  jav a 2 s  .co m
 * @param instances data train
 * @throws Exception
 */
@Override
public void buildClassifier(Instances instances) throws Exception {
    // Check if classifier can handle the data
    getCapabilities().testWithFail(instances);

    // Remove missing value instance from instances
    instances = new Instances(instances);
    instances.deleteWithMissingClass();

    // Gather list of attribute in instances
    ArrayList<Attribute> remainingAttributes = new ArrayList<>();
    Enumeration enumAttributes = instances.enumerateAttributes();
    while (enumAttributes.hasMoreElements()) {
        remainingAttributes.add((Attribute) enumAttributes.nextElement());
    }
    // Start build classifier ID3
    buildMyID3(instances, remainingAttributes);
}

From source file:id3.MyID3.java

/**
 * Main method//ww  w .  ja  v  a 2 s .c o  m
 * @param args arguments
 */
public static void main(String[] args) {
    Instances instances;
    try {
        BufferedReader reader = new BufferedReader(new FileReader("D:\\Weka-3-6\\data\\weather.nominal.arff"));
        try {
            instances = new Instances(reader);
            instances.setClassIndex(instances.numAttributes() - 1);
            MyID3 id3 = new MyID3();
            try {
                id3.buildClassifier(instances);
            } catch (Exception e) {
                e.printStackTrace();
            }
            // Test class distribution
            double[] classDistribution = id3.classDistribution(instances);
            for (int i = 0; i < classDistribution.length; i++) {
                System.out.println(classDistribution[i]);
            }
            // Test entrophy and information gain for each attribute
            System.out.println(id3.computeEntropy(instances));
            Enumeration attributes = instances.enumerateAttributes();
            while (attributes.hasMoreElements()) {
                System.out.println(id3.computeIG(instances, (Attribute) attributes.nextElement()));
            }
            // Test build classifier
            try {
                id3.buildClassifier(instances);
            } catch (Exception e) {
                e.printStackTrace();
            }
            System.out.println(id3.toString());
            // Evaluate model from build classifier (full training)
            Evaluation eval = null;
            try {
                eval = new Evaluation(instances);
            } catch (Exception e) {
                e.printStackTrace();
            }
            try {
                System.out.println(instances);
                eval.evaluateModel(id3, instances);
            } catch (Exception e) {
                e.printStackTrace();
            }
            System.out.println(eval.toSummaryString("\nResults Full-Training\n\n", false));
            // Evaluate model from build classifier (test set)
            // Test Confusion Matrix
            System.out.println("Confusion Matrix : ");
            double[][] cmMatrix = eval.confusionMatrix();
            for (int row_i = 0; row_i < cmMatrix.length; row_i++) {
                for (int col_i = 0; col_i < cmMatrix.length; col_i++) {
                    System.out.print(cmMatrix[row_i][col_i]);
                    System.out.print("|");
                }
                System.out.println();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    }
}

From source file:iris.ID3.java

public void makeLikeAWhat(Instances instances) {
    // Create storage for different info gains
    double[] infoGains = new double[instances.numAttributes()];
    // Enumerate through attributes to find the best gain
    Enumeration attributeEnum = instances.enumerateAttributes();
    while (attributeEnum.hasMoreElements()) {
        // Loop through attributes, adding gain to infoGains array
        Attribute att = (Attribute) attributeEnum.nextElement();
        infoGains[att.index()] = infoGain(instances, att);
    }//from   ww  w  .  ja  va 2  s. c  o m
    // Use maxIndex to find the highest info gain in the array
    highestInfoGain = instances.attribute(Utils.maxIndex(infoGains));

    // Make a leaf if there is no more info to gain
    // Otherwise, create children
    // Check if there is no more info to gain
    if (Utils.eq(infoGains[highestInfoGain.index()], 0)) {
        highestInfoGain = null;
        // Instantiate maxDistribution
        maxDistribution = new double[instances.numClasses()];
        // Set up enumerator for instances
        Enumeration instanceEnum = instances.enumerateInstances();
        // Tally classes
        while (instanceEnum.hasMoreElements()) {
            Instance instance = (Instance) instanceEnum.nextElement();
            maxDistribution[(int) instance.classValue()]++;
        }
        // Normalize data for easier manipulation
        Utils.normalize(maxDistribution);
        // Get the max index of the distrubtion
        classValue = Utils.maxIndex(maxDistribution);
        // Save class attribute
        classAttribute = instances.classAttribute();
    }
    // Create children
    else {
        // Split best attribute into bins
        Instances[] bins = makeBins(instances, highestInfoGain);
        // Create nodes
        children = new ID3[highestInfoGain.numValues()];
        for (int i = 0; i < highestInfoGain.numValues(); i++) {
            children[i] = new ID3();
            children[i].makeLikeAWhat(bins[i]);
        }
    }
}

From source file:j48.BinC45ModelSelection.java

License:Open Source License

/**
 * Selects C4.5-type split for the given dataset.
 *///from   w w  w  .ja  v  a 2  s.c o m
public final ClassifierSplitModel selectModel(Instances data) {

    double minResult;
    double currentResult;
    BinC45Split[] currentModel;
    BinC45Split bestModel = null;
    NoSplit noSplitModel = null;
    double averageInfoGain = 0;
    int validModels = 0;
    boolean multiVal = true;
    Distribution checkDistribution;
    double sumOfWeights;
    int i;

    try {

        // Check if all Instances belong to one class or if not
        // enough Instances to split.
        checkDistribution = new Distribution(data);
        noSplitModel = new NoSplit(checkDistribution);
        if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj) || Utils.eq(checkDistribution.total(),
                checkDistribution.perClass(checkDistribution.maxClass())))
            return noSplitModel;

        // Check if all attributes are nominal and have a
        // lot of values.
        Enumeration enu = data.enumerateAttributes();
        while (enu.hasMoreElements()) {
            Attribute attribute = (Attribute) enu.nextElement();
            if ((attribute.isNumeric())
                    || (Utils.sm((double) attribute.numValues(), (0.3 * (double) m_allData.numInstances())))) {
                multiVal = false;
                break;
            }
        }
        currentModel = new BinC45Split[data.numAttributes()];
        sumOfWeights = data.sumOfWeights();

        // For each attribute.
        for (i = 0; i < data.numAttributes(); i++) {

            // Apart from class attribute.
            if (i != (data).classIndex()) {

                // Get models for current attribute.
                currentModel[i] = new BinC45Split(i, m_minNoObj, sumOfWeights);
                currentModel[i].buildClassifier(data);

                // Check if useful split for current attribute
                // exists and check for enumerated attributes with
                // a lot of values.
                if (currentModel[i].checkModel())
                    if ((data.attribute(i).isNumeric())
                            || (multiVal || Utils.sm((double) data.attribute(i).numValues(),
                                    (0.3 * (double) m_allData.numInstances())))) {
                        averageInfoGain = averageInfoGain + currentModel[i].infoGain();
                        validModels++;
                    }
            } else
                currentModel[i] = null;
        }

        // Check if any useful split was found.
        if (validModels == 0)
            return noSplitModel;
        averageInfoGain = averageInfoGain / (double) validModels;

        // Find "best" attribute to split on.
        minResult = 0;
        for (i = 0; i < data.numAttributes(); i++) {
            if ((i != (data).classIndex()) && (currentModel[i].checkModel()))

                // Use 1E-3 here to get a closer approximation to the
                // original
                // implementation.
                if ((currentModel[i].infoGain() >= (averageInfoGain - 1E-3))
                        && Utils.gr(currentModel[i].gainRatio(), minResult)) {
                    bestModel = currentModel[i];
                    minResult = currentModel[i].gainRatio();
                }
        }

        // Check if useful split was found.
        if (Utils.eq(minResult, 0))
            return noSplitModel;

        // Add all Instances with unknown values for the corresponding
        // attribute to the distribution for the model, so that
        // the complete distribution is stored with the model.
        bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex());

        // Set the split point analogue to C45 if attribute numeric.
        bestModel.setSplitPoint(m_allData);
        return bestModel;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}

From source file:j48.C45ModelSelection.java

License:Open Source License

/**
 * Selects C4.5-type split for the given dataset.
 *///from   w  w w .  ja  va  2 s.c  o  m
public final ClassifierSplitModel selectModel(Instances data) {

    double minResult;
    double currentResult;
    C45Split[] currentModel;
    C45Split bestModel = null;
    NoSplit noSplitModel = null;
    double averageInfoGain = 0;
    int validModels = 0;
    boolean multiVal = true;
    Distribution checkDistribution;
    Attribute attribute;
    double sumOfWeights;
    int i;

    try {

        // Check if all Instances belong to one class or if not
        // enough Instances to split.
        checkDistribution = new Distribution(data);
        noSplitModel = new NoSplit(checkDistribution);
        if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj) || Utils.eq(checkDistribution.total(),
                checkDistribution.perClass(checkDistribution.maxClass())))
            return noSplitModel;

        // Check if all attributes are nominal and have a
        // lot of values.
        if (m_allData != null) {
            Enumeration enu = data.enumerateAttributes();
            while (enu.hasMoreElements()) {
                attribute = (Attribute) enu.nextElement();
                if ((attribute.isNumeric()) || (Utils.sm((double) attribute.numValues(),
                        (0.3 * (double) m_allData.numInstances())))) {
                    multiVal = false;
                    break;
                }
            }
        }

        currentModel = new j48.C45Split[data.numAttributes()];
        sumOfWeights = data.sumOfWeights();

        // For each attribute.
        for (i = 0; i < data.numAttributes(); i++) {

            // Apart from class attribute.
            if (i != (data).classIndex()) {

                // Get models for current attribute.
                currentModel[i] = new j48.C45Split(i, m_minNoObj, sumOfWeights);
                currentModel[i].buildClassifier(data);

                // Check if useful split for current attribute
                // exists and check for enumerated attributes with
                // a lot of values.
                if (currentModel[i].checkModel())
                    if (m_allData != null) {
                        if ((data.attribute(i).isNumeric())
                                || (multiVal || Utils.sm((double) data.attribute(i).numValues(),
                                        (0.3 * (double) m_allData.numInstances())))) {
                            averageInfoGain = averageInfoGain + currentModel[i].infoGain();
                            validModels++;
                        }
                    } else {
                        averageInfoGain = averageInfoGain + currentModel[i].infoGain();
                        validModels++;
                    }
            } else
                currentModel[i] = null;
        }

        // Check if any useful split was found.
        if (validModels == 0)
            return noSplitModel;
        averageInfoGain = averageInfoGain / (double) validModels;

        // Find "best" attribute to split on.
        minResult = 0;
        for (i = 0; i < data.numAttributes(); i++) {
            if ((i != (data).classIndex()) && (currentModel[i].checkModel()))

                // Use 1E-3 here to get a closer approximation to the
                // original
                // implementation.
                if ((currentModel[i].infoGain() >= (averageInfoGain - 1E-3))
                        && Utils.gr(currentModel[i].gainRatio(), minResult)) {
                    bestModel = currentModel[i];
                    minResult = currentModel[i].gainRatio();
                }
        }

        // Check if useful split was found.
        if (Utils.eq(minResult, 0))
            return noSplitModel;

        // Add all Instances with unknown values for the corresponding
        // attribute to the distribution for the model, so that
        // the complete distribution is stored with the model.
        bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex());

        // Set the split point analogue to C45 if attribute numeric.
        if (m_allData != null)
            bestModel.setSplitPoint(m_allData);
        return bestModel;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}

From source file:j48.NBTreeModelSelection.java

License:Open Source License

/**
 * Selects NBTree-type split for the given dataset.
 *///from  w  w  w  .jav a 2 s  .c  om
public final ClassifierSplitModel selectModel(Instances data) {

    double globalErrors = 0;

    double minResult;
    double currentResult;
    NBTreeSplit[] currentModel;
    NBTreeSplit bestModel = null;
    NBTreeNoSplit noSplitModel = null;
    int validModels = 0;
    boolean multiVal = true;
    Distribution checkDistribution;
    Attribute attribute;
    double sumOfWeights;
    int i;

    try {
        // build the global model at this node
        noSplitModel = new NBTreeNoSplit();
        noSplitModel.buildClassifier(data);
        if (data.numInstances() < 5) {
            return noSplitModel;
        }

        // evaluate it
        globalErrors = noSplitModel.getErrors();
        if (globalErrors == 0) {
            return noSplitModel;
        }

        // Check if all Instances belong to one class or if not
        // enough Instances to split.
        checkDistribution = new Distribution(data);
        if (Utils.sm(checkDistribution.total(), m_minNoObj) || Utils.eq(checkDistribution.total(),
                checkDistribution.perClass(checkDistribution.maxClass()))) {
            return noSplitModel;
        }

        // Check if all attributes are nominal and have a 
        // lot of values.
        if (m_allData != null) {
            Enumeration enu = data.enumerateAttributes();
            while (enu.hasMoreElements()) {
                attribute = (Attribute) enu.nextElement();
                if ((attribute.isNumeric()) || (Utils.sm((double) attribute.numValues(),
                        (0.3 * (double) m_allData.numInstances())))) {
                    multiVal = false;
                    break;
                }
            }
        }

        currentModel = new NBTreeSplit[data.numAttributes()];
        sumOfWeights = data.sumOfWeights();

        // For each attribute.
        for (i = 0; i < data.numAttributes(); i++) {

            // Apart from class attribute.
            if (i != (data).classIndex()) {

                // Get models for current attribute.
                currentModel[i] = new NBTreeSplit(i, m_minNoObj, sumOfWeights);
                currentModel[i].setGlobalModel(noSplitModel);
                currentModel[i].buildClassifier(data);

                // Check if useful split for current attribute
                // exists and check for enumerated attributes with 
                // a lot of values.
                if (currentModel[i].checkModel()) {
                    validModels++;
                }
            } else {
                currentModel[i] = null;
            }
        }

        // Check if any useful split was found.
        if (validModels == 0) {
            return noSplitModel;
        }

        // Find "best" attribute to split on.
        minResult = globalErrors;
        for (i = 0; i < data.numAttributes(); i++) {
            if ((i != (data).classIndex()) && (currentModel[i].checkModel())) {
                /*  System.err.println("Errors for "+data.attribute(i).name()+" "+
                    currentModel[i].getErrors()); */
                if (currentModel[i].getErrors() < minResult) {
                    bestModel = currentModel[i];
                    minResult = currentModel[i].getErrors();
                }
            }
        }
        //      System.exit(1);
        // Check if useful split was found.

        if (((globalErrors - minResult) / globalErrors) < 0.05) {
            return noSplitModel;
        }

        /*      if (bestModel == null) {
        System.err.println("This shouldn't happen! glob : "+globalErrors+
              " minRes : "+minResult);
        System.exit(1);
        } */
        // Set the global model for the best split
        //      bestModel.setGlobalModel(noSplitModel);

        return bestModel;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}