Example usage for weka.core Instances enumerateInstances

Introduction

In this page you can find the example usage for weka.core Instances enumerateInstances.

Prototype

publicEnumeration<Instance> enumerateInstances()

Source Link

Document

Returns an enumeration of all instances in the dataset.

Usage

From source file:org.openml.webapplication.fantail.dc.DCUntils.java

License:Open Source License

private static Instances[] splitData(Instances data, Attribute att) {

    Instances[] splitData = new Instances[att.numValues()];
    for (int j = 0; j < att.numValues(); j++) {
        splitData[j] = new Instances(data, data.numInstances());
    }/*from w w  w. ja va 2s .co  m*/
    Enumeration<?> instEnum = data.enumerateInstances();
    while (instEnum.hasMoreElements()) {
        Instance inst = (Instance) instEnum.nextElement();
        splitData[(int) inst.value(att)].add(inst);
    }
    for (Instances splitData1 : splitData) {
        splitData1.compactify();
    }
    return splitData;
}

From source file:org.openml.webapplication.fantail.dc.DCUntils.java

License:Open Source License

private static double computeEntropy(Instances data) {

    double[] classCounts = new double[data.numClasses()];
    Enumeration<?> instEnum = data.enumerateInstances();
    while (instEnum.hasMoreElements()) {
        Instance inst = (Instance) instEnum.nextElement();
        classCounts[(int) inst.classValue()]++;
    }//from ww  w  . j  a  va2 s.  c  om
    double entropy = 0;
    for (int j = 0; j < data.numClasses(); j++) {
        if (classCounts[j] > 0) {
            entropy -= classCounts[j] * Utils.log2(classCounts[j]);
        }
    }
    entropy /= (double) data.numInstances();
    return entropy + Utils.log2(data.numInstances());
}

From source file:org.opentox.jaqpot3.qsar.InstancesUtil.java

License:Open Source License

/**
 * Accepts /* w  w w .ja  v  a 2s  . co m*/
 * @param features
 * @param data
 * @param compoundURIposition
 *      Position where the compound URI should be placed. If set to <code>-1</code>
 *      the compound URI will not be included in the created dataset.
 * @return
 *      A subset of the provided dataset (parameter data in this method) with the
 *      features specified in the provided list with that exact order. The compound
 *      URI feature (string) is placed in the position specified by the parameter
 *      compoundURIposition.
 * @throws JaqpotException
 *      A JaqpotException is thrown with error code {@link ErrorCause#FeatureNotInDataset FeatureNotInDataset}
 *      in case you provide a feature that is not found in the sumbitted Instances.
 */
public static Instances sortByFeatureAttrList(List<String> features, final Instances data,
        int compoundURIposition) throws JaqpotException {
    int position = compoundURIposition > features.size() ? features.size() : compoundURIposition;
    if (compoundURIposition != -1) {
        features.add(position, "compound_uri");
    }
    FastVector vector = new FastVector(features.size());
    for (int i = 0; i < features.size(); i++) {
        String feature = features.get(i);
        Attribute attribute = data.attribute(feature);
        if (attribute == null) {
            throw new JaqpotException("The Dataset you provided does not contain feature:" + feature);
        }
        vector.addElement(attribute.copy());
    }
    Instances result = new Instances(data.relationName(), vector, 0);
    Enumeration instances = data.enumerateInstances();
    while (instances.hasMoreElements()) {
        Instance instance = (Instance) instances.nextElement();
        double[] vals = new double[features.size()];
        for (int i = 0; i < features.size(); i++) {
            vals[i] = instance.value(data.attribute(result.attribute(i).name()));
        }
        Instance in = new Instance(1.0, vals);
        result.add(in);
    }
    return result;
}

From source file:org.opentox.jaqpot3.qsar.InstancesUtil.java

License:Open Source License

public static Instances sortForPMMLModel(List<Feature> list, List<Integer> trFieldsAttrIndex,
        final Instances data, int compoundURIposition) throws JaqpotException {
    List<String> features = new ArrayList<String>();
    for (Feature feature : list) {
        features.add(feature.getUri().toString());
    }/*from   ww  w  .  j  a  va  2 s.  c o m*/

    int position = compoundURIposition > features.size() ? features.size() : compoundURIposition;
    if (compoundURIposition != -1) {
        features.add(position, "compound_uri");
    }
    FastVector vector = new FastVector(features.size());
    for (int i = 0; i < features.size(); i++) {
        String feature = features.get(i);
        Attribute attribute = data.attribute(feature);
        if (attribute == null) {
            throw new JaqpotException("The Dataset you provided does not contain feature:" + feature);
        }
        vector.addElement(attribute.copy());
    }
    int attributeSize = features.size();

    if (trFieldsAttrIndex.size() > 0) {
        for (int i = 0; i < trFieldsAttrIndex.size(); i++) {
            Attribute attribute = data.attribute(trFieldsAttrIndex.get(i));
            if (attribute == null) {
                throw new JaqpotException("The Dataset you provided does not contain this pmml feature");
            }
            vector.addElement(attribute.copy());
        }
        attributeSize += trFieldsAttrIndex.size();
    }

    Instances result = new Instances(data.relationName(), vector, 0);
    Enumeration instances = data.enumerateInstances();
    while (instances.hasMoreElements()) {
        Instance instance = (Instance) instances.nextElement();
        double[] vals = new double[attributeSize];
        for (int i = 0; i < attributeSize; i++) {
            vals[i] = instance.value(data.attribute(result.attribute(i).name()));
        }
        Instance in = new Instance(1.0, vals);
        result.add(in);
    }
    return result;
}

From source file:org.opentox.toxotis.factory.DatasetFactory.java

License:Open Source License

/**
 * Create a dataset using a <code>weka.core.Instances</code> object (based on
 * Weka, version 3.6.2). Since datasets structurally differ from Instances
 * object for they store the information in a more expanded way including meta
 * data and nodes that do not appear in Instances object (or ARFF files), the
 * provided object has to possess a certain structure: The first attribute of
 * it has to be always named <code>compound_uri</code> and be of type <code>string</code>.
 * This attribute stores the URIs of the compounds of the dataset. Second, the rest
 * attributes have to be of type <code>string</code> or <code>numeric</code> or
 * <code>nominal</code> and their name should be an acceptable feature URI (for
 * example <code>http://someserver.com:1234/opentox/feature/54234</code>).
 *
 * @param instances/*from  w  w  w . j ava2s  .c o m*/
 *      Instances object to be converted into a Dataset.
 * @return
 *      The dataset that is created from the provided Instances object.
 * @throws ToxOtisException
 *      In case the conversion is not possible due to structural inconsistencies
 *      of the provided Instances object.
 */
public Dataset createFromArff(Instances instances) throws ToxOtisException {
    if (instances.attribute("compound_uri") == null && instances.attribute("URI") == null) {
        throw new ToxOtisException("Cannot create an OpenTox dataset out of this dataset because "
                + "'compound_uri' was not found in it's attribute list");
    }
    Dataset ds = new Dataset();
    Enumeration instancesEnum = instances.enumerateInstances();
    while (instancesEnum.hasMoreElements()) {
        Instance instance = (Instance) instancesEnum.nextElement();
        ds.getDataEntries().add(createDataEntry(instance));
    }
    try {
        ds.setUri(new VRI(instances.relationName()));
    } catch (URISyntaxException ex) {
        throw new ToxOtisException(
                "The relation name '" + instances.relationName() + "' is not" + "a valid dataset URI!", ex);
    }
    return ds;
}

From source file:org.wkwk.classifier.MyC45.java

public double computeEntropy(Instances data) {
    // Hitung kemunculan kelas
    double[] classCounts = new double[data.numClasses()];
    Enumeration instEnum = data.enumerateInstances();
    while (instEnum.hasMoreElements()) {
        Instance inst = (Instance) instEnum.nextElement();
        classCounts[(int) inst.classValue()]++;
    }/*from  ww w  .  j  a  v a2  s.co  m*/

    // Hitung entropy
    double entropy = 0;
    for (int i = 0; i < data.numClasses(); i++) {
        if (classCounts[i] > 0) {
            entropy -= classCounts[i] / data.numInstances() * Utils.log2(classCounts[i] / data.numInstances());
        }
    }
    return entropy;
}

From source file:org.wkwk.classifier.MyC45.java

public Instances[] splitData(Instances data, Attribute attr) {
    Instances[] splitData = new Instances[attr.numValues()];
    for (int i = 0; i < attr.numValues(); i++) {
        splitData[i] = new Instances(data, data.numInstances());
    }//from  ww w  .  j  a va2 s  . c om

    Enumeration instEnum = data.enumerateInstances();
    while (instEnum.hasMoreElements()) {
        Instance inst = (Instance) instEnum.nextElement();
        splitData[(int) inst.value(attr)].add(inst);
    }
    return splitData;
}

From source file:org.wkwk.classifier.MyC45.java

public double bestThreshold(Instances data, Attribute attr) {
    data.sort(attr);//from  w  w w  .  j  a  v a2 s.  c  o  m

    double m_ig = 0;
    double bestThr = 0;
    double classTemp = data.get(0).classValue();
    double valueTemp = data.get(0).value(attr);

    Enumeration instEnum = data.enumerateInstances();
    double dt;
    while (instEnum.hasMoreElements()) {
        Instance inst = (Instance) instEnum.nextElement();
        if (classTemp != inst.classValue()) {
            classTemp = inst.classValue();
            dt = valueTemp;
            valueTemp = inst.value(attr);
            double threshold = dt + ((valueTemp - dt) / 2);
            double igTemp = computeInfoGainCont(data, attr, threshold);
            if (m_ig < igTemp) {
                m_ig = igTemp;
                bestThr = threshold;
            }
        }
    }
    return bestThr;
}

From source file:probcog.bayesnets.learning.CPTLearner.java

License:Open Source License

/**
 * learns all the examples in the instances. Each instance in the instances represents one example.
 * All the random variables (nodes) in the network
 * need to be found in each instance as columns that are named accordingly, i.e. for each
 * random variable, there must be an attribute with a matching name in the instance. 
 * @param instances         the instances
 * @throws Exception    if the result set is empty
 * @throws SQLException particularly if there is no matching column for one of the node names  
 *//*  ww  w. ja  va  2s  .  c o m*/
public void learn(Instances instances) throws Exception {
    if (!initialized)
        init();

    // if it's an empty result set, throw exception
    if (instances.numInstances() == 0)
        throw new Exception("empty result set!");

    BeliefNode[] nodes = bn.bn.getNodes();
    int numAttributes = instances.numAttributes();
    // Now we can get much more nodes than attributes
    //      if(numAttributes != nodes.length)
    //         throw new Exception("Result does not contain suitable data (attribute count = " + numAttributes + "; node count = " + nodes.length + ")");

    // map node indices to attribute index
    int[] nodeIdx2colIdx = new int[nodes.length];
    Arrays.fill(nodeIdx2colIdx, -1);
    for (int i = 0; i < numAttributes; i++) {
        Set<String> nodeNames = bn.getNodeNamesForAttribute(instances.attribute(i).name());
        //logger.debug("Nodes for attribute "+instances.attribute(i).name()+": "+nodeNames);
        if (nodeNames == null)
            continue;
        for (String nodeName : nodeNames) {
            int node_idx = bn.getNodeIndex(nodeName);
            if (node_idx == -1)
                throw new Exception("Unknown node referenced in result set: " + instances.attribute(i).name());
            nodeIdx2colIdx[node_idx] = i;
        }
    }

    // gather data, iterating over the result set
    int[] domainIndices = new int[nodes.length];
    @SuppressWarnings("unchecked")
    Enumeration<Instance> instanceEnum = instances.enumerateInstances();
    while (instanceEnum.hasMoreElements()) {
        Instance instance = instanceEnum.nextElement();
        // for each row...
        // - get the indices into the domains of each node
        //   that correspond to the current row of data
        //   (sorted in the same order as the nodes are ordered
        //   in the BeliefNetwork)            
        for (int node_idx = 0; node_idx < nodes.length; node_idx++) {
            int domain_idx;
            if (clusterers[node_idx] == null) {
                Discrete domain = (Discrete) nodes[node_idx].getDomain();
                String strValue;
                if (domain instanceof Discretized) { // If we have a discretized domain we discretize first...
                    int colIdx = nodeIdx2colIdx[node_idx];
                    if (colIdx < 0) {
                        //bn.dump();
                        /*
                        for (int i = 0; i < numAttributes; i++) {
                           logger.debug("Attribute "+i+": "+instances.attribute(i).name());
                        }
                        StringBuffer sb = new StringBuffer();
                        for (int i = 0; i < nodeIdx2colIdx.length; i++) {
                           sb.append(i+"\t");
                        }
                        sb.append("\n");
                        for (int i = 0; i < nodeIdx2colIdx.length; i++) {
                           sb.append(nodeIdx2colIdx[i]+"\t");
                        }
                        logger.debug(sb);
                        */
                        throw new Exception(
                                "No attribute specified for " + bn.bn.getNodes()[node_idx].getName());
                    }
                    double value = instance.value(colIdx);
                    strValue = (((Discretized) domain).getNameFromContinuous(value));
                    /*if (domain.findName(strValue) == -1) {
                       logger.debug(domain);
                       logger.debug(strValue);
                    }*/
                } else {
                    int colIdx = nodeIdx2colIdx[node_idx];
                    if (colIdx < 0) {
                        throw new Exception(
                                "No attribute specified for " + bn.bn.getNodes()[node_idx].getName());
                    }
                    strValue = instance.stringValue(nodeIdx2colIdx[node_idx]);
                }
                domain_idx = domain.findName(strValue);
                if (domain_idx == -1) {
                    /*String[] myDomain = bn.getDiscreteDomainAsArray(bn.bn.getNodes()[node_idx].getName());
                    for (int i=0; i<myDomain.length; i++) {
                       logger.debug(myDomain[i]);
                    }*/
                    throw new Exception(strValue + " not found in domain of " + nodes[node_idx].getName());
                }
            } else {
                Instance inst = new Instance(1);
                inst.setValue(0, instance.value(nodeIdx2colIdx[node_idx]));
                domain_idx = clusterers[node_idx].clusterInstance(inst);
            }
            domainIndices[node_idx] = domain_idx;
        }
        // - update each node's CPT
        for (int i = 0; i < nodes.length; i++) {
            counters[i].count(domainIndices);
        }
    }
}

From source file:probcog.bayesnets.learning.DomainLearner.java

License:Open Source License

/**
 * learns all the examples in the result set. Each row in the result set
 * represents one example. All the random variables (nodes) that have been
 * scheduled for learning in the constructor need to be found in each result
 * row as columns that are named accordingly, i.e. for each random variable
 * for which the domain is to be learnt, there must be a column with a
 * matching name in the result set./* w  ww  .  j  av a 2s .  c  o m*/
 * 
 * @param rs
 *            the result set
 * @throws Exception
 *             if the result set is empty
 * @throws SQLException
 *             particularly if there is no matching column for one of the
 *             node names
 */
public void learn(Instances instances) throws Exception, SQLException {
    // if it's an empty result set, throw exception
    if (instances.numInstances() == 0)
        throw new Exception("empty result set!");

    // gather domain data
    int numDirectDomains = directDomains != null ? directDomains.length : 0;
    int numClusteredDomains = clusteredDomains != null ? clusteredDomains.length : 0;
    @SuppressWarnings("unchecked")
    Enumeration<Instance> instanceEnum = instances.enumerateInstances();
    while (instanceEnum.hasMoreElements()) {
        Instance instance = instanceEnum.nextElement();
        // for direct learning, add outcomes to the set of outcomes
        for (int i = 0; i < numDirectDomains; i++) {
            directDomainData.get(i).add(instance.stringValue(instances.attribute(directDomains[i].getName())));
        }
        // for clustering, gather all instances
        for (int i = 0; i < numClusteredDomains; i++) {
            Instance inst = new Instance(1);
            inst.setValue(attrValue, instance.value(instances.attribute(clusteredDomains[i].nodeName)));
            clusterData[i].add(inst);
        }
    }
}