Example usage for weka.core Instances checkInstance

List of usage examples for weka.core Instances checkInstance

Introduction

In this page you can find the example usage for weka.core Instances checkInstance.

Prototype

publicboolean checkInstance(Instance instance) 

Source Link

Document

Checks if the given instance is compatible with this dataset.

Usage

From source file:edu.umbc.cs.maple.utils.WekaUtils.java

License:Open Source License

/** Merge two instance sets.
 * @param instances1/*from   ww w  .  j av  a 2 s . c o  m*/
 * @param instances2
 * @return the merged instance sets
 */
public static Instances mergeInstances(Instances instances1, Instances instances2) {
    if (instances1 == null)
        return instances2;
    if (instances2 == null)
        return instances1;
    if (!instances1.checkInstance(instances2.firstInstance()))
        throw new IllegalArgumentException("The instance sets are incompatible.");
    Instances mergedInstances = new Instances(instances1);
    Instances tempInstances = new Instances(instances2);
    for (int i = 0; i < tempInstances.numInstances(); i++) {
        mergedInstances.add(tempInstances.instance(i));
    }
    return mergedInstances;
}

From source file:org.opentox.ontology.data.Dataset.java

License:Open Source License

/**
 * The dataset as <code>Instances</code>. These objects are used by weka as
 * input/output object to most algorithms (training, data preprocessing etc).
 * The Instances equivalent of the dataset may contain three different types of
 * <code>attributes</code>: numeric, nominal and/or string ones. The first attribute
 * is always a string one corresponding to the compound of the dataentry while  
 * acting as an identifier for it. The name of this attribute is <code>compound_uri</code>
 * and is unique among all data entries. 
 * @return Instances object for the dataset.
 * @throws YaqpOntException In case something goes wrong with the provided
 * representation (e.g. it does not correspond to a valid dataset).
 *//*w  ww.j a  v a2 s .c o  m*/
public Instances getInstances() throws YaqpOntException {

    // SOME INITIAL DEFINITIONS:
    Resource _DATAENTRY = OTClass.DataEntry.getOntClass(oo), _DATASET = OTClass.Dataset.getOntClass(oo),
            _FEATURE = OTClass.Feature.getOntClass(oo),
            _NUMERIC_FEATURE = OTClass.NumericFeature.getOntClass(oo),
            _NOMINAL_FEATURE = OTClass.NominalFeature.getOntClass(oo),
            _STRING_FEATURE = OTClass.StringFeature.getOntClass(oo);
    FastVector attributes = null;
    Instances data = null;
    StmtIterator dataSetIterator = null, featureIterator = null, valuesIterator = null,
            dataEntryIterator = null;
    String relationName = null;
    Map<Resource, WekaDataTypes> featureTypes = new HashMap<Resource, WekaDataTypes>();
    Map<Resource, ArrayList<String>> featureNominalValues = new HashMap<Resource, ArrayList<String>>();

    // CHECK IF THE RESOURCE IS A DATASET. IF YES, GET ITS IDENTIFIER AND SET
    // THE RELATION NAME ACCORDINGLY. IF NOT THROW AN ImproperEntityException.
    // ALSO CHECK IF THERE ARE MULTIPLE DATASETS AND IF YES THROW EXCEPTION.
    dataSetIterator = oo.listStatements(new SimpleSelector(null, RDF.type, _DATASET));

    if (dataSetIterator.hasNext()) {
        relationName = dataSetIterator.next().getSubject().getURI();
        if (dataSetIterator.hasNext()) {
            throw new YaqpOntException(Cause.XONT518, "More than one datasets found");
        }
    } else {
        // this is not a dataset model
        throw new ImproperEntityException(Cause.XIE2, "Not a dataset");
    }
    dataSetIterator.close();

    //  POPULATE THE MAP WHICH CORRELATES RESOURCES TO WEKA DATA TYPES
    ArrayList<String> nominalValues = new ArrayList<String>();
    featureIterator = oo.listStatements(new SimpleSelector(null, RDF.type, _FEATURE));
    while (featureIterator.hasNext()) {
        Resource feature = featureIterator.next().getSubject().as(Resource.class);
        StmtIterator featureTypeIterator = oo
                .listStatements(new SimpleSelector(feature, RDF.type, (RDFNode) null));
        Set<Resource> featureTypesSet = new HashSet<Resource>();
        while (featureTypeIterator.hasNext()) {
            Resource type = featureTypeIterator.next().getObject().as(Resource.class);
            featureTypesSet.add(type);
        }
        if (featureTypesSet.contains(_NUMERIC_FEATURE)) {
            featureTypes.put(feature, WekaDataTypes.numeric);
        } else if (featureTypesSet.contains(_STRING_FEATURE)) {
            featureTypes.put(feature, WekaDataTypes.string);
        } else if (featureTypesSet.contains(_NOMINAL_FEATURE)) {
            featureTypes.put(feature, WekaDataTypes.nominal);
            StmtIterator acceptValueIterator = oo.listStatements(new SimpleSelector(feature,
                    OTDataTypeProperties.acceptValue.createProperty(oo), (RDFNode) null));
            // GET THE RANGE OF THE FEATURE:   
            while (acceptValueIterator.hasNext()) {
                nominalValues.add(acceptValueIterator.next().getObject().as(Literal.class).getString());
            }
            featureNominalValues.put(feature, nominalValues);
            nominalValues = new ArrayList<String>();
        } else {
            assert (featureTypesSet.contains(_FEATURE));
            featureTypes.put(feature, WekaDataTypes.general);
        }
    }

    // GET THE ATTRIBUTES FOR THE DATASET:
    attributes = getAttributes(featureTypes, featureNominalValues);
    data = new Instances(relationName, attributes, 0);

    // ITERATE OVER ALL DATA ENTRIES IN THE DATASET:
    dataEntryIterator = oo.listStatements(new SimpleSelector(null, RDF.type, _DATAENTRY));
    while (dataEntryIterator.hasNext()) {
        Statement dataEntry = dataEntryIterator.next();

        /**
         * B2. For every dataEntry, iterate over all values nodes.
         */
        Instance temp = null;
        valuesIterator = oo.listStatements(new SimpleSelector(dataEntry.getSubject(),
                OTObjectProperties.values.createProperty(oo), (Resource) null));

        double[] vals = new double[data.numAttributes()];
        for (int i = 0; i < data.numAttributes(); i++) {
            vals[i] = Instance.missingValue();
        }

        StmtIterator compoundNamesIterator = oo.listStatements(new SimpleSelector(dataEntry.getSubject(),
                OTObjectProperties.compound.createProperty(oo), (Resource) null));
        String compoundName = null;
        if (compoundNamesIterator.hasNext()) {
            compoundName = compoundNamesIterator.next().getObject().as(Resource.class).getURI();
        }

        vals[data.attribute(compound_uri).index()] = data.attribute(compound_uri).addStringValue(compoundName);

        while (valuesIterator.hasNext()) {
            Statement values = valuesIterator.next();

            /*
             * A pair of the form (AttributeName, AttributeValue) is created.
             * This will be registered in an Instance-type object which
             * is turn will be used to update the dataset.
             */

            // atVal is the value of the attribute
            String atVal = values.getProperty(OTDataTypeProperties.value.createProperty(oo)).getObject()
                    .as(Literal.class).getValue().toString();
            // and atName is the name of the corresponding attribute.
            String atName = values.getProperty(OTObjectProperties.feature.createProperty(oo)).getObject()
                    .as(Resource.class).getURI();

            if (featureTypes.get(oo.createResource(atName)).equals(WekaDataTypes.numeric)) {
                try {
                    vals[data.attribute(atName).index()] = Double.parseDouble(atVal);
                    /**
                     * The following catch rule, handles cases where some values are declared
                     * as numeric (double, float etc) but their value cannot be cast as
                     * double.
                     */
                } catch (NumberFormatException ex) {
                    /* Just don't include this value in the dataset */
                }
            } else if (featureTypes.get(oo.createResource(atName)).equals(WekaDataTypes.string)) {
                vals[data.attribute(atName).index()] = data.attribute(atName).addStringValue(atVal);
            } else if (XSDDatatype.XSDdate.getURI().equals(atName)) {
                try {
                    vals[data.attribute(atName).index()] = data.attribute(atName).parseDate(atVal);
                } catch (ParseException ex) {
                    System.out.println(ex);
                    //Logger.getLogger(Dataset.class.getName()).log(Level.SEVERE, null, ex);
                }
            }
        }
        temp = new Instance(1.0, vals);

        // Add the Instance only if its compatible with the dataset!
        if (data.checkInstance(temp)) {
            data.add(temp);
        } else {
            System.err.println("Warning! The instance " + temp + " is not compatible with the dataset!");
        }
    }
    dataEntryIterator.close();

    return data;

}

From source file:org.opentox.toxotis.core.component.Dataset.java

License:Open Source License

/**
 * <p align="justify">Creates and returns a <code>weka.core.Instances</code>
 * object from the data contained in this Dataset. The Instances object created has the following specific structure:
 * The first element in each Instance is always the Compound's URI. It is
 * identified by the keyword <code>compound_uri</code>. Following that comes a sequence
 * of all Features contained the Dataset's DataEntries, described as
 * either <code>String</code>,<code>Numeric</code> or <code> Nominal</code>.
 * If a compound doesn't possess a value for a specific Feature, or the value is
 * unreadable or unacceptable (e.g. a String value is present when a Numeric is
 * expected), a missing value is placed instead. If a Feature is tagged as both
 * Numeric|String and Nominal, the Nominal property wins. If it is tagged as
 * both Numeric and String, the String property wins.
 * </p>/*w w w.j  a va2 s  . c  om*/
 *
 * @return
 *      Weka Instances from the data contained in this Dataset.
 */
public Instances getInstances() {
    long timeFlag = System.currentTimeMillis();
    // GET THE ATTRIBUTES FOR THE DATASET:
    FastVector attributes = new FastVector();
    Set<Feature> features = getContainedFeatures();
    // THE EXISTENCE OF THE (STRING) ATTRIBUTE 'COMPOUND_URI' IS MANDATORY FOR ALL
    // DATASETS. THIS IS ALWAYS THE FIRST ATTRIBUTE IN THE LIST.
    attributes.addElement(new Attribute(COMPOUND_URI, (FastVector) null));
    // ADD NUMERIC AND STRING ATTRIBUTES INTO THE FASTVECTOR:
    for (Feature feature : features) {
        WekaDataTypes dataType = WekaDataTypes.getFromFeature(feature);
        if (dataType.equals(WekaDataTypes.numeric)) {
            attributes.addElement(new Attribute(feature.getUri().getStringNoQuery()));
        } else if (dataType.equals(WekaDataTypes.string) || dataType.equals(WekaDataTypes.general)) {
            attributes.addElement(new Attribute(feature.getUri().getStringNoQuery(), (FastVector) null));
        } else if (dataType.equals(WekaDataTypes.nominal)) {
            // COPE WITH NOMINAL VALUES:
            FastVector nominalFVec = new FastVector(feature.getAdmissibleValues().size());
            for (LiteralValue value : feature.getAdmissibleValues()) {
                nominalFVec.addElement(value.getValue());
            }
            attributes.addElement(new Attribute(feature.getUri().getStringNoQuery(), nominalFVec));
        }
    }

    Instances data = new Instances(this.getUri().getStringNoQuery(), attributes, 0);

    //POPULATE WITH VALUES:
    for (DataEntry dataEntry : this.getDataEntries()) {
        double[] vals = new double[data.numAttributes()];
        for (int i = 0; i < data.numAttributes(); i++) {
            vals[i] = Instance.missingValue();
        }

        Compound conformer = dataEntry.getConformer();

        vals[data.attribute(COMPOUND_URI).index()] = data.attribute(COMPOUND_URI)
                .addStringValue(conformer.getUri().getStringNoQuery());

        for (FeatureValue featureValue : dataEntry.getFeatureValues()) {
            Feature feature = featureValue.getFeature();
            String featureName = feature.getUri().getStringNoQuery();
            LiteralValue value = featureValue.getValue();

            if (value != null) {
                if (WekaDataTypes.getFromFeature(feature).equals(WekaDataTypes.numeric)) {
                    try {
                        vals[data.attribute(featureName).index()] = Double
                                .parseDouble(value.getValue().toString());
                    } catch (NumberFormatException ex) {
                        logger.warn("NFE while trying to convert to double the value " + value.getValue(), ex);
                    }
                } else if (WekaDataTypes.getFromFeature(feature).equals(WekaDataTypes.string)) {
                    vals[data.attribute(featureName).index()] = data.attribute(featureName)
                            .addStringValue((String) value.getValue().toString());
                } else if (XSDDatatype.XSDdate.getURI().equals(featureName)) {
                    try {
                        vals[data.attribute(featureName).index()] = data.attribute(featureName)
                                .parseDate((String) value.getValue());
                    } catch (ParseException ex) {
                        logger.error("Parsing Exception for Date in Dataset", ex);
                    }
                } else if (WekaDataTypes.getFromFeature(feature).equals(WekaDataTypes.nominal)) {
                    //TODO: Nominals may not work, testing is needed.
                    vals[data.attribute(featureName).index()] = data.attribute(featureName)
                            .indexOfValue(value.getValue().toString());
                }
            }
        }

        Instance valuesInstance = new Instance(1.0, vals);
        // Add the Instance only if its compatible with the dataset!
        if (data.checkInstance(valuesInstance)) {
            data.add(valuesInstance);
        } else {
            logger.warn("Warning! The instance " + valuesInstance + " is not compatible with the dataset!");
        }
    }
    timeInstancesConversion = System.currentTimeMillis() - timeFlag;
    return data;
}