List of usage examples for weka.core Instances checkInstance
publicboolean checkInstance(Instance instance)
From source file:edu.umbc.cs.maple.utils.WekaUtils.java
License:Open Source License
/** Merge two instance sets. * @param instances1/*from ww w . j av a 2 s . c o m*/ * @param instances2 * @return the merged instance sets */ public static Instances mergeInstances(Instances instances1, Instances instances2) { if (instances1 == null) return instances2; if (instances2 == null) return instances1; if (!instances1.checkInstance(instances2.firstInstance())) throw new IllegalArgumentException("The instance sets are incompatible."); Instances mergedInstances = new Instances(instances1); Instances tempInstances = new Instances(instances2); for (int i = 0; i < tempInstances.numInstances(); i++) { mergedInstances.add(tempInstances.instance(i)); } return mergedInstances; }
From source file:org.opentox.ontology.data.Dataset.java
License:Open Source License
/** * The dataset as <code>Instances</code>. These objects are used by weka as * input/output object to most algorithms (training, data preprocessing etc). * The Instances equivalent of the dataset may contain three different types of * <code>attributes</code>: numeric, nominal and/or string ones. The first attribute * is always a string one corresponding to the compound of the dataentry while * acting as an identifier for it. The name of this attribute is <code>compound_uri</code> * and is unique among all data entries. * @return Instances object for the dataset. * @throws YaqpOntException In case something goes wrong with the provided * representation (e.g. it does not correspond to a valid dataset). *//*w ww.j a v a2 s .c o m*/ public Instances getInstances() throws YaqpOntException { // SOME INITIAL DEFINITIONS: Resource _DATAENTRY = OTClass.DataEntry.getOntClass(oo), _DATASET = OTClass.Dataset.getOntClass(oo), _FEATURE = OTClass.Feature.getOntClass(oo), _NUMERIC_FEATURE = OTClass.NumericFeature.getOntClass(oo), _NOMINAL_FEATURE = OTClass.NominalFeature.getOntClass(oo), _STRING_FEATURE = OTClass.StringFeature.getOntClass(oo); FastVector attributes = null; Instances data = null; StmtIterator dataSetIterator = null, featureIterator = null, valuesIterator = null, dataEntryIterator = null; String relationName = null; Map<Resource, WekaDataTypes> featureTypes = new HashMap<Resource, WekaDataTypes>(); Map<Resource, ArrayList<String>> featureNominalValues = new HashMap<Resource, ArrayList<String>>(); // CHECK IF THE RESOURCE IS A DATASET. IF YES, GET ITS IDENTIFIER AND SET // THE RELATION NAME ACCORDINGLY. IF NOT THROW AN ImproperEntityException. // ALSO CHECK IF THERE ARE MULTIPLE DATASETS AND IF YES THROW EXCEPTION. dataSetIterator = oo.listStatements(new SimpleSelector(null, RDF.type, _DATASET)); if (dataSetIterator.hasNext()) { relationName = dataSetIterator.next().getSubject().getURI(); if (dataSetIterator.hasNext()) { throw new YaqpOntException(Cause.XONT518, "More than one datasets found"); } } else { // this is not a dataset model throw new ImproperEntityException(Cause.XIE2, "Not a dataset"); } dataSetIterator.close(); // POPULATE THE MAP WHICH CORRELATES RESOURCES TO WEKA DATA TYPES ArrayList<String> nominalValues = new ArrayList<String>(); featureIterator = oo.listStatements(new SimpleSelector(null, RDF.type, _FEATURE)); while (featureIterator.hasNext()) { Resource feature = featureIterator.next().getSubject().as(Resource.class); StmtIterator featureTypeIterator = oo .listStatements(new SimpleSelector(feature, RDF.type, (RDFNode) null)); Set<Resource> featureTypesSet = new HashSet<Resource>(); while (featureTypeIterator.hasNext()) { Resource type = featureTypeIterator.next().getObject().as(Resource.class); featureTypesSet.add(type); } if (featureTypesSet.contains(_NUMERIC_FEATURE)) { featureTypes.put(feature, WekaDataTypes.numeric); } else if (featureTypesSet.contains(_STRING_FEATURE)) { featureTypes.put(feature, WekaDataTypes.string); } else if (featureTypesSet.contains(_NOMINAL_FEATURE)) { featureTypes.put(feature, WekaDataTypes.nominal); StmtIterator acceptValueIterator = oo.listStatements(new SimpleSelector(feature, OTDataTypeProperties.acceptValue.createProperty(oo), (RDFNode) null)); // GET THE RANGE OF THE FEATURE: while (acceptValueIterator.hasNext()) { nominalValues.add(acceptValueIterator.next().getObject().as(Literal.class).getString()); } featureNominalValues.put(feature, nominalValues); nominalValues = new ArrayList<String>(); } else { assert (featureTypesSet.contains(_FEATURE)); featureTypes.put(feature, WekaDataTypes.general); } } // GET THE ATTRIBUTES FOR THE DATASET: attributes = getAttributes(featureTypes, featureNominalValues); data = new Instances(relationName, attributes, 0); // ITERATE OVER ALL DATA ENTRIES IN THE DATASET: dataEntryIterator = oo.listStatements(new SimpleSelector(null, RDF.type, _DATAENTRY)); while (dataEntryIterator.hasNext()) { Statement dataEntry = dataEntryIterator.next(); /** * B2. For every dataEntry, iterate over all values nodes. */ Instance temp = null; valuesIterator = oo.listStatements(new SimpleSelector(dataEntry.getSubject(), OTObjectProperties.values.createProperty(oo), (Resource) null)); double[] vals = new double[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { vals[i] = Instance.missingValue(); } StmtIterator compoundNamesIterator = oo.listStatements(new SimpleSelector(dataEntry.getSubject(), OTObjectProperties.compound.createProperty(oo), (Resource) null)); String compoundName = null; if (compoundNamesIterator.hasNext()) { compoundName = compoundNamesIterator.next().getObject().as(Resource.class).getURI(); } vals[data.attribute(compound_uri).index()] = data.attribute(compound_uri).addStringValue(compoundName); while (valuesIterator.hasNext()) { Statement values = valuesIterator.next(); /* * A pair of the form (AttributeName, AttributeValue) is created. * This will be registered in an Instance-type object which * is turn will be used to update the dataset. */ // atVal is the value of the attribute String atVal = values.getProperty(OTDataTypeProperties.value.createProperty(oo)).getObject() .as(Literal.class).getValue().toString(); // and atName is the name of the corresponding attribute. String atName = values.getProperty(OTObjectProperties.feature.createProperty(oo)).getObject() .as(Resource.class).getURI(); if (featureTypes.get(oo.createResource(atName)).equals(WekaDataTypes.numeric)) { try { vals[data.attribute(atName).index()] = Double.parseDouble(atVal); /** * The following catch rule, handles cases where some values are declared * as numeric (double, float etc) but their value cannot be cast as * double. */ } catch (NumberFormatException ex) { /* Just don't include this value in the dataset */ } } else if (featureTypes.get(oo.createResource(atName)).equals(WekaDataTypes.string)) { vals[data.attribute(atName).index()] = data.attribute(atName).addStringValue(atVal); } else if (XSDDatatype.XSDdate.getURI().equals(atName)) { try { vals[data.attribute(atName).index()] = data.attribute(atName).parseDate(atVal); } catch (ParseException ex) { System.out.println(ex); //Logger.getLogger(Dataset.class.getName()).log(Level.SEVERE, null, ex); } } } temp = new Instance(1.0, vals); // Add the Instance only if its compatible with the dataset! if (data.checkInstance(temp)) { data.add(temp); } else { System.err.println("Warning! The instance " + temp + " is not compatible with the dataset!"); } } dataEntryIterator.close(); return data; }
From source file:org.opentox.toxotis.core.component.Dataset.java
License:Open Source License
/** * <p align="justify">Creates and returns a <code>weka.core.Instances</code> * object from the data contained in this Dataset. The Instances object created has the following specific structure: * The first element in each Instance is always the Compound's URI. It is * identified by the keyword <code>compound_uri</code>. Following that comes a sequence * of all Features contained the Dataset's DataEntries, described as * either <code>String</code>,<code>Numeric</code> or <code> Nominal</code>. * If a compound doesn't possess a value for a specific Feature, or the value is * unreadable or unacceptable (e.g. a String value is present when a Numeric is * expected), a missing value is placed instead. If a Feature is tagged as both * Numeric|String and Nominal, the Nominal property wins. If it is tagged as * both Numeric and String, the String property wins. * </p>/*w w w.j a va2 s . c om*/ * * @return * Weka Instances from the data contained in this Dataset. */ public Instances getInstances() { long timeFlag = System.currentTimeMillis(); // GET THE ATTRIBUTES FOR THE DATASET: FastVector attributes = new FastVector(); Set<Feature> features = getContainedFeatures(); // THE EXISTENCE OF THE (STRING) ATTRIBUTE 'COMPOUND_URI' IS MANDATORY FOR ALL // DATASETS. THIS IS ALWAYS THE FIRST ATTRIBUTE IN THE LIST. attributes.addElement(new Attribute(COMPOUND_URI, (FastVector) null)); // ADD NUMERIC AND STRING ATTRIBUTES INTO THE FASTVECTOR: for (Feature feature : features) { WekaDataTypes dataType = WekaDataTypes.getFromFeature(feature); if (dataType.equals(WekaDataTypes.numeric)) { attributes.addElement(new Attribute(feature.getUri().getStringNoQuery())); } else if (dataType.equals(WekaDataTypes.string) || dataType.equals(WekaDataTypes.general)) { attributes.addElement(new Attribute(feature.getUri().getStringNoQuery(), (FastVector) null)); } else if (dataType.equals(WekaDataTypes.nominal)) { // COPE WITH NOMINAL VALUES: FastVector nominalFVec = new FastVector(feature.getAdmissibleValues().size()); for (LiteralValue value : feature.getAdmissibleValues()) { nominalFVec.addElement(value.getValue()); } attributes.addElement(new Attribute(feature.getUri().getStringNoQuery(), nominalFVec)); } } Instances data = new Instances(this.getUri().getStringNoQuery(), attributes, 0); //POPULATE WITH VALUES: for (DataEntry dataEntry : this.getDataEntries()) { double[] vals = new double[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { vals[i] = Instance.missingValue(); } Compound conformer = dataEntry.getConformer(); vals[data.attribute(COMPOUND_URI).index()] = data.attribute(COMPOUND_URI) .addStringValue(conformer.getUri().getStringNoQuery()); for (FeatureValue featureValue : dataEntry.getFeatureValues()) { Feature feature = featureValue.getFeature(); String featureName = feature.getUri().getStringNoQuery(); LiteralValue value = featureValue.getValue(); if (value != null) { if (WekaDataTypes.getFromFeature(feature).equals(WekaDataTypes.numeric)) { try { vals[data.attribute(featureName).index()] = Double .parseDouble(value.getValue().toString()); } catch (NumberFormatException ex) { logger.warn("NFE while trying to convert to double the value " + value.getValue(), ex); } } else if (WekaDataTypes.getFromFeature(feature).equals(WekaDataTypes.string)) { vals[data.attribute(featureName).index()] = data.attribute(featureName) .addStringValue((String) value.getValue().toString()); } else if (XSDDatatype.XSDdate.getURI().equals(featureName)) { try { vals[data.attribute(featureName).index()] = data.attribute(featureName) .parseDate((String) value.getValue()); } catch (ParseException ex) { logger.error("Parsing Exception for Date in Dataset", ex); } } else if (WekaDataTypes.getFromFeature(feature).equals(WekaDataTypes.nominal)) { //TODO: Nominals may not work, testing is needed. vals[data.attribute(featureName).index()] = data.attribute(featureName) .indexOfValue(value.getValue().toString()); } } } Instance valuesInstance = new Instance(1.0, vals); // Add the Instance only if its compatible with the dataset! if (data.checkInstance(valuesInstance)) { data.add(valuesInstance); } else { logger.warn("Warning! The instance " + valuesInstance + " is not compatible with the dataset!"); } } timeInstancesConversion = System.currentTimeMillis() - timeFlag; return data; }