Java tutorial
/* * * YAQP - Yet Another QSAR Project: * Machine Learning algorithms designed for the prediction of toxicological * features of chemical compounds become available on the Web. Yaqp is developed * under OpenTox (http://opentox.org) which is an FP7-funded EU research project. * This project was developed at the Automatic Control Lab in the Chemical Engineering * School of National Technical University of Athens. Please read README for more * information. * * Copyright (C) 2009-2010 Pantelis Sopasakis & Charalampos Chomenides * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * Contact: * Pantelis Sopasakis * chvng@mail.ntua.gr * Address: Iroon Politechniou St. 9, Zografou, Athens Greece * tel. +30 210 7723236 */ package org.opentox.ontology.data; import com.hp.hpl.jena.datatypes.xsd.XSDDatatype; import com.hp.hpl.jena.rdf.model.Literal; import com.hp.hpl.jena.rdf.model.RDFNode; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.rdf.model.SimpleSelector; import com.hp.hpl.jena.rdf.model.Statement; import com.hp.hpl.jena.rdf.model.StmtIterator; import com.hp.hpl.jena.vocabulary.RDF; import java.net.URI; import java.text.ParseException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.opentox.core.exceptions.Cause; import org.opentox.core.processors.Pipeline; import org.opentox.io.processors.InputProcessor; import org.opentox.io.publishable.OntObject; import org.opentox.io.publishable.RDFObject; import org.opentox.ontology.exceptions.ImproperEntityException; import org.opentox.ontology.exceptions.YaqpOntException; import org.opentox.ontology.namespaces.OTClass; import org.opentox.ontology.namespaces.OTDataTypeProperties; import org.opentox.ontology.namespaces.OTObjectProperties; import org.opentox.ontology.processors.InstancesProcessor; import org.opentox.qsar.processors.filters.AbstractFilter; import org.opentox.qsar.processors.filters.AttributeCleanup; import org.opentox.qsar.processors.filters.AttributeCleanup.ATTRIBUTE_TYPE; import org.opentox.qsar.processors.filters.SimpleMVHFilter; import weka.core.Attribute; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; /** * * A set of data which can be used for training or testing a model. * @author Pantelis Sopasakis * @author Charalampos Chomenides */ @SuppressWarnings({ "unchecked" }) public class Dataset { private OntObject oo = null; /** * The name of the first attribute in the dataset, corresponding to a * unique identifier for the compound. */ private static final String compound_uri = "compound_uri"; private enum WekaDataTypes { string, nominal, numeric, general; } /** * A dataset is instantiated providing an OntObject, which is an ontological model. * The class {@link OntObject } is in fact an extension of <code>OntModelImpl</code> * of jena. Such an object (OntObject) can be retrieved from a remote dataset server, * or from a local resource (e.g. file) using the <code>InputProcessor</code>. * @param oo An ontological object holding a representation of a dataset. If an improper * ontological entity is provided to construct the Dataset, methods like * {@link Dataset#getInstances() getInstances()} are not likely to work, so you have to * chack that the resource you provided is a dataset resource. * @see DatasetBuilder */ public Dataset(OntObject oo) { this.oo = oo; } /** * The dataset as <code>Instances</code>. These objects are used by weka as * input/output object to most algorithms (training, data preprocessing etc). * The Instances equivalent of the dataset may contain three different types of * <code>attributes</code>: numeric, nominal and/or string ones. The first attribute * is always a string one corresponding to the compound of the dataentry while * acting as an identifier for it. The name of this attribute is <code>compound_uri</code> * and is unique among all data entries. * @return Instances object for the dataset. * @throws YaqpOntException In case something goes wrong with the provided * representation (e.g. it does not correspond to a valid dataset). */ public Instances getInstances() throws YaqpOntException { // SOME INITIAL DEFINITIONS: Resource _DATAENTRY = OTClass.DataEntry.getOntClass(oo), _DATASET = OTClass.Dataset.getOntClass(oo), _FEATURE = OTClass.Feature.getOntClass(oo), _NUMERIC_FEATURE = OTClass.NumericFeature.getOntClass(oo), _NOMINAL_FEATURE = OTClass.NominalFeature.getOntClass(oo), _STRING_FEATURE = OTClass.StringFeature.getOntClass(oo); FastVector attributes = null; Instances data = null; StmtIterator dataSetIterator = null, featureIterator = null, valuesIterator = null, dataEntryIterator = null; String relationName = null; Map<Resource, WekaDataTypes> featureTypes = new HashMap<Resource, WekaDataTypes>(); Map<Resource, ArrayList<String>> featureNominalValues = new HashMap<Resource, ArrayList<String>>(); // CHECK IF THE RESOURCE IS A DATASET. IF YES, GET ITS IDENTIFIER AND SET // THE RELATION NAME ACCORDINGLY. IF NOT THROW AN ImproperEntityException. // ALSO CHECK IF THERE ARE MULTIPLE DATASETS AND IF YES THROW EXCEPTION. dataSetIterator = oo.listStatements(new SimpleSelector(null, RDF.type, _DATASET)); if (dataSetIterator.hasNext()) { relationName = dataSetIterator.next().getSubject().getURI(); if (dataSetIterator.hasNext()) { throw new YaqpOntException(Cause.XONT518, "More than one datasets found"); } } else { // this is not a dataset model throw new ImproperEntityException(Cause.XIE2, "Not a dataset"); } dataSetIterator.close(); // POPULATE THE MAP WHICH CORRELATES RESOURCES TO WEKA DATA TYPES ArrayList<String> nominalValues = new ArrayList<String>(); featureIterator = oo.listStatements(new SimpleSelector(null, RDF.type, _FEATURE)); while (featureIterator.hasNext()) { Resource feature = featureIterator.next().getSubject().as(Resource.class); StmtIterator featureTypeIterator = oo .listStatements(new SimpleSelector(feature, RDF.type, (RDFNode) null)); Set<Resource> featureTypesSet = new HashSet<Resource>(); while (featureTypeIterator.hasNext()) { Resource type = featureTypeIterator.next().getObject().as(Resource.class); featureTypesSet.add(type); } if (featureTypesSet.contains(_NUMERIC_FEATURE)) { featureTypes.put(feature, WekaDataTypes.numeric); } else if (featureTypesSet.contains(_STRING_FEATURE)) { featureTypes.put(feature, WekaDataTypes.string); } else if (featureTypesSet.contains(_NOMINAL_FEATURE)) { featureTypes.put(feature, WekaDataTypes.nominal); StmtIterator acceptValueIterator = oo.listStatements(new SimpleSelector(feature, OTDataTypeProperties.acceptValue.createProperty(oo), (RDFNode) null)); // GET THE RANGE OF THE FEATURE: while (acceptValueIterator.hasNext()) { nominalValues.add(acceptValueIterator.next().getObject().as(Literal.class).getString()); } featureNominalValues.put(feature, nominalValues); nominalValues = new ArrayList<String>(); } else { assert (featureTypesSet.contains(_FEATURE)); featureTypes.put(feature, WekaDataTypes.general); } } // GET THE ATTRIBUTES FOR THE DATASET: attributes = getAttributes(featureTypes, featureNominalValues); data = new Instances(relationName, attributes, 0); // ITERATE OVER ALL DATA ENTRIES IN THE DATASET: dataEntryIterator = oo.listStatements(new SimpleSelector(null, RDF.type, _DATAENTRY)); while (dataEntryIterator.hasNext()) { Statement dataEntry = dataEntryIterator.next(); /** * B2. For every dataEntry, iterate over all values nodes. */ Instance temp = null; valuesIterator = oo.listStatements(new SimpleSelector(dataEntry.getSubject(), OTObjectProperties.values.createProperty(oo), (Resource) null)); double[] vals = new double[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { vals[i] = Instance.missingValue(); } StmtIterator compoundNamesIterator = oo.listStatements(new SimpleSelector(dataEntry.getSubject(), OTObjectProperties.compound.createProperty(oo), (Resource) null)); String compoundName = null; if (compoundNamesIterator.hasNext()) { compoundName = compoundNamesIterator.next().getObject().as(Resource.class).getURI(); } vals[data.attribute(compound_uri).index()] = data.attribute(compound_uri).addStringValue(compoundName); while (valuesIterator.hasNext()) { Statement values = valuesIterator.next(); /* * A pair of the form (AttributeName, AttributeValue) is created. * This will be registered in an Instance-type object which * is turn will be used to update the dataset. */ // atVal is the value of the attribute String atVal = values.getProperty(OTDataTypeProperties.value.createProperty(oo)).getObject() .as(Literal.class).getValue().toString(); // and atName is the name of the corresponding attribute. String atName = values.getProperty(OTObjectProperties.feature.createProperty(oo)).getObject() .as(Resource.class).getURI(); if (featureTypes.get(oo.createResource(atName)).equals(WekaDataTypes.numeric)) { try { vals[data.attribute(atName).index()] = Double.parseDouble(atVal); /** * The following catch rule, handles cases where some values are declared * as numeric (double, float etc) but their value cannot be cast as * double. */ } catch (NumberFormatException ex) { /* Just don't include this value in the dataset */ } } else if (featureTypes.get(oo.createResource(atName)).equals(WekaDataTypes.string)) { vals[data.attribute(atName).index()] = data.attribute(atName).addStringValue(atVal); } else if (XSDDatatype.XSDdate.getURI().equals(atName)) { try { vals[data.attribute(atName).index()] = data.attribute(atName).parseDate(atVal); } catch (ParseException ex) { System.out.println(ex); //Logger.getLogger(Dataset.class.getName()).log(Level.SEVERE, null, ex); } } } temp = new Instance(1.0, vals); // Add the Instance only if its compatible with the dataset! if (data.checkInstance(temp)) { data.add(temp); } else { System.err.println("Warning! The instance " + temp + " is not compatible with the dataset!"); } } dataEntryIterator.close(); return data; } private FastVector getAttributes(Map<Resource, WekaDataTypes> featureTypes, Map<Resource, ArrayList<String>> nominalValues) { FastVector atts = new FastVector(); Set<Entry<Resource, WekaDataTypes>> entrySetDatatypes = featureTypes.entrySet(); // THE EXISTENCE OF THE (STRING) ATTRIBUTE 'COMPOUND_URI' IS MANDATORY FOR ALL // DATASETS. THIS IS ALWAYS THE FIRST ATTRIBUTE IN THE LIST. atts.addElement(new Attribute(compound_uri, (FastVector) null)); // ADD NUMERIC AND STRING ATTRIBUTES INTO THE FASTVECTOR: for (Entry<Resource, WekaDataTypes> entry : entrySetDatatypes) { WekaDataTypes dataType = entry.getValue(); if (dataType.equals(WekaDataTypes.numeric)) { atts.addElement(new Attribute(entry.getKey().getURI())); } else if (dataType.equals(WekaDataTypes.string) || dataType.equals(WekaDataTypes.general)) { atts.addElement(new Attribute(entry.getKey().getURI(), (FastVector) null)); } } // COPE WITH NOMINAL VALUES: Set<Entry<Resource, ArrayList<String>>> nominalAttsSet = nominalValues.entrySet(); for (Entry<Resource, ArrayList<String>> entry : nominalAttsSet) { FastVector nominalFVec = new FastVector(entry.getValue().size()); for (String nominalValue : entry.getValue()) { nominalFVec.addElement(nominalValue); } atts.addElement(new Attribute(entry.getKey().toString(), nominalFVec)); } return atts; } public static void main(String[] args) throws Exception { InputProcessor<OntObject> p1 = new InputProcessor<OntObject>(); DatasetBuilder p2 = new DatasetBuilder(); InstancesProcessor p3 = new InstancesProcessor(); AbstractFilter filter1 = new AttributeCleanup(new ATTRIBUTE_TYPE[] { ATTRIBUTE_TYPE.string }); AbstractFilter filter = new SimpleMVHFilter(); Pipeline pipe = new Pipeline(); pipe.add(p1); pipe.add(p2); pipe.add(p3); pipe.add(filter1); pipe.add(filter); Instances data = (Instances) pipe.process(new URI("http://localhost/9")); } public RDFObject getRDF() { return new RDFObject(oo); } }