org.nlp2rdf.implementation.stanfordcorenlp.StanfordWrapper.java Source code

Introduction

Here is the source code for org.nlp2rdf.implementation.stanfordcorenlp.StanfordWrapper.java
Source

/******************************************************************************/
/*  Copyright (C) 2010-2011, Sebastian Hellmann                               */
/*                                                                            */
/*  Licensed under the Apache License, Version 2.0 (the "License");           */
/*  you may not use this file except in compliance with the License.          */
/*  You may obtain a copy of the License at                                   */
/*                                                                            */
/*      http://www.apache.org/licenses/LICENSE-2.0                            */
/*                                                                            */
/*  Unless required by applicable law or agreed to in writing, software       */
/*  distributed under the License is distributed on an "AS IS" BASIS,         */
/*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
/*  See the License for the specific language governing permissions and       */
/*  limitations under the License.                                            */
/******************************************************************************/

package org.nlp2rdf.implementation.stanfordcorenlp;

import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
import com.hp.hpl.jena.ontology.Individual;
import com.hp.hpl.jena.ontology.ObjectProperty;
import com.hp.hpl.jena.ontology.OntModel;
import com.hp.hpl.jena.vocabulary.OWL;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.util.CoreMap;
import org.nlp2rdf.core.*;
import org.nlp2rdf.core.urischemes.URIScheme;
import org.nlp2rdf.core.vocab.NIFAnnotationProperties;
import org.nlp2rdf.core.vocab.NIFDatatypeProperties;
import org.nlp2rdf.core.vocab.NIFObjectProperties;
import org.nlp2rdf.core.vocab.RLOGIndividuals;
import org.nlp2rdf.vm.dep.StanfordSimple;
import org.nlp2rdf.vm.olia.models.Penn;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.TreeMap;

/**
 * The basic code was inspired by the ClearTK Project
 * http://code.google.com/p/cleartk
 * who have written a UIMA wrapper.
 * The original file by Steven Bethard can be found here:
 * http://code.google.com/p/cleartk/source/browse/trunk/cleartk-stanford-corenlp/src/main/java/org/cleartk/stanford/StanfordCoreNLPAnnotator.java
 * Licence http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
 * <p/>
 * Debug with  echo -n "This is a sentence." | mvn compile exec:java -e  -Dexec.mainClass="org.nlp2rdf.implementation.stanfordcore.StanfordCoreCLI" -Dexec.args="-f text -i -" | less
 *
 * @author Sebastian Hellmann - http://bis.informatik.uni-leipzig.de/SebastianHellmann
 */

public class StanfordWrapper extends NIFWrapper {
    private static Logger log = LoggerFactory.getLogger(StanfordWrapper.class);

    protected Annotator buildAnnotator(NIFParameters nifParameters) {
        /**
         * Prepare Stanford
         **/
        // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution
        Properties props = new Properties();
        props.put("annotators", "tokenize, ssplit, pos, lemma, parse"); // ner,  dcoref");
        if (nifParameters.getConfig() != null) {
            //TODO implement proper config parsing
        }

        return new StanfordCoreNLP(props);
    }

    public void process(Individual context, OntModel inputModel, OntModel outputModel,
            NIFParameters nifParameters) {
        String contextString = context
                .getPropertyValue(NIFDatatypeProperties.isString.getDatatypeProperty(inputModel)).asLiteral()
                .getString();
        String prefix = nifParameters.getPrefix();
        URIScheme urischeme = nifParameters.getUriScheme();

        Annotator pipeline = buildAnnotator(nifParameters);

        // create an empty Annotation just with the given text
        Annotation document = new Annotation(contextString);
        // run all Annotators on this text
        pipeline.annotate(document);

        // these are all the sentences in this document
        // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
        List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);

        //get all the sentences and words and read it in an intermediate structure
        //NOTE: this can be greatly optimized of course
        // for now it is just simple and cheap to implement it like this
        int wordCount = 0;
        TreeMap<Span, List<Span>> tokenizedText = new TreeMap<Span, List<Span>>();
        for (CoreMap sentence : sentences) {
            Span sentenceSpan = new Span(sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
                    sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
            List<Span> wordSpans = new ArrayList<Span>();
            for (CoreLabel coreLabel : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
                wordSpans.add(new Span(coreLabel.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
                        coreLabel.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)));
                wordCount++;
            }
            tokenizedText.put(sentenceSpan, wordSpans);
        }

        /**
         * Basic Model Setup
         **/
        //get parameters for the URIGenerator
        Text2RDF text2RDF = new Text2RDF();
        text2RDF.generateNIFModel(prefix, context, urischeme, outputModel, tokenizedText);
        outputModel.add(RLOGSLF4JBinding.log(nifParameters.getLogPrefix(),
                "Finished creating " + tokenizedText.size() + " sentence(s) with " + wordCount + " word(s) ",
                RLOGIndividuals.DEBUG, this.getClass().getCanonicalName(), null, null));
        // text2RDF.addNextAndPreviousProperties(prefix,urischeme,model);

        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods
        for (CoreMap sentence : sentences) {

            for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
                Span wordSpan = new Span(token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
                        token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
                //the word should exist already
                Individual wordIndividual = outputModel
                        .getIndividual(urischeme.generate(prefix, contextString, wordSpan));

                if (wordIndividual == null) {
                    log.error("SKIPPING: word was not found in the model: "
                            + urischeme.generate(prefix, contextString, wordSpan));
                    continue;
                }
                /********************************
                 * Lemma
                 ******/

                if (token.get(CoreAnnotations.LemmaAnnotation.class) != null) {
                    wordIndividual.addProperty(NIFDatatypeProperties.lemma.getDatatypeProperty(outputModel),
                            token.get(CoreAnnotations.LemmaAnnotation.class), XSDDatatype.XSDstring);
                }

                /********************************
                 * POS tag
                 ******/
                outputModel.setNsPrefix("olia", "http://purl.org/olia/olia.owl#");
                // this is the POS tag of the token
                String posTag = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);

                List<String> oliaIndividual = (List<String>) Penn.hasTag.get(posTag);
                if (oliaIndividual != null) {

                    for (String s : oliaIndividual) {
                        wordIndividual.addProperty(NIFObjectProperties.oliaLink.getObjectProperty(outputModel),
                                outputModel.createIndividual(s, OWL.Thing));
                        List<String> pennlinks = (List<String>) Penn.links.get(s);
                        if (pennlinks != null) {
                            for (String oc : pennlinks) {
                                wordIndividual.addProperty(
                                        NIFAnnotationProperties.oliaCategory.getAnnotationProperty(outputModel),
                                        outputModel.createClass(oc));
                            }
                        } else {
                            outputModel.add(
                                    RLOGSLF4JBinding.log(nifParameters.getLogPrefix(), "missing oliaLinks for " + s,
                                            RLOGIndividuals.ERROR, this.getClass().getCanonicalName(), null, null));
                        }
                    }
                } else {
                    outputModel.add(
                            RLOGSLF4JBinding.log(nifParameters.getLogPrefix(), "missing oliaLinks for " + posTag,
                                    RLOGIndividuals.ERROR, this.getClass().getCanonicalName(), null, null));

                }
            }

            SemanticGraph dependencies = sentence
                    .get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class);

            if (dependencies != null) {
                //time to add the prefix
                StanfordSimple.addStanfordSimplePrefix(outputModel);

                // create relation annotations for each Stanford dependency
                for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) {

                    Span govSpan = new Span(
                            stanfordEdge.getGovernor().get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
                            stanfordEdge.getGovernor().get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
                    Span depSpan = new Span(
                            stanfordEdge.getDependent().get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
                            stanfordEdge.getDependent().get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
                    //String relationType = stanfordEdge.getRelation().toString();

                    String[] edgeURIs = StanfordSimple.getURIforEdgeLabel(stanfordEdge.getRelation().toString());
                    //ObjectProperty relation = model.createObjectProperty(new CStringInst().generate(prefix, contextString, new Span[]{}));
                    ObjectProperty relation = null;
                    switch (edgeURIs.length) {
                    case 1:
                        relation = outputModel.createObjectProperty(edgeURIs[0]);

                        break;
                    case 2:
                        relation = outputModel.createObjectProperty(edgeURIs[0]);
                        relation.addSubProperty(outputModel.createObjectProperty(edgeURIs[1]));
                        break;
                    default:
                        String message = "Empty edge label, no URI written: " + edgeURIs;
                        outputModel.add(RLOGSLF4JBinding.log(nifParameters.getLogPrefix(), message,
                                RLOGIndividuals.ERROR, this.getClass().getCanonicalName(), null, null));
                        continue;

                    }

                    Individual gov = text2RDF.createCStringIndividual(prefix, context, govSpan, urischeme,
                            outputModel);
                    Individual dep = text2RDF.createCStringIndividual(prefix, context, depSpan, urischeme,
                            outputModel);
                    gov.addProperty(relation, dep);
                    relation.addSuperProperty(NIFObjectProperties.inter.getObjectProperty(outputModel));
                    relation.addSuperProperty(NIFObjectProperties.dependency.getObjectProperty(outputModel));

                    if (gov == null || dep == null) {
                        String message = "SKIPPING Either gov or dep was null for the dependencies\n" + "gov: "
                                + gov + "\ndep: " + dep;
                        outputModel.add(RLOGSLF4JBinding.log(nifParameters.getLogPrefix(), message,
                                RLOGIndividuals.ERROR, this.getClass().getCanonicalName(), null, null));
                        continue;
                    }

                    //  List<String> oliaIndividual = (List<String>) Stanford.hasTag.get(stanfordEdge.getRelation().getShortName());

                    /** for (String s : oliaIndividual) {
                        
                     relation.addProperty(NIFAnnotationProperties.oliaPropLink.getAnnotationProperty(model), model.createIndividual(s, OWL.Thing));
                     for (String oc : (List<String>) Stanford.links.get(s)) {
                     relation.addProperty(NIFAnnotationProperties.oliaCategory.getAnnotationProperty(model), oc);
                     }
                     if (((List<String>) Stanford.links.get(s)).isEmpty()) {
                     log.error("missing links for: " + s);
                     }
                     } **/

                    /* Individual relation = null;//dependency.getOLiAIndividualForTag(relationType);
                        
                    //in an ideal world, all used tags should also be in OLiA, this tends to be null sometimes
                    if (relation == null) {
                        log.error("reltype was null for: " + relationType);
                        continue;
                    }
                        
                    ObjectProperty dependencyRelation = model.createObjectProperty(relation.getURI());
                    //add the property from governer to dependent
                    gov.addProperty(dependencyRelation, dep);
                        
                        
                    Set<String> classUris = dependency.getClassURIsForTag(relationType);
                    for (String cl : classUris) {
                        if (!cl.startsWith("http://purl.org/olia/stanford.owl")) {
                            continue;
                        }
                        //add the property from governer to dependent
                        ObjectProperty nn = model.createObjectProperty(cl);
                        gov.addProperty(nn, dep);
                        dependencyRelation.addSuperProperty(nn);
                        
                        //copy and transform the hierarchy
                        //removed for 2.0
                        //OLiAOntology.classHierarchy2PropertyHierarchy(dependency.getHierarchy(cl), model, "http://purl.org/olia/stanford.owl");
                    }
                    }*/

                }
            } //end sentences
            /**************
             * Syntax Tree
             * */

            //Tree tree = sentence.get(TreeAnnotation.class);
            //if (tree != null) {
            //removed for 2.0
            //processTree(tree, urigenerator, prefix, text, model);
            //}

        }

    }
}

//log.info("Added lemma, pos, olia having " + (diff.size() - size) + " more triples.");
//size = diff.size();
//log.info("Added dependencies: " + (diff.size() - size) + " more triples.");
//size = diff.size();

/**public void processTree(Tree currentNode, URIGenerator uriGenerator, String prefix, String text, OntModel model) {
 // String tag = currentNode.label().value();
 //log.info("Current Node :" + currentNode);
 //log.info("Label: " + currentNode.label() + "");
 //log.info("Label Value: " + currentNode.label().value() + "");
 //log.info("Preterminal: " + currentNode.isPreTerminal() + "");
 //log.info("Index: " + ((CoreLabel) currentNode.label()).get(CharacterOffsetBeginAnnotation.class) + "");
    
    
 if (currentNode.isLeaf()) {
 //the node is a leaf and belongs in the realm of pos tagging
 } else {
 Phrase p = new Text2RDF().createStringAnnotationForClass(Phrase.class, prefix, text, getSpan(currentNode), uriGenerator, model);
 List<Tree> children = currentNode.getChildrenAsList();
 for (Tree child : children) {
    
    
 /* if (false && child.isPreTerminal()) {
 //skip preterminals
 log.debug("skipping preterminal: "+currentNode);
 log.debug("label: "+currentNode.label());
 child = child.getChild(0);
 Word childTerminal = new Text2RDF().createStringAnnotationForClass(Word.class, prefix, text, getSpan(child), uriGenerator, model);
 p.addChild(childTerminal);
 *
 Phrase childPhrase = new Text2RDF().createStringAnnotationForClass(Phrase.class, prefix, text, getSpan(child), uriGenerator, model);
 p.addChild(childPhrase);
 processTree(child, uriGenerator, prefix, text, model);
    
 log.info("Current Node :" + currentNode);
 log.info("Label: " + currentNode.label() + "");
 log.info("Label Value: " + currentNode.label().value() + "");
 log.info("Preterminal: " + currentNode.isPreTerminal() + "");
 //log.info("Index: " + ((CoreLabel) currentNode.label()).get(CharacterOffsetBeginAnnotation.class) + "");
    
 //adding syntax classes from olia and olia-top
 String tag = ((CoreLabel) currentNode.label()).get(CategoryAnnotation.class);
 Set<String> classes = penn_syntax.getClassURIsForTag(tag);
 for (String classUri : classes) {
 log.info("found: " + classUri + " for: " + tag);
 OntModel hierarchy = penn_syntax.getHierarchy(classUri);
 for (ExtendedIterator<OntClass> it = hierarchy.listClasses(); it.hasNext(); ) {
 OntClass oc = it.next();
 p.addOntClass(model.createResource(oc.getURI()));
 }
 //Copy the hierarchy
 model.add(hierarchy);
 }
 }
 }
 } */