org.dkpro.core.io.nif.internal.Nif2DKPro.java Source code

Introduction

Here is the source code for org.dkpro.core.io.nif.internal.Nif2DKPro.java
Source

/*
 * Licensed to the Technische Universitt Darmstadt under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The Technische Universitt Darmstadt 
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.
 *  
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.dkpro.core.io.nif.internal;

import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import org.apache.commons.collections4.iterators.IteratorIterable;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.Property;
import org.apache.jena.rdf.model.Resource;
import org.apache.jena.rdf.model.Statement;
import org.apache.jena.vocabulary.RDF;
import org.apache.uima.cas.Type;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;

import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;

public class Nif2DKPro {
    private MappingProvider posMappingProvider;

    public void setPosMappingProvider(MappingProvider aPosMappingProvider) {
        posMappingProvider = aPosMappingProvider;
    }

    public void convert(Statement aContext, JCas aJCas) {
        Model m = aContext.getModel();

        final Resource tSentence = m.createResource(NIF.TYPE_SENTENCE);
        final Resource tWord = m.createResource(NIF.TYPE_WORD);
        final Resource tTitle = m.createResource(NIF.TYPE_TITLE);
        final Resource tParagraph = m.createResource(NIF.TYPE_PARAGRAPH);

        final Property pReferenceContext = m.createProperty(NIF.PROP_REFERENCE_CONTEXT);
        final Property pIsString = m.createProperty(NIF.PROP_IS_STRING);
        final Property pBeginIndex = m.createProperty(NIF.PROP_BEGIN_INDEX);
        final Property pEndIndex = m.createProperty(NIF.PROP_END_INDEX);
        final Property pLemma = m.createProperty(NIF.PROP_LEMMA);
        final Property pStem = m.createProperty(NIF.PROP_STEM);
        final Property pPosTag = m.createProperty(NIF.PROP_POS_TAG);
        final Property pTaIdentRef = m.createProperty(ITS.PROP_TA_IDENT_REF);
        final Property pTaClassRef = m.createProperty(ITS.PROP_TA_CLASS_REF);

        // Convert context node -> document text
        String text = m.getProperty(aContext.getSubject(), pIsString).getString();
        aJCas.setDocumentText(text);

        // Convert headings/titles
        Iterator<Resource> headingIterator = m.listResourcesWithProperty(RDF.type, tTitle)
                .filterKeep(res -> res.getProperty(pReferenceContext).getResource().equals(aContext.getSubject()));
        for (Resource nifTitle : new IteratorIterable<Resource>(headingIterator)) {
            int begin = nifTitle.getProperty(pBeginIndex).getInt();
            int end = nifTitle.getProperty(pEndIndex).getInt();
            Heading uimaHeading = new Heading(aJCas, begin, end);
            uimaHeading.addToIndexes();

            assert assertSanity(nifTitle, uimaHeading);
        }

        // Convert paragraphs
        Iterator<Resource> paragraphIterator = m.listResourcesWithProperty(RDF.type, tParagraph)
                .filterKeep(res -> res.getProperty(pReferenceContext).getResource().equals(aContext.getSubject()));
        for (Resource nifParagraph : new IteratorIterable<Resource>(paragraphIterator)) {
            int begin = nifParagraph.getProperty(pBeginIndex).getInt();
            int end = nifParagraph.getProperty(pEndIndex).getInt();
            Paragraph uimaParagraph = new Paragraph(aJCas, begin, end);
            uimaParagraph.addToIndexes();

            assert assertSanity(nifParagraph, uimaParagraph);
        }

        // Convert sentences
        List<Resource> nifSentences = m.listResourcesWithProperty(RDF.type, tSentence)
                .filterKeep(res -> res.getProperty(pReferenceContext).getResource().equals(aContext.getSubject()))
                .toList();
        nifSentences.sort((a, b) -> a.getProperty(pBeginIndex).getInt() - b.getProperty(pBeginIndex).getInt());
        for (Resource nifSentence : nifSentences) {
            int begin = nifSentence.getProperty(pBeginIndex).getInt();
            int end = nifSentence.getProperty(pEndIndex).getInt();
            Sentence uimaSentence = new Sentence(aJCas, begin, end);
            uimaSentence.addToIndexes();

            assert assertSanity(nifSentence, uimaSentence);
        }

        // Convert tokens
        Iterator<Resource> tokenIterator = m.listResourcesWithProperty(RDF.type, tWord)
                .filterKeep(res -> res.getProperty(pReferenceContext).getResource().equals(aContext.getSubject()));
        for (Resource nifWord : new IteratorIterable<Resource>(tokenIterator)) {
            int begin = nifWord.getProperty(pBeginIndex).getInt();
            int end = nifWord.getProperty(pEndIndex).getInt();
            Token uimaToken = new Token(aJCas, begin, end);
            uimaToken.addToIndexes();

            assert assertSanity(nifWord, uimaToken);

            // Convert lemma
            if (nifWord.hasProperty(pLemma)) {
                Lemma uimaLemma = new Lemma(aJCas, uimaToken.getBegin(), uimaToken.getEnd());
                uimaLemma.setValue(nifWord.getProperty(pLemma).getString());
                uimaLemma.addToIndexes();
                uimaToken.setLemma(uimaLemma);
            }

            // Convert stem
            if (nifWord.hasProperty(pLemma)) {
                Stem uimaStem = new Stem(aJCas, uimaToken.getBegin(), uimaToken.getEnd());
                uimaStem.setValue(nifWord.getProperty(pStem).getString());
                uimaStem.addToIndexes();
                uimaToken.setStem(uimaStem);
            }

            // Convert posTag (this is discouraged, the better alternative should be oliaLink)
            if (nifWord.hasProperty(pPosTag)) {
                String tag = nifWord.getProperty(pStem).getString();
                Type posTag = posMappingProvider.getTagType(tag);
                POS uimaPos = (POS) aJCas.getCas().createAnnotation(posTag, uimaToken.getBegin(),
                        uimaToken.getEnd());
                uimaPos.setPosValue(tag.intern());
                uimaPos.setCoarseValue(
                        uimaPos.getClass().equals(POS.class) ? null : uimaPos.getType().getShortName().intern());
                uimaPos.addToIndexes();
                uimaToken.setPos(uimaPos);
            }
        }

        // Convert named entities
        //
        // NIF uses taIdentRef to link to a unique instance of an entity and taClassRef to identify
        // the category of the entity. Named entity recognizers in DKPro Core just categorizes the
        // entity, e.g. as a person, location, or whatnot. For what NIF uses, we'd need a named
        // entity linker, not just a recognizer. Furthermore, the DKPro Core named entity
        // recognizers are not mapped to a common tag set (unlike e.g. POS which is mapped to 
        // the universal POS tags).
        // 
        // So, what we do here is treating the URI of the taClassRef in NIF simply as the
        // named entity category and store it. 
        //
        // Here we use duck-typing, i.e. it has a taClassRef property then it is likely a named
        // entity. NIF 2.1 [1] appears to introduce a representation of named entities using the
        // class "EntityOccurrence", but e.g. kore50 [2] doesn't seem to use that - it uses "Phrase"
        // instead.
        //
        // [1] http://nif.readthedocs.io/en/2.1-rc/prov-and-conf.html
        // [2] https://datahub.io/dataset/kore-50-nif-ner-corpus
        Set<Resource> nifNamedEntities1 = m.listResourcesWithProperty(pTaIdentRef)
                .filterKeep(res -> res.getProperty(pReferenceContext).getResource().equals(aContext.getSubject()))
                .toSet();
        Set<Resource> nifNamedEntities2 = m.listResourcesWithProperty(pTaIdentRef)
                .filterKeep(res -> res.getProperty(pReferenceContext).getResource().equals(aContext.getSubject()))
                .toSet();
        Set<Resource> nifNamedEntities = new HashSet<Resource>();
        nifNamedEntities.addAll(nifNamedEntities1);
        nifNamedEntities.addAll(nifNamedEntities2);
        for (Resource nifNamedEntity : nifNamedEntities) {
            int begin = nifNamedEntity.getProperty(pBeginIndex).getInt();
            int end = nifNamedEntity.getProperty(pEndIndex).getInt();
            NamedEntity uimaNamedEntity = new NamedEntity(aJCas, begin, end);
            if (nifNamedEntity.hasProperty(pTaClassRef)) {
                uimaNamedEntity.setValue(nifNamedEntity.getProperty(pTaClassRef).getResource().getURI());
            }
            if (nifNamedEntity.hasProperty(pTaIdentRef)) {
                uimaNamedEntity.setIdentifier(nifNamedEntity.getProperty(pTaIdentRef).getResource().getURI());
            }
            uimaNamedEntity.addToIndexes();

            assert assertSanity(nifNamedEntity, uimaNamedEntity);
        }
    }

    private static boolean assertSanity(Resource aNif, Annotation aUima) {
        final Property pAnchorOf = aNif.getModel().createProperty(NIF.PROP_ANCHOR_OF);

        int docLength = aUima.getCAS().getDocumentText().length();

        if (aNif.hasProperty(pAnchorOf)) {
            String nifText = aNif.getProperty(pAnchorOf).getString();
            String uimaText = aUima.getCoveredText();
            assert nifText.equals(uimaText);
        }
        assert aUima.getBegin() >= 0 && aUima.getBegin() <= docLength;
        assert aUima.getEnd() >= 0 && aUima.getEnd() <= docLength;

        return true;
    }
}