org.dkpro.core.io.nif.internal.DKPro2Nif.java Source code

Java tutorial

Introduction

Here is the source code for org.dkpro.core.io.nif.internal.DKPro2Nif.java

Source

/*
 * Licensed to the Technische Universitt Darmstadt under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The Technische Universitt Darmstadt 
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.
 *  
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.dkpro.core.io.nif.internal;

import static org.apache.jena.datatypes.xsd.XSDDatatype.XSDnonNegativeInteger;
import static org.apache.jena.datatypes.xsd.XSDDatatype.XSDstring;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;

import org.apache.commons.lang3.StringUtils;
import org.apache.jena.ontology.Individual;
import org.apache.jena.ontology.OntModel;
import org.apache.jena.rdf.model.Property;
import org.apache.jena.rdf.model.Resource;
import org.apache.uima.jcas.JCas;

import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;

public class DKPro2Nif {
    public static void convert(JCas aJCas, OntModel aTarget) {
        // Shorten down variable name for model
        OntModel m = aTarget;

        // Set up query instances
        final Resource tContext = m.createResource(NIF.TYPE_CONTEXT);
        final Resource tSentence = m.createResource(NIF.TYPE_SENTENCE);
        final Resource tWord = m.createResource(NIF.TYPE_WORD);
        final Resource tTitle = m.createResource(NIF.TYPE_TITLE);
        final Resource tParagraph = m.createResource(NIF.TYPE_PARAGRAPH);
        final Resource tEntityOccurrence = m.createResource(NIF.TYPE_ENTITY_OCCURRENCE);

        final Property pReferenceContext = m.createProperty(NIF.PROP_REFERENCE_CONTEXT);
        final Property pIsString = m.createProperty(NIF.PROP_IS_STRING);
        final Property pAnchorOf = m.createProperty(NIF.PROP_ANCHOR_OF);
        final Property pBeginIndex = m.createProperty(NIF.PROP_BEGIN_INDEX);
        final Property pEndIndex = m.createProperty(NIF.PROP_END_INDEX);
        final Property pStem = m.createProperty(NIF.PROP_STEM);
        final Property pLemma = m.createProperty(NIF.PROP_LEMMA);
        final Property pPosTag = m.createProperty(NIF.PROP_POS_TAG);
        final Property pWord = m.createProperty(NIF.PROP_WORD);
        final Property pNextWord = m.createProperty(NIF.PROP_NEXT_WORD);
        final Property pPreviousWord = m.createProperty(NIF.PROP_PREVIOUS_WORD);
        final Property pSentence = m.createProperty(NIF.PROP_SENTENCE);
        final Property pNextSentence = m.createProperty(NIF.PROP_NEXT_SENTENCE);
        final Property pPreviousSentence = m.createProperty(NIF.PROP_PREVIOUS_SENTENCE);
        final Property pTaIdentRef = m.createProperty(ITS.PROP_TA_IDENT_REF);
        final Property pTaClassRef = m.createProperty(ITS.PROP_TA_CLASS_REF);

        // Get a URI for the document
        DocumentMetaData dmd = DocumentMetaData.get(aJCas);
        String docuri = dmd.getDocumentUri() != null ? dmd.getDocumentUri() : "urn:" + dmd.getDocumentId();

        // Convert document -> context node
        Individual context;
        {
            String uri = String.format("%s#offset_%d_%d", docuri, 0, aJCas.getDocumentText().length());
            context = m.createIndividual(uri, tContext);
            context.addLiteral(pIsString, m.createTypedLiteral(aJCas.getDocumentText(), XSDstring));
            context.addLiteral(pBeginIndex, m.createTypedLiteral(0, XSDnonNegativeInteger));
            context.addLiteral(pEndIndex,
                    m.createTypedLiteral(aJCas.getDocumentText().length(), XSDnonNegativeInteger));
        }

        // Convert headings/titles
        for (Heading uimaHeading : select(aJCas, Heading.class)) {
            String headingUri = String.format("%s#offset_%d_%d", docuri, uimaHeading.getBegin(),
                    uimaHeading.getEnd());
            Individual nifTitle = m.createIndividual(headingUri, tTitle);
            nifTitle.addProperty(pReferenceContext, context);
            nifTitle.addLiteral(pAnchorOf, uimaHeading.getCoveredText());
            nifTitle.addLiteral(pBeginIndex, m.createTypedLiteral(uimaHeading.getBegin(), XSDnonNegativeInteger));
            nifTitle.addLiteral(pEndIndex, m.createTypedLiteral(uimaHeading.getEnd(), XSDnonNegativeInteger));
        }

        // Convert paragraphs
        for (Paragraph uimaParagraph : select(aJCas, Paragraph.class)) {
            String paragraphUri = String.format("%s#offset_%d_%d", docuri, uimaParagraph.getBegin(),
                    uimaParagraph.getEnd());
            Individual nifParagraph = m.createIndividual(paragraphUri, tParagraph);
            nifParagraph.addProperty(pReferenceContext, context);
            nifParagraph.addLiteral(pAnchorOf, uimaParagraph.getCoveredText());
            nifParagraph.addLiteral(pBeginIndex,
                    m.createTypedLiteral(uimaParagraph.getBegin(), XSDnonNegativeInteger));
            nifParagraph.addLiteral(pEndIndex, m.createTypedLiteral(uimaParagraph.getEnd(), XSDnonNegativeInteger));
        }

        // Convert sentences
        Individual previousNifSentence = null;
        for (Sentence uimaSentence : select(aJCas, Sentence.class)) {
            String sentenceUri = String.format("%s#offset_%d_%d", docuri, uimaSentence.getBegin(),
                    uimaSentence.getEnd());
            Individual nifSentence = m.createIndividual(sentenceUri, tSentence);
            nifSentence.addProperty(pReferenceContext, context);
            nifSentence.addLiteral(pAnchorOf, uimaSentence.getCoveredText());
            nifSentence.addLiteral(pBeginIndex,
                    m.createTypedLiteral(uimaSentence.getBegin(), XSDnonNegativeInteger));
            nifSentence.addLiteral(pEndIndex, m.createTypedLiteral(uimaSentence.getEnd(), XSDnonNegativeInteger));

            // Link word sequence
            if (previousNifSentence != null) {
                previousNifSentence.addProperty(pNextSentence, nifSentence);
                nifSentence.addProperty(pPreviousSentence, previousNifSentence);
            }
            previousNifSentence = nifSentence;

            // Convert tokens
            Individual previousNifWord = null;
            for (Token uimaToken : selectCovered(Token.class, uimaSentence)) {
                String wordUri = String.format("%s#offset_%d_%d", docuri, uimaToken.getBegin(), uimaToken.getEnd());
                Individual nifWord = m.createIndividual(wordUri, tWord);
                nifWord.addProperty(pReferenceContext, context);
                nifWord.addLiteral(pAnchorOf, uimaToken.getCoveredText());
                nifWord.addLiteral(pBeginIndex, m.createTypedLiteral(uimaToken.getBegin(), XSDnonNegativeInteger));
                nifWord.addLiteral(pEndIndex, m.createTypedLiteral(uimaToken.getEnd(), XSDnonNegativeInteger));

                // Link sentence <-> word
                nifWord.addProperty(pSentence, nifSentence);
                nifSentence.addProperty(pWord, nifWord);

                // Link word sequence
                if (previousNifWord != null) {
                    previousNifWord.addProperty(pNextWord, nifWord);
                    nifWord.addProperty(pPreviousWord, previousNifWord);
                }
                previousNifWord = nifWord;

                // Convert stem
                if (uimaToken.getStemValue() != null) {
                    nifWord.addProperty(pStem, uimaToken.getStemValue());
                }

                // Convert lemma
                if (uimaToken.getLemmaValue() != null) {
                    nifWord.addProperty(pLemma, uimaToken.getLemmaValue());
                }

                // Convert posTag (this is discouraged, the better alternative should be oliaLink)
                if (uimaToken.getPosValue() != null) {
                    nifWord.addProperty(pPosTag, uimaToken.getPosValue());
                }
            }
        }

        // Convert named entities
        //
        // Actually, the named entity in NIF is different from the one in DKPro Core. NIF uses
        // taIdentRef to link to a unique instance of an entity. Named entity recognizers in DKPro
        // Core just categorizes the entity, e.g. as a person, location, or whatnot. For what NIF
        // uses, we'd need a named entity linker, not just a recognizer.
        //
        // We create NEs using the NIF 2.1 class "EntityOccurence".
        // 
        // So here, we check if the DKPro Core NE value/identifier looks like a URI and if yes, then
        // we store it into the NIF taIdentRef property - otherwise we ignore it because NIF does
        // not have the concept of a NE category.
        for (NamedEntity uimaNamedEntity : select(aJCas, NamedEntity.class)) {
            String neClass = uimaNamedEntity.getValue();
            String neIdentifier = uimaNamedEntity.getValue();

            boolean neClassIsUri = StringUtils.startsWith(neClass, "http://");
            boolean neIdentifierIsUri = StringUtils.startsWith(neIdentifier, "http://");

            // The crudest form of checking for a URI, but since "http://" appears to be the default
            // prefix in the semantic web, let's just stick with it for the moment.
            if (!neClassIsUri && !neIdentifierIsUri) {
                continue;
            }

            String neUri = String.format("%s#offset_%d_%d", docuri, uimaNamedEntity.getBegin(),
                    uimaNamedEntity.getEnd());
            Individual nifNamedEntity = m.createIndividual(neUri, tEntityOccurrence);
            nifNamedEntity.addProperty(pReferenceContext, context);
            nifNamedEntity.addLiteral(pAnchorOf, uimaNamedEntity.getCoveredText());
            nifNamedEntity.addLiteral(pBeginIndex,
                    m.createTypedLiteral(uimaNamedEntity.getBegin(), XSDnonNegativeInteger));
            nifNamedEntity.addLiteral(pEndIndex,
                    m.createTypedLiteral(uimaNamedEntity.getEnd(), XSDnonNegativeInteger));

            if (neClassIsUri) {
                nifNamedEntity.addProperty(pTaClassRef, m.createResource(neClass));
            }

            if (neIdentifierIsUri) {
                nifNamedEntity.addProperty(pTaClassRef, m.createResource(neIdentifier));
            }
        }
    }
}