org.apache.stanbol.enhancer.nlp.utils.NIFHelper.java Source code

Introduction

Here is the source code for org.apache.stanbol.enhancer.nlp.utils.NIFHelper.java
Source

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.nlp.utils;

import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.EnumMap;
import java.util.Map;

import org.apache.clerezza.commons.rdf.Language;
import org.apache.clerezza.commons.rdf.Graph;
import org.apache.clerezza.commons.rdf.IRI;
import org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl;
import org.apache.clerezza.commons.rdf.impl.utils.TripleImpl;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.commons.io.IOUtils;
import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.Chunk;
import org.apache.stanbol.enhancer.nlp.model.Span;
import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum;
import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.nlp.model.annotation.Annotated;
import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.nlp.nif.SsoOntology;
import org.apache.stanbol.enhancer.nlp.nif.StringOntology;
import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.nlp.pos.Pos;
import org.apache.stanbol.enhancer.nlp.pos.PosTag;
import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;

public final class NIFHelper {

    private static final LiteralFactory lf = LiteralFactory.getInstance();

    private NIFHelper() {
    }

    public static final Map<SpanTypeEnum, IRI> SPAN_TYPE_TO_SSO_TYPE;
    static {
        Map<SpanTypeEnum, IRI> mapping = new EnumMap<SpanTypeEnum, IRI>(SpanTypeEnum.class);
        //mapping.put(SpanTypeEnum.Text, null);
        //mapping.put(SpanTypeEnum.TextSection, null);
        mapping.put(SpanTypeEnum.Sentence, SsoOntology.Sentence.getUri());
        mapping.put(SpanTypeEnum.Chunk, SsoOntology.Phrase.getUri());
        mapping.put(SpanTypeEnum.Token, SsoOntology.Word.getUri());
        SPAN_TYPE_TO_SSO_TYPE = Collections.unmodifiableMap(mapping);
    }

    /**
     * Read-only map that maps from the {@link LexicalCategory} to the OLIA
     * Concept representing the Phrase (e.g. {@link LexicalCategory#Noun} maps
     * to "<code>http://purl.org/olia/olia.owl#NounPhrase</code>").
     */
    public static final Map<LexicalCategory, IRI> LEXICAL_TYPE_TO_PHRASE_TYPE;
    static {
        String olia = "http://purl.org/olia/olia.owl#";
        Map<LexicalCategory, IRI> mapping = new EnumMap<LexicalCategory, IRI>(LexicalCategory.class);
        mapping.put(LexicalCategory.Noun, new IRI(olia + "NounPhrase"));
        mapping.put(LexicalCategory.Verb, new IRI(olia + "VerbPhrase"));
        mapping.put(LexicalCategory.Adjective, new IRI(olia + "AdjectivePhrase"));
        mapping.put(LexicalCategory.Adverb, new IRI(olia + "AdverbPhrase"));
        mapping.put(LexicalCategory.Conjuction, new IRI(olia + "ConjuctionPhrase"));
        LEXICAL_TYPE_TO_PHRASE_TYPE = Collections.unmodifiableMap(mapping);
    }

    /**
     * Creates a NIF2.0 Fragment URI using the parsed base URI and the start/end
     * indexes.
     * @param base the base URI
     * @param start the start position. If <code>&lt; 0</code> than zero is added.
     * @param end the end position or values &lt; 1 when open ended.
     * @return the NIF 2.0 Fragment URI
     * @throws IllegalArgumentException if <code>null</code> is parsed as base
     * {@link IRI} or the end position is &gt;=0 but &lt= the parsed start
     * position.
     */
    public static final IRI getNifFragmentURI(IRI base, int start, int end) {
        if (base == null) {
            throw new IllegalArgumentException("Base URI MUST NOT be NULL!");
        }
        StringBuilder sb = new StringBuilder(base.getUnicodeString());
        sb.append("#char=");
        sb.append(start >= 0 ? start : 0).append(',');
        if (end >= 0) {
            if (end < start) {
                throw new IllegalArgumentException("End index '" + end + "' < start '" + start + "'!");
            }
            sb.append(end);
        } //else open ended ...
        return new IRI(sb.toString());
    }

    public static final IRI getNifOffsetURI(IRI base, int start, int end) {
        if (base == null) {
            throw new IllegalArgumentException("Base URI MUST NOT be NULL!");
        }
        StringBuilder sb = new StringBuilder(base.getUnicodeString());
        sb.append("#offset_");
        sb.append(start >= 0 ? start : 0).append('_');
        if (end >= 0) {
            if (end < start) {
                throw new IllegalArgumentException("End index '" + end + "' < start '" + start + "'!");
            }
            sb.append(end);
        } //else open ended ...
        return new IRI(sb.toString());
    }

    public static final int NIF_HASH_CONTEXT_LENGTH = 10;
    public static final int NIF_HASH_MAX_STRING_LENGTH = 20;

    public static final Charset UTF8 = Charset.forName("UTF8");

    public static final IRI getNifHashURI(IRI base, int start, int end, String text) {
        if (base == null) {
            throw new IllegalArgumentException("Base URI MUST NOT be NULL!");
        }
        start = start < 0 ? 0 : start;
        end = end < 0 ? start : end;
        if (end < start) {
            throw new IllegalArgumentException("End index '" + end + "' < start '" + start + "'!");
        }
        if (end >= text.length()) {
            throw new IllegalArgumentException(
                    "The End index '" + end + "' exeeds the " + "length of the text '" + text.length() + "'!");
        }
        int contextStart = Math.max(0, start - NIF_HASH_CONTEXT_LENGTH);
        int contextEnd = Math.min(text.length(), end + NIF_HASH_CONTEXT_LENGTH);
        StringBuilder sb = new StringBuilder(base.getUnicodeString());
        sb.append("#hash_");
        sb.append(NIF_HASH_CONTEXT_LENGTH);
        sb.append('_');
        sb.append(end - start);
        sb.append('_');
        sb.append(getContextDigest(text, contextStart, start, end, contextEnd));
        sb.append('_');
        sb.append(text.substring(start, Math.min(end, start + NIF_HASH_MAX_STRING_LENGTH)));
        return new IRI(sb.toString());
    }

    /**
     * Creates the UTF8 byte representation for the '{prefix}({selected}){suffix}'
     * calculated based on the parsed parameters
     * @param text the text
     * @param contextStart the start index of the prefix
     * @param start the start index of the selected text part
     * @param end the end index of the selecte text part
     * @param contextEnd the end index of the suffix
     * @return the HASH string representation of the MD5 over 
     *  <code>'{prefix}({selected}){suffix}'</code> (NOTE the brackets that are
     *  added at the start/end of the selected text)
     */
    private static String getContextDigest(String text, int contextStart, int start, int end, int contextEnd) {
        ByteArrayOutputStream contextOs = new ByteArrayOutputStream();
        Writer contextWriter = new OutputStreamWriter(contextOs, UTF8);
        try {
            if (contextStart < start) {
                contextWriter.append(text, contextStart, start);
            }
            contextWriter.append('(');
            if (start < end) {
                contextWriter.append(text, start, end);
            }
            contextWriter.append(')');
            if (end < contextEnd) {
                contextWriter.append(text, end, contextEnd);
            }
            contextWriter.flush();
            return ContentItemHelper.streamDigest(new ByteArrayInputStream(contextOs.toByteArray()), null, "MD5");
        } catch (IOException e) {
            //NO IOExceptions in in-memory stream implementations
            throw new IllegalStateException(e);
        } finally {
            IOUtils.closeQuietly(contextOs);
        }
    }

    /**
     * Writes basic information of the parsed span by using NIF 1.0 including the
     * {@link SsoOntology} Sentence/Phrase/Word type based on 
     * the {@link Span#getType()}<p>
     * As {@link AnalysedText} is based on the plain text version of the ContentItem
     * this uses the {@link StringOntology#OffsetBasedString} notation.<p>
     * <i>NOTE:</i> This DOES NOT write string relations, lemma, pos ... information
     * that might be stored as {@link Annotation} with the parsed {@link Span}.
     * @param graph the graph to add the triples
     * @param base the base URI
     * @param text the {@link AnalysedText}
     * @param language the {@link Language} or <code>null</code> if not known
     * @param span the {@link Span} to write.
     * @return the {@link IRI} representing the parsed {@link Span} in the
     * graph
     */
    public static IRI writeSpan(Graph graph, IRI base, AnalysedText text, Language language, Span span) {
        IRI segment = getNifOffsetURI(base, span.getStart(), span.getEnd());
        graph.add(new TripleImpl(segment, RDF_TYPE, StringOntology.OffsetBasedString.getUri()));
        graph.add(new TripleImpl(segment, StringOntology.anchorOf.getUri(),
                new PlainLiteralImpl(span.getSpan(), language)));
        graph.add(new TripleImpl(segment, StringOntology.beginIndex.getUri(),
                lf.createTypedLiteral(span.getStart())));
        graph.add(new TripleImpl(segment, StringOntology.endIndex.getUri(), lf.createTypedLiteral(span.getEnd())));
        switch (span.getType()) {
        case Token:
            graph.add(new TripleImpl(segment, RDF_TYPE, SsoOntology.Word.getUri()));
            break;
        case Chunk:
            graph.add(new TripleImpl(segment, RDF_TYPE, SsoOntology.Phrase.getUri()));
            break;
        case Sentence:
            graph.add(new TripleImpl(segment, RDF_TYPE, SsoOntology.Sentence.getUri()));
            break;
        //            case Text:
        //                graph.add(new TripleImpl(segment, RDF_TYPE, StringOntology.Document.getUri()));
        //no default:
        }
        return segment;
    }

    /**
     * Writes the {@link NlpAnnotations#POS_ANNOTATION} as NIF 1.0 to the parsed
     * RDF graph by using the parsed segmentUri as subject
     * @param graph the graph
     * @param annotated the annotated element (e.g. a {@link Token})
     * @param segmentUri the URI of the resource representing the parsed 
     * annotated element in the graph
     */
    public static void writePos(Graph graph, Annotated annotated, IRI segmentUri) {
        Value<PosTag> posTag = annotated.getAnnotation(NlpAnnotations.POS_ANNOTATION);
        if (posTag != null) {
            if (posTag.value().isMapped()) {
                for (Pos pos : posTag.value().getPos()) {
                    graph.add(new TripleImpl(segmentUri, SsoOntology.oliaLink.getUri(), pos.getUri()));
                }
                for (LexicalCategory cat : posTag.value().getCategories()) {
                    graph.add(new TripleImpl(segmentUri, SsoOntology.oliaLink.getUri(), cat.getUri()));
                }
            }
            graph.add(new TripleImpl(segmentUri, SsoOntology.posTag.getUri(),
                    lf.createTypedLiteral(posTag.value().getTag())));
            graph.add(new TripleImpl(segmentUri, ENHANCER_CONFIDENCE, lf.createTypedLiteral(posTag.probability())));
        }
    }

    /**
     * Writes a {@link NlpAnnotations#PHRASE_ANNOTATION} as NIF 1.0 to the
     * parsed RDF graph by using the segmentUri as subject
     * @param graph the graph
     * @param annotated the annotated element (e.g. a {@link Chunk})
     * @param segmentUri the URI of the resource representing the parsed 
     * annotated element in the graph
     */
    public static void writePhrase(Graph graph, Annotated annotated, IRI segmentUri) {
        Value<PhraseTag> phraseTag = annotated.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION);
        if (phraseTag != null) {
            IRI phraseTypeUri = LEXICAL_TYPE_TO_PHRASE_TYPE.get(phraseTag.value().getCategory());
            if (phraseTypeUri != null) { //add the oliaLink for the Phrase
                graph.add(new TripleImpl(segmentUri, SsoOntology.oliaLink.getUri(), phraseTypeUri));
                graph.add(new TripleImpl(segmentUri, ENHANCER_CONFIDENCE,
                        lf.createTypedLiteral(phraseTag.probability())));
            }
        }
    }

}