de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter.java
Source

/*******************************************************************************
 * Copyright 2012
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.tudarmstadt.ukp.dkpro.core.opennlp;

import static org.apache.commons.io.IOUtils.closeQuietly;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;

import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.uimafit.descriptor.ConfigurationParameter;

import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;

/**
 * Part-of-Speech annotator using OpenNLP. Requires {@link Sentence}s to be annotated before.
 * 
 * @author Richard Eckart de Castilho
 */
public class OpenNlpSegmenter extends SegmenterBase {
    public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
    @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false)
    protected String language;

    public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT;
    @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false)
    protected String variant;

    public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
    @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false)
    protected String modelLocation;

    private CasConfigurableProviderBase<SentenceDetectorME> sentenceModelProvider;
    private CasConfigurableProviderBase<TokenizerME> tokenModelProvider;

    @Override
    public void initialize(UimaContext aContext) throws ResourceInitializationException {
        super.initialize(aContext);

        sentenceModelProvider = new CasConfigurableProviderBase<SentenceDetectorME>() {
            {
                setDefault(VERSION, "20120616.0");
                setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core");
                setDefault(ARTIFACT_ID,
                        "de.tudarmstadt.ukp.dkpro.core.opennlp-model-sentence-${language}-${variant}");

                setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/"
                        + "sentence-${language}-${variant}.bin");
                setDefault(VARIANT, "maxent");

                setOverride(LOCATION, modelLocation);
                setOverride(LANGUAGE, language);
                setOverride(VARIANT, variant);
            }

            @Override
            protected SentenceDetectorME produceResource(URL aUrl) throws IOException {
                InputStream is = null;
                try {
                    is = aUrl.openStream();
                    SentenceModel model = new SentenceModel(is);
                    return new SentenceDetectorME(model);
                } finally {
                    closeQuietly(is);
                }
            }
        };

        tokenModelProvider = new CasConfigurableProviderBase<TokenizerME>() {
            {
                setDefault(VERSION, "1.5");
                setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core");
                setDefault(ARTIFACT_ID, "de.tudarmstadt.ukp.dkpro.core.opennlp-model-token-${language}-${variant}");

                setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/"
                        + "token-${language}-${variant}.bin");
                setDefault(VARIANT, "maxent");

                setOverride(LOCATION, modelLocation);
                setOverride(LANGUAGE, language);
                setOverride(VARIANT, variant);
            }

            @Override
            protected TokenizerME produceResource(URL aUrl) throws IOException {
                InputStream is = null;
                try {
                    is = aUrl.openStream();
                    TokenizerModel model = new TokenizerModel(is);
                    return new TokenizerME(model);
                } finally {
                    closeQuietly(is);
                }
            }
        };
    }

    @Override
    public void process(JCas aJCas) throws AnalysisEngineProcessException {
        CAS cas = aJCas.getCas();
        sentenceModelProvider.configure(cas);
        tokenModelProvider.configure(cas);

        super.process(aJCas);
    }

    @Override
    protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException {
        for (Span sSpan : sentenceModelProvider.getResource().sentPosDetect(aText)) {
            createSentence(aJCas, sSpan.getStart() + aZoneBegin, sSpan.getEnd() + aZoneBegin);
            for (Span tSpan : tokenModelProvider.getResource()
                    .tokenizePos(aText.substring(sSpan.getStart(), sSpan.getEnd()))) {
                createToken(aJCas, tSpan.getStart() + sSpan.getStart() + aZoneBegin,
                        tSpan.getEnd() + sSpan.getStart() + aZoneBegin);
            }
        }
    }
}