de.tudarmstadt.ukp.dkpro.core.clearnlp.ClearNlpParser.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.core.clearnlp.ClearNlpParser.java

Source

/*
 * Copyright 2012
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.tudarmstadt.ukp.dkpro.core.clearnlp;

import static org.apache.commons.io.IOUtils.closeQuietly;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;
import static org.apache.uima.util.Level.INFO;
import static org.apache.uima.util.Level.WARNING;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.net.URL;
import java.util.List;
import java.util.Properties;
import java.util.zip.GZIPInputStream;

import org.apache.commons.io.FileUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.AnalysisComponent;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

import com.clearnlp.classification.model.StringModel;
import com.clearnlp.component.dep.AbstractDEPParser;
import com.clearnlp.dependency.DEPNode;
import com.clearnlp.dependency.DEPTree;
import com.clearnlp.nlp.NLPGetter;

import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT;

/**
 * CLEAR parser annotator.
 */
@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
        "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
        "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS",
        "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }, outputs = {
                "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" })
public class ClearNlpParser extends JCasAnnotator_ImplBase {
    /**
     * Write the tag set(s) to the log when a model is loaded.
     */
    public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET;
    @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false")
    protected boolean printTagSet;

    /**
     * Use this language instead of the document language to resolve the model.
     */
    public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
    @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false)
    protected String language;

    /**
     * Variant of a model the model. Used to address a specific model if here are multiple models
     * for one language.
     */
    public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT;
    @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false)
    protected String variant;

    /**
     * Location from which the model is read.
     */
    public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
    @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false)
    protected String modelLocation;

    private File workingDir;

    private CasConfigurableProviderBase<AbstractDEPParser> parserProvider;

    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);

        parserProvider = new ModelProviderBase<AbstractDEPParser>(this, "clearnlp", "parser") {
            @Override
            protected AbstractDEPParser produceResource(URL aUrl) throws IOException {
                InputStream is = null;
                BufferedInputStream bis = null;
                ObjectInputStream ois = null;
                GZIPInputStream gis = null;

                try {
                    is = aUrl.openStream();
                    String language = getAggregatedProperties().getProperty(LANGUAGE);
                    gis = new GZIPInputStream(is);
                    bis = new BufferedInputStream(gis);
                    ois = new ObjectInputStream(bis);
                    AbstractDEPParser parser = NLPGetter.getDEPParser(ois, language);
                    Properties metadata = getResourceMetaData();

                    SingletonTagset depTags = new SingletonTagset(Dependency.class,
                            metadata.getProperty("dependency.tagset"));

                    try {
                        for (StringModel model : parser.getModels()) {
                            for (String label : model.getLabels()) {
                                String[] fields = label.split("_");
                                if (fields.length == 3) {
                                    depTags.add(fields[2]);
                                }
                                // else {
                                // getContext().getLogger().log(WARNING,
                                // "Unknown label format: [" + label + "]");
                                // }
                            }
                        }
                    } catch (Exception e) {
                        getContext().getLogger().log(WARNING, "Unable to find tagset information.");
                    }

                    addTagset(depTags);

                    if (printTagSet) {
                        getContext().getLogger().log(INFO, getTagset().toString());
                    }

                    return parser;
                } catch (Exception e) {
                    throw new IOException(e);
                } finally {
                    closeQuietly(ois);
                    closeQuietly(bis);
                    closeQuietly(gis);
                    closeQuietly(is);
                }
            }
        };
    }

    /**
     * @see AnalysisComponent#collectionProcessComplete()
     */
    @Override
    public void collectionProcessComplete() throws AnalysisEngineProcessException {
        if ((workingDir != null) && workingDir.isDirectory()) {
            FileUtils.deleteQuietly(workingDir);
        }
    }

    @Override
    public void process(JCas aJCas) throws AnalysisEngineProcessException {
        parserProvider.configure(aJCas.getCas());

        // Iterate over all sentences
        for (Sentence sentence : select(aJCas, Sentence.class)) {
            List<Token> tokens = selectCovered(aJCas, Token.class, sentence);

            DEPTree tree = new DEPTree();

            // Generate input format required by parser
            for (int i = 0; i < tokens.size(); i++) {
                Token t = tokens.get(i);
                DEPNode node = new DEPNode(i + 1, tokens.get(i).getCoveredText());
                node.pos = t.getPos().getPosValue();
                if (t.getLemma() != null) {
                    node.lemma = t.getLemma().getValue();
                }
                tree.add(node);
            }

            // Parse sentence
            AbstractDEPParser parser = parserProvider.getResource();
            parser.process(tree);

            for (int i = 1; i < tree.size(); i++) {
                DEPNode node = tree.get(i);

                if (node.hasHead()) {
                    if (node.getHead().id != 0) {
                        Dependency dep = new Dependency(aJCas);
                        dep.setGovernor(tokens.get(node.getHead().id - 1));
                        dep.setDependent(tokens.get(node.id - 1));
                        dep.setDependencyType(node.getLabel());
                        dep.setBegin(dep.getDependent().getBegin());
                        dep.setEnd(dep.getDependent().getEnd());
                        dep.setFlavor(DependencyFlavor.BASIC);
                        dep.addToIndexes();
                    } else {
                        Dependency dep = new ROOT(aJCas);
                        dep.setGovernor(tokens.get(node.id - 1));
                        dep.setDependent(tokens.get(node.id - 1));
                        dep.setDependencyType("ROOT");
                        dep.setBegin(dep.getDependent().getBegin());
                        dep.setEnd(dep.getDependent().getEnd());
                        dep.setFlavor(DependencyFlavor.BASIC);
                        dep.addToIndexes();
                    }
                }
            }
        }
    }
}