Java tutorial
/******************************************************************************* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.dkpro.core.opennlp; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.util.Level.INFO; import static org.uimafit.util.JCasUtil.select; import static org.uimafit.util.JCasUtil.selectCovered; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.List; import opennlp.model.AbstractModel; import opennlp.tools.parser.AbstractBottomUpParser; import opennlp.tools.parser.Parse; import opennlp.tools.parser.Parser; import opennlp.tools.parser.ParserFactory; import opennlp.tools.parser.ParserModel; import opennlp.tools.util.Span; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Type; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import org.uimafit.component.JCasAnnotator_ImplBase; import org.uimafit.descriptor.ConfigurationParameter; import org.uimafit.util.FSCollectionFactory; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; /** * Parser annotator using OpenNLP. Requires {@link Sentence}s to be annotated before. * * @author Richard Eckart de Castilho */ public class OpenNlpParser extends JCasAnnotator_ImplBase { private static final String CONPACKAGE = Constituent.class.getPackage().getName() + "."; public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) protected String language; public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) protected String modelLocation; public static final String PARAM_TAGGER_MAPPING_LOCATION = ComponentParameters.PARAM_TAGGER_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_TAGGER_MAPPING_LOCATION, mandatory = false) protected String mappingLocation; public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") private boolean internTags; public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") protected boolean printTagSet; /** * Sets whether to create or not to create POS tags. The creation of * constituent tags must be turned on for this to work.<br/> * Default: {@code true} */ public static final String PARAM_CREATE_POS_TAGS = "createPosTags"; @ConfigurationParameter(name = PARAM_CREATE_POS_TAGS, mandatory = true, defaultValue = "true") private boolean createPosTags; /** * If this paramter is set to true, each sentence is annotated with a * PennTree-Annotation, containing the whole parse tree in Prenn Treebank * style format.<br/> * Default: {@code false} */ public static final String PARAM_CREATE_PENN_TREE_STRING = "createPennTreeString"; @ConfigurationParameter(name = PARAM_CREATE_PENN_TREE_STRING, mandatory = true, defaultValue = "false") private boolean createPennTreeString; private CasConfigurableProviderBase<Parser> modelProvider; private MappingProvider mappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); modelProvider = new CasConfigurableProviderBase<Parser>() { { setDefault(VERSION, "20120616.0"); setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); setDefault(ARTIFACT_ID, "de.tudarmstadt.ukp.dkpro.core.opennlp-model-parser-${language}-${variant}"); setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/" + "parser-${language}-${variant}.bin"); setDefault(VARIANT, "maxent"); setOverride(LOCATION, modelLocation); setOverride(LANGUAGE, language); setOverride(VARIANT, variant); } @Override protected Parser produceResource(URL aUrl) throws IOException { InputStream is = null; try { is = aUrl.openStream(); ParserModel model = new ParserModel(is); if (printTagSet) { printTags("tagger", model.getParserTaggerModel().getPosModel()); printTags("parser", model.getParserChunkerModel().getChunkerModel()); } return ParserFactory.create(model); } finally { closeQuietly(is); } } }; mappingProvider = new MappingProvider(); mappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/" + "core/api/lexmorph/tagset/${language}-${tagger.tagset}-tagger.map"); mappingProvider.setDefault(MappingProvider.BASE_TYPE, POS.class.getName()); mappingProvider.setDefault("tagger.tagset", "default"); mappingProvider.setOverride(MappingProvider.LOCATION, mappingLocation); mappingProvider.setOverride(MappingProvider.LANGUAGE, language); mappingProvider.addImport("tagger.tagset", modelProvider); } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { CAS cas = aJCas.getCas(); modelProvider.configure(cas); mappingProvider.configure(cas); for (Sentence sentence : select(aJCas, Sentence.class)) { List<Token> tokens = selectCovered(aJCas, Token.class, sentence); Parse parseInput = new Parse(cas.getDocumentText(), new Span(sentence.getBegin(), sentence.getEnd()), AbstractBottomUpParser.INC_NODE, 0, 0); int i = 0; for (Token t : tokens) { parseInput.insert(new Parse(cas.getDocumentText(), new Span(t.getBegin(), t.getEnd()), AbstractBottomUpParser.TOK_NODE, 0, i)); i++; } Parse parseOutput = modelProvider.getResource().parse(parseInput); createConstituentAnnotationFromTree(aJCas, parseOutput, null, tokens); if (createPennTreeString) { StringBuffer sb = new StringBuffer(); parseOutput.setType("ROOT"); // in DKPro the root is ROOT, not TOP parseOutput.show(sb); PennTree pTree = new PennTree(aJCas, sentence.getBegin(), sentence.getEnd()); pTree.setPennTree(sb.toString()); pTree.addToIndexes(); } } } private void printTags(String aType, AbstractModel aModel) { List<String> tags = new ArrayList<String>(); for (int i = 0; i < aModel.getNumOutcomes(); i++) { tags.add(aModel.getOutcome(i)); } Collections.sort(tags); StringBuilder sb = new StringBuilder(); sb.append("Model of " + aType + " contains [").append(tags.size()).append("] tags: "); for (String tag : tags) { sb.append(tag); sb.append(" "); } getContext().getLogger().log(INFO, sb.toString()); } /** * Creates linked constituent annotations + POS annotations * * @param aNode * the source tree * @param aParentFS * @param aCreatePos * sets whether to create or not to create POS tags * @param aCreateLemmas * sets whether to create or not to create Lemmas * @return the child-structure (needed for recursive call only) */ private Annotation createConstituentAnnotationFromTree(JCas aJCas, Parse aNode, Annotation aParentFS, List<Token> aTokens) { // If the node is a word-level constituent node (== POS): // create parent link on token and (if not turned off) create POS tag if (aNode.isPosTag()) { Token token = getToken(aTokens, aNode.getSpan().getStart(), aNode.getSpan().getEnd()); // link token to its parent constituent if (aParentFS != null) { token.setParent(aParentFS); } // only add POS to index if we want POS-tagging if (createPosTags) { Type posTag = mappingProvider.getTagType(aNode.getType()); POS posAnno = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); posAnno.setPosValue(internTags ? aNode.getType().intern() : aNode.getType()); posAnno.addToIndexes(); token.setPos((POS) posAnno); } return token; } // Check if node is a constituent node on sentence or phrase-level else { String typeName = aNode.getType(); if (AbstractBottomUpParser.TOP_NODE.equals(typeName)) { typeName = "ROOT"; // in DKPro the root is ROOT, not TOP } // create the necessary objects and methods String constituentTypeName = CONPACKAGE + typeName; Type type = aJCas.getTypeSystem().getType(constituentTypeName); //if type is unknown, map to X-type if (type == null) { type = aJCas.getTypeSystem().getType(CONPACKAGE + "X"); } Constituent constAnno = (Constituent) aJCas.getCas().createAnnotation(type, aNode.getSpan().getStart(), aNode.getSpan().getEnd()); constAnno.setConstituentType(typeName); // link to parent if (aParentFS != null) { constAnno.setParent(aParentFS); } // Do we have any children? List<Annotation> childAnnotations = new ArrayList<Annotation>(); for (Parse child : aNode.getChildren()) { Annotation childAnnotation = createConstituentAnnotationFromTree(aJCas, child, constAnno, aTokens); if (childAnnotation != null) { childAnnotations.add(childAnnotation); } } // Now that we know how many children we have, link annotation of // current node with its children FSArray childArray = (FSArray) FSCollectionFactory.createFSArray(aJCas, childAnnotations); constAnno.setChildren(childArray); // write annotation for current node to index aJCas.addFsToIndexes(constAnno); return constAnno; } } /** * Given a list of tokens (e.g. those from a sentence) return the one at the specified position. */ private Token getToken(List<Token> aTokens, int aBegin, int aEnd) { for (Token t : aTokens) { if (aBegin == t.getBegin() && aEnd == t.getEnd()) { return t; } } throw new IllegalStateException("Token not found"); } }