Java tutorial
/******************************************************************************* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.dkpro.core.maltparser; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.apache.uima.util.Level.INFO; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.URL; import java.util.List; import java.util.Properties; import java.util.jar.JarEntry; import java.util.jar.JarInputStream; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.AnalysisComponent; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Level; import org.apache.uima.util.Logger; import org.maltparser.MaltParserService; import org.maltparser.core.exception.MaltChainedException; import org.maltparser.core.options.OptionManager; import org.maltparser.core.symbol.SymbolTable; import org.maltparser.core.syntaxgraph.DependencyStructure; import org.maltparser.core.syntaxgraph.edge.Edge; import org.maltparser.core.syntaxgraph.node.TokenNode; import org.maltparser.parser.SingleMalt; import org.springframework.beans.PropertyAccessor; import org.springframework.beans.PropertyAccessorFactory; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; /** * <p> * DKPro Annotator for the MaltParser * </p> * * Required annotations:<br/> * <ul> * <li>Token</li> * <li>Sentence</li> * <li>POS</li> * </ul> * * Generated annotations:<br/> * <ul> * <li>Dependency (annotated over sentence-span)</li> * </ul> * * * @author Oliver Ferschke * @author Richard Eckart de Castilho */ @TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) public class MaltParser extends JCasAnnotator_ImplBase { /** * Use this language instead of the document language to resolve the model. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) protected String language; /** * Override the default variant used to locate the model. */ public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; /** * Load the model from this location instead of locating the model automatically. */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) protected String modelLocation; /** * Log the tag set(s) when a model is loaded. * * Default: {@code false} */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") protected boolean printTagSet; // Not sure if we'll ever have to use different symbol tables // public static final String SYMBOL_TABLE = "symbolTableName"; // @ConfigurationParameter(name = SYMBOL_TABLE, mandatory = true, defaultValue = "DEPREL") private final String symbolTableName = "DEPREL"; private Logger logger; private SymbolTable symbolTable; private File workingDir; private CasConfigurableProviderBase<MaltParserService> modelProvider; @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); logger = getContext().getLogger(); try { workingDir = File.createTempFile("maltparser", ".tmp"); workingDir.delete(); workingDir.mkdirs(); workingDir.deleteOnExit(); } catch (IOException e) { throw new ResourceInitializationException(e); } modelProvider = new ModelProviderBase<MaltParserService>() { private MaltParserService parser; { setContextObject(MaltParser.this); setDefault(ARTIFACT_ID, "${groupId}.maltparser-model-parser-${language}-${variant}"); setDefault(VARIANT, "linear"); setDefault(LOCATION, "classpath:/${package}/lib/parser-${language}-${variant}.mco"); setOverride(LOCATION, modelLocation); setOverride(LANGUAGE, language); setOverride(VARIANT, variant); } @Override protected MaltParserService produceResource(URL aUrl) throws IOException { if (parser != null) { // Terminates the parser model try { parser.terminateParserModel(); parser = null; } catch (MaltChainedException e) { logger.log(Level.SEVERE, "MaltParser exception while terminating parser model: " + e.getMessage()); } } try { // However, Maltparser is not happy at all if the model file does not have the right // name, so we are forced to create a temporary directory and place the file there. File modelFile = new File(workingDir, getRealName(aUrl)); if (!modelFile.exists()) { InputStream is = null; OutputStream os = null; try { is = aUrl.openStream(); os = new FileOutputStream(modelFile); IOUtils.copy(is, os); modelFile.deleteOnExit(); } finally { IOUtils.closeQuietly(is); IOUtils.closeQuietly(os); } } // Maltparser has a very odd way of finding out which command line options it supports. // By manually initializing the OptionManager before Maltparser tries it, we can work // around Maltparsers' own broken code. if (OptionManager.instance().getOptionContainerIndices().size() == 0) { OptionManager.instance().loadOptionDescriptionFile( MaltParserService.class.getResource("/appdata/options.xml")); OptionManager.instance().generateMaps(); } // Ok, now we can finally initialize the parser parser = new MaltParserService(); parser.initializeParserModel("-w " + workingDir + " -c " + modelFile.getName() + " -m parse"); // parser.initializeParserModel("-u " + modelUrl.toString() + " -m parse"); Properties metadata = getResourceMetaData(); PropertyAccessor paDirect = PropertyAccessorFactory.forDirectFieldAccess(parser); SingleMalt singleMalt = (SingleMalt) paDirect.getPropertyValue("singleMalt"); SingletonTagset posTags = new SingletonTagset(POS.class, metadata.getProperty("pos.tagset")); SymbolTable posTagTable = singleMalt.getSymbolTables().getSymbolTable("POSTAG"); for (int i : posTagTable.getCodes()) { posTags.add(posTagTable.getSymbolCodeToString(i)); } addTagset(posTags); SingletonTagset depTags = new SingletonTagset(Dependency.class, metadata.getProperty("dependency.tagset")); SymbolTable depRelTable = singleMalt.getSymbolTables().getSymbolTable("DEPREL"); for (int i : depRelTable.getCodes()) { depTags.add(depRelTable.getSymbolCodeToString(i)); } addTagset(depTags); if (printTagSet) { getContext().getLogger().log(INFO, getTagset().toString()); } return parser; } catch (MaltChainedException e) { logger.log(Level.SEVERE, "MaltParser exception while initializing parser model: " + e.getMessage()); throw new IOException(e); } } }; } /** * @see AnalysisComponent#collectionProcessComplete() */ @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { if (workingDir != null && workingDir.isDirectory()) { FileUtils.deleteQuietly(workingDir); } } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { modelProvider.configure(aJCas.getCas()); // Iterate over all sentences for (Sentence curSentence : select(aJCas, Sentence.class)) { // Generate list of tokens for current sentence List<Token> tokens = selectCovered(Token.class, curSentence); // Generate input format required by parser String[] parserInput = new String[tokens.size()]; for (int i = 0; i < parserInput.length; i++) { Token t = tokens.get(i); // This only works for the English model. Other models have different input // formats. See http://www.maltparser.org/mco/mco.html parserInput[i] = String.format("%1$d\t%2$s\t_\t%3$s\t%3$s\t_", i + 1, t.getCoveredText(), t.getPos().getPosValue()); } // Parse sentence DependencyStructure graph = null; try { // Parses the sentence graph = modelProvider.getResource().parse(parserInput); symbolTable = graph.getSymbolTables().getSymbolTable(symbolTableName); } catch (MaltChainedException e) { logger.log(Level.WARNING, "MaltParser exception while parsing sentence: " + e.getMessage(), e); // don't pass on exception - go on with next sentence continue; } /* * Generate annotations: NOTE: Index of token in tokenList corresponds to node in * DependencyGraph with NodeIndex+1 */ try { // iterate over all tokens in current sentence for (int i = 0; i < tokens.size(); i++) { // Start with Node 1 - we omit ROOT-dependencies, // because we don't have a ROOT-token. TokenNode curNode = graph.getTokenNode(i + 1); // iterate over all dependencies for current token for (Edge edge : curNode.getHeadEdges()) { int sourceIdx = edge.getSource().getIndex(); int targetIdx = edge.getTarget().getIndex(); // get corresponding token for node in DependencyGraph Token sourceToken = sourceIdx > 0 ? tokens.get(sourceIdx - 1) : null; Token targetToken = targetIdx > 0 ? tokens.get(targetIdx - 1) : null; // create dep-annotation for current edge if (sourceToken != null && targetToken != null) { Dependency dep = new Dependency(aJCas); dep.setDependencyType(edge.getLabelSymbol(symbolTable)); dep.setGovernor(sourceToken); // TODO check if source=Governor dep.setDependent(targetToken); // TODO check if target=Dependent dep.setBegin(dep.getDependent().getBegin()); dep.setEnd(dep.getDependent().getEnd()); dep.addToIndexes(); } } } } catch (MaltChainedException e) { logger.log(Level.WARNING, "MaltParser exception creating dependency annotations: " + e.getMessage(), e); // don't pass on exception - go on with next sentence continue; } } } private String getRealName(URL aUrl) throws IOException { JarEntry je = null; JarInputStream jis = null; try { jis = new JarInputStream(aUrl.openConnection().getInputStream()); while ((je = jis.getNextJarEntry()) != null) { String entryName = je.getName(); if (entryName.endsWith(".info")) { int indexUnderScore = entryName.lastIndexOf('_'); int indexSeparator = entryName.lastIndexOf(File.separator); if (indexSeparator == -1) { indexSeparator = entryName.lastIndexOf('/'); } if (indexSeparator == -1) { indexSeparator = entryName.lastIndexOf('\\'); } int indexDot = entryName.lastIndexOf('.'); if (indexUnderScore == -1 || indexDot == -1) { throw new IllegalStateException( "Could not find the configuration name and type from the URL '" + aUrl.toString() + "'. "); } return entryName.substring(indexSeparator + 1, indexUnderScore) + ".mco"; } } throw new IllegalStateException( "Could not find the configuration name and type from the URL '" + aUrl.toString() + "'. "); } finally { IOUtils.closeQuietly(jis); } } }