Java tutorial
/* * Copyright 2013 * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package de.tudarmstadt.lt.n2n.annotators; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.PriorityQueue; import java.util.Set; import org.apache.commons.lang.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import de.tudarmstadt.lt.n2n.types.Entity; import de.tudarmstadt.lt.n2n.types.Relation; import de.tudarmstadt.lt.n2n.utilities.N2nUtils; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; /** * * @author Steffen Remus */ public class RelationAnnotator extends JCasAnnotator_ImplBase { private final static Logger LOG = LoggerFactory.getLogger(RelationAnnotator.class); public static final String PARAM_FIND_NEW_RELATIONS = "_search_for_new_relations"; @ConfigurationParameter(name = PARAM_FIND_NEW_RELATIONS, mandatory = true, defaultValue = { "false" }) private boolean _search_for_new_relations; /** * e.g. Sentence.class or S.class * * default: Sentence.class */ public static final String PARAM_COVERING_ANNOTATION_TYPE = "_covering_annotation_type"; @ConfigurationParameter(name = PARAM_COVERING_ANNOTATION_TYPE, mandatory = false) private Class<? extends Annotation> _covering_annotation_type = Sentence.class; public static final String PARAM_FIND_DEPENDENCY_PATH = "_search_for_dependency_path"; @ConfigurationParameter(name = PARAM_FIND_DEPENDENCY_PATH, mandatory = true, defaultValue = { "true" }) private boolean _search_for_dependency_path; public static final String PARAM_MIN_DEPENDENCY_PATH_LENGTH = "_min_dependency_path_length"; @ConfigurationParameter(name = PARAM_MIN_DEPENDENCY_PATH_LENGTH, mandatory = true, defaultValue = { "1" }) private int _min_dependency_path_length; public static final String PARAM_MAX_DEPENDENCY_PATH_LENGTH = "_max_dependency_path_length"; @ConfigurationParameter(name = PARAM_MAX_DEPENDENCY_PATH_LENGTH, mandatory = true, defaultValue = { "5" }) private int _max_dependency_path_length; public static final String PARAM_GOOGLESYNTACTICS_DYNAMIC_PATHLENGTHS = "_googlesyntactics_dynamic_pathlength"; @ConfigurationParameter(name = PARAM_GOOGLESYNTACTICS_DYNAMIC_PATHLENGTHS, mandatory = false, defaultValue = { "false" }) private boolean _googlesyntactics_dynamic_pathlength; public static final String PARAM_REMOVE_IMMEDIATE_NN_RELATIONS_FROM_PATH = "_remove_immediate_nn_relations"; @ConfigurationParameter(name = PARAM_REMOVE_IMMEDIATE_NN_RELATIONS_FROM_PATH, mandatory = true, defaultValue = { "true" }) private boolean _remove_immediate_nn_relations; @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { try { // dynamic path lengths from google's syntactic ngrams // here we want to ectract only the paths of a certain length, e.g. paths of length 1 from arcs, paths of length 2 from biarcs, etc. if (_googlesyntactics_dynamic_pathlength) { // check if we are currently processing a quadarc, triarc, biarc, arc, and set the min and max pathlengths accordingly DocumentMetaData meta = DocumentMetaData.get(aJCas); String docid = meta.getDocumentId(); int exact_pathlength = JCasUtil.select(aJCas, Token.class).size() - 1; LOG.trace("{} contains {} words. [{}]", docid, exact_pathlength + 1, aJCas.getDocumentText()); LOG.debug("setting minimum and maximum path length to {}. ", exact_pathlength); _min_dependency_path_length = exact_pathlength; _max_dependency_path_length = exact_pathlength; } // search for entites and create the relation annotations if (_search_for_new_relations) searchForRelationEntities(aJCas); // in the set of relations search for the shortest path in the dependency graph if (_search_for_dependency_path) searchForPath(aJCas); } catch (Exception e) { LOG.error("Error while processing cas.", e); } } private void searchForRelationEntities(JCas aJCas) { for (Sentence sentence : JCasUtil.select(aJCas, Sentence.class)) { List<? extends Annotation> searchlist = null; if (_covering_annotation_type == null || Sentence.class.equals(_covering_annotation_type)) searchlist = Arrays.asList(sentence); else searchlist = JCasUtil.selectCovered(_covering_annotation_type, sentence); for (Annotation clause_like_covering_anno : searchlist) { if (!JCasUtil.selectCovered(_covering_annotation_type, clause_like_covering_anno).isEmpty()) continue; LOG.trace("searching for new relations in {} annotation [{}].", _covering_annotation_type.getSimpleName(), StringUtils.abbreviate(clause_like_covering_anno.getCoveredText(), 50)); searchAndCreateEntityPairs(aJCas, sentence, clause_like_covering_anno); LOG.debug("found {} relations for {} annotation [{}].", JCasUtil.selectCovered(Relation.class, clause_like_covering_anno).size(), _covering_annotation_type.getSimpleName(), StringUtils.abbreviate(clause_like_covering_anno.getCoveredText(), 50)); } } } private void searchAndCreateEntityPairs(JCas aJCas, Sentence sentence, Annotation clause_like_covering_anno) { Set<List<Integer>> existing_entity_pairs = new HashSet<List<Integer>>(); // List<Integer> is in fact a quadruple consisting of {e1.begin, e1.end, e2.begin, e1.end} for (Relation rel : JCasUtil.selectCovered(Relation.class, clause_like_covering_anno)) existing_entity_pairs.add(Arrays.asList(rel.getE1().getBegin(), rel.getE1().getEnd(), rel.getE2().getBegin(), rel.getE2().getEnd())); // for (N ne1 : JCasUtil.selectCovered(N.class, clause_like_covering_anno)) { // e1 loop // for (N ne2 : JCasUtil.selectCovered(N.class, clause_like_covering_anno)) { // e2 loop for (Token ne1 : JCasUtil.selectCovered(Token.class, clause_like_covering_anno)) { // e1 loop if (ne1.getPos().getPosValue() == null || !ne1.getPos().getPosValue().startsWith("N")) // TODO: replace that stupid if clause as soon as the POS values are available as real types, which will provided by the PreParsedSentencesReader continue; ne2_loop: for (Token ne2 : JCasUtil.selectCovered(Token.class, clause_like_covering_anno)) { // e2 loop if (ne2.getPos().getPosValue() == null || !ne2.getPos().getPosValue().startsWith("N")) continue; if (ne2.getBegin() <= ne1.getBegin()) // find relations between e1 and e2 only once, backward relations are considered later in further processing. This saves computation time continue; if (_remove_immediate_nn_relations) { for (Dependency dep : JCasUtil.selectCovering(Dependency.class, ne2)) { // check the dependencies if we found a dependent noun in a 'nn' relation here. If so we don't want it if (((ne1 == dep.getGovernor() && dep.getDependent().equals(ne2)) || (ne2 == dep.getGovernor() && dep.getDependent().equals(ne1))) && "nn".equals(dep.getDependencyType())) { LOG.trace( "Skipping enitity e2 '{}-{}' because it is the dependent of an 'nn' relation with '{}-{}'. We don't want that.", ne2.getCoveredText(), ne2.getBegin(), dep.getGovernor().getCoveredText(), dep.getGovernor().getBegin()); continue ne2_loop; } } } if (existing_entity_pairs .contains(Arrays.asList(ne1.getBegin(), ne1.getEnd(), ne2.getBegin(), ne2.getEnd()))) { LOG.trace( "Skipping enitities '{}-{}' and '{}-{}' because relation covering both already exists.", ne1.getCoveredText(), ne1.getBegin(), ne2.getCoveredText(), ne2.getBegin()); continue; } Entity e1 = new Entity(aJCas, ne1.getBegin(), ne1.getEnd()); e1.addToIndexes(); e1.setRepresentativeToken(N2nUtils.getLastElement(JCasUtil.selectCovered(Token.class, e1), true)); Entity e2 = new Entity(aJCas, ne2.getBegin(), ne2.getEnd()); e2.addToIndexes(); e2.setRepresentativeToken(N2nUtils.getLastElement(JCasUtil.selectCovered(Token.class, e2), true)); Relation relation = new Relation(aJCas, e1.getBegin(), e2.getEnd()); relation.setE1(e1); relation.setE2(e2); relation.setCoveringSentence(sentence); relation.addToIndexes(); LOG.trace("Found new relation with '{}-{}' as e1 and '{}-{}' as e2.", e1.getCoveredText(), e1.getBegin(), e2.getCoveredText(), e2.getBegin()); } } } private void searchForPath(JCas aJCas) { List<Relation> relations_to_remove = new LinkedList<Relation>(); for (Relation relation : JCasUtil.select(aJCas, Relation.class)) { Entity e1 = relation.getE1(); Entity e2 = relation.getE2(); Sentence covering_sentence = relation.getCoveringSentence(); LOG.trace( "searching for a shortest path in the dependency graph for relation ({}-{},{}-{}) in sentence [{}].", e1.getCoveredText(), e1.getBegin(), e2.getCoveredText(), e2.getBegin(), StringUtils.abbreviate(covering_sentence.getCoveredText(), 50)); List<Dependency> dependency_path = null; try { dependency_path = find_path(e1, e2, JCasUtil.selectCovered(Dependency.class, covering_sentence)); if (dependency_path == null) { // no path found relations_to_remove.add(relation); LOG.debug("Removing relation ({}-{},{}-{}) in sentence [{}]. No dependency path was found.", e1.getCoveredText(), e1.getBegin(), e2.getCoveredText(), e2.getBegin(), StringUtils.abbreviate(covering_sentence.getCoveredText(), 50)); continue; } } catch (IllegalStateException e) { relations_to_remove.add(relation); LOG.warn(String.format("%s: '%s.'", e.getClass().getSimpleName(), e.getMessage())); continue; } int path_length = dependency_path.size(); // dependencies are edges thus the path length is len(edges) LOG.trace("Found shortest path with length {} for relation ({}-{},{}-{}) in sentence [{}].", path_length, e1.getCoveredText(), e1.getBegin(), e2.getCoveredText(), e2.getBegin(), StringUtils.abbreviate(covering_sentence.getCoveredText(), 50)); // if the direct neighbor of either entities representative tokens is connected via 'nn' relation and the representative token is not the governor then ignore this edge if (_remove_immediate_nn_relations) { if (path_length > 0 && "nn".equals(dependency_path.get(0).getDependencyType()) && dependency_path.get(0).getDependent().equals(e1.getRepresentativeToken())) { // check first edge dependency_path.remove(0); path_length--; LOG.debug( "The immediate connecting edge for e1 ({}-{}) in sentence [{}] is an 'nn' relation and e1 is the dependent. It is removed from the path. New path length is {}.", e1.getCoveredText(), e1.getBegin(), StringUtils.abbreviate(covering_sentence.getCoveredText(), 50), path_length); } if (path_length > 0 && "nn".equals(dependency_path.get(path_length - 1).getDependencyType()) && dependency_path.get(path_length - 1).getDependent() .equals(e2.getRepresentativeToken())) { // check last edge dependency_path.remove(path_length - 1); path_length--; LOG.debug( "The immediate connecting edge for e2 ({}-{}) in sentence [{}] is an 'nn' relation and e1 is the dependent. It is removed from the path. New path length is {}.", e2.getCoveredText(), e2.getBegin(), StringUtils.abbreviate(covering_sentence.getCoveredText(), 50), path_length); } } // if path is too short skip it if (path_length < _min_dependency_path_length) { LOG.debug( "Path for relation ({}-{},{}-{}) in sentence [{}] is shorter than minlength={} ({}). Removing relation from cas index.", e1.getCoveredText(), e1.getBegin(), e2.getCoveredText(), e2.getBegin(), StringUtils.abbreviate(covering_sentence.getCoveredText(), 50), _min_dependency_path_length, path_length); relations_to_remove.add(relation); continue; } // if path is too long skip it if (path_length > _max_dependency_path_length) { LOG.debug( "Path for relation ({}-{},{}-{}) in sentence [{}] is longer than maxlength={} ({}). Removing relation from cas index.", e1.getCoveredText(), e1.getBegin(), e2.getCoveredText(), e2.getBegin(), StringUtils.abbreviate(covering_sentence.getCoveredText(), 50), _max_dependency_path_length, path_length); relations_to_remove.add(relation); continue; } relation.setDependencyPath((FSArray) aJCas.getCas().createArrayFS(dependency_path.size())); for (int i = 0; i < dependency_path.size(); i++) relation.setDependencyPath(i, dependency_path.get(i)); } for (Relation relation : relations_to_remove) relation.removeFromIndexes(aJCas); } protected List<Dependency> find_path(Entity e1, Entity e2, List<Dependency> dependencies) throws IllegalStateException { Token e1r = e1.getRepresentativeToken(); if (e1r == null) { e1r = N2nUtils.getLastElement(JCasUtil.selectCovered(Token.class, e1), true); // representative token for e1 e1.setRepresentativeToken(e1r); } Token e2r = e2.getRepresentativeToken(); if (e2r == null) { e2r = N2nUtils.getLastElement(JCasUtil.selectCovered(Token.class, e2), true); // representative token for e2 e2.setRepresentativeToken(e2r); } // create an inverse index (map tokens to dependencies) Map<Token, List<Dependency>> t2d = new HashMap<Token, List<Dependency>>(); for (Dependency d : dependencies) { List<Dependency> d4t = t2d.get(d.getGovernor()); if (d4t == null) { d4t = new ArrayList<Dependency>(); t2d.put(d.getGovernor(), d4t); } d4t.add(d); d4t = t2d.get(d.getDependent()); if (d4t == null) { d4t = new ArrayList<Dependency>(); t2d.put(d.getDependent(), d4t); } d4t.add(d); } if (!t2d.keySet().contains(e1r)) { // e1r isn't a node, we cannot find a path LOG.debug("'{}-{}' is not a node, maybe it is part of a collapsed prepositon. Removing relation.", e1r.getCoveredText(), e1r.getBegin()); return null; } if (!t2d.keySet().contains(e2r)) { // e2r isn't a node, we cannot find a path LOG.debug("'{}-{}' is not a node, maybe it is part of a collapsed prepositon. Removing relation.", e2r.getCoveredText(), e2r.getBegin()); return null; } // start at e1r and try to walk to e2r return find_path_dijkstra(e1r, e2r, t2d.keySet(), t2d); } protected List<Dependency> find_path_dijkstra(Token start, Token dest, Collection<Token> nodes, Map<Token, List<Dependency>> edges) throws IllegalStateException { List<Dependency> shortest_path = new ArrayList<Dependency>(); final Map<Token, Integer> dist = new HashMap<Token, Integer>(); final Map<Token, Dependency> prev = new HashMap<Token, Dependency>(); for (Token t : nodes) dist.put(t, Integer.MAX_VALUE); dist.put(start, 0); PriorityQueue<Token> Q = new PriorityQueue<Token>(edges.size(), new Comparator<Token>() { @Override public int compare(Token o1, Token o2) { return dist.get(o1).compareTo(dist.get(o2)); } }); Q.addAll(nodes); while (!Q.isEmpty()) { Token u = Q.poll(); // initially source node if (u.equals(dest)) // stop if dest break; if (dist.get(u) == Integer.MAX_VALUE) throw new IllegalStateException(String.format( "Could not find path from token '%s' to token '%s'. Perhaps start or dest is part of a preposition? (%s)", start.getCoveredText(), dest.getCoveredText(), DocumentMetaData.get(u.getCAS()).getDocumentId())); List<Dependency> connected_edges = edges.get(u); if (connected_edges == null) continue; for (Dependency d : connected_edges) { Token v = null; if (u.equals(d.getGovernor())) v = d.getDependent(); else v = d.getGovernor(); if (!Q.contains(v)) continue; int alt = dist.get(u) + 1; // dist(u,v) = 1 if (alt < dist.get(v)) { dist.put(v, alt); prev.put(v, d); Q.remove(v); // reinsert v so that Q is recomputed Q.offer(v); } } } Token u = dest; Dependency e = prev.get(u); while (e != null) { shortest_path.add(0, e); if (u == e.getGovernor()) u = e.getDependent(); else u = e.getGovernor(); e = prev.get(u); } return shortest_path; } }