Java tutorial
/******************************************************************************* * Copyright (c) 2010-2012 Nikita Zhiltsov. * All rights reserved. This program and the accompanying materials * are made available under the terms of the GNU Public License v3.0 * which accompanies this distribution, and is available at * http://www.gnu.org/licenses/gpl.html * * Contributors: * Nikita Zhiltsov - initial API and implementation * Azat Khasanshin - implementation ******************************************************************************/ package ru.ksu.niimm.cll.mocassin.crawl.parser.gate.util; import static java.lang.String.format; import static com.google.common.base.Preconditions.*; import gate.Annotation; import gate.AnnotationSet; import gate.Document; import gate.Factory; import gate.FeatureMap; import gate.util.OffsetComparator; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import org.slf4j.Logger; import ru.ksu.niimm.cll.mocassin.crawl.analyzer.impl.Formula; import ru.ksu.niimm.cll.mocassin.crawl.analyzer.impl.MathExpression; import ru.ksu.niimm.cll.mocassin.crawl.analyzer.impl.Term; import ru.ksu.niimm.cll.mocassin.crawl.analyzer.impl.Variable; import ru.ksu.niimm.cll.mocassin.crawl.parser.arxmliv.ArxmlivStructureElementTypes; import ru.ksu.niimm.cll.mocassin.crawl.parser.gate.GateFormatConstants; import ru.ksu.niimm.cll.mocassin.crawl.parser.gate.Token; import ru.ksu.niimm.cll.mocassin.crawl.parser.gate.TokenImpl; import ru.ksu.niimm.cll.mocassin.util.CollectionUtil; import ru.ksu.niimm.cll.mocassin.util.inject.log.InjectLogger; import com.google.common.collect.Iterables; import com.google.inject.Inject; import com.google.inject.name.Named; public class AnnotationUtilImpl implements AnnotationUtil { private static final String FREE_EXPRESSION_MARK = ".Ex"; private final String TOKEN_ANNOTATION_NAME; private final String ARXMLIV_MATH_ANNOTATION_NAME; private final String ARXMLIV_MATH_TEX_ANNOTATION_NAME; private final String ARXMLIV_MARKUP_NAME; private final String SPACE_TOKEN_ANNOTATION_NAME; private final String SENTENCE_ANNOTATION_NAME; private static final Set<String> NAME_SET = ArxmlivStructureElementTypes.toNameSet(); private final String DOMAIN_ONTOLOGY_URI; private final String NORMALIZED_FORM_ANNOTATION_NAME; private final String TERM_ANNOTATION_NAME; @InjectLogger private Logger logger; @Inject AnnotationUtilImpl(@Named("token.annotation.name") String tokenAnnotationName, @Named("arxmliv.math.annotation.name") String arxmlivMathAnnotationName, @Named("arxmliv.math.tex.annotation.name") String arxmlivMathTexAnnotationName, @Named("arxmliv.markup.name") String arxmlivMarkupName, @Named("space.token.annotation.name") String spaceTokenAnnotationName, @Named("sentence.annotation.name") String sentenceAnnotationName, @Named("term.annotation.name") String termAnnotationName, @Named("domain.ontology.uri") String domainOntologyURI, @Named("normalized.form.annotation.name") String normalizedFormAnnotationName) { this.TOKEN_ANNOTATION_NAME = tokenAnnotationName; this.ARXMLIV_MATH_ANNOTATION_NAME = arxmlivMathAnnotationName; this.ARXMLIV_MATH_TEX_ANNOTATION_NAME = arxmlivMathTexAnnotationName; this.ARXMLIV_MARKUP_NAME = arxmlivMarkupName; this.SPACE_TOKEN_ANNOTATION_NAME = spaceTokenAnnotationName; this.SENTENCE_ANNOTATION_NAME = sentenceAnnotationName; this.DOMAIN_ONTOLOGY_URI = domainOntologyURI; this.NORMALIZED_FORM_ANNOTATION_NAME = normalizedFormAnnotationName; this.TERM_ANNOTATION_NAME = termAnnotationName; } @Override public AnnotationSet getStructuralAnnotations(Document document) { AnnotationSet annotationSet = document.getAnnotations(ARXMLIV_MARKUP_NAME); AnnotationSet equations = document.getAnnotations(ARXMLIV_MARKUP_NAME).get("equation"); Set<Annotation> equationsForRemove = new HashSet<Annotation>(); for (Annotation equation : equations) { AnnotationSet coveringGroups = annotationSet.getCovering("equationgroup", equation.getStartNode().getOffset(), equation.getEndNode().getOffset()); String xmlIdAttr = (String) equation.getFeatures().get("xml:id"); if (!coveringGroups.isEmpty() || xmlIdAttr != null && xmlIdAttr.contains(FREE_EXPRESSION_MARK)) { equationsForRemove.add(equation); } } annotationSet.removeAll(equationsForRemove); return annotationSet.get(NAME_SET); } @Override public Annotation getEnclosingSentence(Document document, Annotation annotation) { AnnotationSet sentenceSet = document.getAnnotations(GateFormatConstants.DEFAULT_ANNOTATION_SET_NAME) .getCovering(SENTENCE_ANNOTATION_NAME, annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset()); if (sentenceSet.size() == 0) { AnnotationSet allSentences = document.getAnnotations(GateFormatConstants.DEFAULT_ANNOTATION_SET_NAME) .get(SENTENCE_ANNOTATION_NAME); long distance = Long.MAX_VALUE; Annotation closestSentence = null; for (Annotation sentence : allSentences) { long endDistance = Math .abs(sentence.getEndNode().getOffset() - annotation.getStartNode().getOffset()); long startDistance = Math .abs(sentence.getStartNode().getOffset() - annotation.getStartNode().getOffset()); long minDistance = Math.min(endDistance, startDistance); if (minDistance < distance) { closestSentence = sentence; distance = minDistance; } } if (closestSentence == null) { throw new RuntimeException( String.format("couldn't locate sentence for annotation with id='%s'", annotation.getId())); } else { return closestSentence; } } return sentenceSet.iterator().next(); } /* * (non-Javadoc) * * @see * ru.ksu.niimm.cll.mocassin.nlp.util.impl.AnnotationUtil#getTokensForAnnotation * (gate.Document, gate.Annotation) */ public List<Token> getTokensForAnnotation(Document document, Annotation annotation) { List<Token> returningTokens; returningTokens = new LinkedList<Token>(); List<Annotation> tokenList = getSortedTokenList(document, annotation, false); for (int i = 0; i < tokenList.size(); i++) { Annotation a = tokenList.get(i); String kind = (String) a.getFeatures().get("kind"); if (!kind.equals("word")) continue; String tokenValue = (String) a.getFeatures().get(GateFormatConstants.TOKEN_FEATURE_NAME); String stemValue = (String) a.getFeatures().get(GateFormatConstants.STEM_FEATURE_NAME); String pos = (String) a.getFeatures().get(GateFormatConstants.POS_FEATURE_NAME); Token token = new TokenImpl(tokenValue, pos, stemValue); returningTokens.add(token); } return returningTokens; } @Override public String[] getPureTokensForAnnotation(Document document, Annotation annotation, boolean useStemming) { List<Annotation> tokenList = getSortedTokenList(document, annotation, false); List<String> strTokens = new ArrayList<String>(); for (Annotation a : tokenList) { String kind = (String) a.getFeatures().get("kind"); if (!kind.equals("word")) continue; String tokenFeatureName = useStemming ? GateFormatConstants.STEM_FEATURE_NAME : GateFormatConstants.TOKEN_FEATURE_NAME; String tokenValue = (String) a.getFeatures().get(tokenFeatureName); strTokens.add(tokenValue); } return Iterables.toArray(strTokens, String.class); } @Override public String[] getTokensWithMathAnnotation(Document document, Annotation annotation) { return extractTokensWithMathExpressions(document, annotation, null); } private String[] extractTokensWithMathExpressions(Document document, Annotation annotation, String symbol) { List<Annotation> tokenList = getSortedTokenList(document, annotation, false); tokenList.addAll(getMathTokens(document, annotation)); Collections.sort(tokenList, new OffsetComparator()); List<String> strTokens = new ArrayList<String>(); for (Annotation a : tokenList) { String value = null; if (a.getType().equals(TOKEN_ANNOTATION_NAME)) { value = (String) a.getFeatures().get(GateFormatConstants.TOKEN_FEATURE_NAME); } else if (a.getType().equals(ARXMLIV_MATH_ANNOTATION_NAME)) { value = symbol == null ? String.format("$%s$", (String) a.getFeatures().get(ARXMLIV_MATH_TEX_ANNOTATION_NAME)) : symbol; } if (value != null) { strTokens.add(value); } } return Iterables.toArray(strTokens, String.class); } @Override public String[] getTokensWithTemplatedMathAnnotations(Document document, Annotation annotation, char symbol) { return extractTokensWithMathExpressions(document, annotation, String.valueOf(symbol)); } @Override public String getTextContentsForAnnotation(Document document, Annotation annotation) { List<Annotation> tokenList = getSortedTokenList(document, annotation, true); StringBuffer sb = new StringBuffer(); for (Annotation tokenAnnotation : tokenList) { String tokenValue = (String) tokenAnnotation.getFeatures().get(GateFormatConstants.TOKEN_FEATURE_NAME); sb.append(tokenValue); } return sb.toString().trim(); } @Override public String getTextContentsForAnnotationWithReplacements(Document document, Annotation annotation, Annotation annotationForReplace, final String replacementString) { List<Annotation> tokenList = getSortedTokenList(document, annotation, true); StringBuffer sb = new StringBuffer(); for (Annotation tokenAnnotation : tokenList) { if (tokenAnnotation.equals(annotationForReplace)) { sb.append(replacementString); } else { String tokenValue = (String) tokenAnnotation.getFeatures() .get(GateFormatConstants.TOKEN_FEATURE_NAME); sb.append(tokenValue); } sb.append(" "); } return sb.toString().trim(); } @Override public List<Term> getTerms(String paperUrl, Document document, Annotation annotation, float confidenceThreshold) { checkArgument(confidenceThreshold >= 0 && confidenceThreshold <= 1); List<Term> terms = new ArrayList<Term>(); AnnotationSet termSet = document.getAnnotations(ARXMLIV_MARKUP_NAME).get(TERM_ANNOTATION_NAME) .getContained(annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset()); for (Annotation termAnnotation : termSet) { String ontologyTermId = (String) termAnnotation.getFeatures().get("OMtermID"); if (ontologyTermId != null) { String confidenceScoreStr = (String) termAnnotation.getFeatures().get("OMtermVal"); if (confidenceScoreStr != null) { float confidenceScore = Float.parseFloat(confidenceScoreStr); if (confidenceScore < confidenceThreshold) continue; String uri = format("%s/%d", paperUrl, termAnnotation.getId()); String classUri = format("%s#%s", DOMAIN_ONTOLOGY_URI, ontologyTermId); String initialView = getTextContentsForAnnotation(document, termAnnotation); String normalizedView = (String) termAnnotation.getFeatures() .get(NORMALIZED_FORM_ANNOTATION_NAME); List<MathExpression> mathExpressions = extractContainingMathExpressions(paperUrl, document, termAnnotation); Term term = new Term(termAnnotation.getId(), uri, classUri, normalizedView, initialView, confidenceScore, mathExpressions); terms.add(term); } else { logger.error("There's no any confidence score for a annotation = {} in a document = {}", termAnnotation.getId(), document.getName()); } } } return terms; } private List<MathExpression> extractContainingMathExpressions(String paperUrl, Document document, Annotation termAnnotation) { List<MathExpression> mathExpressions = new ArrayList<MathExpression>(); FeatureMap featureMap = Factory.newFeatureMap(); featureMap.put("termid", termAnnotation.getId()); AnnotationSet mathAnnotations = document.getAnnotations(ARXMLIV_MARKUP_NAME) .get(ARXMLIV_MATH_ANNOTATION_NAME, featureMap); for (Annotation mathAnnotation : mathAnnotations) { String latexExpression = (String) mathAnnotation.getFeatures().get(ARXMLIV_MATH_TEX_ANNOTATION_NAME); Integer varId = (Integer) mathAnnotation.getFeatures().get("varid"); if (varId != null) { String varUri = format("%s/%d", paperUrl, mathAnnotation.getId()); mathExpressions.add(new Variable(mathAnnotation.getId(), varUri, latexExpression)); } else { List<Variable> variables = extractContainingVariables(paperUrl, document, mathAnnotation); String formulaUri = format("%s/%d", paperUrl, mathAnnotation.getId()); mathExpressions.add(new Formula(mathAnnotation.getId(), formulaUri, latexExpression, variables)); } } return mathExpressions; } private List<Variable> extractContainingVariables(String paperUrl, Document document, Annotation mathAnnotation) { String varsAttribute = (String) mathAnnotation.getFeatures().get("vars"); List<Variable> variables = new ArrayList<Variable>(); if (varsAttribute != null) { String[] vars = varsAttribute.split(";"); FeatureMap varFeatureMap = Factory.newFeatureMap(); for (String var : vars) { varFeatureMap.put("varid", Integer.parseInt(var)); } AnnotationSet containingVariableAnnotations = document.getAnnotations(ARXMLIV_MARKUP_NAME) .get(ARXMLIV_MATH_ANNOTATION_NAME, varFeatureMap); for (Annotation contVarAnnotation : containingVariableAnnotations) { String tex = (String) contVarAnnotation.getFeatures().get(ARXMLIV_MATH_TEX_ANNOTATION_NAME); String varUri = format("%s/%d", paperUrl, contVarAnnotation.getId()); variables.add(new Variable(contVarAnnotation.getId(), varUri, tex)); } } return variables; } private List<Annotation> getSortedTokenList(Document document, Annotation annotation, boolean withSpaces) { AnnotationSet tokenSet = getTokenSetWithoutSpaces(document, annotation); List<Annotation> tokenList = new ArrayList<Annotation>(); for (Annotation token : tokenSet) { AnnotationSet coveringMathAnnotations = document.getAnnotations(ARXMLIV_MARKUP_NAME).getCovering( ARXMLIV_MATH_ANNOTATION_NAME, token.getStartNode().getOffset(), token.getEndNode().getOffset()); if (coveringMathAnnotations.isEmpty()) { tokenList.add(token); } } if (withSpaces) { tokenList.addAll(getTokenSetWithSpaces(document, annotation)); } Collections.sort(tokenList, new OffsetComparator()); return tokenList; } private AnnotationSet getTokenSetWithoutSpaces(Document document, Annotation annotation) { AnnotationSet tokenSet = document.getAnnotations(GateFormatConstants.DEFAULT_ANNOTATION_SET_NAME) .get(TOKEN_ANNOTATION_NAME) .getContained(annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset()); return tokenSet; } private List<Annotation> getMathTokens(Document document, Annotation annotation) { AnnotationSet mathTokens = document.getAnnotations(ARXMLIV_MARKUP_NAME).get(ARXMLIV_MATH_ANNOTATION_NAME) .getContained(annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset()); return CollectionUtil.asList(mathTokens); } private AnnotationSet getTokenSetWithSpaces(Document document, Annotation annotation) { AnnotationSet spaceTokenSet = document.getAnnotations(GateFormatConstants.DEFAULT_ANNOTATION_SET_NAME) .get(SPACE_TOKEN_ANNOTATION_NAME) .getContained(annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset()); return spaceTokenSet; } }