Java tutorial
/* * Copyright 2015 Textocat * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.textocat.textokit.io.brat; import com.google.common.base.Function; import com.google.common.base.Joiner; import com.google.common.collect.*; import com.textocat.textokit.commons.DocumentMetadata; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.*; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.fit.component.CasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.OperationalProperties; import org.apache.uima.fit.factory.initializable.InitializableFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.nlplab.brat.ann.*; import org.nlplab.brat.configuration.*; import org.nlplab.brat.configuration.EventRole.Cardinality; import java.io.*; import java.nio.file.Path; import java.util.List; import java.util.Map; import java.util.Set; import static org.nlplab.brat.BratConstants.*; /** * TODO adjust this top javadoc * <p/> * UIMA Annotator is CAS Annotator to convert UIMA annotations to brat standoff * format annotations. 1) defines input, ouput files directories 2) reading * annotator descriptor file and converts parameters to brat configuration file * saved as annotation.conf 3) saves annotations text file using specified file * name parameter in DocumentMetadata annotations. 4) reading UIMA annotations * and converts them to brat annotation (*.ann files) * <p/> * T: text-bound annotation R: relation E: event A: attribute M: modification * (alias for attribute, for backward compatibility) N: normalization #: note * <p/> * For event annotation you have to add additional info about event entities * into desc file * * @author Rinat Gareev * @author pathfinder */ @OperationalProperties(modifiesCas = false, multipleDeploymentAllowed = false) public class UIMA2BratAnnotator extends CasAnnotator_ImplBase { public final static String BRAT_OUT = "BratOutputDir"; public final static String ENTITIES_TO_BRAT = "EntitiesToBrat"; public final static String RELATIONS_TO_BRAT = "RelationsToBrat"; public final static String EVENTS_TO_BRAT = "EventsToBrat"; public final static String BRAT_NOTE_MAPPERS = "BratNoteMappers"; public static final String PARAM_OUTPUT_PATH_FUNCTION = "outputPathFunction"; // annotator configuration fields @ConfigurationParameter(name = BRAT_OUT, mandatory = true) private File bratDirectory; @ConfigurationParameter(name = ENTITIES_TO_BRAT, mandatory = false) private String[] entitiesToBratRaw; private List<EntityDefinitionValue> entitiesToBrat; @ConfigurationParameter(name = RELATIONS_TO_BRAT, mandatory = false) private String[] relationsToBratRaw; private List<StructureDefinitionValue> relationsToBrat; @ConfigurationParameter(name = EVENTS_TO_BRAT, mandatory = false) private String[] eventsToBratRaw; private List<StructureDefinitionValue> eventsToBrat; @ConfigurationParameter(name = BRAT_NOTE_MAPPERS, mandatory = false) private String[] noteMappersDefinitionsRaw; private List<NoteMapperDefinitionValue> noteMappersDefinitions; @ConfigurationParameter(name = PARAM_OUTPUT_PATH_FUNCTION, mandatory = false, defaultValue = "com.textocat.textokit.consumer.DefaultSourceURI2OutputFilePathFunction") private Class<? extends Function> outPathFuncClass; // derived configuration fields private BratTypesConfiguration bratTypesConfig; private UimaBratMapping mapping; private Function<DocumentMetadata, Path> outPathFunc; // state fields private TypeSystem ts; // per-CAS state fields private String currentDocName; private BratAnnotationContainer bac; private ToBratMappingContext context; @Override public void initialize(UimaContext ctx) throws ResourceInitializationException { super.initialize(ctx); getLogger().info("Annotator is initializing ..."); if (entitiesToBratRaw == null) { entitiesToBrat = ImmutableList.of(); } else { entitiesToBrat = Lists.newLinkedList(); for (String valStr : entitiesToBratRaw) { entitiesToBrat.add(EntityDefinitionValue.fromString(valStr)); } entitiesToBrat = ImmutableList.copyOf(entitiesToBrat); } if (relationsToBratRaw == null) { relationsToBrat = ImmutableList.of(); } else { relationsToBrat = Lists.newLinkedList(); for (String valStr : relationsToBratRaw) { StructureDefinitionValue val = StructureDefinitionValue.fromString(valStr); if (val.roleDefinitions.size() != 2) { throw new IllegalArgumentException(String.format("Illegal relation definition: %s", valStr)); } relationsToBrat.add(val); } relationsToBrat = ImmutableList.copyOf(relationsToBrat); } if (eventsToBratRaw == null) { eventsToBrat = ImmutableList.of(); } else { eventsToBrat = Lists.newLinkedList(); for (String valStr : eventsToBratRaw) { eventsToBrat.add(StructureDefinitionValue.fromString(valStr)); } eventsToBrat = ImmutableList.copyOf(eventsToBrat); } if (noteMappersDefinitionsRaw == null) { noteMappersDefinitions = ImmutableList.of(); } else { noteMappersDefinitions = Lists.newLinkedList(); for (String defStr : noteMappersDefinitionsRaw) { noteMappersDefinitions.add(NoteMapperDefinitionValue.fromString(defStr)); } noteMappersDefinitions = ImmutableList.copyOf(noteMappersDefinitions); } // //noinspection unchecked outPathFunc = InitializableFactory.create(ctx, outPathFuncClass); } @Override public void typeSystemInit(TypeSystem ts) throws AnalysisEngineProcessException { super.typeSystemInit(ts); this.ts = ts; // getLogger().info("Reading UIMA types to convert to brat annotations ... "); createBratTypesConfiguration(); Writer acWriter = null; try { if (!bratDirectory.isDirectory()) bratDirectory.mkdirs(); File annotationConfFile = new File(bratDirectory, ANNOTATION_CONF_FILE); acWriter = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(annotationConfFile), ANNOTATION_CONF_ENCODING)); bratTypesConfig.writeTo(acWriter); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } finally { IOUtils.closeQuietly(acWriter); } } @Override public void process(CAS cas) throws AnalysisEngineProcessException { // extract target file name currentDocName = extractDocName(cas); // prepare paths BratDocument bratDoc = new BratDocument(bratDirectory, currentDocName); // write doc text String txt = cas.getDocumentText(); try { FileUtils.write(bratDoc.getTxtFile(), txt, TXT_FILES_ENCODING); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } // populate Brat annotation container bac = new BratAnnotationContainer(bratTypesConfig); context = new ToBratMappingContext(); // start with entities for (Type uType : mapping.getEntityUimaTypes()) { UimaBratEntityMapping entMapping = mapping.getEntityMapping(uType); for (AnnotationFS uEntity : cas.getAnnotationIndex(uType)) { mapEntity(entMapping, uEntity); } } // then relations for (Type uType : mapping.getRelationUimaTypes()) { UimaBratRelationMapping relMapping = mapping.getRelationMapping(uType); for (AnnotationFS uRelation : cas.getAnnotationIndex(uType)) { mapRelation(relMapping, uRelation); } } // then events for (Type uType : mapping.getEventUimaTypes()) { UimaBratEventMapping evMapping = mapping.getEventMapping(uType); for (AnnotationFS uEvent : cas.getAnnotationIndex(uType)) { mapEvent(evMapping, uEvent); } } // write .ann file Writer annWriter = null; try { annWriter = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(bratDoc.getAnnFile()), ANN_FILES_ENCODING)); bac.writeTo(annWriter); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } finally { IOUtils.closeQuietly(annWriter); } // clear per-CAS state currentDocName = null; bac = null; context = null; } private void mapEntity(UimaBratEntityMapping entMapping, AnnotationFS uEntity) { if (context.isMapped(uEntity)) { return; } BratEntityType bType = entMapping.bratType; // create brat annotation instance BratEntity bEntity = new BratEntity(bType, uEntity.getBegin(), uEntity.getEnd(), uEntity.getCoveredText()); // add to container - it assigns ID bEntity = bac.register(bEntity); // map to note mapNotes(entMapping, bEntity, uEntity); // memorize context.mapped(uEntity, bEntity); } private void mapRelation(UimaBratRelationMapping relMapping, AnnotationFS uRelation) { if (context.isMapped(uRelation)) { return; } BratRelationType bType = relMapping.bratType; Map<String, BratEntity> argMap = makeArgMap(uRelation, bType, relMapping.roleFeatures); if (argMap == null) { return; } // create BratRelation bRelation = new BratRelation(bType, argMap); // assign id bRelation = bac.register(bRelation); // map to note mapNotes(relMapping, bRelation, uRelation); // memorize context.mapped(uRelation, bRelation); } private void mapEvent(UimaBratEventMapping evMapping, AnnotationFS uEvent) { if (context.isMapped(uEvent)) { return; } BratEventType bType = evMapping.bratType; // use UIMA event annotation boundaries as Brat event trigger boundaries BratEventTrigger trigger = new BratEventTrigger(bType, uEvent.getBegin(), uEvent.getEnd(), uEvent.getCoveredText()); // assign id to trigger trigger = bac.register(trigger); // fill slots Multimap<String, BratAnnotation<?>> roleAnnotations = makeRoleMap(uEvent, bType, evMapping.roleFeatures); // create BratEvent bEvent = new BratEvent(bType, trigger, roleAnnotations); // assign id bEvent = bac.register(bEvent); // map to note mapNotes(evMapping, bEvent, uEvent); // memorize context.mapped(uEvent, bEvent); } // fill relation roles private Map<String, BratEntity> makeArgMap(AnnotationFS uAnno, BratRelationType bratType, Map<String, Feature> argFeatMap) { Map<String, BratEntity> argAnnotations = Maps.newHashMapWithExpectedSize(2); for (String argName : argFeatMap.keySet()) { Feature argFeat = argFeatMap.get(argName); FeatureStructure argFS = uAnno.getFeatureValue(argFeat); if (argFS == null) { getLogger().warn(String.format("Can't map %s to Brat relation. Its feature '%s' is not set.", toPrettyString(uAnno), argFeat)); return null; } BratEntity argValue = context.demandEntity(argFS); argAnnotations.put(argName, argValue); } return argAnnotations; } // fill event roles private Multimap<String, BratAnnotation<?>> makeRoleMap(AnnotationFS uAnno, BratEventType bratType, Map<String, Feature> roleFeatMap) { Multimap<String, BratAnnotation<?>> roleAnnotations = LinkedHashMultimap.create(); for (String roleName : roleFeatMap.keySet()) { EventRole roleDesc = bratType.getRole(roleName); // check role range types boolean entityInRange = isEveryInstanceOf(roleDesc.getRangeTypes(), BratEntityType.class); if (!entityInRange && !isEveryInstanceOf(roleDesc.getRangeTypes(), BratEventType.class)) { throw new UnsupportedOperationException( String.format("Mixed entity/event types in role range is not supported: %s", roleDesc)); } // Feature roleFeat = roleFeatMap.get(roleName); FeatureStructure _roleFS = uAnno.getFeatureValue(roleFeat); if (_roleFS == null) { continue; } List<FeatureStructure> roleFSList; if (PUtils.hasCollectionRange(roleFeat)) { roleFSList = PUtils.toList(roleFeat, _roleFS); } else { roleFSList = ImmutableList.of(_roleFS); } // for (FeatureStructure roleFS : roleFSList) { BratAnnotation<?> rv; if (entityInRange) { rv = context.demandEntity(roleFS); } else { // role value should be event rv = context.getEvent(roleFS, false); if (rv == null) { // means that a sub-event has not been mapped yet // TODO implement nested event mapping throw new UnsupportedOperationException("Nested event mapping is not supported yet"); } } roleAnnotations.put(roleName, rv); } } return roleAnnotations; } private static boolean isEveryInstanceOf(Iterable<?> srcCol, Class<?> testClass) { for (Object e : srcCol) { if (!testClass.isInstance(e)) { return false; } } return true; } /* * PRECONDITIONS: bAnno must have ID */ private <BT extends BratType> void mapNotes(UimaBratTypeMappingBase<BT> mapping, BratAnnotation<BT> bAnno, AnnotationFS uAnno) { assert bAnno.getId() != null; BratNoteMapper noteMapper = mapping.noteMapper; if (noteMapper != null) { String noteContent = noteMapper.makeNote(uAnno); if (noteContent != null) { BratNoteAnnotation noteAnno = new BratNoteAnnotation(bratTypesConfig.getUiNoteType(), bAnno, noteContent); noteAnno = bac.register(noteAnno); } } } private String toPrettyString(AnnotationFS anno) { return String.format("<%s, offset %s in %s>", anno.getCoveredText(), anno.getBegin(), currentDocName); } private String toPrettyString(FeatureStructure fs) { if (fs instanceof AnnotationFS) { return toPrettyString((AnnotationFS) fs); } return String.valueOf(fs); } private String extractDocName(CAS cas) { JCas jcas = null; try { jcas = cas.getJCas(); } catch (CASException e) { throw new IllegalStateException(e); } DocumentMetadata metaAnno = JCasUtil.selectSingle(jcas, DocumentMetadata.class); Path outPath = outPathFunc.apply(metaAnno); String docName = flattenToSingleFilename(outPath); if (StringUtils.isBlank(docName)) { throw new IllegalStateException(String.format("Extracted empty doc name from meta: %s", metaAnno)); } return docName; } private static final Joiner PATH_ELEM_JOINER = Joiner.on('-'); private static String flattenToSingleFilename(Path p) { return PATH_ELEM_JOINER.join(p); } private void createBratTypesConfiguration() throws AnalysisEngineProcessException { // type configuration builder final BratTypesConfiguration.Builder tcBuilder = BratTypesConfiguration.builder(); /* * define mapping initializer that will incrementally build * required Brat type system as side effect */ UimaBratMappingInitializer initializer = new UimaBratMappingInitializer(ts, entitiesToBrat, relationsToBrat, eventsToBrat, noteMappersDefinitions) { @Override protected BratEntityType getEntityType(String typeName) { return tcBuilder.addEntityType(typeName); } @Override protected BratRelationType getRelationType(String typeName, Map<String, String> argTypeNames) { return tcBuilder.addRelationType(typeName, argTypeNames); } @Override protected BratEventType getEventType(String typeName, Map<String, String> roleTypeNames, Set<String> multiValuedRoles) { Map<String, Cardinality> roleCardinalities = Maps.newHashMap(); for (String roleName : roleTypeNames.keySet()) { Cardinality card = multiValuedRoles.contains(roleName) ? Cardinality.ARRAY : Cardinality.OPTIONAL; roleCardinalities.put(roleName, card); } return tcBuilder.addEventType(typeName, Multimaps.forMap(roleTypeNames), roleCardinalities); } }; mapping = initializer.create(); bratTypesConfig = tcBuilder.build(); } private class ToBratMappingContext { private Map<AnnotationFS, BratAnnotation<?>> mappedAnnos = Maps.newHashMap(); private boolean isMapped(AnnotationFS anno) { return mappedAnnos.containsKey(anno); } private BratEntity demandEntity(FeatureStructure fs) { return getMapped(fs, BratEntity.class, true); } private BratEvent getEvent(FeatureStructure fs, boolean require) { return getMapped(fs, BratEvent.class, require); } @SuppressWarnings("unchecked") private <B extends BratAnnotation<?>> B getMapped(FeatureStructure fs, Class<B> targetClass, boolean require) { BratAnnotation<?> result = mappedAnnos.get(fs); if (result == null) { if (require) { throw new IllegalStateException(String.format("Can't find mapped instance for %s in %s", toPrettyString(fs), currentDocName)); } return null; } if (!targetClass.isInstance(result)) { throw new IllegalStateException( String.format("Unexpected mapped instance type for %s:\n required: %s\n actual:%s", toPrettyString(fs), targetClass.getName(), result.getClass().getName())); } return (B) result; } private void mapped(AnnotationFS uAnno, BratAnnotation<?> bAnno) { mappedAnnos.put(uAnno, bAnno); } } }