Java tutorial
/******************************************************************************* * Copyright (C) 2015 - 2016 Carnegie Mellon University * Author: Oliver Ferschke * * This file is part of DiscourseDB. * * DiscourseDB is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * DiscourseDB is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with DiscourseDB. If not, see <http://www.gnu.org/licenses/> * or write to the Free Software Foundation, Inc., 51 Franklin Street, * Fifth Floor, Boston, MA 02110-1301 USA *******************************************************************************/ package edu.cmu.cs.lti.discoursedb.annotation.brat.io; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Objects; import java.util.Set; import java.util.TreeMap; import java.util.stream.Collectors; import javax.persistence.EntityNotFoundException; import javax.persistence.Table; import org.apache.commons.io.FileUtils; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.hateoas.Identifiable; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Propagation; import org.springframework.transaction.annotation.Transactional; import org.springframework.util.Assert; import com.google.common.collect.Lists; import antlr.Utils; import edu.cmu.cs.lti.discoursedb.annotation.brat.model.BratAnnotation; import edu.cmu.cs.lti.discoursedb.annotation.brat.model.BratSeparator; import edu.cmu.cs.lti.discoursedb.annotation.brat.model.BratTypes; import edu.cmu.cs.lti.discoursedb.annotation.brat.model.BratTypes.AnnotationSourceType; import edu.cmu.cs.lti.discoursedb.annotation.brat.model.BratTypes.BratAnnotationType; import edu.cmu.cs.lti.discoursedb.annotation.brat.util.UtilService; import edu.cmu.cs.lti.discoursedb.annotation.brat.model.CleanupInfo; import edu.cmu.cs.lti.discoursedb.annotation.brat.model.OffsetInfo; import edu.cmu.cs.lti.discoursedb.annotation.brat.model.VersionInfo; import edu.cmu.cs.lti.discoursedb.core.model.BaseEntity; import edu.cmu.cs.lti.discoursedb.core.model.annotation.AnnotationInstance; import edu.cmu.cs.lti.discoursedb.core.model.annotation.Feature; import edu.cmu.cs.lti.discoursedb.core.model.macro.Content; import edu.cmu.cs.lti.discoursedb.core.model.macro.Contribution; import edu.cmu.cs.lti.discoursedb.core.model.macro.Discourse; import edu.cmu.cs.lti.discoursedb.core.model.macro.DiscoursePart; import edu.cmu.cs.lti.discoursedb.system.model.system.SystemUser; import edu.cmu.cs.lti.discoursedb.core.service.annotation.AnnotationService; import edu.cmu.cs.lti.discoursedb.core.service.macro.ContentService; import edu.cmu.cs.lti.discoursedb.core.service.macro.ContributionService; import edu.cmu.cs.lti.discoursedb.core.service.macro.DiscoursePartService; import edu.cmu.cs.lti.discoursedb.core.service.macro.DiscourseService; import edu.cmu.cs.lti.discoursedb.system.service.system.SystemUserService; import edu.cmu.cs.lti.discoursedb.core.service.user.UserService; import lombok.NonNull; import lombok.RequiredArgsConstructor; import lombok.extern.log4j.Log4j; @Log4j @Service @RequiredArgsConstructor(onConstructor = @__(@Autowired)) public class BratService { private final @NonNull DiscourseService discourseService; @Autowired private final @NonNull ContributionService contribService; private final @NonNull ContentService contentService; private final @NonNull AnnotationService annoService; private final @NonNull SystemUserService sysUserService; private final @NonNull DiscoursePartService dpService; @Autowired private final @NonNull UtilService utilService; /** * Imports the annotations of all brat-annotated documents located in the provided folder. * * @param inputFolder the path to the brat corpus folder to import * @throws IOException if an Exception occurs accessing the folder */ public void importDataset(String inputFolder) throws IOException { Assert.hasText(inputFolder, "inputFolder parameter cannot be empty [importDataset(" + inputFolder + ")]"); File dir = new File(inputFolder); Assert.isTrue(dir.isDirectory(), "Provided parameter has to be a path to a folder. [importDataset(" + inputFolder + ")]"); // retrieve all files that end with ann, strip off the extension and save the file name without extension in a list List<String> baseFileNames = Arrays.stream(dir.listFiles((d, name) -> name.endsWith(".ann"))) .map(f -> f.getName().substring(0, f.getName().length() - 4)).collect(Collectors.toList()); for (String baseFileName : baseFileNames) { importThread(inputFolder, baseFileName); } } /** * Imports annotations from a particular brat-annotated thread into DiscourseDB. * * @param inputFolder the folder with the annotation and meta data files * @param baseFileName the base file name of the current thread * @throws IOException in case an error occurs reading the files */ public void importThread(String inputFolder, String baseFileName) throws IOException { Assert.hasText(inputFolder, "inputFolder parameter cannot be empty [importThread(" + inputFolder + ", " + baseFileName + ")]"); Assert.hasText(baseFileName, "baseFileName parameter cannot be empty [importThread(" + inputFolder + ", " + baseFileName + ")]"); File dir = new File(inputFolder); Assert.isTrue(dir.isDirectory(), "Provided parameter has to be a path to a folder. [importThread(" + inputFolder + ", " + baseFileName + ")]"); // The importThreadFromBrat call performs the main import work // and the cleanup call deletes discoursedb annotations that have been // deleted in Brat. They need to run in separate transactions for the // deletion to work. log.info("Starting import of " + baseFileName); cleanupAfterImport(importThreadFromBrat(inputFolder, baseFileName)); log.trace("Finished import of " + baseFileName); } /** * Imports a thread in Brat stand-off format into discoursedb. * * @param inputFolder folder with the brat annotation and meta data * @param baseFileName the base filename for the current thread to be imported * @return an info object containing lists of ids of annotations and featured to be deleted after the import * @throws IOException if any exception occurs while reading the brat annotations or meta data */ @Transactional(value = "coreTransactionManager", propagation = Propagation.REQUIRED, readOnly = false) private CleanupInfo importThreadFromBrat(String inputFolder, String baseFileName) throws IOException { Assert.hasText(inputFolder, "inputFolder parameter cannot be empty [importThread(" + inputFolder + ", " + baseFileName + ")]"); Assert.hasText(baseFileName, "baseFileName parameter cannot be empty [importThread(" + inputFolder + ", " + baseFileName + ")]"); File annFile = new File(inputFolder, baseFileName + ".ann"); File offsetFile = new File(inputFolder, baseFileName + ".offsets"); File versionsFile = new File(inputFolder, baseFileName + ".versions"); // get mapping from entity to offset TreeMap<Integer, OffsetInfo> offsetToOffsetInfo = getOffsetToOffsetInfoMap(offsetFile); // keep track of versions of orginally exported annotations and features Map<String, VersionInfo> annotationBratIdToVersionInfo = getBratIdToDdbIdMap(versionsFile, AnnotationSourceType.DDB_ANNOTATION); Map<String, VersionInfo> featureBratIdToVersionInfo = getBratIdToDdbIdMap(versionsFile, AnnotationSourceType.DDB_FEATURE); DiscoursePart dp = dpService .findOne(Long.parseLong(baseFileName.substring(baseFileName.lastIndexOf("_") + 1))).get(); SystemUser sysUser = sysUserService.getSystemUser().get(); //Init ddb annotation stats for deletion handling Set<Long> ddbAnnotationIds = new HashSet<>(); Set<Long> ddbFeatureIds = new HashSet<>(); //extract annotations on Contributions for (AnnotationInstance anno : annoService.findContributionAnnotationsByDiscoursePart(dp)) { ddbAnnotationIds.add(anno.getId()); anno.setAnnotator(sysUser); ; if (anno.getFeatures() != null) { ddbFeatureIds.addAll(anno.getFeatures().stream().map(f -> f.getId()).collect(Collectors.toList())); } } //extract annotations on Content entities for (AnnotationInstance anno : annoService.findCurrentRevisionAnnotationsByDiscoursePart(dp)) { ddbAnnotationIds.add(anno.getId()); anno.setAnnotator(sysUser); if (anno.getFeatures() != null) { ddbFeatureIds.addAll(anno.getFeatures().stream().map(f -> f.getId()).collect(Collectors.toList())); } } log.info(ddbAnnotationIds.size() + " annotations within current thread available in DiscoursDB."); log.info(ddbFeatureIds.size() + " features within current thread available in DiscoursDB."); List<String> bratStandoffEncodedStrings = FileUtils.readLines(annFile); //sorting in reverse order assures that Attribute annotations (A) are imported after text-bound annotations (T) Collections.sort(bratStandoffEncodedStrings, Collections.reverseOrder()); for (String bratStandoffEncodedString : bratStandoffEncodedStrings) { // create BratAnnotation object from Brat-Stand-off-Encoded String // offset correction will be done later BratAnnotation bratAnno = new BratAnnotation(bratStandoffEncodedString); if (bratAnno.getType() == BratAnnotationType.BRAT_TEXT) { Entry<Integer, OffsetInfo> offset = offsetToOffsetInfo.floorEntry(bratAnno.getBeginIndex()); Contribution contrib = contribService.findOne(offset.getValue().getDiscourseDbContributionId()) .get(); Content content = contentService.findOne(offset.getValue().getDiscourseDbContentId()).get(); long separatorStartIndex = offset.getKey(); long separatorEndIndex = separatorStartIndex + BratSeparator.length; long textEndIndex = separatorEndIndex + content.getText().length(); // CONTRIBUTION LABEL: Annotation is completely within a separator if (bratAnno.getBeginIndex() >= separatorStartIndex && bratAnno.getBeginIndex() <= separatorEndIndex && bratAnno.getEndIndex() >= separatorStartIndex && bratAnno.getEndIndex() <= separatorEndIndex) { // check if annotation already existed before if (annotationBratIdToVersionInfo.keySet().contains(bratAnno.getId())) { VersionInfo entityInfo = annotationBratIdToVersionInfo.get(bratAnno.getId()); ddbAnnotationIds.remove(entityInfo.getDiscourseDBEntityId()); //update deletion stats AnnotationInstance existingAnno = annoService .findOneAnnotationInstance(entityInfo.getDiscourseDBEntityId()).get(); //check if the anno version in the database still matches the anno version we initially exported if (existingAnno.getEntityVersion() == entityInfo.getDiscourseDBEntityVersion()) { existingAnno.setBeginOffset(0); existingAnno.setEndOffset(0); existingAnno.setType(bratAnno.getAnnotationLabel()); } else { log.error( "Entity changed in DiscourseDB since the data was last import but also changed in the exported file. Cannot import annotation."); } } else { // anno is new and didn't exist in ddb before AnnotationInstance newAnno = annoService .createTypedAnnotation(bratAnno.getAnnotationLabel()); annoService.addAnnotation(contrib, newAnno); contribService.save(contrib); //this should happen in addAnnotation. Filed Issue #15 //update version file annotationBratIdToVersionInfo.put(bratAnno.getId(), new VersionInfo(AnnotationSourceType.DDB_ANNOTATION, bratAnno.getId(), newAnno.getId(), newAnno.getEntityVersion())); } } // SPAN ANNOTATION WITHIN CONTRIBUTION TEXT (does neither span over separator nor over multiple contributions) else if (bratAnno.getBeginIndex() > separatorEndIndex && bratAnno.getBeginIndex() <= textEndIndex && bratAnno.getEndIndex() > separatorEndIndex && bratAnno.getEndIndex() <= textEndIndex) { // calculate offset corrected index values for span annotation int offsetCorrectedBeginIdx = bratAnno.getBeginIndex() - offset.getKey() - BratSeparator.length - 1; int offsetCorrectedEndIdx = bratAnno.getEndIndex() - offset.getKey() - BratSeparator.length - 1; // check if annotation already existed before if (annotationBratIdToVersionInfo.keySet().contains(bratAnno.getId())) { VersionInfo entityInfo = annotationBratIdToVersionInfo.get(bratAnno.getId()); ddbAnnotationIds.remove(entityInfo.getDiscourseDBEntityId()); //update deletion stats // Anno already existed. Check for changes. AnnotationInstance existingAnno = annoService .findOneAnnotationInstance(entityInfo.getDiscourseDBEntityId()).get(); //check if the anno version in the database still matches the anno version we initially exported //if so, we can update if (existingAnno.getEntityVersion() == entityInfo.getDiscourseDBEntityVersion()) { existingAnno.setBeginOffset(offsetCorrectedBeginIdx); existingAnno.setEndOffset(offsetCorrectedEndIdx); existingAnno.setType(bratAnno.getAnnotationLabel()); } else { log.error( "Entity changed in DiscourseDB since the data was last import but also changed in the exported file. Cannot import annotation."); } } else { // Anno is new and didn't exist in ddb before. Create it. AnnotationInstance newAnno = annoService .createTypedAnnotation(bratAnno.getAnnotationLabel()); newAnno.setBeginOffset(offsetCorrectedBeginIdx); newAnno.setEndOffset(offsetCorrectedEndIdx); annoService.addAnnotation(content, newAnno); contentService.save(content); //this should happen in addAnnotation. Filed Issue #15 //update version file annotationBratIdToVersionInfo.put(bratAnno.getId(), new VersionInfo(AnnotationSourceType.DDB_ANNOTATION, bratAnno.getId(), newAnno.getId(), newAnno.getEntityVersion())); } } else { log.error( "Annotation extends over contribution separator(s) AND text. You can only annotate within a separator or within a contribution. Skipping this annotation..."); } } else if (bratAnno.getType() == BratAnnotationType.BRAT_NOTE) { VersionInfo entityInfo = featureBratIdToVersionInfo.get(bratAnno.getId()); // check if feature already existed before if (featureBratIdToVersionInfo.keySet().contains(bratAnno.getId())) { ddbFeatureIds.remove(entityInfo.getDiscourseDBEntityId()); //update deletion stats // feature already existed Feature existingFeature = annoService.findOneFeature(entityInfo.getDiscourseDBEntityId()).get(); //check if the feature version in the database still matches the feature version we initially exported if (existingFeature.getEntityVersion() == entityInfo.getDiscourseDBEntityVersion()) { //check for and apply changes if (existingFeature.getValue().equalsIgnoreCase(bratAnno.getAnnotationLabel())) { existingFeature.setValue(bratAnno.getAnnotationLabel()); } } else { log.error( "Entity changed in DiscourseDB since the data was last import but also changed in the exported file. Cannot import feature."); } } else { // feature didn't exist in database yet. Create it. VersionInfo referenceAnnotationInfo = annotationBratIdToVersionInfo .get(bratAnno.getSourceAnnotationId()); if (referenceAnnotationInfo != null) { AnnotationInstance referenceAnno = annoService .findOneAnnotationInstance(referenceAnnotationInfo.getDiscourseDBEntityId()).get(); Feature newFeature = annoService.createTypedFeature(bratAnno.getNoteText(), bratAnno.getType().name()); //update version file featureBratIdToVersionInfo.put(bratAnno.getId(), new VersionInfo(AnnotationSourceType.DDB_FEATURE, bratAnno.getId(), newFeature.getId(), newFeature.getEntityVersion())); annoService.addFeature(referenceAnno, newFeature); annoService.saveFeature(newFeature); //this should happen in addFeature. Filed Issue #15 } else { log.error("Cannot find the annotation this feature applies to."); } } } else { //Implement import capabilities for other Brat Annotation types here log.error("Unsupported Annotation type " + bratAnno.getType() + " Skipping."); } } //Regenerate the version infos updated data from the newly created annotations List<VersionInfo> updatedVersionInfo = new ArrayList<>(); updatedVersionInfo.addAll(annotationBratIdToVersionInfo.values()); updatedVersionInfo.addAll(featureBratIdToVersionInfo.values()); FileUtils.writeLines(versionsFile, updatedVersionInfo); //return info about entities to be deleted return new CleanupInfo(versionsFile, ddbFeatureIds, ddbAnnotationIds); } /** * Deletes annotations and features identified by a list of ids. * Also updates the versions file. * * @param featureIds a list of discourse db feature ids * @param annotationIds a list of discoursedb annotation ids */ @Transactional(propagation = Propagation.REQUIRES_NEW, readOnly = false) private void cleanupAfterImport(CleanupInfo cleanupInfo) throws IOException { Assert.notNull(cleanupInfo, "cleanup info cannot be null"); //delete features from DiscourseDB that have been deleted in brat for (Long id : cleanupInfo.getFeaturesToDelete()) { log.info("Delete feature " + id); annoService.deleteFeature(id); } //delete annotations from DiscourseDB that have been deleted in brat for (Long id : cleanupInfo.getAnnotationsToDelete()) { log.info("Delete annotation " + id); annoService.deleteAnnotation(id); } //cleanup versions file - remove deleted entities List<VersionInfo> filteredVersionFile = new ArrayList<>(); for (String line : FileUtils.readLines(cleanupInfo.getVersionsFile())) { VersionInfo info = new VersionInfo(line); if (info.getType() == AnnotationSourceType.DDB_ANNOTATION) { if (!cleanupInfo.getAnnotationsToDelete().contains(info.getDiscourseDBEntityId())) { filteredVersionFile.add(info); } } else if (info.getType() == AnnotationSourceType.DDB_FEATURE) { if (!cleanupInfo.getFeaturesToDelete().contains(info.getDiscourseDBEntityId())) { filteredVersionFile.add(info); } } } FileUtils.writeLines(cleanupInfo.getVersionsFile(), filteredVersionFile); } @Transactional(propagation = Propagation.REQUIRED, readOnly = true) public void exportDiscoursePart(DiscoursePart dp, String outputFolder) throws IOException { exportDiscoursePart(dp, outputFolder, false); } public String discoursePart2BratName(DiscoursePart dp) { return dp.getName().replaceAll("[^a-zA-Z0-9]", "_") + "_" + dp.getId().toString(); } @Transactional(propagation = Propagation.REQUIRED, readOnly = true) public void exportDiscoursePart(DiscoursePart dp, String outputFolder, Boolean threaded) throws IOException { Assert.notNull(dp, "The DiscoursePart cannot be null"); Assert.hasText(outputFolder, "The outputFolder has to be specified"); //define a common base filename for all files associated with this DiscoursePart String baseFileName = discoursePart2BratName(dp); //dp.getClass().getAnnotation(Table.class).name() + "_"+dp.getId(); // delete me //The offset mapping keeps track of the start positions of each contribution/content in the aggregated txt file List<OffsetInfo> entityOffsetMapping = new ArrayList<>(); List<String> discoursePartText = new ArrayList<>(); List<BratAnnotation> bratAnnotations = new ArrayList<>(); BratIdGenerator bratIdGenerator = new BratIdGenerator(); int spanOffset = 0; // Sort contributions by their start time, without crashing on null List<Contribution> contribsTimeOrdered = Lists.newArrayList(contribService.findAllByDiscoursePart(dp)); //This should be (maybe, optionally) a depth-first sort, with start time as a tiebreaker. contribsTimeOrdered.sort((c1, c2) -> { if (c1 == null) { return -1; } else if (c2 == null) { return 1; } else if (c1.getStartTime() == c2.getStartTime()) { return c1.getId().compareTo(c2.getId()); } else { return c1.getStartTime().compareTo(c2.getStartTime()); } }); List<Contribution> contribs = null; if (threaded) { contribs = utilService.threadsort(contribsTimeOrdered, c -> c.getId(), c -> { Contribution p = contribService.getOneRelatedContribution(c); if (p == null) { return 0L; } else { return p.getId(); } }); } else { contribs = contribsTimeOrdered; } // Export current revision of sorted contributions for (Contribution contrib : contribs) { Content curRevision = contrib.getCurrentRevision(); String text = curRevision.getText(); String sep = new BratSeparator(0, contrib.getCurrentRevision().getAuthor().getUsername(), contrib.getCurrentRevision().getTitle(), contrib.getStartTime()).get(); discoursePartText.add(sep); discoursePartText.add(text); //annotations on content for (AnnotationInstance anno : annoService.findAnnotations(curRevision)) { bratAnnotations .addAll(convertAnnotationToBrat(anno, spanOffset, sep, text, curRevision, bratIdGenerator)); } //annotations on contributions for (AnnotationInstance anno : annoService.findAnnotations(contrib)) { bratAnnotations .addAll(convertAnnotationToBrat(anno, spanOffset, sep, text, contrib, bratIdGenerator)); } //keep track of offsets entityOffsetMapping.add(new OffsetInfo(spanOffset, contrib.getId(), curRevision.getId())); //update span offsets spanOffset += text.length() + 1; spanOffset += BratSeparator.length + 1; } if (contribs.size() > 0) { FileUtils.writeLines(new File(outputFolder, baseFileName + ".txt"), discoursePartText); FileUtils.writeLines(new File(outputFolder, baseFileName + ".ann"), bratAnnotations); FileUtils.writeLines(new File(outputFolder, baseFileName + ".offsets"), entityOffsetMapping); FileUtils.writeLines(new File(outputFolder, baseFileName + ".versions"), bratAnnotations.stream() .map(anno -> anno.getVersionInfo()).filter(Objects::nonNull).collect(Collectors.toList())); } } /** * Converts a DiscourseDB annotation into Brat annotations. * A single DiscourseDB annotation might result in multiple Brat annotations * * @param dbAnno the DiscourseDB annotation to convert * @param spanOffset the current offset of the Contribution or Content within the aggregate document * @param text the contribution text * @param entity the annotated entity (Contribution or Content) * @param annotationVersionInfo a list in which version information about the exported annotations will be stored * @return a list of BratAnnotations */ @Transactional(propagation = Propagation.REQUIRED, readOnly = true) private <T extends BaseEntity & Identifiable<Long>> List<BratAnnotation> convertAnnotationToBrat( AnnotationInstance dbAnno, int spanOffset, String sep, String text, T entity, BratIdGenerator bratIdGenerator) { Assert.notNull(dbAnno, "The annotation instance to be converted cannot be null."); Assert.notNull(text, "The text may be empty, but not null."); Assert.notNull(entity, "The entity associated with the annotation cannot be null."); Assert.notNull(bratIdGenerator, "The Brat IDGenerator cannot be null."); //one DiscourseDB annotation could result in multiple BRAT annotations List<BratAnnotation> newAnnotations = new ArrayList<>(); //PRODUCE Text-Bound Annotation for ALL other annotations BratAnnotation textBoundAnnotation = new BratAnnotation(); textBoundAnnotation.setType(BratAnnotationType.BRAT_TEXT); textBoundAnnotation .setId(bratIdGenerator.getNextAvailableBratId(BratAnnotationType.BRAT_TEXT, dbAnno.getId())); textBoundAnnotation.setAnnotationLabel(dbAnno.getType()); //CALC OFFSET if (entity instanceof Contribution) { //annotations on contributions are always annotated on the contribution separator as an entity label textBoundAnnotation.setBeginIndex(spanOffset); textBoundAnnotation.setEndIndex(spanOffset + sep.length()); textBoundAnnotation.setCoveredText(sep); } else if (entity instanceof Content) { //content labels are always annotated as text spans on the currentRevision content entity if (dbAnno.getEndOffset() == 0) { log.warn("Labels on Content entites should define a span and should not be entity labels."); } textBoundAnnotation.setBeginIndex(spanOffset + dbAnno.getBeginOffset() + sep.length() + 1); textBoundAnnotation.setEndIndex(spanOffset + dbAnno.getEndOffset() + sep.length() + 1); textBoundAnnotation.setCoveredText(text.substring(dbAnno.getBeginOffset(), dbAnno.getEndOffset())); } textBoundAnnotation.setVersionInfo(new VersionInfo(AnnotationSourceType.DDB_ANNOTATION, textBoundAnnotation.getId(), dbAnno.getId(), dbAnno.getEntityVersion())); newAnnotations.add(textBoundAnnotation); //FEATURE VALUES ARE USED TO CREATE BRAT ANNOTATION ATTRIBUTES. Feature types are ignored. for (Feature f : dbAnno.getFeatures()) { BratAnnotation newAttribute = new BratAnnotation(); newAttribute.setType(BratAnnotationType.BRAT_NOTE); newAttribute.setId(bratIdGenerator.getNextAvailableBratId(BratAnnotationType.BRAT_NOTE, f.getId())); newAttribute.setAnnotationLabel("AnnotatorNotes"); newAttribute.setNoteText(f.getValue()); newAttribute.setSourceAnnotationId(textBoundAnnotation.getId()); newAttribute.setVersionInfo(new VersionInfo(AnnotationSourceType.DDB_FEATURE, newAttribute.getId(), f.getId(), f.getEntityVersion())); newAnnotations.add(newAttribute); } return newAnnotations; } /** * Parses the offsets file and provides a Map from offset to discoursedb id. This is used to * identify discoursedb entities by offset in order to identify the contribution at a specific point in the aggated (thread-level) document. * * @param offsetFile file with the offset mapping * @return a TreeMap (has to be a TreeMap because of the required floorEntry method) mapping offset values to entity ids for the given entity type * @throws IOException if an exception occurred while accessing the offset file */ private TreeMap<Integer, OffsetInfo> getOffsetToOffsetInfoMap(File offsetFile) throws IOException { Assert.notNull(offsetFile, "OffsetFile has to be specified."); TreeMap<Integer, OffsetInfo> offsetToOffsetInfo = new TreeMap<>(); for (String line : FileUtils.readLines(offsetFile)) { OffsetInfo info = new OffsetInfo(line); offsetToOffsetInfo.put(info.getSpanOffset(), info); } return offsetToOffsetInfo; } /** * Parses the a version file and returns a map from brat annotation id to DDBEntityInfo objects which provide * meta information about the corresponding discourse db entities * * @param versionFile the file containing the version information * @param sourceType the brat-type of annotations that should be extracted from the versions file (e.g. text-bound annotations) * @return a map from brat annotation ids to DDBEntityInfo objects * @throws IOException if an error occured reading the versions file */ private Map<String, VersionInfo> getBratIdToDdbIdMap(File versionFile, AnnotationSourceType sourceType) throws IOException { Assert.notNull(versionFile, "OffsetFile has to be specified."); Assert.notNull(sourceType, "A source type needs to be specified."); Map<String, VersionInfo> bratIdToDdbVersion = new HashMap<>(); for (String line : FileUtils.readLines(versionFile)) { VersionInfo info = new VersionInfo(line); if (info.getType() == sourceType) { bratIdToDdbVersion.put(info.getBratAnnotationId(), info); } } return bratIdToDdbVersion; } /** * Generates a Brat annotation.conf file with all DiscourseDB annotation types that occur in the set of annotations on contributions or current revisions within the provided discourse registered as Brat annotations. * Relations, events and attribute sections are left empty and not further configuration is generated. * * @param discourseName the name of the discourse for which to export annotation types * @param outputFolder the folder to which the config file should be written * @throws IOException if an exception occurs writing the config file */ @Transactional(propagation = Propagation.REQUIRED, readOnly = true) public void generateBratConfig(String discourseName, String outputFolder) throws IOException { Assert.hasText(discourseName, "The discourse name cannot be empty."); Assert.hasText(outputFolder, "The output folder path cannot be empty."); generateBratConfig(discourseService.findOne(discourseName).orElseThrow( () -> new EntityNotFoundException("Discourse with name " + discourseName + " does not exist.")), outputFolder); } /** * Generates a Brat annotation.conf file with all DiscourseDB annotation types that occur in the set of annotations on contributions or current revisions within the provided discourse registered as Brat annotations. * Relations, events and attribute sections are left empty and not further configuration is generated. * * @param discourseName the name of the discourse for which to export annotation types * @param outputFolder the folder to which the config file should be written * @throws IOException if an exception occurs writing the config file */ @Transactional(propagation = Propagation.REQUIRED, readOnly = true) public void generateBratConfig(String outputFolder) throws IOException { Assert.hasText(outputFolder, "The output folder path cannot be empty."); Set<String> annoTypes = new HashSet<>(); for (Discourse curDiscourse : discourseService.findAll()) { for (DiscoursePart dp : dpService.findAllByDiscourse(curDiscourse)) { annoTypes.addAll(annoService.findContributionAnnotationsByDiscoursePart(dp).stream() .map(anno -> BratAnnotation.cleanString(anno.getType())).collect(Collectors.toSet())); annoTypes.addAll(annoService.findCurrentRevisionAnnotationsByDiscoursePart(dp).stream() .map(anno -> BratAnnotation.cleanString(anno.getType())).collect(Collectors.toSet())); } } generateBratConfig(outputFolder, annoTypes); } /** * Generates a Brat annotation.conf file with all DiscourseDB annotation types that occur in the set of annotations on contributions or current revisions within the provided discourse registered as Brat annotations. * Relations, events and attribute sections are left empty and not further configuration is generated. * * @param discourse the discourse for which to export annotation types * @param outputFolder the folder to which the config file should be written * @throws IOException if an exception occurs writing the config file */ @Transactional(propagation = Propagation.REQUIRED, readOnly = true) public void generateBratConfig(Discourse discourse, String outputFolder) throws IOException { Assert.notNull(discourse, "The discourse cannot be null."); Assert.hasText(outputFolder, "The output folder path cannot be empty."); Set<String> annoTypes = new HashSet<>(); for (DiscoursePart dp : dpService.findAllByDiscourse(discourse)) { annoTypes.addAll(annoService.findContributionAnnotationsByDiscoursePart(dp).stream() .map(anno -> BratAnnotation.cleanString(anno.getType())).collect(Collectors.toSet())); annoTypes.addAll(annoService.findCurrentRevisionAnnotationsByDiscoursePart(dp).stream() .map(anno -> BratAnnotation.cleanString(anno.getType())).collect(Collectors.toSet())); } generateBratConfig(outputFolder, annoTypes); } /** * Generates an empty brat anotation.conf file and registers the provided set of annotation types. * * @param outputFolder the folder to which the config file should be written * @param annotationTypes a set of annotation types to register in the brat configuration file * @throws IOException if an exception occurs writing the config file */ @Transactional(propagation = Propagation.REQUIRED, readOnly = true) public void generateBratConfig(String outputFolder, Set<String> annotationTypes) throws IOException { Assert.hasText(outputFolder, "The output folder path cannot be empty."); Assert.notNull(annotationTypes, "The set holding annotation types was null. Please pass a (potentially empty) set."); List<String> annotationConf = new ArrayList<>(); annotationConf.add("[relations]"); annotationConf.add("[events]"); annotationConf.add("[attributes]"); annotationConf.add("[entities]"); annotationConf.addAll(annotationTypes); FileUtils.writeLines(new File(outputFolder, "annotation.conf"), annotationConf); } /** * The Brat UI auto-generates annotations starting with ID 1. * If an annotation with id 1 is deleted, the next annotation created will again get id 1. * If we delete an annotation that exists in DiscourseDB that has brat id 1 and then create a new annotation that does not yet exist in discoursedb it * will get the brat id 1 and the import process would not create the annotation since it thinks the annotation with brat id 1 is already there. * That's why we offset all annotations that came from discoursedb with a large number upon export to create a certain id range that will always be * associated with annotations already available in discoursedb. * * This generator produces brat ids and ensured they don't collide with previously generated ids. */ protected class BratIdGenerator { private static final int BRAT_ID_OFFSET = 100000; private List<String> ids = new ArrayList<>(); public String getNextAvailableBratId(BratAnnotationType type, long baseId) { long offsetId = BRAT_ID_OFFSET + baseId; String curBratId = type.toString() + offsetId; while (ids.contains(curBratId)) { offsetId++; curBratId = type.toString() + offsetId; } ids.add(curBratId); return curBratId; } } }