org.apache.ctakes.temporal.ae.THYMEAnaforaXMLReader.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.ctakes.temporal.ae.THYMEAnaforaXMLReader.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.temporal.ae;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.List;
import java.util.Map;

import org.apache.ctakes.typesystem.type.constants.CONST;
import org.apache.ctakes.typesystem.type.refsem.Event;
import org.apache.ctakes.typesystem.type.refsem.EventProperties;
import org.apache.ctakes.typesystem.type.relation.AspectualTextRelation;
import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation;
import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
import org.apache.ctakes.typesystem.type.relation.RelationArgument;
import org.apache.ctakes.typesystem.type.relation.TemporalTextRelation;
import org.apache.ctakes.typesystem.type.textsem.EventMention;
import org.apache.ctakes.typesystem.type.textsem.Markable;
import org.apache.ctakes.typesystem.type.textsem.TimeMention;
import org.apache.log4j.Logger;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.pipeline.SimplePipeline;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.EmptyFSList;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.jcas.cas.NonEmptyFSList;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.cleartk.util.ViewUriUtil;
import org.cleartk.util.cr.UriCollectionReader;
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.input.SAXBuilder;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

public class THYMEAnaforaXMLReader extends JCasAnnotator_ImplBase {
    private static Logger LOGGER = Logger.getLogger(THYMEAnaforaXMLReader.class);

    public static final String PARAM_ANAFORA_DIRECTORY = "anaforaDirectory";

    @ConfigurationParameter(name = PARAM_ANAFORA_DIRECTORY, description = "root directory of the Anafora-annotated files, with one subdirectory for "
            + "each annotated file")
    private File anaforaDirectory;

    public static final String PARAM_ANAFORA_XML_SUFFIXES = "anaforaSuffixes";

    @ConfigurationParameter(name = PARAM_ANAFORA_XML_SUFFIXES, mandatory = false, description = "list of suffixes that might be added to a file name to identify the Anafora "
            + "XML annotations file; only the first suffix corresponding to a file will be used")
    private String[] anaforaXMLSuffixes = new String[] { ".Temporal-Relations.gold.completed.xml",
            ".Temporal-Relation.gold.completed.xml", ".Temporal.dave.completed.xml",
            ".Temporal-Relation-Adjudication.gold.completed.xml",
            ".Temporal-Entity-Adjudication.gold.completed.xml",
            ".temporal.Temporal-Adjudication.gold.completed.xml", ".temporal.Temporal-Entities.gold.completed.xml",
            ".Temporal-Entity.gold.completed.xml", ".Gold_Temporal_Entities.xml", ".Gold_Temporal_Relations.xml" };

    public static AnalysisEngineDescription getDescription() throws ResourceInitializationException {
        return AnalysisEngineFactory.createEngineDescription(THYMEAnaforaXMLReader.class);
    }

    public static AnalysisEngineDescription getDescription(File anaforaDirectory)
            throws ResourceInitializationException {
        return AnalysisEngineFactory.createEngineDescription(THYMEAnaforaXMLReader.class,
                THYMEAnaforaXMLReader.PARAM_ANAFORA_DIRECTORY, anaforaDirectory);
    }

    @Override
    public void process(JCas jCas) throws AnalysisEngineProcessException {
        // determine source text file
        File textFile = new File(ViewUriUtil.getURI(jCas));
        LOGGER.info("processing " + textFile);

        // determine possible Anafora XML file names
        File corefFile = new File(textFile.getPath() + ".Coreference.gold.completed.xml");
        List<File> possibleXMLFiles = Lists.newArrayList();
        for (String anaforaXMLSuffix : this.anaforaXMLSuffixes) {
            if (this.anaforaDirectory == null) {
                possibleXMLFiles.add(new File(textFile + anaforaXMLSuffix));
            } else {
                possibleXMLFiles.add(new File(textFile.getPath() + anaforaXMLSuffix));
            }
        }

        // find an Anafora XML file that actually exists
        File xmlFile = null;
        for (File possibleXMLFile : possibleXMLFiles) {
            if (possibleXMLFile.exists()) {
                xmlFile = possibleXMLFile;
                break;
            }
        }
        if (this.anaforaXMLSuffixes.length > 0 && xmlFile == null) {
            throw new IllegalArgumentException("no Anafora XML file found from " + possibleXMLFiles);
        }

        if (xmlFile != null) {
            processXmlFile(jCas, xmlFile);
        }
        if (corefFile.exists()) {
            processXmlFile(jCas, corefFile);
        }
    }

    private static void processXmlFile(JCas jCas, File xmlFile) throws AnalysisEngineProcessException {
        // load the XML
        Element dataElem;
        try {
            dataElem = new SAXBuilder().build(xmlFile.toURI().toURL()).getRootElement();
        } catch (MalformedURLException e) {
            throw new AnalysisEngineProcessException(e);
        } catch (JDOMException e) {
            throw new AnalysisEngineProcessException(e);
        } catch (IOException e) {
            throw new AnalysisEngineProcessException(e);
        }

        int curEventId = 1;
        int curTimexId = 1;
        int curRelId = 1;
        int docLen = jCas.getDocumentText().length();

        for (Element annotationsElem : dataElem.getChildren("annotations")) {

            Map<String, Annotation> idToAnnotation = Maps.newHashMap();
            for (Element entityElem : annotationsElem.getChildren("entity")) {
                String id = removeSingleChildText(entityElem, "id", null);
                Element spanElem = removeSingleChild(entityElem, "span", id);
                String type = removeSingleChildText(entityElem, "type", id);
                Element propertiesElem = removeSingleChild(entityElem, "properties", id);

                // UIMA doesn't support disjoint spans, so take the span enclosing
                // everything
                int begin = Integer.MAX_VALUE;
                int end = Integer.MIN_VALUE;
                for (String spanString : spanElem.getText().split(";")) {
                    String[] beginEndStrings = spanString.split(",");
                    if (beginEndStrings.length != 2) {
                        error("span not of the format 'number,number'", id);
                    }
                    int spanBegin = Integer.parseInt(beginEndStrings[0]);
                    int spanEnd = Integer.parseInt(beginEndStrings[1]);
                    if (spanBegin < begin) {
                        begin = spanBegin;
                    }
                    if (spanEnd > end) {
                        end = spanEnd;
                    }
                }
                if (begin < 0 || end >= docLen) {
                    error("Illegal begin or end boundary", id);
                    continue;
                }

                Annotation annotation;
                if (type.equals("EVENT")) {
                    String docTimeRel = removeSingleChildText(propertiesElem, "DocTimeRel", id);
                    if (docTimeRel == null) {
                        error("no docTimeRel, assuming OVERLAP", id);
                        docTimeRel = "OVERLAP";
                    }
                    String eventType = removeSingleChildText(propertiesElem, "Type", id);
                    String degree = removeSingleChildText(propertiesElem, "Degree", id);
                    String polarity = removeSingleChildText(propertiesElem, "Polarity", id);
                    String contextualModality = removeSingleChildText(propertiesElem, "ContextualModality", id);
                    String contextualAspect = removeSingleChildText(propertiesElem, "ContextualAspect", id);
                    String permanence = removeSingleChildText(propertiesElem, "Permanence", id);
                    EventMention eventMention = new EventMention(jCas, begin, end);
                    Event event = new Event(jCas);
                    EventProperties eventProperties = new EventProperties(jCas);
                    eventProperties.setDocTimeRel(docTimeRel);
                    eventProperties.setCategory(eventType);
                    eventProperties.setDegree(degree);
                    if (polarity.equals("POS")) {
                        eventProperties.setPolarity(CONST.NE_POLARITY_NEGATION_ABSENT);
                    } else if (polarity.equals("NEG")) {
                        eventProperties.setPolarity(CONST.NE_POLARITY_NEGATION_PRESENT);
                    } else {
                        error("polarity that was not POS or NEG", id);
                    }
                    eventProperties.setContextualModality(contextualModality);
                    eventProperties.setContextualAspect(contextualAspect);
                    eventProperties.setPermanence(permanence);
                    eventProperties.addToIndexes();
                    event.setConfidence(1.0f);
                    event.setDiscoveryTechnique(CONST.NE_DISCOVERY_TECH_GOLD_ANNOTATION);
                    event.setProperties(eventProperties);
                    event.setMentions(new FSArray(jCas, 1));
                    event.setMentions(0, eventMention);
                    event.addToIndexes();
                    eventMention.setId(curEventId++);
                    eventMention.setConfidence(1.0f);
                    eventMention.setDiscoveryTechnique(CONST.NE_DISCOVERY_TECH_GOLD_ANNOTATION);
                    eventMention.setEvent(event);
                    eventMention.addToIndexes();
                    annotation = eventMention;

                } else if (type.equals("TIMEX3")) {
                    String timeClass = removeSingleChildText(propertiesElem, "Class", id);
                    TimeMention timeMention = new TimeMention(jCas, begin, end);
                    timeMention.setId(curTimexId++);
                    timeMention.setTimeClass(timeClass);
                    timeMention.addToIndexes();
                    annotation = timeMention;

                } else if (type.equals("DOCTIME")) {
                    TimeMention timeMention = new TimeMention(jCas, begin, end);
                    timeMention.setId(curTimexId++);
                    timeMention.setTimeClass(type);
                    timeMention.addToIndexes();
                    annotation = timeMention;

                } else if (type.equals("SECTIONTIME")) {
                    TimeMention timeMention = new TimeMention(jCas, begin, end);
                    timeMention.setId(curTimexId++);
                    timeMention.setTimeClass(type);
                    timeMention.addToIndexes();
                    annotation = timeMention;

                } else if (type.equals("Markable")) {
                    while (end >= begin && (jCas.getDocumentText().charAt(end - 1) == '\n'
                            || jCas.getDocumentText().charAt(end - 1) == '\r')) {
                        end--;
                    }
                    Markable markable = new Markable(jCas, begin, end);
                    markable.addToIndexes();
                    annotation = markable;

                } else if (type.equals("DUPLICATE")) {
                    LOGGER.warn("Ignoring duplicate sections in annotations.");
                    continue;
                } else {
                    throw new UnsupportedOperationException("unsupported entity type: " + type);
                }

                // match the annotation to it's ID for later use
                idToAnnotation.put(id, annotation);

                // make sure all XML has been consumed
                removeSingleChild(entityElem, "parentsType", id);
                if (!propertiesElem.getChildren().isEmpty() || !entityElem.getChildren().isEmpty()) {
                    List<String> children = Lists.newArrayList();
                    for (Element child : propertiesElem.getChildren()) {
                        children.add(child.getName());
                    }
                    for (Element child : entityElem.getChildren()) {
                        children.add(child.getName());
                    }
                    error("unprocessed children " + children, id);
                }
            }

            for (Element relationElem : annotationsElem.getChildren("relation")) {
                String id = removeSingleChildText(relationElem, "id", null);
                String type = removeSingleChildText(relationElem, "type", id);
                Element propertiesElem = removeSingleChild(relationElem, "properties", id);

                if (type.equals("TLINK")) {
                    String sourceID = removeSingleChildText(propertiesElem, "Source", id);
                    String targetID = removeSingleChildText(propertiesElem, "Target", id);
                    String tlinkType = removeSingleChildText(propertiesElem, "Type", id);
                    TemporalTextRelation relation = new TemporalTextRelation(jCas);
                    relation.setId(curRelId++);
                    addRelation(jCas, relation, sourceID, targetID, tlinkType, idToAnnotation, id);

                } else if (type.equals("ALINK")) {
                    String sourceID = removeSingleChildText(propertiesElem, "Source", id);
                    String targetID = removeSingleChildText(propertiesElem, "Target", id);
                    String alinkType = removeSingleChildText(propertiesElem, "Type", id);
                    AspectualTextRelation relation = new AspectualTextRelation(jCas);
                    addRelation(jCas, relation, sourceID, targetID, alinkType, idToAnnotation, id);

                } else if (type.equals("Identical")) {
                    CollectionTextRelation chain = new CollectionTextRelation(jCas);
                    String mention = removeSingleChildText(propertiesElem, "FirstInstance", id);
                    NonEmptyFSList list = new NonEmptyFSList(jCas);
                    NonEmptyFSList root = list;
                    Markable antecedent, anaphor;
                    antecedent = (Markable) idToAnnotation.get(mention);
                    list.setHead(antecedent);
                    List<Element> corefs = propertiesElem.getChildren("Coreferring_String");
                    //          while((mention = removeSingleChildText(propertiesElem, "Coreferring_String", id)) != null){
                    for (Element coref : corefs) {
                        mention = coref.getText();
                        NonEmptyFSList child = new NonEmptyFSList(jCas);
                        anaphor = (Markable) idToAnnotation.get(mention);
                        child.setHead(anaphor);
                        CoreferenceRelation pair = new CoreferenceRelation(jCas);
                        pair.setCategory("Identity");
                        RelationArgument arg1 = new RelationArgument(jCas);
                        arg1.setArgument(antecedent);
                        arg1.setRole("antecedent");
                        pair.setArg1(arg1);
                        RelationArgument arg2 = new RelationArgument(jCas);
                        arg2.setArgument(anaphor);
                        arg2.setRole("anaphor");
                        pair.setArg2(arg2);
                        pair.addToIndexes();
                        list.setTail(child);
                        list = child;
                        antecedent = anaphor;
                    }
                    propertiesElem.removeChildren("Coreferring_String");
                    EmptyFSList tail = new EmptyFSList(jCas);
                    list.setTail(tail);
                    root.addToIndexes();
                    chain.setMembers(root);
                    chain.addToIndexes();
                } else if (type.equals("Set/Subset")) {
                    error("This reader has not implemented reading of Set/Subset relations yet", id);

                } else if (type.equals("Whole/Part")) {
                    error("This reader has not implemented reading of Whole/Part relations yet", id);

                } else if (type.equals("Appositive")) {
                    error("This reader has not implemented reading of Appositive relations yet", id);

                } else {
                    throw new UnsupportedOperationException("unsupported relation type: " + type);
                }

                // make sure all XML has been consumed
                removeSingleChild(relationElem, "parentsType", id);
                if (!propertiesElem.getChildren().isEmpty() || !relationElem.getChildren().isEmpty()) {
                    List<String> children = Lists.newArrayList();
                    for (Element child : propertiesElem.getChildren()) {
                        children.add(child.getName());
                    }
                    for (Element child : relationElem.getChildren()) {
                        children.add(child.getName());
                    }
                    error("unprocessed children " + children, id);
                }
            }
        }
    }

    private static Element getSingleChild(Element elem, String elemName, String causeID) {
        List<Element> children = elem.getChildren(elemName);
        if (children.size() != 1) {
            error(String.format("not exactly one '%s' child", elemName), causeID);
        }
        return children.size() > 0 ? children.get(0) : null;
    }

    private static Element removeSingleChild(Element elem, String elemName, String causeID) {
        Element child = getSingleChild(elem, elemName, causeID);
        elem.removeChildren(elemName);
        return child;
    }

    private static String removeSingleChildText(Element elem, String elemName, String causeID) {
        Element child = getSingleChild(elem, elemName, causeID);
        String text = child.getText();
        if (text.isEmpty()) {
            error(String.format("an empty '%s' child", elemName), causeID);
            text = null;
        }
        elem.removeChildren(elemName);
        return text;
    }

    private static void addRelation(JCas jCas, BinaryTextRelation relation, String sourceID, String targetID,
            String category, Map<String, Annotation> idToAnnotation, String causeID) {
        if (sourceID != null && targetID != null) {
            Annotation source = getArgument(sourceID, idToAnnotation, causeID);
            Annotation target = getArgument(targetID, idToAnnotation, causeID);
            if (source != null && target != null) {
                RelationArgument sourceArg = new RelationArgument(jCas);
                sourceArg.setArgument(source);
                sourceArg.addToIndexes();
                RelationArgument targetArg = new RelationArgument(jCas);
                targetArg.setArgument(target);
                targetArg.addToIndexes();
                relation.setCategory(category);
                relation.setArg1(sourceArg);
                relation.setArg2(targetArg);
                relation.addToIndexes();
            }
        }
    }

    private static Annotation getArgument(String id, Map<String, Annotation> idToAnnotation, String causeID) {
        Annotation annotation = idToAnnotation.get(id);
        if (annotation == null) {
            error("no annotation with id " + id, causeID);
        }
        return annotation;
    }

    private static void error(String found, String id) {
        LOGGER.error(String.format("found %s in annotation with ID %s", found, id));
    }

    public static void main(String[] args) throws Exception {
        List<File> files = Lists.newArrayList();
        for (String path : args) {
            files.add(new File(path));
        }
        CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files);
        AnalysisEngine engine = AnalysisEngineFactory.createEngine(THYMEAnaforaXMLReader.class);
        SimplePipeline.runPipeline(reader, engine);
    }
}