de.tudarmstadt.ukp.clarin.webanno.tcf.TcfWriter.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.clarin.webanno.tcf.TcfWriter.java

Source

/*******************************************************************************
 * Copyright 2012
 * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.tudarmstadt.ukp.clarin.webanno.tcf;

import static org.apache.commons.io.IOUtils.closeQuietly;
import static org.apache.uima.fit.util.JCasUtil.exists;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain;
import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
import eu.clarin.weblicht.wlfxb.io.TextCorpusStreamedWithReplaceableLayers;
import eu.clarin.weblicht.wlfxb.io.WLDObjector;
import eu.clarin.weblicht.wlfxb.io.WLFormatException;
import eu.clarin.weblicht.wlfxb.tc.api.DependencyParsingLayer;
import eu.clarin.weblicht.wlfxb.tc.api.LemmasLayer;
import eu.clarin.weblicht.wlfxb.tc.api.NamedEntitiesLayer;
import eu.clarin.weblicht.wlfxb.tc.api.PosTagsLayer;
import eu.clarin.weblicht.wlfxb.tc.api.Reference;
import eu.clarin.weblicht.wlfxb.tc.api.ReferencesLayer;
import eu.clarin.weblicht.wlfxb.tc.api.SentencesLayer;
import eu.clarin.weblicht.wlfxb.tc.api.TextCorpus;
import eu.clarin.weblicht.wlfxb.tc.api.TokensLayer;
import eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusLayerTag;
import eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusStored;
import eu.clarin.weblicht.wlfxb.xb.WLData;

/**
 * Writer for the WebLicht TCF format.
 *
 * @author Seid Muhie Yimam
 * @author Richard Eckart de Castilho
 */
@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
        "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
        "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
        "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity",
        "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS",
        "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma",
        "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain",
        "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink",
        "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" })
public class TcfWriter extends JCasFileWriter_ImplBase {
    private static final String REL_TYPE_EXPLETIVE = "expletive";

    /**
     * Specify the suffix of output files. Default value <code>.tcf</code>. If the suffix is not
     * needed, provide an empty string as value.
     */
    public static final String PARAM_FILENAME_SUFFIX = "filenameSuffix";
    @ConfigurationParameter(name = PARAM_FILENAME_SUFFIX, mandatory = true, defaultValue = ".tcf")
    private String filenameSuffix;

    /**
     * If there are no annotations for a particular layer in the CAS, preserve any potentially
     * existing annotations in the original TCF.<br>
     * Default: {@code false}
     */
    public static final String PARAM_PRESERVE_IF_EMPTY = "preserveIfEmpty";
    @ConfigurationParameter(name = PARAM_PRESERVE_IF_EMPTY, mandatory = true, defaultValue = "false")
    private boolean preserveIfEmpty;

    /**
     * Merge with source TCF file if one is available.<br>
     * Default: {@code true}
     */
    public static final String PARAM_MERGE = "merge";
    @ConfigurationParameter(name = PARAM_MERGE, mandatory = true, defaultValue = "true")
    private boolean merge;

    @Override
    public void process(JCas aJCas) throws AnalysisEngineProcessException {
        InputStream docIS = null;
        try {
            boolean writeWithoutMerging = true;
            if (merge) {
                OutputStream docOS = null;
                try {
                    docOS = getOutputStream(aJCas, filenameSuffix);
                    // Get the original TCF file and preserve it
                    DocumentMetaData documentMetadata = DocumentMetaData.get(aJCas);
                    URL filePathUrl = new URL(documentMetadata.getDocumentUri());
                    try {
                        docIS = filePathUrl.openStream();

                        try {
                            getLogger().debug("Merging with [" + documentMetadata.getDocumentUri() + "]");
                            casToTcfWriter(docIS, aJCas, docOS);
                            writeWithoutMerging = false;
                        }
                        // See https://github.com/weblicht/wlfxb/issues/7
                        // catch (WLFormatException ex) {
                        // getLogger().debug("No source file to merge with: " + ex.getMessage());
                        // }
                        // Workaround: catch all exceptions
                        catch (Exception ex) {
                            getLogger().debug("Source file is not TCF: " + ex.getMessage());
                        }
                    } catch (IOException e) {
                        getLogger().debug("Cannot open source file to merge with: " + e.getMessage());
                    }
                } finally {
                    closeQuietly(docOS);
                }
            } else {
                getLogger().debug("Merging disabled");
            }

            // If merging failed or is disabled, go on without merging
            if (writeWithoutMerging) {
                OutputStream docOS = null;
                try {
                    docOS = getOutputStream(aJCas, filenameSuffix);
                    casToTcfWriter(aJCas, docOS);
                } finally {
                    closeQuietly(docOS);
                }
            }
        } catch (Exception e) {
            throw new AnalysisEngineProcessException(e);
        } finally {
            closeQuietly(docIS);
        }
    }

    /**
     * Create TCF File from scratch
     * 
     * @param aJCas
     *            the JCas.
     * @param aOs
     *            the output stream.
     * @throws WLFormatException
     *             if a TCF problem occurs.
     */
    public void casToTcfWriter(JCas aJCas, OutputStream aOs) throws WLFormatException {
        // create TextCorpus object, specifying its language from the aJcas Object
        TextCorpusStored textCorpus = new TextCorpusStored(aJCas.getDocumentLanguage());

        // create text annotation layer and add the string of the text into the layer
        textCorpus.createTextLayer().addText(aJCas.getDocumentText());

        write(aJCas, textCorpus);

        // write the annotated data object into the output stream
        WLData wldata = new WLData(textCorpus);
        WLDObjector.write(wldata, aOs);
    }

    /**
     * Merge annotations from CAS into an existing TCF file.
     *
     * @param aIs
     *            the TCF file with an existing annotation layers
     * @param aJCas
     *            an annotated CAS object
     * @param aOs
     *            the output stream.
     * @throws WLFormatException
     *             if a TCF problem occurs.
     */
    public void casToTcfWriter(InputStream aIs, JCas aJCas, OutputStream aOs) throws WLFormatException {
        // If these layers are present in the TCF file, we use them from there, otherwise
        // we generate them
        EnumSet<TextCorpusLayerTag> layersToRead = EnumSet.of(TextCorpusLayerTag.TOKENS,
                TextCorpusLayerTag.SENTENCES);

        // If we have annotations for these layers in the CAS, we rewrite those layers. 
        List<TextCorpusLayerTag> layersToReplace = new ArrayList<TextCorpusLayerTag>();
        if (exists(aJCas, POS.class) || !preserveIfEmpty) {
            layersToReplace.add(TextCorpusLayerTag.POSTAGS);
        }
        if (exists(aJCas, Lemma.class) || !preserveIfEmpty) {
            layersToReplace.add(TextCorpusLayerTag.LEMMAS);
        }
        if (exists(aJCas, NamedEntity.class) || !preserveIfEmpty) {
            layersToReplace.add(TextCorpusLayerTag.NAMED_ENTITIES);
        }
        if (exists(aJCas, Dependency.class) || !preserveIfEmpty) {
            layersToReplace.add(TextCorpusLayerTag.PARSING_DEPENDENCY);
        }
        if (exists(aJCas, CoreferenceChain.class) || !preserveIfEmpty) {
            layersToReplace.add(TextCorpusLayerTag.REFERENCES);
        }

        TextCorpusStreamedWithReplaceableLayers textCorpus = null;
        try {
            textCorpus = new TextCorpusStreamedWithReplaceableLayers(aIs, layersToRead,
                    EnumSet.copyOf(layersToReplace), aOs);

            write(aJCas, textCorpus);
        } finally {
            if (textCorpus != null) {
                try {
                    textCorpus.close();
                } catch (IOException e) {
                    // Ignore exception while closing
                }
            }
        }
    }

    private void write(JCas aJCas, TextCorpus aTextCorpus) {
        Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> tokensBeginPositionMap;
        tokensBeginPositionMap = writeTokens(aJCas, aTextCorpus);
        writeSentence(aJCas, aTextCorpus, tokensBeginPositionMap);
        writePosTags(aJCas, aTextCorpus, tokensBeginPositionMap);
        writeLemmas(aJCas, aTextCorpus, tokensBeginPositionMap);
        writeDependency(aJCas, aTextCorpus, tokensBeginPositionMap);
        writeNamedEntity(aJCas, aTextCorpus, tokensBeginPositionMap);
        writeCoreference(aJCas, aTextCorpus, tokensBeginPositionMap);
    }

    private Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> writeTokens(JCas aJCas, TextCorpus aTextCorpus) {
        boolean tokensLayerCreated = false;

        // Create tokens layer if it does not exist
        TokensLayer tokensLayer = aTextCorpus.getTokensLayer();
        if (tokensLayer == null) {
            tokensLayer = aTextCorpus.createTokensLayer();
            tokensLayerCreated = true;
            getLogger().debug("Layer [" + TextCorpusLayerTag.TOKENS.getXmlName() + "]: created");
        } else {
            getLogger().debug("Layer [" + TextCorpusLayerTag.TOKENS.getXmlName() + "]: found");
        }

        Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> tokensBeginPositionMap = new HashMap<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token>();

        int j = 0;
        for (Token token : select(aJCas, Token.class)) {
            if (tokensLayerCreated) {
                tokensLayer.addToken(token.getCoveredText());
            }

            tokensBeginPositionMap.put(token.getBegin(), tokensLayer.getToken(j));
            j++;
        }

        return tokensBeginPositionMap;
    }

    private void writePosTags(JCas aJCas, TextCorpus aTextCorpus,
            Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> aTokensBeginPositionMap) {
        if (!JCasUtil.exists(aJCas, POS.class)) {
            // Do nothing if there are no part-of-speech tags in the CAS
            getLogger().debug("Layer [" + TextCorpusLayerTag.POSTAGS.getXmlName() + "]: empty");
            return;
        }

        // Tokens layer must already exist
        TokensLayer tokensLayer = aTextCorpus.getTokensLayer();

        // create POS tag annotation layer
        String posTagSet = "STTS";
        for (TagsetDescription tagSet : select(aJCas, TagsetDescription.class)) {
            if (tagSet.getLayer().equals(POS.class.getName())) {
                posTagSet = tagSet.getName();
                break;
            }
        }

        PosTagsLayer posLayer = aTextCorpus.createPosTagsLayer(posTagSet);

        getLogger().debug("Layer [" + TextCorpusLayerTag.POSTAGS.getXmlName() + "]: created");

        int j = 0;
        for (Token coveredToken : select(aJCas, Token.class)) {
            POS pos = coveredToken.getPos();

            if (pos != null && posLayer != null) {
                String posValue = coveredToken.getPos().getPosValue();
                posLayer.addTag(posValue, tokensLayer.getToken(j));
            }

            j++;
        }
    }

    private void writeLemmas(JCas aJCas, TextCorpus aTextCorpus,
            Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> aTokensBeginPositionMap) {
        if (!JCasUtil.exists(aJCas, Lemma.class)) {
            // Do nothing if there are no lemmas in the CAS
            getLogger().debug("Layer [" + TextCorpusLayerTag.LEMMAS.getXmlName() + "]: empty");
            return;
        }

        // Tokens layer must already exist
        TokensLayer tokensLayer = aTextCorpus.getTokensLayer();

        // create lemma annotation layer
        LemmasLayer lemmasLayer = aTextCorpus.createLemmasLayer();

        getLogger().debug("Layer [" + TextCorpusLayerTag.LEMMAS.getXmlName() + "]: created");

        int j = 0;
        for (Token coveredToken : select(aJCas, Token.class)) {
            Lemma lemma = coveredToken.getLemma();
            if (lemma != null && lemmasLayer != null) {
                String lemmaValue = coveredToken.getLemma().getValue();
                lemmasLayer.addLemma(lemmaValue, tokensLayer.getToken(j));
            }
            j++;
        }

    }

    private void writeSentence(JCas aJCas, TextCorpus aTextCorpus,
            Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> aTokensBeginPositionMap) {
        // if not TCF file, add sentence layer (Sentence is required for BRAT)
        SentencesLayer sentencesLayer = aTextCorpus.getSentencesLayer();
        if (sentencesLayer != null) {
            getLogger().debug("Layer [" + TextCorpusLayerTag.SENTENCES.getXmlName() + "]: found");
            return;
        }

        sentencesLayer = aTextCorpus.createSentencesLayer();

        getLogger().debug("Layer [" + TextCorpusLayerTag.SENTENCES.getXmlName() + "]: created");

        for (Sentence sentence : select(aJCas, Sentence.class)) {
            List<eu.clarin.weblicht.wlfxb.tc.api.Token> tokens = new ArrayList<eu.clarin.weblicht.wlfxb.tc.api.Token>();
            for (Token token : selectCovered(Token.class, sentence)) {
                tokens.add(aTokensBeginPositionMap.get(token.getBegin()));
            }
            sentencesLayer.addSentence(tokens);
        }
    }

    private void writeDependency(JCas aJCas, TextCorpus aTextCorpus,
            Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> aTokensBeginPositionMap) {
        if (!JCasUtil.exists(aJCas, Dependency.class)) {
            // Do nothing if there are no dependencies in the CAS
            getLogger().debug("Layer [" + TextCorpusLayerTag.PARSING_DEPENDENCY.getXmlName() + "]: empty");
            return;
        }

        DependencyParsingLayer dependencyParsingLayer = null;
        String tagSetName = "tiger";
        for (TagsetDescription tagSet : select(aJCas, TagsetDescription.class)) {
            if (tagSet.getLayer().equals(Dependency.class.getName())) {
                tagSetName = tagSet.getName();
                break;
            }
        }

        dependencyParsingLayer = aTextCorpus.createDependencyParsingLayer(tagSetName, false, true);

        getLogger().debug("Layer [" + TextCorpusLayerTag.PARSING_DEPENDENCY.getXmlName() + "]: created");

        for (Sentence s : select(aJCas, Sentence.class)) {
            List<eu.clarin.weblicht.wlfxb.tc.api.Dependency> deps = new ArrayList<eu.clarin.weblicht.wlfxb.tc.api.Dependency>();
            for (Dependency d : selectCovered(Dependency.class, s)) {
                eu.clarin.weblicht.wlfxb.tc.api.Dependency dependency = dependencyParsingLayer.createDependency(
                        d.getDependencyType(), aTokensBeginPositionMap.get(d.getDependent().getBegin()),
                        aTokensBeginPositionMap.get(d.getGovernor().getBegin()));

                deps.add(dependency);
            }
            if (deps.size() > 0) {
                dependencyParsingLayer.addParse(deps);
            }
        }
    }

    private void writeNamedEntity(JCas aJCas, TextCorpus aTextCorpus,
            Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> aTokensBeginPositionMap) {
        if (!JCasUtil.exists(aJCas, NamedEntity.class)) {
            // Do nothing if there are no named entities in the CAS
            getLogger().debug("Layer [" + TextCorpusLayerTag.NAMED_ENTITIES.getXmlName() + "]: empty");
            return;
        }

        String tagSetName = "BART";
        for (TagsetDescription tagSet : select(aJCas, TagsetDescription.class)) {
            if (tagSet.getLayer().equals(NamedEntity.class.getName())) {
                tagSetName = tagSet.getName();
                break;
            }
        }

        NamedEntitiesLayer namedEntitiesLayer = aTextCorpus.createNamedEntitiesLayer(tagSetName);

        getLogger().debug("Layer [" + TextCorpusLayerTag.NAMED_ENTITIES.getXmlName() + "]: created");

        for (NamedEntity namedEntity : select(aJCas, NamedEntity.class)) {
            List<Token> tokensInCas = selectCovered(aJCas, Token.class, namedEntity.getBegin(),
                    namedEntity.getEnd());
            List<eu.clarin.weblicht.wlfxb.tc.api.Token> tokensInTcf = new ArrayList<eu.clarin.weblicht.wlfxb.tc.api.Token>();
            for (Token token : tokensInCas) {
                tokensInTcf.add(aTokensBeginPositionMap.get(token.getBegin()));
            }
            namedEntitiesLayer.addEntity(namedEntity.getValue(), tokensInTcf);
        }
    }

    private void writeCoreference(JCas aJCas, TextCorpus aTextCorpus,
            Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> aTokensBeginPositionMap) {
        if (!JCasUtil.exists(aJCas, CoreferenceChain.class)) {
            // Do nothing if there are no coreference chains in the CAS
            getLogger().debug("Layer [" + TextCorpusLayerTag.REFERENCES.getXmlName() + "]: empty");
            return;
        }

        String tagSetName = "TueBaDz";
        for (TagsetDescription tagSet : select(aJCas, TagsetDescription.class)) {
            if (tagSet.getLayer().equals(CoreferenceLink.class.getName())) {
                tagSetName = tagSet.getName();
                break;
            }
        }

        ReferencesLayer coreferencesLayer = aTextCorpus.createReferencesLayer(null, tagSetName, null);

        getLogger().debug("Layer [" + TextCorpusLayerTag.REFERENCES.getXmlName() + "]: created");

        for (CoreferenceChain chain : select(aJCas, CoreferenceChain.class)) {
            CoreferenceLink prevLink = null;
            Reference prevRef = null;
            List<Reference> refs = new ArrayList<Reference>();
            for (CoreferenceLink link : chain.links()) {
                // Get covered tokens
                List<eu.clarin.weblicht.wlfxb.tc.api.Token> tokens = new ArrayList<eu.clarin.weblicht.wlfxb.tc.api.Token>();
                for (Token token : selectCovered(Token.class, link)) {
                    tokens.add(aTokensBeginPositionMap.get(token.getBegin()));
                }

                // Create current reference
                Reference ref = coreferencesLayer.createReference(link.getReferenceType(), tokens, null);

                // Special handling for expletive relations
                if (REL_TYPE_EXPLETIVE.equals(link.getReferenceRelation())) {
                    coreferencesLayer.addRelation(ref, REL_TYPE_EXPLETIVE);
                    // if the relation is expletive, then there must not be a next element in the
                    // chain, so we bail out here.
                    continue;
                }

                // Create relation between previous and current reference
                if (prevLink != null) {
                    coreferencesLayer.addRelation(prevRef, prevLink.getReferenceRelation(), ref);
                }

                prevLink = link;
                prevRef = ref;
                refs.add(ref);
            }
            coreferencesLayer.addReferent(refs);
        }
    }
}