de.tudarmstadt.ukp.clarin.webanno.tsv.WebannoTsv1Reader.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.clarin.webanno.tsv.WebannoTsv1Reader.java
Source

/*******************************************************************************
 * Copyright 2012
 * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.tudarmstadt.ukp.clarin.webanno.tsv;

import static org.apache.commons.io.IOUtils.closeQuietly;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.math.NumberUtils;
import org.apache.uima.cas.CAS;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.util.Level;

import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;

/**
 * Reads a specific TSV File (9 TAB separated) annotation and change it to CAS object. Example of
 * Input Files: <br>
 * 1 Heutzutage heutzutage ADV _ _ 2 ADV _ _ <br>
 * Columns are separated by a TAB character and sentences are separated by a blank new line.
 *
 *
 */
public class WebannoTsv1Reader extends JCasResourceCollectionReader_ImplBase {

    private String fileName;

    public void convertToCas(JCas aJCas, InputStream aIs, String aEncoding) throws IOException

    {
        StringBuilder text = new StringBuilder();
        Map<Integer, String> tokens = new HashMap<Integer, String>();
        Map<Integer, String> pos = new HashMap<Integer, String>();
        Map<Integer, String> lemma = new HashMap<Integer, String>();
        Map<Integer, String> namedEntity = new HashMap<Integer, String>();
        Map<Integer, String> dependencyFunction = new HashMap<Integer, String>();
        Map<Integer, Integer> dependencyDependent = new HashMap<Integer, Integer>();

        List<Integer> firstTokenInSentence = new ArrayList<Integer>();

        DocumentMetaData documentMetadata = DocumentMetaData.get(aJCas);
        fileName = documentMetadata.getDocumentTitle();
        setAnnotations(aIs, aEncoding, text, tokens, pos, lemma, namedEntity, dependencyFunction,
                dependencyDependent, firstTokenInSentence);

        aJCas.setDocumentText(text.toString());

        Map<String, Token> tokensStored = new HashMap<String, Token>();

        createToken(aJCas, text, tokens, pos, lemma, tokensStored);

        createNamedEntity(namedEntity, aJCas, tokens, tokensStored);

        createDependency(aJCas, tokens, dependencyFunction, dependencyDependent, tokensStored);

        createSentence(aJCas, firstTokenInSentence, tokensStored);
    }

    /**
     * Create {@link Token} in the {@link CAS}. If the lemma and pos columns are not empty it will
     * create {@link Lemma} and {@link POS} annotations
     */
    private void createToken(JCas aJCas, StringBuilder text, Map<Integer, String> tokens, Map<Integer, String> pos,
            Map<Integer, String> lemma, Map<String, Token> tokensStored) {
        int tokenBeginPosition = 0;
        int tokenEndPosition = 0;

        for (int i = 1; i <= tokens.size(); i++) {
            tokenBeginPosition = text.indexOf(tokens.get(i), tokenBeginPosition);
            Token outToken = new Token(aJCas, tokenBeginPosition,
                    text.indexOf(tokens.get(i), tokenBeginPosition) + tokens.get(i).length());
            tokenEndPosition = text.indexOf(tokens.get(i), tokenBeginPosition) + tokens.get(i).length();
            tokenBeginPosition = tokenEndPosition;
            outToken.addToIndexes();

            // Add pos to CAS if exist
            if (!pos.get(i).equals("_")) {
                POS outPos = new POS(aJCas, outToken.getBegin(), outToken.getEnd());
                outPos.setPosValue(pos.get(i));
                outPos.addToIndexes();
                outToken.setPos(outPos);
            }

            // Add lemma if exist
            if (!lemma.get(i).equals("_")) {
                Lemma outLemma = new Lemma(aJCas, outToken.getBegin(), outToken.getEnd());
                outLemma.setValue(lemma.get(i));
                outLemma.addToIndexes();
                outToken.setLemma(outLemma);
            }
            tokensStored.put("t_" + i, outToken);
        }
    }

    /**
     * add dependency parsing to CAS
     */
    private void createDependency(JCas aJCas, Map<Integer, String> tokens, Map<Integer, String> dependencyFunction,
            Map<Integer, Integer> dependencyDependent, Map<String, Token> tokensStored) {
        for (int i = 1; i <= tokens.size(); i++) {
            if (dependencyFunction.get(i) != null) {
                Dependency outDependency = new Dependency(aJCas);
                outDependency.setDependencyType(dependencyFunction.get(i));

                // if span A has (start,end)= (20, 26) and B has (start,end)= (30, 36)
                // arc drawn from A to B, dependency will have (start, end) = (20, 36)
                // arc drawn from B to A, still dependency will have (start, end) = (20, 36)
                int begin = 0, end = 0;
                // if not ROOT
                if (dependencyDependent.get(i) != 0) {
                    begin = tokensStored.get("t_" + i).getBegin() > tokensStored
                            .get("t_" + dependencyDependent.get(i)).getBegin()
                                    ? tokensStored.get("t_" + dependencyDependent.get(i)).getBegin()
                                    : tokensStored.get("t_" + i).getBegin();
                    end = tokensStored.get("t_" + i).getEnd() < tokensStored.get("t_" + dependencyDependent.get(i))
                            .getEnd() ? tokensStored.get("t_" + dependencyDependent.get(i)).getEnd()
                                    : tokensStored.get("t_" + i).getEnd();
                } else {
                    begin = tokensStored.get("t_" + i).getBegin();
                    end = tokensStored.get("t_" + i).getEnd();
                }

                outDependency.setBegin(begin);
                outDependency.setEnd(end);
                outDependency.setDependent(tokensStored.get("t_" + i));
                if (dependencyDependent.get(i) == 0) {
                    outDependency.setGovernor(tokensStored.get("t_" + i));
                } else {
                    outDependency.setGovernor(tokensStored.get("t_" + dependencyDependent.get(i)));
                }
                outDependency.addToIndexes();
            }
        }
    }

    /**
     * Add sentence layer to CAS
     */
    private void createSentence(JCas aJCas, List<Integer> firstTokenInSentence, Map<String, Token> tokensStored) {
        for (int i = 0; i < firstTokenInSentence.size(); i++) {
            Sentence outSentence = new Sentence(aJCas);
            // Only last sentence, and no the only sentence in the document (i!=0)
            if (i == firstTokenInSentence.size() - 1 && i != 0) {
                outSentence.setBegin(tokensStored.get("t_" + firstTokenInSentence.get(i)).getEnd());
                outSentence.setEnd(tokensStored.get("t_" + (tokensStored.size())).getEnd());
                outSentence.addToIndexes();
                break;
            }
            if (i == firstTokenInSentence.size() - 1 && i == 0) {
                outSentence.setBegin(tokensStored.get("t_" + firstTokenInSentence.get(i)).getBegin());
                outSentence.setEnd(tokensStored.get("t_" + (tokensStored.size())).getEnd());
                outSentence.addToIndexes();
            } else if (i == 0) {
                outSentence.setBegin(tokensStored.get("t_" + firstTokenInSentence.get(i)).getBegin());
                outSentence.setEnd(tokensStored.get("t_" + firstTokenInSentence.get(i + 1)).getEnd());
                outSentence.addToIndexes();
            } else {
                outSentence.setBegin(tokensStored.get("t_" + firstTokenInSentence.get(i)).getEnd() + 1);
                outSentence.setEnd(tokensStored.get("t_" + firstTokenInSentence.get(i + 1)).getEnd());
                outSentence.addToIndexes();
            }
        }
    }

    /**
     * Iterate through all lines and get available annotations<br>
     * First column is sentence number and a blank new line marks end of a sentence<br>
     * The Second column is the token <br>
     * The third column is the lemma annotation <br>
     * The fourth column is the POS annotation <br>
     * The fifth column is used for Named Entity annotations (Multiple annotations separeted by |
     * character) <br>
     * The sixth column is the origin token number of dependency parsing <br>
     * The seventh column is the function/type of the dependency parsing <br>
     * eighth and ninth columns are undefined currently
     */
    private void setAnnotations(InputStream aIs, String aEncoding, StringBuilder text, Map<Integer, String> tokens,
            Map<Integer, String> pos, Map<Integer, String> lemma, Map<Integer, String> namedEntity,
            Map<Integer, String> dependencyFunction, Map<Integer, Integer> dependencyDependent,
            List<Integer> firstTokenInSentence) throws IOException {
        int tokenNumber = 0;
        boolean first = true;
        int base = 0;

        LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding);
        boolean textFound = false;
        StringBuffer tmpText = new StringBuffer();
        while (lineIterator.hasNext()) {
            String line = lineIterator.next().trim();
            if (line.startsWith("#text=")) {
                text.append(line.substring(6) + "\n");
                textFound = true;
                continue;
            }
            if (line.startsWith("#")) {
                continue;// it is a comment line
            }
            int count = StringUtils.countMatches(line, "\t");
            if (line.isEmpty()) {
                continue;
            }
            if (count != 9) {// not a proper TSV file
                getUimaContext().getLogger().log(Level.INFO, "This is not a valid TSV File");
                throw new IOException(fileName + " This is not a valid TSV File");
            }
            StringTokenizer lineTk = new StringTokenizer(line, "\t");

            if (first) {
                tokenNumber = Integer.parseInt(line.substring(0, line.indexOf("\t")));
                firstTokenInSentence.add(tokenNumber);
                first = false;
            } else {
                int lineNumber = Integer.parseInt(line.substring(0, line.indexOf("\t")));
                if (lineNumber == 1) {
                    base = tokenNumber;
                    firstTokenInSentence.add(base);
                }
                tokenNumber = base + Integer.parseInt(line.substring(0, line.indexOf("\t")));
            }

            while (lineTk.hasMoreElements()) {
                lineTk.nextToken();
                String token = lineTk.nextToken();

                // for backward compatibility
                tmpText.append(token + " ");

                tokens.put(tokenNumber, token);
                lemma.put(tokenNumber, lineTk.nextToken());
                pos.put(tokenNumber, lineTk.nextToken());
                String ne = lineTk.nextToken();
                lineTk.nextToken();// make it compatible with prev WebAnno TSV reader
                namedEntity.put(tokenNumber, (ne.equals("_") || ne.equals("-")) ? "O" : ne);
                String dependentValue = lineTk.nextToken();
                if (NumberUtils.isDigits(dependentValue)) {
                    int dependent = Integer.parseInt(dependentValue);
                    dependencyDependent.put(tokenNumber, dependent == 0 ? 0 : base + dependent);
                    dependencyFunction.put(tokenNumber, lineTk.nextToken());
                } else {
                    lineTk.nextToken();
                }
                lineTk.nextToken();
                lineTk.nextToken();
            }
        }
        if (!textFound) {
            text.append(tmpText);
        }
    }

    public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING;
    @ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8")
    private String encoding;

    @Override
    public void getNext(JCas aJCas) throws IOException, CollectionException {
        Resource res = nextFile();
        initCas(aJCas, res);
        InputStream is = null;
        try {
            is = res.getInputStream();
            convertToCas(aJCas, is, encoding);
        } finally {
            closeQuietly(is);
        }

    }

    /**
     * Creates Named Entities from CoNLL BIO format to CAS format
     */
    private void createNamedEntity(Map<Integer, String> aNamedEntityMap, JCas aJCas,
            Map<Integer, String> aTokensMap, Map<String, Token> aJcasTokens) {

        Map<Integer, NamedEntity> indexedNeAnnos = new LinkedHashMap<Integer, NamedEntity>();

        for (int i = 1; i <= aTokensMap.size(); i++) {
            if (aNamedEntityMap.get(i).equals("O")) {
                continue;
            }
            int index = 1;// to maintain multiple span ne annotation in the same index
            for (String ne : aNamedEntityMap.get(i).split("\\|")) {

                if (ne.equals("O")) {// for annotations such as B_LOC|O|I_PER and the like
                    index++;
                } else if (ne.startsWith("B_") || ne.startsWith("B-")) {
                    NamedEntity outNamedEntity = new NamedEntity(aJCas, aJcasTokens.get("t_" + i).getBegin(),
                            aJcasTokens.get("t_" + i).getEnd());
                    outNamedEntity.setValue(ne.substring(2));
                    outNamedEntity.addToIndexes();
                    indexedNeAnnos.put(index, outNamedEntity);
                    index++;
                } else if (ne.startsWith("I_") || ne.startsWith("I-")) {
                    NamedEntity outNamedEntity = indexedNeAnnos.get(index);
                    outNamedEntity.setEnd(aJcasTokens.get("t_" + i).getEnd());
                    outNamedEntity.addToIndexes();
                    index++;
                } else {// NE is not in IOB format. store one NE per token. No way to detect multiple
                        // token NE
                    NamedEntity outNamedEntity = new NamedEntity(aJCas, aJcasTokens.get("t_" + i).getBegin(),
                            aJcasTokens.get("t_" + i).getEnd());
                    outNamedEntity.setValue(ne);
                    outNamedEntity.addToIndexes();
                    indexedNeAnnos.put(index, outNamedEntity);
                    index++;
                }
            }
        }
    }
}