de.tudarmstadt.lt.n2n.preparsed.annotators.PreParsedLineParser.java Source code

Introduction

Here is the source code for de.tudarmstadt.lt.n2n.preparsed.annotators.PreParsedLineParser.java
Source

/*
 *   Copyright 2013
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package de.tudarmstadt.lt.n2n.preparsed.annotators;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;

/**
 *
 * @author Steffen Remus
 */
public class PreParsedLineParser {

    private static Logger LOG = LoggerFactory.getLogger(PreParsedLineParser.class);

    public PreParsedLineParser() {
        // _posMappingProvider = new MappingProvider();
        // _posMappingProvider.setDefault(MappingProvider.LOCATION,
        // "classpath:/de/tudarmstadt/ukp/dkpro/"
        // + "core/api/lexmorph/tagset/${language}-${pos.tagset}-pos.map");
        // _posMappingProvider.setDefault(MappingProvider.BASE_TYPE, POS.class.getName());
        // _posMappingProvider.setDefault("pos.tagset", "default");
        // _posMappingProvider.setOverride(MappingProvider.LOCATION, _posMappingLocation);
        // _posMappingProvider.setOverride(MappingProvider.LANGUAGE, _language);
        // _posMappingProvider.addImport("pos.tagset", modelProvider);

    }

    public void parse(JCas aJCas, boolean create_sentences, boolean create_tokens, boolean create_dependencies,
            boolean create_collapsed_dependencies, boolean create_constituents, boolean write_penntree)
            throws IllegalStateException {

        DocumentMetaData metadata = DocumentMetaData.get(aJCas);
        String cas_id = metadata.getDocumentId();
        String cas_text = aJCas.getDocumentText();
        LOG.trace("[{}] Splitting text into lines '{}'.", cas_id, StringUtils.abbreviate(cas_text, 50));
        if (cas_text.trim().isEmpty()) {
            LOG.trace("[{}] Text is empty.", cas_id);
            return;
        }

        List<Integer> line_idxs = new ArrayList<Integer>();
        line_idxs.add(0);
        for (int index = 0; (index = cas_text.indexOf('\n', index) + 1) > 0; line_idxs.add(index))
            ;
        LOG.debug("[{}] Found {} newline characters -> {} lines [{}]", cas_id, line_idxs.size(),
                line_idxs.size() + 1, StringUtils.abbreviate(cas_text, 50));
        if (line_idxs.get(line_idxs.size() - 1) < cas_text.length()) // if cas doesn't end with a new line
            line_idxs.add(cas_text.length());

        for (int i = 0; i < line_idxs.size() - 1; i++) {
            int bline_idx = line_idxs.get(i);
            int eline_idx = line_idxs.get(i + 1);
            parseLine(aJCas, create_sentences, create_tokens, create_dependencies, create_collapsed_dependencies,
                    create_constituents, write_penntree, bline_idx, eline_idx);
        }
    }

    public void parseLine(JCas aJCas, boolean create_sentences, boolean create_tokens, boolean create_dependencies,
            boolean create_collapsed_dependencies, boolean create_constituents, boolean write_penntree,
            int bline_offset, int eline_offset) throws IllegalStateException {

        DocumentMetaData metadata = DocumentMetaData.get(aJCas);
        String cas_id = metadata.getDocumentId();
        String cas_text = aJCas.getDocumentText();
        String line_text = cas_text.substring(bline_offset, eline_offset);
        LOG.trace("[{}] Trying to parse line '{}'.", cas_id, StringUtils.abbreviate(line_text, 50));
        if (line_text.trim().isEmpty()) {
            LOG.warn("[{}] Line is empty.", cas_id);
            return;
        }

        List<Integer> tab_idxs = new ArrayList<Integer>();
        for (int index = 0; (index = line_text.indexOf('\t', index) + 1) > 0; tab_idxs.add(index + bline_offset))
            ;
        if (tab_idxs.get(tab_idxs.size() - 1) < line_text.length()) // if line doesn't end with a tab character
            tab_idxs.add(line_text.length() + 1);

        LOG.debug("[{}] Found {} tab characters -> {} columns [{}]", cas_id, tab_idxs.size(), tab_idxs.size() + 1,
                StringUtils.abbreviate(line_text, 50));

        /*
         * 0 = sentence -- currently ignored
         * 1 = token/pos-tag (separated by ' ')
         * 2 = StanfordDependencies (separated by ;;;;;) -- currently ignored
         * 3 = collapsed dependencies with positions (separated by ';' or ',')
         * 4 = penn tree string
         * 5 = if existent should be empty
         */
        String text = cas_text.substring(bline_offset, tab_idxs.size() >= 1 ? tab_idxs.get(0) - 1 : eline_offset); // currently unused
        LOG.trace("[{}] Sentence text '{}'.", cas_id, text);

        int token_list_begin_offset_in_cas = tab_idxs.size() >= 2 ? tab_idxs.get(0) : bline_offset,
                token_list_end_offset_in_cas = tab_idxs.size() >= 2 ? tab_idxs.get(1) - 1 : bline_offset;
        String token_tag_list = tab_idxs.size() >= 2
                ? cas_text.substring(token_list_begin_offset_in_cas, token_list_end_offset_in_cas)
                : null;
        LOG.trace("[{}] Token+tag-list text '{}'.", cas_id, token_tag_list);

        String dependency_list = tab_idxs.size() >= 3 ? cas_text.substring(tab_idxs.get(1), tab_idxs.get(2) - 1)
                : null; // currently unused
        LOG.trace("[{}] Dependency-list text '{}'.", cas_id, dependency_list);

        String collapsed_dependency_list = tab_idxs.size() >= 4
                ? cas_text.substring(tab_idxs.get(2), tab_idxs.get(3) - 1)
                : null;
        LOG.trace("[{}] Collapsed dependency-list text '{}'.", cas_id, collapsed_dependency_list);

        String penn_tree_string = tab_idxs.size() >= 5
                ? cas_text.substring(tab_idxs.get(3), tab_idxs.get(4) - 1).trim()
                : null; // trim because sometimes there is a trailing tab character
        LOG.trace("[{}] Collapsed penn-tree-string '{}'.", cas_id, penn_tree_string);

        // create token annotations?
        if (create_tokens && StringUtils.isNotBlank(token_tag_list))
            parseTokens(aJCas, token_tag_list, cas_id, token_list_begin_offset_in_cas,
                    token_list_end_offset_in_cas);

        // create collapsed dependency annotations?
        if (create_collapsed_dependencies && StringUtils.isNotBlank(collapsed_dependency_list))
            parseCollapsedDependencies(aJCas, collapsed_dependency_list, cas_id);

        // create penntree string annotation?
        if (write_penntree && StringUtils.isNotBlank(penn_tree_string))
            parsePennTree(aJCas, penn_tree_string, token_list_begin_offset_in_cas, token_list_end_offset_in_cas);

        // create sentence annotation?
        if (create_sentences && StringUtils.isNotBlank(token_tag_list))
            new Sentence(aJCas, token_list_begin_offset_in_cas, token_list_end_offset_in_cas).addToIndexes();

        // TODO: create constituents annotations
        // TODO: create stanford dependencies annotations

    }

    private void parseCollapsedDependencies(JCas aJCas, String collapsed_dependency_list, String cas_id)
            throws IllegalStateException {
        List<Token> tokens = new ArrayList<Token>(JCasUtil.select(aJCas, Token.class));
        Collections.sort(tokens, new Comparator<Token>() {
            @Override
            public int compare(Token o1, Token o2) {
                return o1.getBegin() - o2.getBegin();
            }

        });

        for (String collapsed_dep : collapsed_dependency_list.split("\\);")) {
            LOG.trace("[{}] Found collapsed-dependcy-string string '{}'.", cas_id, collapsed_dep);

            int _temp1 = collapsed_dep.indexOf("(");
            if (_temp1 < 0)
                throw new IllegalStateException(String
                        .format("Unexpected collapsed dependency relation: '%s' (%s).", collapsed_dep, cas_id));
            String relation_name = collapsed_dep.substring(0, _temp1).trim();

            int _temp2 = collapsed_dep.indexOf("-", _temp1); // skip until "-num"
            if (_temp2 < 0)
                throw new IllegalStateException(String
                        .format("Unexpected collapsed dependency relation: '%s' (%s).", collapsed_dep, cas_id));
            int _temp3 = collapsed_dep.indexOf(";", _temp2);
            if (_temp3 < 0) {
                _temp3 = collapsed_dep.indexOf(",", _temp2);
                if (_temp3 < 0)
                    throw new IllegalStateException(String
                            .format("Unexpected collapsed dependency relation: '%s' (%s).", collapsed_dep, cas_id));
            }
            String w1 = collapsed_dep.substring(_temp1 + 1, _temp3).trim();
            String w2 = collapsed_dep.substring(_temp2 + 1, collapsed_dep.length()).trim();

            LOG.trace("[{}] Found collapsed-dependcy constituents rel='{}', gov='{}', dep='{}'.", cas_id,
                    relation_name, w1, w2);
            int i_w1, i_w2;
            try {
                i_w1 = Integer.parseInt(w1.substring(w1.lastIndexOf('-') + 1)) - 1;
                i_w2 = Integer.parseInt(w2.substring(w2.lastIndexOf('-') + 1)) - 1;
            } catch (NumberFormatException e) {
                throw new IllegalStateException(String.format(
                        "Unexpected collapsed dependency relation: '%s' (%s). Could not parse token index.",
                        collapsed_dep, cas_id));
            }
            if (i_w1 < 0 || i_w2 < 0) {
                LOG.warn("[{}] Found a dependency constituent with index <= 0 in relation '{}'.", cas_id,
                        collapsed_dep);
                continue;
            }
            if (i_w1 >= tokens.size() || i_w2 >= tokens.size()) {
                LOG.warn(
                        "[{}] Found a dependency constituent with index > than available tokens in relation '{}'. ({} or {} >= {})",
                        cas_id, collapsed_dep, i_w1, i_w2, tokens.size());
                continue;
            }

            Token gov = tokens.get(i_w1);
            Token dep = tokens.get(i_w2);
            LOG.trace("[{}] Assuming following tokens for collapsed-dependcy constituents gov='{}', dep='{}'.",
                    cas_id, gov.getCoveredText(), dep.getCoveredText());

            // TODO: add the correct dependency type here
            Dependency d = new Dependency(aJCas);
            d.setDependencyType(relation_name);
            d.setGovernor(gov);
            d.setDependent(dep);
            d.setBegin(Math.min(gov.getBegin(), dep.getBegin()));
            d.setEnd(Math.max(gov.getEnd(), dep.getEnd()));
            d.addToIndexes();
        }
    }

    private void parsePennTree(JCas aJCas, String penn_tree_string, int begin_annotation, int end_annotation) {
        PennTree pt = new PennTree(aJCas, begin_annotation, end_annotation);
        pt.setPennTree(penn_tree_string);
        pt.addToIndexes();
    }

    private void parseTokens(JCas aJCas, String token_tag_list, String cas_id, int token_list_begin_offset_in_cas,
            int token_list_end_offset_in_cas) throws IllegalStateException {
        // skip whitespaces
        for (int offset = 0; offset < token_tag_list.length()
                && token_tag_list.charAt(offset) == ' '; offset++, token_list_begin_offset_in_cas++)
            ;
        token_tag_list = token_tag_list.trim(); // remove white spaces at beginning and end

        for (int index = -1; index < token_tag_list.length();) {

            // extract token+tag string
            int token_tag_begin = index + 1;
            index = token_tag_list.indexOf('/', token_tag_begin); // advance index to pos type
            index = token_tag_list.indexOf(' ', index + 1); // advance index to next token/pos pair
            index = index > 0 ? index : token_tag_list.length();
            int token_tag_end = index;
            String token_tag = token_tag_list.substring(token_tag_begin, token_tag_end);
            LOG.trace("[{}] Found token+tag string '{}'.", cas_id, token_tag);

            // extract tokens and tags individually
            int slash_idx = token_tag.lastIndexOf('/');
            if (slash_idx <= 0)
                throw new IllegalStateException(
                        String.format("Unexpected token/pos pair: '%s' (%s).", token_tag, cas_id));

            String tag_str = token_tag.substring(slash_idx + 1, token_tag.length());
            LOG.trace("[{}] Found token='{}' + tag='{}'.", cas_id, token_tag.substring(0, slash_idx), tag_str);
            Token t = new Token(aJCas, token_list_begin_offset_in_cas + token_tag_begin, // begin of token in cas text
                    token_list_begin_offset_in_cas + token_tag_begin + slash_idx // end of token in cas text
            );

            assert (t.getBegin() >= token_list_begin_offset_in_cas) : "Token begin must be in token+tag-list area!";
            assert (t.getEnd() <= token_list_end_offset_in_cas) : "Token end must be in token+tag-list area!";

            // TODO: add the correct pos type here
            POS pos = new POS(aJCas, t.getBegin(), t.getEnd());
            pos.setPosValue(tag_str);
            pos.addToIndexes();

            t.setPos(pos);
            t.addToIndexes();
        }
    }
}