de.tudarmstadt.lt.n2n.preparsed.annotators.GoogleSyntacticNgramsAnnotator.java Source code

Introduction

Here is the source code for de.tudarmstadt.lt.n2n.preparsed.annotators.GoogleSyntacticNgramsAnnotator.java
Source

/*
 *   Copyright 2012
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package de.tudarmstadt.lt.n2n.preparsed.annotators;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import de.tudarmstadt.lt.utilities.types.RepeatedSentence;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;

/**
 *
 * @author Steffen Remus
 **/
public class GoogleSyntacticNgramsAnnotator extends JCasAnnotator_ImplBase {

    private final static Logger LOG = LoggerFactory.getLogger(GoogleSyntacticNgramsAnnotator.class);

    public static final String PARAM_CREATE_SENTENCE_ANNOTATIONS = "_create_sentences";
    @ConfigurationParameter(name = PARAM_CREATE_SENTENCE_ANNOTATIONS, mandatory = true, defaultValue = { "true" })
    private boolean _create_sentences;

    public static final String PARAM_CREATE_TOKEN_ANNOTATIONS = "_create_tokens";
    @ConfigurationParameter(name = PARAM_CREATE_TOKEN_ANNOTATIONS, mandatory = true, defaultValue = { "true" })
    private boolean _create_tokens;

    public static final String PARAM_CREATE_DEPENDENCY_ANNOTATIONS = "_create_dependencies";
    @ConfigurationParameter(name = PARAM_CREATE_DEPENDENCY_ANNOTATIONS, mandatory = true, defaultValue = { "true" })
    private boolean _create_dependencies;

    public static final String PARAM_FAIL_EARLY = "_fail_early";
    @ConfigurationParameter(name = PARAM_FAIL_EARLY, mandatory = false, defaultValue = { "false" })
    private boolean _fail_early;

    @Override
    public void process(JCas aJCas) throws AnalysisEngineProcessException {
        try {
            // assert line.trim().split("\n", 2).length < 2 : "Too many lines, expected only one!";
            parseCasLines(aJCas);
        } catch (Exception e) {
            LOG.warn("Could not process cas {}, text: '{}' ({}: {})", DocumentMetaData.get(aJCas).getDocumentId(),
                    StringUtils.abbreviate(aJCas.getDocumentText(), 50), e.getClass().getSimpleName(),
                    e.getMessage());
            if (_fail_early)
                throw new AnalysisEngineProcessException(e);
        }
    }

    void parseCasLines(JCas aJCas) {
        DocumentMetaData metadata = DocumentMetaData.get(aJCas);
        String cas_id = metadata.getDocumentId();
        String cas_text = aJCas.getDocumentText();
        LOG.trace("[{}] Splitting text into lines '{}'.", cas_id, StringUtils.abbreviate(cas_text, 50));
        if (cas_text.trim().isEmpty()) {
            LOG.trace("[{}] Text is empty.", cas_id);
            return;
        }

        List<Integer> line_idxs = new ArrayList<Integer>();
        line_idxs.add(0);
        for (int index = 0; (index = cas_text.indexOf('\n', index) + 1) > 0; line_idxs.add(index))
            ;
        LOG.debug("[{}] Found {} newline characters -> {} lines [{}]", cas_id, line_idxs.size(),
                line_idxs.size() + 1, StringUtils.abbreviate(cas_text, 50));
        if (line_idxs.get(line_idxs.size() - 1) < cas_text.length()) // if cas doesn't end with a new line
            line_idxs.add(cas_text.length());

        for (int i = 0; i < line_idxs.size() - 1; i++) {
            int bline_idx = line_idxs.get(i);
            int eline_idx = line_idxs.get(i + 1);
            parseCasLine(aJCas, bline_idx, eline_idx);
        }

    }

    void parseCasLine(JCas aJCas, int bline_offset, int eline_offset) {

        DocumentMetaData metadata = DocumentMetaData.get(aJCas);
        String cas_id = metadata.getDocumentId();
        String cas_text = aJCas.getDocumentText();
        String line_text = cas_text.substring(bline_offset, eline_offset);
        LOG.trace("[{}] Trying to parse line '{}'.", cas_id, StringUtils.abbreviate(line_text, 50));
        if (line_text.trim().isEmpty()) {
            LOG.warn("[{}] Line is empty.", cas_id);
            return;
        }

        List<Integer> tab_idxs = new ArrayList<Integer>();
        for (int index = 0; (index = line_text.indexOf('\t', index) + 1) > 0; tab_idxs.add(index + bline_offset))
            ;
        if (tab_idxs.get(tab_idxs.size() - 1) < line_text.length()) // if line doesn't end with a tab character
            tab_idxs.add(bline_offset + line_text.length() + 1);

        LOG.debug("[{}] Found {} tab characters -> {} columns [{}]", cas_id, tab_idxs.size(), tab_idxs.size() + 1,
                StringUtils.abbreviate(line_text, 50));

        // 0 = word; 1 = context; 2 = number of occurrences; 3 = number of occurrences per year

        String word = cas_text.substring(bline_offset, tab_idxs.size() >= 1 ? tab_idxs.get(0) - 1 : eline_offset); // unused
        LOG.trace("[{}] Word '{}'.", cas_id, word);

        int context_begin_offset_in_cas = tab_idxs.size() >= 2 ? tab_idxs.get(0) : bline_offset,
                context_end_offset_in_cas = tab_idxs.size() >= 2 ? tab_idxs.get(1) - 1 : bline_offset;
        String context = tab_idxs.size() >= 2
                ? cas_text.substring(context_begin_offset_in_cas, context_end_offset_in_cas)
                : "";
        LOG.trace("[{}] Context '{}'.", cas_id, context);

        int num_occurrences = Integer
                .parseInt(tab_idxs.size() >= 3 ? cas_text.substring(tab_idxs.get(1), tab_idxs.get(2) - 1) : "0");
        LOG.trace("[{}] Number-of-occurrences '{}'.", cas_id, num_occurrences);

        String num_occurrences_per_year = tab_idxs.size() >= 4
                ? cas_text.substring(tab_idxs.get(2), tab_idxs.get(3) - 1)
                : ""; // unused
        LOG.trace("[{}] Number-of-occurrences-per-year '{}'.", cas_id, num_occurrences_per_year);

        parseContext(aJCas, context, cas_id, context_begin_offset_in_cas, context_end_offset_in_cas,
                num_occurrences);

    }

    void parseContext(JCas aJCas, String context, String cas_id, int context_begin_offset_in_cas,
            int context_end_offset_in_cas, int num_occurrences) throws IllegalStateException {
        // skip whitespaces
        for (int offset = 0; offset < context.length()
                && context.charAt(offset) == ' '; offset++, context_begin_offset_in_cas++)
            ;
        context = context.trim(); // remove white spaces at beginning and end
        if (context.isEmpty()) {
            LOG.warn("[{}] Context is empty.", cas_id);
            return;
        }

        // create a pseudo sentence
        if (_create_sentences) {
            RepeatedSentence s = new RepeatedSentence(aJCas, context_begin_offset_in_cas,
                    context_end_offset_in_cas);
            s.setRepetitionCount(num_occurrences - 1);
            s.addToIndexes();
        }
        //      with/IN/prep/6 //NNP/dep/3 ever/NNP/pobj/
        // word/postag/deplabel/headindex word/postag/deplabel/headindex word/postag/deplabel/headindex word/postag/deplabel/headindex
        // word/postag/deplabel/headindex contains no '/' and no whitespace characters; head == governor
        List<Integer> space_idxs = new ArrayList<Integer>();
        space_idxs.add(0);
        for (int index = 0; (index = context.indexOf(' ', index) + 1) > 0; space_idxs.add(index))
            ;
        if (space_idxs.get(space_idxs.size() - 1) < context.length()) // if context doesn't end with a space character
            space_idxs.add(context.length() + 1);

        if (!_create_tokens)
            return;
        Token[] tokens = new Token[space_idxs.size() - 1];
        String[][] context_constituents = new String[space_idxs.size() - 1][];
        // first pass: collect tokens
        for (int i = 0; i < space_idxs.size() - 1; i++) {
            int btoken_idx = space_idxs.get(i);
            int etoken_idx = space_idxs.get(i + 1);

            String synt_token = context.substring(btoken_idx, etoken_idx - 1);
            LOG.trace("[{}] Syntactic token '{}'.", cas_id, synt_token);

            String[] splits = synt_token.split("/");
            LOG.trace("[{}] Syntactic token constituents '{}'.", cas_id, Arrays.asList(splits));

            String word = splits[0];
            String postag = splits[1];

            btoken_idx += context_begin_offset_in_cas;

            Token token = new Token(aJCas, btoken_idx, btoken_idx + word.length());
            POS pos = new POS(aJCas, btoken_idx, btoken_idx + word.length());
            pos.setPosValue(postag);
            token.setPos(pos);
            pos.addToIndexes();
            token.addToIndexes();

            tokens[i] = token;
            context_constituents[i] = splits;
        }

        if (!_create_dependencies)
            return;
        for (int i = 0; i < tokens.length; i++) {
            Token dep = tokens[i];
            String[] splits = context_constituents[i];
            String deplabel = splits[2];

            int headindex = 0;
            try {
                headindex = Integer.parseInt(splits[3]) - 1;
            } catch (NumberFormatException e) {
                try {
                    headindex = Integer.parseInt(splits[splits.length - 1]) - 1;
                } catch (NumberFormatException ee) {
                    LOG.error("Could not parse {} or {} as number.", splits[3], splits[splits.length - 1]);
                }
            }
            if (headindex < 0) // skip root node pointing to nowhere
                continue;
            Token gov = tokens[headindex];

            Dependency d = new Dependency(aJCas);
            d.setBegin(Math.min(gov.getBegin(), dep.getBegin()));
            d.setEnd(Math.max(gov.getEnd(), dep.getEnd()));
            d.setDependencyType(deplabel);
            d.setGovernor(gov);
            d.setDependent(dep);
            d.addToIndexes();
        }

    }
}