de.tudarmstadt.lt.n2n.io.SemEvalReader.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.lt.n2n.io.SemEvalReader.java

Source

/*
 *   Copyright 2012
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package de.tudarmstadt.lt.n2n.io;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import de.tudarmstadt.lt.n2n.types.Entity;
import de.tudarmstadt.lt.n2n.types.Relation;
import de.tudarmstadt.lt.utilities.IOUtils.CountingLineIterator;
import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;

/**
 * 
 * @author Steffen Remus
 * 
 */
public class SemEvalReader extends ResourceCollectionReaderBase {

    private final static Logger LOG = LoggerFactory.getLogger(SemEvalReader.class);

    /**
     *
     */
    public static final String PARAM_CREATE_RELATION_ANNOTATIONS = "_create_relation_annos";
    @ConfigurationParameter(name = PARAM_CREATE_RELATION_ANNOTATIONS, mandatory = true)
    private boolean _create_relation_annos;

    public static final String PARAM_CREATE_SENTENCE_ANNOTATIONS = "_create_sentence_annos";
    @ConfigurationParameter(name = PARAM_CREATE_SENTENCE_ANNOTATIONS, mandatory = true)
    private boolean _create_sentence_annos;

    private Resource _current_resource;
    private CountingLineIterator _line_iter;
    private String _last_line;
    private Pattern _semeval_sentence_pattern = Pattern.compile("^\\d+\\s\".*<e1>.*</e1>.*<e2>.*</e2>.*\"$");

    @Override
    public void initialize(UimaContext ctx) throws ResourceInitializationException {
        super.initialize(ctx);
        if (!_create_sentence_annos && _create_relation_annos) {
            LOG.info(
                    "PARAM_CREATE_SENTENCE_ANNOTATIONS is set to true since relation annotations depend on sentence annotations");
            _create_sentence_annos = _create_relation_annos || _create_sentence_annos;
        }
    }

    @Override
    public void getNext(CAS aCAS) throws IOException, CollectionException {
        if (_line_iter == null) {
            _current_resource = nextFile();
            LOG.info("reading resource: [{}]", _current_resource.getResolvedUri());
            _line_iter = new CountingLineIterator(
                    new BufferedReader(new InputStreamReader(_current_resource.getInputStream(), "UTF-8")));
            if (!_line_iter.hasNext()) {
                _line_iter.close();
                _line_iter = null;
                return;
            }
            _last_line = _line_iter.next();
        }

        try {
            parse(aCAS);
        } catch (CASException e) {
            throw new IOException(e);
        } finally {
            prepare_next();
        }
    }

    private void prepare_next() {
        if (!_line_iter.hasNext()) {
            _line_iter.close();
            _line_iter = null;
        }
    }

    public void parse(CAS aCas) throws CASException {
        String line = _last_line;
        while (!_semeval_sentence_pattern.matcher(line).matches()) {
            LOG.warn(
                    "Error while parsing line {}: '{}'. Line format does not match the desired semeval sentence input format. Skipping line.",
                    _line_iter.getCurentLineNumber(), line);
            if (!_line_iter.hasNext())
                return;
            line = _line_iter.next();
        }

        // new sentence == new cas
        initCas(aCas, _current_resource, String.valueOf(_line_iter.getCurentLineNumber()));
        JCas aJCas = aCas.getJCas();

        LOG.trace("parsing line {}: '{}', treating as new sentence.", _line_iter.getCurentLineNumber(),
                StringUtils.abbreviate(line, 35));
        Relation relation = new Relation(aJCas);
        try {
            int s_begin = line.indexOf("\"");
            int sentence_id = Integer.parseInt(line.substring(0, s_begin).trim());
            s_begin += 1;

            int e1_begin = line.indexOf("<e1>") - s_begin;
            int e1_end = line.indexOf("</e1>") - s_begin - 4;

            int e2_begin = line.indexOf("<e2>") - s_begin - 4 - 5;
            int e2_end = line.indexOf("</e2>") - s_begin - 4 - 5 - 4;

            int s_end = line.lastIndexOf("\"");
            if (e1_begin < 0 || e1_end < 0 || e2_begin < 0 || e2_end < 0)
                throw new IllegalStateException(
                        "Could not find occurrences of either <e1>, </e1>, <e2>, </e2> or \".");

            String plaintext_sentence = line.substring(s_begin, s_end).replace("<e1>", "").replace("</e1>", "")
                    .replace("<e2>", "").replace("</e2>", "");
            Sentence sentence_annotation = null;
            if (_create_sentence_annos) {
                sentence_annotation = new Sentence(aJCas, 0, plaintext_sentence.length());
                sentence_annotation.addToIndexes();
            }
            if (_create_relation_annos) {
                Entity e1 = new Entity(aJCas, e1_begin, e1_end);
                e1.addToIndexes();

                Entity e2 = new Entity(aJCas, e2_begin, e2_end);
                e2.addToIndexes();

                relation.setE1(e1);
                relation.setE2(e2);
                relation.setCoveringSentence(sentence_annotation);
                relation.setBegin(0);
                relation.setEnd(plaintext_sentence.length());
                relation.setId(sentence_id);
                relation.addToIndexes();
                relation.setInfo("");
                relation.setComment("");
            }

            aJCas.setDocumentText(plaintext_sentence);

        } catch (Exception e) {
            LOG.warn("Error while parsing line {}: '{}'\n\t{}: {}", _line_iter.getCurentLineNumber(), line,
                    e.getClass().getName(), e.getMessage());
        }

        if (!_line_iter.hasNext())
            return;
        line = _line_iter.next();

        // read infos and comments
        while (!line.isEmpty() && !_semeval_sentence_pattern.matcher(line).matches()) {
            if (relation.getInfo() == null) {
                LOG.trace("parsing line {}: '{}', treating as info from sentence before.",
                        _line_iter.getCurentLineNumber(), StringUtils.abbreviate(line, 35));
                relation.setInfo(line);
            } else {
                LOG.trace("parsing line {}: '{}', treating as comment from sentence before.",
                        _line_iter.getCurentLineNumber(), StringUtils.abbreviate(line, 35));
                relation.setComment(relation.getComment() + line + "\n");
            }
            if (!_line_iter.hasNext())
                return;
            line = _line_iter.next();
        }

        // skip empty lines
        while (line.isEmpty() && _line_iter.hasNext())
            line = _line_iter.next();

        _last_line = line;
    }

    @Override
    public boolean hasNext() throws IOException, CollectionException {
        return _line_iter == null ? super.hasNext() : (_line_iter.hasNext() || super.hasNext());
    }

}