de.tudarmstadt.lt.n2n.annotators.JoBimPrinter.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.lt.n2n.annotators.JoBimPrinter.java

Source

/*
 *   Copyright 2012
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package de.tudarmstadt.lt.n2n.annotators;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.zip.Deflater;
import java.util.zip.GZIPOutputStream;

import jobimtext.holing.extractor.JobimAnnotationExtractor;
import jobimtext.holing.extractor.JobimExtractorConfiguration;
import jobimtext.holing.type.JoBim;

import org.apache.commons.lang.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasConsumer_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import de.tudarmstadt.lt.utilities.types.RepeatedSentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;

/**
 *
 * @author Steffen Remus
 */
public class JoBimPrinter extends JCasConsumer_ImplBase {

    private static Logger LOG = LoggerFactory.getLogger(JoBimPrinter.class);

    public static final String PARAM_TARGET_PRINTSTREAM = "_printstream_as_string";
    @ConfigurationParameter(name = PARAM_TARGET_PRINTSTREAM, mandatory = false)
    private String _printstream_as_string;
    private PrintStream _printstream;
    private boolean _prints_to_sys;

    public static final String PARAM_SORT_OUTPUT = "_sort_output";
    @ConfigurationParameter(name = PARAM_SORT_OUTPUT, mandatory = false, defaultValue = { "false" })
    private boolean _sort_output;

    public static final String PARAM_BASE_EXTRACTOR_CONFIGURATION_FILES = "_extractor_configs";
    @ConfigurationParameter(name = PARAM_BASE_EXTRACTOR_CONFIGURATION_FILES, mandatory = true)
    private File[] _extractor_configs;
    private JobimAnnotationExtractor[] _extractors;

    public static final String PARAM_COVERING_ANNOTATION_TYPE = "_covering_annotation_type";
    @ConfigurationParameter(name = PARAM_COVERING_ANNOTATION_TYPE, mandatory = false)
    private Class<? extends Annotation> _covering_annotation_type;

    public static final String PARAM_WRITE_OLDFORMAT = "_write_old_format";
    @ConfigurationParameter(name = PARAM_WRITE_OLDFORMAT, mandatory = false, defaultValue = { "true" })
    private boolean _write_old_format;

    public static final String PARAM_DETAILED_OUTPUT = "_detailed_output";
    @ConfigurationParameter(name = PARAM_DETAILED_OUTPUT, mandatory = false, defaultValue = { "false" })
    private boolean _detailed_output;

    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);
        _prints_to_sys = true;
        if (_printstream_as_string == null || "sysout".equals(_printstream_as_string)
                || "System.out".equals(_printstream_as_string) || "stdout".equals(_printstream_as_string))
            _printstream = System.out;
        else if ("syserr".equals(_printstream_as_string) || "System.err".equals(_printstream_as_string)
                || "stderr".equals(_printstream_as_string))
            _printstream = System.err;
        else
            try {
                // try to open and close the stream
                openPrintToFileStream();
                _printstream.close();
            } catch (IOException e) {
                throw new ResourceInitializationException(e);
            }

        try {
            _extractors = new JobimAnnotationExtractor[_extractor_configs.length];
            for (int i = 0; i < _extractor_configs.length; i++)
                _extractors[i] = JobimExtractorConfiguration
                        .getExtractorFromXmlFile(_extractor_configs[i].getAbsolutePath());
        } catch (Exception e) {
            throw new ResourceInitializationException(e);
        }
        if (_covering_annotation_type == null)
            _covering_annotation_type = Sentence.class;

    }

    private void openPrintToFileStream() throws IOException {
        OutputStream os = new FileOutputStream(_printstream_as_string, true);
        if (_printstream_as_string.endsWith(".gz"))
            os = new GZIPOutputStream(os) {
                {
                    def.setLevel(Deflater.BEST_COMPRESSION);
                }
            };
        _printstream = new PrintStream(os);
        _printstream.flush();
        _prints_to_sys = false;
    }

    @Override
    public void process(JCas aJCas) throws AnalysisEngineProcessException {
        if (!_prints_to_sys)
            try {
                openPrintToFileStream();
            } catch (IOException e) {
                LOG.error(String.format("Could not open printstream %s", _printstream_as_string));
                return;
            }
        if (_detailed_output)
            process_detailed(aJCas);
        else
            process_concise(aJCas);
        _printstream.flush();
        if (!_prints_to_sys)
            _printstream.close();
    }

    @Override
    public void collectionProcessComplete() throws AnalysisEngineProcessException {
        _printstream.flush();
        if (!_prints_to_sys)
            _printstream.close();
        super.collectionProcessComplete();
    }

    private void process_concise(JCas aJCas) {
        Collection<? extends Annotation> covering_annotations = JCasUtil.select(aJCas, _covering_annotation_type);
        for (Annotation covering_annotation : covering_annotations) {
            long src_id = aJCas.hashCode() + covering_annotation.getBegin();
            int repetitions = 0;
            if (covering_annotation instanceof RepeatedSentence)
                repetitions = ((RepeatedSentence) covering_annotation).getRepetitionCount();
            for (JoBim jb : getJoBims(covering_annotation, _sort_output)) {
                for (JobimAnnotationExtractor extractor : _extractors) {
                    if (_write_old_format)
                        _printstream.println(
                                get_concise_string_old_format(jb, covering_annotation, extractor, repetitions));
                    else
                        _printstream.println(
                                get_concise_string(jb, covering_annotation, src_id, extractor, repetitions));
                }
            }
        }
    }

    public static String get_concise_string(JoBim jb, Annotation covering_annotation, long src_id,
            JobimAnnotationExtractor extractor, int repetitions) {
        return String.format("%2$s%1$s %3$s%1$s %4$s%1$s %5$s%1$s %6$s%1$s %7$s".replace(" ", ""), // space characters are just for better overview
                extractor.getConfiguration().keyValuesDelimiter, // should be a tab
                extractor.extractKey(jb), extractor.extractValues(jb), String.valueOf(src_id), // docid / sentence id
                String.valueOf(jb.getKey().getBegin()), // offset
                String.valueOf(jb.getKey().getEnd()), // end-offset
                repetitions > 0 ? String.valueOf(repetitions + 1) : "");
    }

    public static String get_concise_string_old_format(JoBim jb, Annotation covering_annotation,
            JobimAnnotationExtractor extractor, int repetitions) {
        return String.format("%2$s%1$s %3$s%1$s %4$s".replace(" ", ""), // space characters are just for better overview
                extractor.getConfiguration().keyValuesDelimiter, // should be a tab
                extractor.extractKey(jb), extractor.extractValues(jb),
                repetitions > 0 ? String.valueOf(repetitions + 1) : "");
    }

    private void process_detailed(JCas aJCas) {
        _printstream.format("=== %s begin cas [%s] ===%n", getClass().getSimpleName(),
                StringUtils.abbreviate(aJCas.getDocumentText(), 30).replace("\n", ""));

        Collection<? extends Annotation> covering_annotations = JCasUtil.select(aJCas, _covering_annotation_type);
        for (Annotation covering_annotation : covering_annotations) {
            long src_id = aJCas.hashCode() + covering_annotation.getBegin();
            int repetitions = 0;
            if (covering_annotation instanceof RepeatedSentence)
                repetitions = ((RepeatedSentence) covering_annotation).getRepetitionCount();
            for (JoBim jb : getJoBims(covering_annotation, _sort_output)) {
                for (JobimAnnotationExtractor extractor : _extractors) {
                    _printstream.format("- found JoBim covering '%s':%n", jb.getCoveredText());
                    _printstream.format("   - %-15s %s %n", "key=", extractor.extractKey(jb));
                    _printstream.format("   - %-15s %s %n", "values=", extractor.extractValues(jb));
                    _printstream.format("   - %-15s %s %n", "src_id=", String.valueOf(src_id));
                    _printstream.format("   - %-15s %s %n", "begin_idx=", String.valueOf(jb.getBegin()));
                    _printstream.format("   - %-15s %s %n", "end_idx=", String.valueOf(jb.getEnd()));
                    _printstream.format("   - %-15s %s %n", "repetitions=", String.valueOf(repetitions));
                    _printstream.format("   - %-15s %s %n", "occurrences=", String.valueOf(repetitions + 1));
                    _printstream.format("   - %-15s %s %n", "concise=",
                            _write_old_format
                                    ? get_concise_string_old_format(jb, covering_annotation, extractor, repetitions)
                                    : get_concise_string(jb, covering_annotation, src_id, extractor, repetitions));
                }
            }
        }
        _printstream.format("=== %s end cas ===%n%n", getClass().getSimpleName());
    }

    public static Collection<JoBim> getJoBims(Annotation covering_annotation, boolean sort) {
        Collection<JoBim> jobims = JCasUtil.selectCovered(JoBim.class, covering_annotation);
        if (!sort)
            return jobims;
        List<JoBim> result = new ArrayList<JoBim>(jobims);

        Collections.sort(result, new Comparator<JoBim>() {
            @Override
            public int compare(JoBim o1, JoBim o2) {
                return o1.getBegin() - o2.getBegin();
            }
        });

        return result;
    }

}