Java tutorial
/* * Copyright 2012 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.lt.n2n.annotators; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.zip.Deflater; import java.util.zip.GZIPOutputStream; import jobimtext.holing.extractor.JobimAnnotationExtractor; import jobimtext.holing.extractor.JobimExtractorConfiguration; import jobimtext.holing.type.JoBim; import org.apache.commons.lang.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasConsumer_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import de.tudarmstadt.lt.utilities.types.RepeatedSentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; /** * * @author Steffen Remus */ public class JoBimPrinter extends JCasConsumer_ImplBase { private static Logger LOG = LoggerFactory.getLogger(JoBimPrinter.class); public static final String PARAM_TARGET_PRINTSTREAM = "_printstream_as_string"; @ConfigurationParameter(name = PARAM_TARGET_PRINTSTREAM, mandatory = false) private String _printstream_as_string; private PrintStream _printstream; private boolean _prints_to_sys; public static final String PARAM_SORT_OUTPUT = "_sort_output"; @ConfigurationParameter(name = PARAM_SORT_OUTPUT, mandatory = false, defaultValue = { "false" }) private boolean _sort_output; public static final String PARAM_BASE_EXTRACTOR_CONFIGURATION_FILES = "_extractor_configs"; @ConfigurationParameter(name = PARAM_BASE_EXTRACTOR_CONFIGURATION_FILES, mandatory = true) private File[] _extractor_configs; private JobimAnnotationExtractor[] _extractors; public static final String PARAM_COVERING_ANNOTATION_TYPE = "_covering_annotation_type"; @ConfigurationParameter(name = PARAM_COVERING_ANNOTATION_TYPE, mandatory = false) private Class<? extends Annotation> _covering_annotation_type; public static final String PARAM_WRITE_OLDFORMAT = "_write_old_format"; @ConfigurationParameter(name = PARAM_WRITE_OLDFORMAT, mandatory = false, defaultValue = { "true" }) private boolean _write_old_format; public static final String PARAM_DETAILED_OUTPUT = "_detailed_output"; @ConfigurationParameter(name = PARAM_DETAILED_OUTPUT, mandatory = false, defaultValue = { "false" }) private boolean _detailed_output; @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); _prints_to_sys = true; if (_printstream_as_string == null || "sysout".equals(_printstream_as_string) || "System.out".equals(_printstream_as_string) || "stdout".equals(_printstream_as_string)) _printstream = System.out; else if ("syserr".equals(_printstream_as_string) || "System.err".equals(_printstream_as_string) || "stderr".equals(_printstream_as_string)) _printstream = System.err; else try { // try to open and close the stream openPrintToFileStream(); _printstream.close(); } catch (IOException e) { throw new ResourceInitializationException(e); } try { _extractors = new JobimAnnotationExtractor[_extractor_configs.length]; for (int i = 0; i < _extractor_configs.length; i++) _extractors[i] = JobimExtractorConfiguration .getExtractorFromXmlFile(_extractor_configs[i].getAbsolutePath()); } catch (Exception e) { throw new ResourceInitializationException(e); } if (_covering_annotation_type == null) _covering_annotation_type = Sentence.class; } private void openPrintToFileStream() throws IOException { OutputStream os = new FileOutputStream(_printstream_as_string, true); if (_printstream_as_string.endsWith(".gz")) os = new GZIPOutputStream(os) { { def.setLevel(Deflater.BEST_COMPRESSION); } }; _printstream = new PrintStream(os); _printstream.flush(); _prints_to_sys = false; } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { if (!_prints_to_sys) try { openPrintToFileStream(); } catch (IOException e) { LOG.error(String.format("Could not open printstream %s", _printstream_as_string)); return; } if (_detailed_output) process_detailed(aJCas); else process_concise(aJCas); _printstream.flush(); if (!_prints_to_sys) _printstream.close(); } @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { _printstream.flush(); if (!_prints_to_sys) _printstream.close(); super.collectionProcessComplete(); } private void process_concise(JCas aJCas) { Collection<? extends Annotation> covering_annotations = JCasUtil.select(aJCas, _covering_annotation_type); for (Annotation covering_annotation : covering_annotations) { long src_id = aJCas.hashCode() + covering_annotation.getBegin(); int repetitions = 0; if (covering_annotation instanceof RepeatedSentence) repetitions = ((RepeatedSentence) covering_annotation).getRepetitionCount(); for (JoBim jb : getJoBims(covering_annotation, _sort_output)) { for (JobimAnnotationExtractor extractor : _extractors) { if (_write_old_format) _printstream.println( get_concise_string_old_format(jb, covering_annotation, extractor, repetitions)); else _printstream.println( get_concise_string(jb, covering_annotation, src_id, extractor, repetitions)); } } } } public static String get_concise_string(JoBim jb, Annotation covering_annotation, long src_id, JobimAnnotationExtractor extractor, int repetitions) { return String.format("%2$s%1$s %3$s%1$s %4$s%1$s %5$s%1$s %6$s%1$s %7$s".replace(" ", ""), // space characters are just for better overview extractor.getConfiguration().keyValuesDelimiter, // should be a tab extractor.extractKey(jb), extractor.extractValues(jb), String.valueOf(src_id), // docid / sentence id String.valueOf(jb.getKey().getBegin()), // offset String.valueOf(jb.getKey().getEnd()), // end-offset repetitions > 0 ? String.valueOf(repetitions + 1) : ""); } public static String get_concise_string_old_format(JoBim jb, Annotation covering_annotation, JobimAnnotationExtractor extractor, int repetitions) { return String.format("%2$s%1$s %3$s%1$s %4$s".replace(" ", ""), // space characters are just for better overview extractor.getConfiguration().keyValuesDelimiter, // should be a tab extractor.extractKey(jb), extractor.extractValues(jb), repetitions > 0 ? String.valueOf(repetitions + 1) : ""); } private void process_detailed(JCas aJCas) { _printstream.format("=== %s begin cas [%s] ===%n", getClass().getSimpleName(), StringUtils.abbreviate(aJCas.getDocumentText(), 30).replace("\n", "")); Collection<? extends Annotation> covering_annotations = JCasUtil.select(aJCas, _covering_annotation_type); for (Annotation covering_annotation : covering_annotations) { long src_id = aJCas.hashCode() + covering_annotation.getBegin(); int repetitions = 0; if (covering_annotation instanceof RepeatedSentence) repetitions = ((RepeatedSentence) covering_annotation).getRepetitionCount(); for (JoBim jb : getJoBims(covering_annotation, _sort_output)) { for (JobimAnnotationExtractor extractor : _extractors) { _printstream.format("- found JoBim covering '%s':%n", jb.getCoveredText()); _printstream.format(" - %-15s %s %n", "key=", extractor.extractKey(jb)); _printstream.format(" - %-15s %s %n", "values=", extractor.extractValues(jb)); _printstream.format(" - %-15s %s %n", "src_id=", String.valueOf(src_id)); _printstream.format(" - %-15s %s %n", "begin_idx=", String.valueOf(jb.getBegin())); _printstream.format(" - %-15s %s %n", "end_idx=", String.valueOf(jb.getEnd())); _printstream.format(" - %-15s %s %n", "repetitions=", String.valueOf(repetitions)); _printstream.format(" - %-15s %s %n", "occurrences=", String.valueOf(repetitions + 1)); _printstream.format(" - %-15s %s %n", "concise=", _write_old_format ? get_concise_string_old_format(jb, covering_annotation, extractor, repetitions) : get_concise_string(jb, covering_annotation, src_id, extractor, repetitions)); } } } _printstream.format("=== %s end cas ===%n%n", getClass().getSimpleName()); } public static Collection<JoBim> getJoBims(Annotation covering_annotation, boolean sort) { Collection<JoBim> jobims = JCasUtil.selectCovered(JoBim.class, covering_annotation); if (!sort) return jobims; List<JoBim> result = new ArrayList<JoBim>(jobims); Collections.sort(result, new Comparator<JoBim>() { @Override public int compare(JoBim o1, JoBim o2) { return o1.getBegin() - o2.getBegin(); } }); return result; } }