Java tutorial
/* * Copyright 2012 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.lt.n2n.io; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import de.tudarmstadt.lt.n2n.types.Entity; import de.tudarmstadt.lt.n2n.types.Relation; import de.tudarmstadt.lt.utilities.IOUtils.CountingLineIterator; import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; /** * * @author Steffen Remus * */ public class SemEvalReader extends ResourceCollectionReaderBase { private final static Logger LOG = LoggerFactory.getLogger(SemEvalReader.class); /** * */ public static final String PARAM_CREATE_RELATION_ANNOTATIONS = "_create_relation_annos"; @ConfigurationParameter(name = PARAM_CREATE_RELATION_ANNOTATIONS, mandatory = true) private boolean _create_relation_annos; public static final String PARAM_CREATE_SENTENCE_ANNOTATIONS = "_create_sentence_annos"; @ConfigurationParameter(name = PARAM_CREATE_SENTENCE_ANNOTATIONS, mandatory = true) private boolean _create_sentence_annos; private Resource _current_resource; private CountingLineIterator _line_iter; private String _last_line; private Pattern _semeval_sentence_pattern = Pattern.compile("^\\d+\\s\".*<e1>.*</e1>.*<e2>.*</e2>.*\"$"); @Override public void initialize(UimaContext ctx) throws ResourceInitializationException { super.initialize(ctx); if (!_create_sentence_annos && _create_relation_annos) { LOG.info( "PARAM_CREATE_SENTENCE_ANNOTATIONS is set to true since relation annotations depend on sentence annotations"); _create_sentence_annos = _create_relation_annos || _create_sentence_annos; } } @Override public void getNext(CAS aCAS) throws IOException, CollectionException { if (_line_iter == null) { _current_resource = nextFile(); LOG.info("reading resource: [{}]", _current_resource.getResolvedUri()); _line_iter = new CountingLineIterator( new BufferedReader(new InputStreamReader(_current_resource.getInputStream(), "UTF-8"))); if (!_line_iter.hasNext()) { _line_iter.close(); _line_iter = null; return; } _last_line = _line_iter.next(); } try { parse(aCAS); } catch (CASException e) { throw new IOException(e); } finally { prepare_next(); } } private void prepare_next() { if (!_line_iter.hasNext()) { _line_iter.close(); _line_iter = null; } } public void parse(CAS aCas) throws CASException { String line = _last_line; while (!_semeval_sentence_pattern.matcher(line).matches()) { LOG.warn( "Error while parsing line {}: '{}'. Line format does not match the desired semeval sentence input format. Skipping line.", _line_iter.getCurentLineNumber(), line); if (!_line_iter.hasNext()) return; line = _line_iter.next(); } // new sentence == new cas initCas(aCas, _current_resource, String.valueOf(_line_iter.getCurentLineNumber())); JCas aJCas = aCas.getJCas(); LOG.trace("parsing line {}: '{}', treating as new sentence.", _line_iter.getCurentLineNumber(), StringUtils.abbreviate(line, 35)); Relation relation = new Relation(aJCas); try { int s_begin = line.indexOf("\""); int sentence_id = Integer.parseInt(line.substring(0, s_begin).trim()); s_begin += 1; int e1_begin = line.indexOf("<e1>") - s_begin; int e1_end = line.indexOf("</e1>") - s_begin - 4; int e2_begin = line.indexOf("<e2>") - s_begin - 4 - 5; int e2_end = line.indexOf("</e2>") - s_begin - 4 - 5 - 4; int s_end = line.lastIndexOf("\""); if (e1_begin < 0 || e1_end < 0 || e2_begin < 0 || e2_end < 0) throw new IllegalStateException( "Could not find occurrences of either <e1>, </e1>, <e2>, </e2> or \"."); String plaintext_sentence = line.substring(s_begin, s_end).replace("<e1>", "").replace("</e1>", "") .replace("<e2>", "").replace("</e2>", ""); Sentence sentence_annotation = null; if (_create_sentence_annos) { sentence_annotation = new Sentence(aJCas, 0, plaintext_sentence.length()); sentence_annotation.addToIndexes(); } if (_create_relation_annos) { Entity e1 = new Entity(aJCas, e1_begin, e1_end); e1.addToIndexes(); Entity e2 = new Entity(aJCas, e2_begin, e2_end); e2.addToIndexes(); relation.setE1(e1); relation.setE2(e2); relation.setCoveringSentence(sentence_annotation); relation.setBegin(0); relation.setEnd(plaintext_sentence.length()); relation.setId(sentence_id); relation.addToIndexes(); relation.setInfo(""); relation.setComment(""); } aJCas.setDocumentText(plaintext_sentence); } catch (Exception e) { LOG.warn("Error while parsing line {}: '{}'\n\t{}: {}", _line_iter.getCurentLineNumber(), line, e.getClass().getName(), e.getMessage()); } if (!_line_iter.hasNext()) return; line = _line_iter.next(); // read infos and comments while (!line.isEmpty() && !_semeval_sentence_pattern.matcher(line).matches()) { if (relation.getInfo() == null) { LOG.trace("parsing line {}: '{}', treating as info from sentence before.", _line_iter.getCurentLineNumber(), StringUtils.abbreviate(line, 35)); relation.setInfo(line); } else { LOG.trace("parsing line {}: '{}', treating as comment from sentence before.", _line_iter.getCurentLineNumber(), StringUtils.abbreviate(line, 35)); relation.setComment(relation.getComment() + line + "\n"); } if (!_line_iter.hasNext()) return; line = _line_iter.next(); } // skip empty lines while (line.isEmpty() && _line_iter.hasNext()) line = _line_iter.next(); _last_line = line; } @Override public boolean hasNext() throws IOException, CollectionException { return _line_iter == null ? super.hasNext() : (_line_iter.hasNext() || super.hasNext()); } }