Java tutorial
/* * Copyright 2013 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.lt.n2n.preparsed.annotators; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; /** * * @author Steffen Remus */ public class PreParsedLineParser { private static Logger LOG = LoggerFactory.getLogger(PreParsedLineParser.class); public PreParsedLineParser() { // _posMappingProvider = new MappingProvider(); // _posMappingProvider.setDefault(MappingProvider.LOCATION, // "classpath:/de/tudarmstadt/ukp/dkpro/" // + "core/api/lexmorph/tagset/${language}-${pos.tagset}-pos.map"); // _posMappingProvider.setDefault(MappingProvider.BASE_TYPE, POS.class.getName()); // _posMappingProvider.setDefault("pos.tagset", "default"); // _posMappingProvider.setOverride(MappingProvider.LOCATION, _posMappingLocation); // _posMappingProvider.setOverride(MappingProvider.LANGUAGE, _language); // _posMappingProvider.addImport("pos.tagset", modelProvider); } public void parse(JCas aJCas, boolean create_sentences, boolean create_tokens, boolean create_dependencies, boolean create_collapsed_dependencies, boolean create_constituents, boolean write_penntree) throws IllegalStateException { DocumentMetaData metadata = DocumentMetaData.get(aJCas); String cas_id = metadata.getDocumentId(); String cas_text = aJCas.getDocumentText(); LOG.trace("[{}] Splitting text into lines '{}'.", cas_id, StringUtils.abbreviate(cas_text, 50)); if (cas_text.trim().isEmpty()) { LOG.trace("[{}] Text is empty.", cas_id); return; } List<Integer> line_idxs = new ArrayList<Integer>(); line_idxs.add(0); for (int index = 0; (index = cas_text.indexOf('\n', index) + 1) > 0; line_idxs.add(index)) ; LOG.debug("[{}] Found {} newline characters -> {} lines [{}]", cas_id, line_idxs.size(), line_idxs.size() + 1, StringUtils.abbreviate(cas_text, 50)); if (line_idxs.get(line_idxs.size() - 1) < cas_text.length()) // if cas doesn't end with a new line line_idxs.add(cas_text.length()); for (int i = 0; i < line_idxs.size() - 1; i++) { int bline_idx = line_idxs.get(i); int eline_idx = line_idxs.get(i + 1); parseLine(aJCas, create_sentences, create_tokens, create_dependencies, create_collapsed_dependencies, create_constituents, write_penntree, bline_idx, eline_idx); } } public void parseLine(JCas aJCas, boolean create_sentences, boolean create_tokens, boolean create_dependencies, boolean create_collapsed_dependencies, boolean create_constituents, boolean write_penntree, int bline_offset, int eline_offset) throws IllegalStateException { DocumentMetaData metadata = DocumentMetaData.get(aJCas); String cas_id = metadata.getDocumentId(); String cas_text = aJCas.getDocumentText(); String line_text = cas_text.substring(bline_offset, eline_offset); LOG.trace("[{}] Trying to parse line '{}'.", cas_id, StringUtils.abbreviate(line_text, 50)); if (line_text.trim().isEmpty()) { LOG.warn("[{}] Line is empty.", cas_id); return; } List<Integer> tab_idxs = new ArrayList<Integer>(); for (int index = 0; (index = line_text.indexOf('\t', index) + 1) > 0; tab_idxs.add(index + bline_offset)) ; if (tab_idxs.get(tab_idxs.size() - 1) < line_text.length()) // if line doesn't end with a tab character tab_idxs.add(line_text.length() + 1); LOG.debug("[{}] Found {} tab characters -> {} columns [{}]", cas_id, tab_idxs.size(), tab_idxs.size() + 1, StringUtils.abbreviate(line_text, 50)); /* * 0 = sentence -- currently ignored * 1 = token/pos-tag (separated by ' ') * 2 = StanfordDependencies (separated by ;;;;;) -- currently ignored * 3 = collapsed dependencies with positions (separated by ';' or ',') * 4 = penn tree string * 5 = if existent should be empty */ String text = cas_text.substring(bline_offset, tab_idxs.size() >= 1 ? tab_idxs.get(0) - 1 : eline_offset); // currently unused LOG.trace("[{}] Sentence text '{}'.", cas_id, text); int token_list_begin_offset_in_cas = tab_idxs.size() >= 2 ? tab_idxs.get(0) : bline_offset, token_list_end_offset_in_cas = tab_idxs.size() >= 2 ? tab_idxs.get(1) - 1 : bline_offset; String token_tag_list = tab_idxs.size() >= 2 ? cas_text.substring(token_list_begin_offset_in_cas, token_list_end_offset_in_cas) : null; LOG.trace("[{}] Token+tag-list text '{}'.", cas_id, token_tag_list); String dependency_list = tab_idxs.size() >= 3 ? cas_text.substring(tab_idxs.get(1), tab_idxs.get(2) - 1) : null; // currently unused LOG.trace("[{}] Dependency-list text '{}'.", cas_id, dependency_list); String collapsed_dependency_list = tab_idxs.size() >= 4 ? cas_text.substring(tab_idxs.get(2), tab_idxs.get(3) - 1) : null; LOG.trace("[{}] Collapsed dependency-list text '{}'.", cas_id, collapsed_dependency_list); String penn_tree_string = tab_idxs.size() >= 5 ? cas_text.substring(tab_idxs.get(3), tab_idxs.get(4) - 1).trim() : null; // trim because sometimes there is a trailing tab character LOG.trace("[{}] Collapsed penn-tree-string '{}'.", cas_id, penn_tree_string); // create token annotations? if (create_tokens && StringUtils.isNotBlank(token_tag_list)) parseTokens(aJCas, token_tag_list, cas_id, token_list_begin_offset_in_cas, token_list_end_offset_in_cas); // create collapsed dependency annotations? if (create_collapsed_dependencies && StringUtils.isNotBlank(collapsed_dependency_list)) parseCollapsedDependencies(aJCas, collapsed_dependency_list, cas_id); // create penntree string annotation? if (write_penntree && StringUtils.isNotBlank(penn_tree_string)) parsePennTree(aJCas, penn_tree_string, token_list_begin_offset_in_cas, token_list_end_offset_in_cas); // create sentence annotation? if (create_sentences && StringUtils.isNotBlank(token_tag_list)) new Sentence(aJCas, token_list_begin_offset_in_cas, token_list_end_offset_in_cas).addToIndexes(); // TODO: create constituents annotations // TODO: create stanford dependencies annotations } private void parseCollapsedDependencies(JCas aJCas, String collapsed_dependency_list, String cas_id) throws IllegalStateException { List<Token> tokens = new ArrayList<Token>(JCasUtil.select(aJCas, Token.class)); Collections.sort(tokens, new Comparator<Token>() { @Override public int compare(Token o1, Token o2) { return o1.getBegin() - o2.getBegin(); } }); for (String collapsed_dep : collapsed_dependency_list.split("\\);")) { LOG.trace("[{}] Found collapsed-dependcy-string string '{}'.", cas_id, collapsed_dep); int _temp1 = collapsed_dep.indexOf("("); if (_temp1 < 0) throw new IllegalStateException(String .format("Unexpected collapsed dependency relation: '%s' (%s).", collapsed_dep, cas_id)); String relation_name = collapsed_dep.substring(0, _temp1).trim(); int _temp2 = collapsed_dep.indexOf("-", _temp1); // skip until "-num" if (_temp2 < 0) throw new IllegalStateException(String .format("Unexpected collapsed dependency relation: '%s' (%s).", collapsed_dep, cas_id)); int _temp3 = collapsed_dep.indexOf(";", _temp2); if (_temp3 < 0) { _temp3 = collapsed_dep.indexOf(",", _temp2); if (_temp3 < 0) throw new IllegalStateException(String .format("Unexpected collapsed dependency relation: '%s' (%s).", collapsed_dep, cas_id)); } String w1 = collapsed_dep.substring(_temp1 + 1, _temp3).trim(); String w2 = collapsed_dep.substring(_temp2 + 1, collapsed_dep.length()).trim(); LOG.trace("[{}] Found collapsed-dependcy constituents rel='{}', gov='{}', dep='{}'.", cas_id, relation_name, w1, w2); int i_w1, i_w2; try { i_w1 = Integer.parseInt(w1.substring(w1.lastIndexOf('-') + 1)) - 1; i_w2 = Integer.parseInt(w2.substring(w2.lastIndexOf('-') + 1)) - 1; } catch (NumberFormatException e) { throw new IllegalStateException(String.format( "Unexpected collapsed dependency relation: '%s' (%s). Could not parse token index.", collapsed_dep, cas_id)); } if (i_w1 < 0 || i_w2 < 0) { LOG.warn("[{}] Found a dependency constituent with index <= 0 in relation '{}'.", cas_id, collapsed_dep); continue; } if (i_w1 >= tokens.size() || i_w2 >= tokens.size()) { LOG.warn( "[{}] Found a dependency constituent with index > than available tokens in relation '{}'. ({} or {} >= {})", cas_id, collapsed_dep, i_w1, i_w2, tokens.size()); continue; } Token gov = tokens.get(i_w1); Token dep = tokens.get(i_w2); LOG.trace("[{}] Assuming following tokens for collapsed-dependcy constituents gov='{}', dep='{}'.", cas_id, gov.getCoveredText(), dep.getCoveredText()); // TODO: add the correct dependency type here Dependency d = new Dependency(aJCas); d.setDependencyType(relation_name); d.setGovernor(gov); d.setDependent(dep); d.setBegin(Math.min(gov.getBegin(), dep.getBegin())); d.setEnd(Math.max(gov.getEnd(), dep.getEnd())); d.addToIndexes(); } } private void parsePennTree(JCas aJCas, String penn_tree_string, int begin_annotation, int end_annotation) { PennTree pt = new PennTree(aJCas, begin_annotation, end_annotation); pt.setPennTree(penn_tree_string); pt.addToIndexes(); } private void parseTokens(JCas aJCas, String token_tag_list, String cas_id, int token_list_begin_offset_in_cas, int token_list_end_offset_in_cas) throws IllegalStateException { // skip whitespaces for (int offset = 0; offset < token_tag_list.length() && token_tag_list.charAt(offset) == ' '; offset++, token_list_begin_offset_in_cas++) ; token_tag_list = token_tag_list.trim(); // remove white spaces at beginning and end for (int index = -1; index < token_tag_list.length();) { // extract token+tag string int token_tag_begin = index + 1; index = token_tag_list.indexOf('/', token_tag_begin); // advance index to pos type index = token_tag_list.indexOf(' ', index + 1); // advance index to next token/pos pair index = index > 0 ? index : token_tag_list.length(); int token_tag_end = index; String token_tag = token_tag_list.substring(token_tag_begin, token_tag_end); LOG.trace("[{}] Found token+tag string '{}'.", cas_id, token_tag); // extract tokens and tags individually int slash_idx = token_tag.lastIndexOf('/'); if (slash_idx <= 0) throw new IllegalStateException( String.format("Unexpected token/pos pair: '%s' (%s).", token_tag, cas_id)); String tag_str = token_tag.substring(slash_idx + 1, token_tag.length()); LOG.trace("[{}] Found token='{}' + tag='{}'.", cas_id, token_tag.substring(0, slash_idx), tag_str); Token t = new Token(aJCas, token_list_begin_offset_in_cas + token_tag_begin, // begin of token in cas text token_list_begin_offset_in_cas + token_tag_begin + slash_idx // end of token in cas text ); assert (t.getBegin() >= token_list_begin_offset_in_cas) : "Token begin must be in token+tag-list area!"; assert (t.getEnd() <= token_list_end_offset_in_cas) : "Token end must be in token+tag-list area!"; // TODO: add the correct pos type here POS pos = new POS(aJCas, t.getBegin(), t.getEnd()); pos.setPosValue(tag_str); pos.addToIndexes(); t.setPos(pos); t.addToIndexes(); } } }