Java tutorial
/* * Copyright 2012 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.lt.n2n.preparsed.annotators; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import de.tudarmstadt.lt.utilities.types.RepeatedSentence; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; /** * * @author Steffen Remus **/ public class GoogleSyntacticNgramsAnnotator extends JCasAnnotator_ImplBase { private final static Logger LOG = LoggerFactory.getLogger(GoogleSyntacticNgramsAnnotator.class); public static final String PARAM_CREATE_SENTENCE_ANNOTATIONS = "_create_sentences"; @ConfigurationParameter(name = PARAM_CREATE_SENTENCE_ANNOTATIONS, mandatory = true, defaultValue = { "true" }) private boolean _create_sentences; public static final String PARAM_CREATE_TOKEN_ANNOTATIONS = "_create_tokens"; @ConfigurationParameter(name = PARAM_CREATE_TOKEN_ANNOTATIONS, mandatory = true, defaultValue = { "true" }) private boolean _create_tokens; public static final String PARAM_CREATE_DEPENDENCY_ANNOTATIONS = "_create_dependencies"; @ConfigurationParameter(name = PARAM_CREATE_DEPENDENCY_ANNOTATIONS, mandatory = true, defaultValue = { "true" }) private boolean _create_dependencies; public static final String PARAM_FAIL_EARLY = "_fail_early"; @ConfigurationParameter(name = PARAM_FAIL_EARLY, mandatory = false, defaultValue = { "false" }) private boolean _fail_early; @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { try { // assert line.trim().split("\n", 2).length < 2 : "Too many lines, expected only one!"; parseCasLines(aJCas); } catch (Exception e) { LOG.warn("Could not process cas {}, text: '{}' ({}: {})", DocumentMetaData.get(aJCas).getDocumentId(), StringUtils.abbreviate(aJCas.getDocumentText(), 50), e.getClass().getSimpleName(), e.getMessage()); if (_fail_early) throw new AnalysisEngineProcessException(e); } } void parseCasLines(JCas aJCas) { DocumentMetaData metadata = DocumentMetaData.get(aJCas); String cas_id = metadata.getDocumentId(); String cas_text = aJCas.getDocumentText(); LOG.trace("[{}] Splitting text into lines '{}'.", cas_id, StringUtils.abbreviate(cas_text, 50)); if (cas_text.trim().isEmpty()) { LOG.trace("[{}] Text is empty.", cas_id); return; } List<Integer> line_idxs = new ArrayList<Integer>(); line_idxs.add(0); for (int index = 0; (index = cas_text.indexOf('\n', index) + 1) > 0; line_idxs.add(index)) ; LOG.debug("[{}] Found {} newline characters -> {} lines [{}]", cas_id, line_idxs.size(), line_idxs.size() + 1, StringUtils.abbreviate(cas_text, 50)); if (line_idxs.get(line_idxs.size() - 1) < cas_text.length()) // if cas doesn't end with a new line line_idxs.add(cas_text.length()); for (int i = 0; i < line_idxs.size() - 1; i++) { int bline_idx = line_idxs.get(i); int eline_idx = line_idxs.get(i + 1); parseCasLine(aJCas, bline_idx, eline_idx); } } void parseCasLine(JCas aJCas, int bline_offset, int eline_offset) { DocumentMetaData metadata = DocumentMetaData.get(aJCas); String cas_id = metadata.getDocumentId(); String cas_text = aJCas.getDocumentText(); String line_text = cas_text.substring(bline_offset, eline_offset); LOG.trace("[{}] Trying to parse line '{}'.", cas_id, StringUtils.abbreviate(line_text, 50)); if (line_text.trim().isEmpty()) { LOG.warn("[{}] Line is empty.", cas_id); return; } List<Integer> tab_idxs = new ArrayList<Integer>(); for (int index = 0; (index = line_text.indexOf('\t', index) + 1) > 0; tab_idxs.add(index + bline_offset)) ; if (tab_idxs.get(tab_idxs.size() - 1) < line_text.length()) // if line doesn't end with a tab character tab_idxs.add(bline_offset + line_text.length() + 1); LOG.debug("[{}] Found {} tab characters -> {} columns [{}]", cas_id, tab_idxs.size(), tab_idxs.size() + 1, StringUtils.abbreviate(line_text, 50)); // 0 = word; 1 = context; 2 = number of occurrences; 3 = number of occurrences per year String word = cas_text.substring(bline_offset, tab_idxs.size() >= 1 ? tab_idxs.get(0) - 1 : eline_offset); // unused LOG.trace("[{}] Word '{}'.", cas_id, word); int context_begin_offset_in_cas = tab_idxs.size() >= 2 ? tab_idxs.get(0) : bline_offset, context_end_offset_in_cas = tab_idxs.size() >= 2 ? tab_idxs.get(1) - 1 : bline_offset; String context = tab_idxs.size() >= 2 ? cas_text.substring(context_begin_offset_in_cas, context_end_offset_in_cas) : ""; LOG.trace("[{}] Context '{}'.", cas_id, context); int num_occurrences = Integer .parseInt(tab_idxs.size() >= 3 ? cas_text.substring(tab_idxs.get(1), tab_idxs.get(2) - 1) : "0"); LOG.trace("[{}] Number-of-occurrences '{}'.", cas_id, num_occurrences); String num_occurrences_per_year = tab_idxs.size() >= 4 ? cas_text.substring(tab_idxs.get(2), tab_idxs.get(3) - 1) : ""; // unused LOG.trace("[{}] Number-of-occurrences-per-year '{}'.", cas_id, num_occurrences_per_year); parseContext(aJCas, context, cas_id, context_begin_offset_in_cas, context_end_offset_in_cas, num_occurrences); } void parseContext(JCas aJCas, String context, String cas_id, int context_begin_offset_in_cas, int context_end_offset_in_cas, int num_occurrences) throws IllegalStateException { // skip whitespaces for (int offset = 0; offset < context.length() && context.charAt(offset) == ' '; offset++, context_begin_offset_in_cas++) ; context = context.trim(); // remove white spaces at beginning and end if (context.isEmpty()) { LOG.warn("[{}] Context is empty.", cas_id); return; } // create a pseudo sentence if (_create_sentences) { RepeatedSentence s = new RepeatedSentence(aJCas, context_begin_offset_in_cas, context_end_offset_in_cas); s.setRepetitionCount(num_occurrences - 1); s.addToIndexes(); } // with/IN/prep/6 //NNP/dep/3 ever/NNP/pobj/ // word/postag/deplabel/headindex word/postag/deplabel/headindex word/postag/deplabel/headindex word/postag/deplabel/headindex // word/postag/deplabel/headindex contains no '/' and no whitespace characters; head == governor List<Integer> space_idxs = new ArrayList<Integer>(); space_idxs.add(0); for (int index = 0; (index = context.indexOf(' ', index) + 1) > 0; space_idxs.add(index)) ; if (space_idxs.get(space_idxs.size() - 1) < context.length()) // if context doesn't end with a space character space_idxs.add(context.length() + 1); if (!_create_tokens) return; Token[] tokens = new Token[space_idxs.size() - 1]; String[][] context_constituents = new String[space_idxs.size() - 1][]; // first pass: collect tokens for (int i = 0; i < space_idxs.size() - 1; i++) { int btoken_idx = space_idxs.get(i); int etoken_idx = space_idxs.get(i + 1); String synt_token = context.substring(btoken_idx, etoken_idx - 1); LOG.trace("[{}] Syntactic token '{}'.", cas_id, synt_token); String[] splits = synt_token.split("/"); LOG.trace("[{}] Syntactic token constituents '{}'.", cas_id, Arrays.asList(splits)); String word = splits[0]; String postag = splits[1]; btoken_idx += context_begin_offset_in_cas; Token token = new Token(aJCas, btoken_idx, btoken_idx + word.length()); POS pos = new POS(aJCas, btoken_idx, btoken_idx + word.length()); pos.setPosValue(postag); token.setPos(pos); pos.addToIndexes(); token.addToIndexes(); tokens[i] = token; context_constituents[i] = splits; } if (!_create_dependencies) return; for (int i = 0; i < tokens.length; i++) { Token dep = tokens[i]; String[] splits = context_constituents[i]; String deplabel = splits[2]; int headindex = 0; try { headindex = Integer.parseInt(splits[3]) - 1; } catch (NumberFormatException e) { try { headindex = Integer.parseInt(splits[splits.length - 1]) - 1; } catch (NumberFormatException ee) { LOG.error("Could not parse {} or {} as number.", splits[3], splits[splits.length - 1]); } } if (headindex < 0) // skip root node pointing to nowhere continue; Token gov = tokens[headindex]; Dependency d = new Dependency(aJCas); d.setBegin(Math.min(gov.getBegin(), dep.getBegin())); d.setEnd(Math.max(gov.getEnd(), dep.getEnd())); d.setDependencyType(deplabel); d.setGovernor(gov); d.setDependent(dep); d.addToIndexes(); } } }