Java tutorial
/* * Copyright 2012 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.lt.lm.service; import java.rmi.RemoteException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.fit.factory.AggregateBuilder; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import de.tudarmstadt.lt.lm.PseudoSymbol; import de.tudarmstadt.lt.lm.util.Properties; import de.tudarmstadt.lt.utilities.ArrayUtils; import de.tudarmstadt.lt.utilities.types.FilteredItem; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.PUNC; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; /** * * @author Steffen Remus **/ public class UimaStringProvider extends AbstractStringProvider implements StringProviderMXBean { private static Logger LOG = LoggerFactory.getLogger(UimaStringProvider.class); private AnalysisEngine _processing_engine; private AnalysisEngineDescription _processing_engine_description; private Object _processing_engine_lck = new Object(); public void initDefaultProcessingEngine() throws ResourceInitializationException { _processing_engine_description = null; } public AnalysisEngine getEngine() { return _processing_engine; } public void setEngineDescription(AnalysisEngineDescription engine_description) throws Exception { if (_processing_engine != null) { // gracefully shutdown the old engine LOG.info("Shutting down old cas engine."); _processing_engine.batchProcessComplete(); _processing_engine.collectionProcessComplete(); _processing_engine.destroy(); } _processing_engine_description = engine_description; AggregateBuilder builder = new AggregateBuilder(); builder.add(engine_description); _processing_engine = builder.createAggregate(); } // @Override public AnalysisEngineDescription getEngineDescription() throws RemoteException { return _processing_engine_description; } @Override public List<String>[] getNgrams(String text, String language_code) throws Exception { String docid = "#" + Integer.toHexString(text.hashCode()); LOG.debug("[{}] Processing text '{}' (length {}).", docid, StringUtils.abbreviate(text, 30), text.length()); return getNgramsInner(text.trim(), language_code, docid, 0, 0); } public List<String>[] getNgramsInner(String text, String language_code, String docid, int splitcount, int casExceptionCount) throws RemoteException, Exception { int maxLengthSplitHeuristic = Properties.maxLengthSplitHeuristic(); if (maxLengthSplitHeuristic > 0 && text.length() > maxLengthSplitHeuristic) { List<String>[] result = null; Iterator<String> text_splits_iter = applySplitHeuristic(text, splitcount); for (int i = 0; text_splits_iter.hasNext(); i++) { String docid_i = String.format("%s.%d", docid, i); List<String>[] intermediate_result = getNgramsInner(text_splits_iter.next().trim(), language_code, docid_i, splitcount + 1, 0); if (result == null) result = intermediate_result; else result = ArrayUtils.getConcatinatedArray(result, intermediate_result); } return result; } synchronized (_processing_engine_lck) { List<String>[] result = null; JCas aJCas = null; try { aJCas = _processing_engine.newJCas(); } catch (Throwable t) { for (int i = 0; t != null; i++) { LOG.error( String.format("Failed to initialize cas: %s '%s' (%d-%s: %s).)", docid, StringUtils.abbreviate(text, 50), i, t.getClass().getName(), t.getMessage()), t); t = t.getCause(); } return EMPTY_NGRAM_LIST; } aJCas.setDocumentText(text); DocumentMetaData meta = DocumentMetaData.create(aJCas); meta.setDocumentId(docid); meta.setDocumentTitle(StringUtils.abbreviate(text, 30)); aJCas.getCas().setDocumentLanguage(language_code); // aJCas.setDocumentLanguage(language_code); meta.setLanguage(language_code); LOG.debug("[{}] Pre-processing text '{}'...", docid, StringUtils.abbreviate(text, 50)); try { _processing_engine.process(aJCas); } catch (Throwable t) { for (int i = 0; t != null; i++) { LOG.error( String.format("Failed to process cas: %s '%s' (%d-%s: %s).)", docid, StringUtils.abbreviate(text, 50), i, t.getClass().getName(), t.getMessage()), t); t = t.getCause(); } LOG.error("Reactivating cas engine."); setEngineDescription(getEngineDescription()); LOG.error("Cas engine reactivated."); if (casExceptionCount++ < 1) { LOG.error("Reprocessing cas {}.", docid); return getNgramsInner(text, language_code, docid, splitcount, casExceptionCount); } LOG.error("Cas engine exception counter has reached its limit. Cas {} will be skipped.", docid); return EMPTY_NGRAM_LIST; } result = getNgramsFromSentencesFromCas(aJCas); return result; } } private final static Pattern[] patterns = new Pattern[] { Pattern.compile("\\.\\s*$"), Pattern.compile("\\w\\.\\s+[$,A-Z,0-9,\",']"), Pattern.compile(":\\s+[$,A-Z,0-9,\",']"), Pattern.compile("\\.\\s") }; public Iterator<String> applySplitHeuristic(final String text, int aggressiveness) { if (aggressiveness >= patterns.length) return new Iterator<String>() { int i = 0; @Override public boolean hasNext() { return i < text.length(); } @Override public String next() { int next_i = Math.min(i + 1000, text.length()); String result = text.substring(i, next_i); i = next_i; return result; } @Override public void remove() { throw new UnsupportedOperationException("remove() is not supported"); } }; final Matcher m = patterns[aggressiveness].matcher(text); return new Iterator<String>() { int last = 0; boolean found = false; @Override public boolean hasNext() { found = m.find(); return found || last < text.length(); } @Override public String next() { int current; if (found) current = m.end() - 1; else current = text.length(); String res = text.substring(last, current); last = current; return res; } @Override public void remove() { throw new UnsupportedOperationException("remove() is not supported"); } }; } @SuppressWarnings("unchecked") public List<String>[] getNgramsFromSentencesFromCas(JCas aJCas) throws Exception { DocumentMetaData meta = DocumentMetaData.get(aJCas); String docid = (meta != null) ? meta.getDocumentId() : "#" + Integer.toString(aJCas.getDocumentText().hashCode()); Collection<Sentence> sentences = JCasUtil.select(aJCas, Sentence.class); LOG.debug("[{}] Processing {} sentences (document length: {}).", docid, sentences.size(), aJCas.getDocumentText().length()); List<List<String>> result_list = new LinkedList<List<String>>(); int i = 0; for (Sentence sentence : sentences) { LOG.trace("[{}] Processing sentence {}/{}.", docid, ++i, sentences.size()); List<String>[] ngram_sequence = getNgramsFromSentence(sentence); if (ngram_sequence.length == 0) continue; result_list.addAll(Arrays.asList(ngram_sequence)); } return result_list.toArray(new List[0]); } public List<String>[] getNgramsFromSentence(Sentence sentence) throws Exception { List<String> sequence = getSequenceFromSentence(sentence); if (sequence.size() < getLanguageModel().getOrder()) return EMPTY_NGRAM_LIST; List<String>[] result = getLanguageModel().getNgramSequence(sequence); return result; } // @Override public List<String> getSequenceFromSentence(Sentence sentence) throws Exception { boolean handle_sentence_boundaries = Properties.handleBoundaries() == 3; // FIXME: List<String> sequence = new ArrayList<String>(); for (int i = 0; i < getLanguageModel().getOrder() - 1 && handle_sentence_boundaries; i++) sequence.add(PseudoSymbol.SEQUENCE_START.asString()); try { List<Token> tokens = JCasUtil.selectCovered(Token.class, sentence); // Collections.sort(tokens, AnnotationBeginOffsetOrderComparator.INSTANCE); int token_count = 0; boolean filtered_last_token = false; for (Iterator<Token> iter = tokens.iterator(); iter.hasNext();) { Token token = iter.next(); if (!JCasUtil.selectCovered(FilteredItem.class, token).isEmpty()) { if (!filtered_last_token && Properties.mergeFilteredItems()) { sequence.add("<filtered>"); token_count++; filtered_last_token = true; } continue; } filtered_last_token = false; if (Properties.ignorePunctuation() && token.getPos() instanceof PUNC) continue; String word; if (Properties.ignorePunctuation() && token.getLemma() != null && token.getLemma().getValue() != null) word = token.getLemma().getValue(); else word = token.getCoveredText(); word = de.tudarmstadt.lt.utilities.StringUtils.trim_and_replace_emptyspace(word, "_"); if (Properties.useLowercase()) word = word.toLowerCase(); sequence.add(word); token_count++; } if (token_count < getLanguageModel().getOrder()) return Collections.emptyList(); } catch (Throwable t) { while (t != null) { LOG.error(String.format("Error while processing sentence. %s:%s", t.getClass().getName(), t.getMessage()), t); t = t.getCause(); } } // sequence.add(PseudoSymbol.SEQUENCE_END.asString()); return sequence; } @Override public List<String> tokenizeSentence_intern(String sentence, String language_code) { throw new UnsupportedOperationException("This method is not provided for " + getClass().getSimpleName()); } @Override public List<String>[] getNgramSequenceFromSentence(List<String> sequence) throws Exception { throw new UnsupportedOperationException("This method is not provided for " + getClass().getSimpleName()); } @Override public List<String>[] getNgramSequence(List<String> sequence) throws Exception { throw new UnsupportedOperationException("This method is not provided for " + getClass().getSimpleName()); } @Override public List<String> splitSentences(String text, String language_code) throws Exception { throw new UnsupportedOperationException("This method is not provided for " + getClass().getSimpleName()); } }