List of usage examples for opennlp.tools.tokenize Tokenizer tokenizePos
Span[] tokenizePos(String s);
From source file:ht.process.Processor.java
public ProcessorResult2 processDocumentV3(ProcessorParams param) { TokenizerModel portugueseTokenizer = (TokenizerModel) servletContext.getAttribute("portugueseTokenizer"); TokenizerModel englishTokenizer = (TokenizerModel) servletContext.getAttribute("englishTokenizer"); ConcurrentHashMap<String, String> portugueseStopwords = (ConcurrentHashMap<String, String>) servletContext .getAttribute("portugueseStopwords"); ConcurrentHashMap<String, String> englishStopwords = (ConcurrentHashMap<String, String>) servletContext .getAttribute("englishStopwords"); ResourceBundle portugueseMessages = (ResourceBundle) servletContext.getAttribute("portugueseMessages"); ResourceBundle englishMessages = (ResourceBundle) servletContext.getAttribute("englishMessages"); String text = param.body;/* w ww. ja va 2 s.co m*/ //String text = LZString.decompressFromUTF16(param.body); Connection connection; try { connection = ((DataSource) servletContext.getAttribute("connPool")).getConnection(); } catch (SQLException ex) { Logger.getLogger(Processor.class.getName()).log(Level.SEVERE, null, ex); return null; } HashSet<String> semanticTypes; if (param.semanticTypes == null || param.semanticTypes.isEmpty()) semanticTypes = defaultSemanticTypes; else semanticTypes = param.semanticTypes; ConceptProcessor processor = null; ResourceBundle messages = null; switch (param.language) { case "en": processor = new EnglishProcessor(connection, englishStopwords, englishTokenizer, semanticTypes); //messages = englishMessages; break; case "pt": processor = new PortugueseProcessor(connection, portugueseStopwords, portugueseTokenizer, semanticTypes); //messages = portugueseMessages; break; default: processor = new EnglishProcessor(connection, englishStopwords, englishTokenizer, semanticTypes); //messages = englishMessages; break; } if (param.contentLanguage == null) param.contentLanguage = "detected"; if ((param.contentLanguage.equals("detected") && param.language.equals("en")) || param.contentLanguage.equals("en")) messages = englishMessages; else if ((param.contentLanguage.equals("detected") && param.language.equals("pt")) || param.contentLanguage.equals("pt")) messages = portugueseMessages; else messages = englishMessages; if (param.recognizeOnlyCHV == null) param.recognizeOnlyCHV = true; processor.recognizeOnlyCHV = param.recognizeOnlyCHV; if (param.recognizeWithoutDefinition == null) param.recognizeWithoutDefinition = true; if (param.styFilter == null) param.styFilter = "all"; if (param.styFilter.equals("all")) processor.allAccepted = true; else if (param.styFilter.equals("one")) processor.allAccepted = false; Tokenizer tokenizer = new TokenizerME(processor.tokenizerModel); Span spans[] = tokenizer.tokenizePos(text); //Span[] spansCopy = new Span[spans.length]; //System.arraycopy( spans, 0, spansCopy, 0, spans.length ); //System.out.println("TEXT: " + text); //System.out.println("SPANS: " + spans.length); ArrayList<Change> resultChanges = new ArrayList<>(); for (int i = 0; i < spans.length; i++) { Span initialSpan = spans[i]; Concept bestMatch = processor.processToken(spans, i, text, FORWARD_THRESHOLD); if (bestMatch != null) { //replace "'" so it doesn't break the tooltip html if the definition contains it String definition = processor.getDefinition(bestMatch); if (definition != null) { bestMatch.setDefinition(definition); } } if (bestMatch != null && ((!param.recognizeWithoutDefinition && bestMatch.definition != null) || param.recognizeWithoutDefinition)) { i += bestMatch.words - 1; /* if (lastFound == null) { splitText.add(text.substring(0, initialSpan.getStart())); } else { splitText.add(text.substring(lastFound.span.getEnd(), bestMatch.span.getStart())); } */ String definitionTooltip = replaceConcept(bestMatch, param.language, messages); resultChanges .add(new Change(bestMatch.span.getStart(), bestMatch.span.getEnd(), definitionTooltip)); //lastFound = bestMatch; } } try { connection.close(); } catch (SQLException ex) { Logger.getLogger(Processor.class.getName()).log(Level.SEVERE, null, ex); } return new ProcessorResult2(resultChanges); }
From source file:org.apache.stanbol.enhancer.engines.opennlp.impl.NEREngineCore.java
protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, String text, String language) { // version with explicit sentence endings to reflect heading / paragraph // structure of an HTML or PDF document converted to text String textWithDots = text.replaceAll("\\n\\n", ".\n"); text = removeNonUtf8CompliantCharacters(text); SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en")); Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots); NameFinderME finder = new NameFinderME(nameFinderModel); Tokenizer tokenizer = openNLP.getTokenizer(language); Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>(); for (int i = 0; i < sentenceSpans.length; i++) { String sentence = sentenceSpans[i].getCoveredText(text).toString().trim(); // build a context by concatenating three sentences to be used for // similarity ranking / disambiguation + contextual snippet in the // extraction structure List<String> contextElements = new ArrayList<String>(); if (i > 0) { CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text); contextElements.add(previousSentence.toString().trim()); }// w w w . j av a2 s . co m contextElements.add(sentence.trim()); if (i + 1 < sentenceSpans.length) { CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text); contextElements.add(nextSentence.toString().trim()); } String context = StringUtils.join(contextElements, " "); // extract the names in the current sentence and // keep them store them with the current context Span[] tokenSpans = tokenizer.tokenizePos(sentence); String[] tokens = Span.spansToStrings(tokenSpans, sentence); Span[] nameSpans = finder.find(tokens); double[] probs = finder.probs(); //int lastStartPosition = 0; for (int j = 0; j < nameSpans.length; j++) { String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(), tokenSpans[nameSpans[j].getEnd() - 1].getEnd()); //NOTE: With OpenNLP 1.6 the probability is now stored in the span double prob = nameSpans[j].getProb(); //prob == 0.0 := unspecified Double confidence = prob != 0.0 ? Double.valueOf(prob) : null; if (confidence == null) { //fall back to the old if it is not set. for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) { prob *= probs[k]; } confidence = Double.valueOf(prob); } else if (confidence < 0.5d) { //It looks like as if preceptron based models do return //invalid probabilities. As it is expected the Named Entities //with a probability < 50% are not even returned by finder.find(..) //we will just ignore confidence values < 0.5 here confidence = null; } int start = tokenSpans[nameSpans[j].getStart()].getStart(); int absoluteStart = sentenceSpans[i].getStart() + start; int absoluteEnd = absoluteStart + name.length(); NerTag nerTag = config.getNerTag(nameSpans[j].getType()); NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, nerTag.getType(), context, confidence); List<NameOccurrence> occurrences = nameOccurrences.get(name); if (occurrences == null) { occurrences = new ArrayList<NameOccurrence>(); } occurrences.add(occurrence); nameOccurrences.put(name, occurrences); } } finder.clearAdaptiveData(); log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences); return nameOccurrences; }
From source file:org.dbpedia.spotlight.spot.NESpotter.java
protected List<SurfaceFormOccurrence> extractNameOccurrences(BaseModel nameFinderModel, Text text, URI oType) { String intext = text.text();/*from ww w .j av a2 s .c om*/ SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel); String[] sentences = sentenceDetector.sentDetect(intext); Span[] sentenceEndings = sentenceDetector.sentPosDetect(intext); int[] sentencePositions = new int[sentences.length + 1]; for (int k = 0; k < sentenceEndings.length; k++) { sentencePositions[k] = sentenceEndings[k].getStart(); } NameFinderME finder = new NameFinderME((TokenNameFinderModel) nameFinderModel); List<SurfaceFormOccurrence> sfOccurrences = new ArrayList<SurfaceFormOccurrence>(); Tokenizer tokenizer = new SimpleTokenizer(); for (int i = 0; i < sentences.length; i++) { String sentence = sentences[i]; //LOG.debug("Sentence: " + sentence); // extract the names in the current sentence String[] tokens = tokenizer.tokenize(sentence); Span[] tokenspan = tokenizer.tokenizePos(sentence); Span[] nameSpans = finder.find(tokens); double[] probs = finder.probs(); if (nameSpans != null && nameSpans.length > 0) { //System.out.println("Tokens: " +(new ArrayList(Arrays.asList(tokens))).toString()); //System.out.println("NameSpans: " +(new ArrayList(Arrays.asList(nameSpans))).toString()); for (Span span : nameSpans) { StringBuilder buf = new StringBuilder(); //System.out.println("StartSpan: " + span.getStart() + " EndSpan: " + span.getEnd()); for (int j = span.getStart(); j < span.getEnd(); j++) { //System.out.println(tokens[i] + " appended to " + buf.toString()); buf.append(tokens[j]); if (j < span.getEnd() - 1) buf.append(" "); } String surfaceFormStr = buf.toString().trim(); if (surfaceFormStr.contains(".")) { surfaceFormStr = correctPhrase(surfaceFormStr, sentence); } int entStart = sentencePositions[i] + tokenspan[span.getStart()].getStart(); int entEnd = sentencePositions[i] + tokenspan[span.getEnd() - 1].getEnd(); /* System.out.println("\n\nRR-NE Found = " + buf.toString()); System.out.println("Start = " + entStart); System.out.println("End = " + entEnd); System.out.println("Sentence = " + sentence); System.out.println("Text = " + text); */ SurfaceForm surfaceForm = new SurfaceForm(surfaceFormStr); SurfaceFormOccurrence sfocc = new SurfaceFormOccurrence(surfaceForm, text, entStart); sfocc.features().put("type", new Feature("type", oType.toString())); sfOccurrences.add(sfocc); } } } finder.clearAdaptiveData(); if (LOG.isDebugEnabled()) { LOG.debug("Occurrences found: " + StringUtils.join(sfOccurrences, ", ")); } return sfOccurrences; }