List of usage examples for opennlp.tools.util Span spansToStrings
public static String[] spansToStrings(Span[] spans, String[] tokens)
From source file:hrpod.tools.nlp.NLPTools.java
public String[] tokenize(String text) { String[] chunkStrings = null; try {//from w ww . jav a 2 s . co m TokenizerME wordBreaker = new TokenizerME(getTokenModel()); POSTaggerME posme = new POSTaggerME(getPosModel()); ChunkerME chunkerME = new ChunkerME(getChunkerModel()); //words is the tokenized sentence String[] words = wordBreaker.tokenize(text); //posTags are the parts of speech of every word in the sentence (The chunker needs this info) String[] posTags = posme.tag(words); //chunks are the start end "spans" indices to the chunks in the words array Span[] chunks = chunkerME.chunkAsSpans(words, posTags); //chunkStrings are the actual chunks chunkStrings = Span.spansToStrings(chunks, words); //for (int i = 0; i < chunks.length; i++) { // if (chunks[i].getType().equals("NP")) { // System.out.println("NP: \n\t" + chunkStrings[i]); //String[] split = chunkStrings[i].split(" "); //List<String> ngrams = ngram(Arrays.asList(split), N, " "); //System.out.println("ngrams:"); //for (String gram : ngrams) { // System.out.println("\t" + gram); //} //} //} } catch (Exception e) { logger.error("Error in tokenize", e); } return chunkStrings; }
From source file:com.screenslicer.core.nlp.Person.java
public static String extractName(String src, boolean strict, boolean dictionaryOnly) { NameFinderME nameFinder = new NameFinderME(nameModel); String[] sentences = NlpUtil.sentences(src); Collection<String> nlpNames = new HashSet<String>(); Collection<String> nlpFallbacks = new HashSet<String>(); Collection<String> dictionaryNames = new HashSet<String>(); Collection<String> dictionaryFallbacks = new HashSet<String>(); for (int i = 0; i < sentences.length; i++) { String[] tokens = NlpUtil.tokensFromSentence(sentences[i]); for (int j = 0; j < tokens.length; j++) { String first = tokens[j]; String last = null;/*from www. ja va2 s . co m*/ if (j + 1 < tokens.length) { last = tokens[j + 1]; } if (isFirstName(first, strict) && isLastName(last) && isFullName(first + " " + last, strict)) { dictionaryNames.add(first + " " + last); } else if (!strict && isFirstName(first, strict)) { dictionaryFallbacks.add(first); } } Span[] spans = nameFinder.find(tokens); for (int j = 0; !dictionaryOnly && j < spans.length; j++) { List<String> curNames = Arrays.asList(Span.spansToStrings(spans, tokens)); for (String curName : curNames) { if (curName.contains(" ") && isFullName(curName, strict)) { nlpNames.add(curName); } else if (isFirstName(curName, strict)) { nlpFallbacks.add(curName); } } } } if (nlpNames.isEmpty()) { nlpNames = nlpFallbacks; } if (dictionaryNames.isEmpty()) { dictionaryNames = dictionaryFallbacks; } if ((dictionaryOnly || nlpNames.size() != 1) && dictionaryNames.size() != 1) { nlpNames.clear(); nlpFallbacks.clear(); dictionaryNames.clear(); dictionaryFallbacks.clear(); nameFinder.clearAdaptiveData(); for (int s = 0; s < sentences.length; s++) { String[] tokens = sentences[s].split("[\\W\\s]|$|^"); for (int i = 0; i < tokens.length; i++) { String first = tokens[i]; String last = null; if (i + 1 < tokens.length) { last = tokens[i + 1]; } if (isFirstName(first, strict) && isLastName(last) && isFullName(first + " " + last, strict)) { dictionaryNames.add(first + " " + last); } else if (!strict && isFirstName(first, strict)) { dictionaryFallbacks.add(first); } } Span[] spans = nameFinder.find(tokens); for (int j = 0; !dictionaryOnly && j < spans.length; j++) { List<String> curNames = Arrays.asList(Span.spansToStrings(spans, tokens)); for (String curName : curNames) { if (curName.contains(" ") && isFullName(curName, strict)) { nlpNames.add(curName); } else if (isFirstName(curName, strict)) { nlpFallbacks.add(curName); } } } } } if (nlpNames.isEmpty()) { nlpNames = nlpFallbacks; } if (dictionaryNames.isEmpty()) { dictionaryNames = dictionaryFallbacks; } if (nlpNames.size() == 1) { return nlpNames.iterator().next(); } if (nlpFallbacks.size() == 1) { return nlpFallbacks.iterator().next(); } if (dictionaryNames.size() == 1) { return dictionaryNames.iterator().next(); } if (dictionaryFallbacks.size() == 1) { return dictionaryFallbacks.iterator().next(); } return null; }
From source file:org.sglover.nlp.CoreNLPEntityTagger.java
@Override protected Entities getEntitiesImpl(String content) { Entities namedEntities = Entities.empty(); SentenceModel sentenceModel = sentenceModels.get("en"); SentenceDetector sentenceDetector = new SentenceDetectorME(sentenceModel); String[] sentences = sentenceDetector.sentDetect(content); TokenizerModel tm = tokenizerModels.get("en"); TokenizerME wordBreaker = new TokenizerME(tm); for (String sentence : sentences) { String[] tokens = wordBreaker.tokenize(sentence); List<TextAnnotation> allTextAnnotations = new LinkedList<TextAnnotation>(); POSModel posModel = posModels.get("en"); POSTaggerME posme = new POSTaggerME(posModel); String[] posTags = posme.tag(tokens); List<String> npTokens = new LinkedList<>(); ChunkerModel chunkerModel = chunkerModels.get("en"); ChunkerME chunkerME = new ChunkerME(chunkerModel); Span[] chunks = chunkerME.chunkAsSpans(tokens, posTags); String[] chunkStrings = Span.spansToStrings(chunks, tokens); for (int i = 0; i < chunks.length; i++) { String chunkString = chunkStrings[i]; logger.info("Chunk = " + chunkString + ", type = " + chunks[i].getType()); if (chunks[i].getType().equals("NP")) { npTokens.add(chunkString); }/*from w w w .j av a2 s .com*/ } // findEntities(namedEntities, allTextAnnotations, // npTokens.toArray(new String[0])); findEntities(namedEntities, allTextAnnotations, tokens); } return namedEntities; }
From source file:org.apache.stanbol.enhancer.engines.opennlp.impl.NEREngineCore.java
protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, String text, String language) { // version with explicit sentence endings to reflect heading / paragraph // structure of an HTML or PDF document converted to text String textWithDots = text.replaceAll("\\n\\n", ".\n"); text = removeNonUtf8CompliantCharacters(text); SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en")); Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots); NameFinderME finder = new NameFinderME(nameFinderModel); Tokenizer tokenizer = openNLP.getTokenizer(language); Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>(); for (int i = 0; i < sentenceSpans.length; i++) { String sentence = sentenceSpans[i].getCoveredText(text).toString().trim(); // build a context by concatenating three sentences to be used for // similarity ranking / disambiguation + contextual snippet in the // extraction structure List<String> contextElements = new ArrayList<String>(); if (i > 0) { CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text); contextElements.add(previousSentence.toString().trim()); }// w w w. j a v a 2 s .co m contextElements.add(sentence.trim()); if (i + 1 < sentenceSpans.length) { CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text); contextElements.add(nextSentence.toString().trim()); } String context = StringUtils.join(contextElements, " "); // extract the names in the current sentence and // keep them store them with the current context Span[] tokenSpans = tokenizer.tokenizePos(sentence); String[] tokens = Span.spansToStrings(tokenSpans, sentence); Span[] nameSpans = finder.find(tokens); double[] probs = finder.probs(); //int lastStartPosition = 0; for (int j = 0; j < nameSpans.length; j++) { String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(), tokenSpans[nameSpans[j].getEnd() - 1].getEnd()); //NOTE: With OpenNLP 1.6 the probability is now stored in the span double prob = nameSpans[j].getProb(); //prob == 0.0 := unspecified Double confidence = prob != 0.0 ? Double.valueOf(prob) : null; if (confidence == null) { //fall back to the old if it is not set. for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) { prob *= probs[k]; } confidence = Double.valueOf(prob); } else if (confidence < 0.5d) { //It looks like as if preceptron based models do return //invalid probabilities. As it is expected the Named Entities //with a probability < 50% are not even returned by finder.find(..) //we will just ignore confidence values < 0.5 here confidence = null; } int start = tokenSpans[nameSpans[j].getStart()].getStart(); int absoluteStart = sentenceSpans[i].getStart() + start; int absoluteEnd = absoluteStart + name.length(); NerTag nerTag = config.getNerTag(nameSpans[j].getType()); NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, nerTag.getType(), context, confidence); List<NameOccurrence> occurrences = nameOccurrences.get(name); if (occurrences == null) { occurrences = new ArrayList<NameOccurrence>(); } occurrences.add(occurrence); nameOccurrences.put(name, occurrences); } } finder.clearAdaptiveData(); log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences); return nameOccurrences; }
From source file:org.apache.tika.parser.geo.topic.NameEntityExtractor.java
public void getAllNameEntitiesfromInput(InputStream stream) throws IOException { String[] in = IOUtils.toString(stream, UTF_8).split(" "); Span nameE[];// w w w . j av a 2s .co m //name finder is not thread safe https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind synchronized (nameFinder) { nameE = nameFinder.find(in); //the same name finder is reused, so clear adaptive data nameFinder.clearAdaptiveData(); } String spanNames = Arrays.toString(Span.spansToStrings(nameE, in)); spanNames = spanNames.substring(1, spanNames.length() - 1); String[] tmp = spanNames.split(","); for (String name : tmp) { name = name.trim(); this.locationNameEntities.add(name); } }