List of usage examples for opennlp.tools.namefind NameFinderME clearAdaptiveData
public void clearAdaptiveData()
From source file:org.sglover.nlp.CoreNLPEntityTagger.java
private void findEntities(Entities namedEntities, List<TextAnnotation> allTextAnnotations, String[] tokens) { for (Map.Entry<String, TokenNameFinderModel> finderEntry : tokenNameFinders.entrySet()) { String type = finderEntry.getKey(); NameFinderME finder = new NameFinderME(finderEntry.getValue()); try {/*ww w . j a va 2 s . c o m*/ Span[] spans = finder.find(tokens); double[] probs = finder.probs(spans); for (int ni = 0; ni < spans.length; ni++) { allTextAnnotations.add(new TextAnnotation(type, spans[ni], probs[ni])); } } finally { finder.clearAdaptiveData(); } } if (allTextAnnotations.size() > 0) { removeConflicts(allTextAnnotations); } convertTextAnnotationsToNamedEntities(tokens, allTextAnnotations, namedEntities); }
From source file:com.screenslicer.core.nlp.Person.java
public static String extractName(String src, boolean strict, boolean dictionaryOnly) { NameFinderME nameFinder = new NameFinderME(nameModel); String[] sentences = NlpUtil.sentences(src); Collection<String> nlpNames = new HashSet<String>(); Collection<String> nlpFallbacks = new HashSet<String>(); Collection<String> dictionaryNames = new HashSet<String>(); Collection<String> dictionaryFallbacks = new HashSet<String>(); for (int i = 0; i < sentences.length; i++) { String[] tokens = NlpUtil.tokensFromSentence(sentences[i]); for (int j = 0; j < tokens.length; j++) { String first = tokens[j]; String last = null;/*from w ww . j av a 2 s . c o m*/ if (j + 1 < tokens.length) { last = tokens[j + 1]; } if (isFirstName(first, strict) && isLastName(last) && isFullName(first + " " + last, strict)) { dictionaryNames.add(first + " " + last); } else if (!strict && isFirstName(first, strict)) { dictionaryFallbacks.add(first); } } Span[] spans = nameFinder.find(tokens); for (int j = 0; !dictionaryOnly && j < spans.length; j++) { List<String> curNames = Arrays.asList(Span.spansToStrings(spans, tokens)); for (String curName : curNames) { if (curName.contains(" ") && isFullName(curName, strict)) { nlpNames.add(curName); } else if (isFirstName(curName, strict)) { nlpFallbacks.add(curName); } } } } if (nlpNames.isEmpty()) { nlpNames = nlpFallbacks; } if (dictionaryNames.isEmpty()) { dictionaryNames = dictionaryFallbacks; } if ((dictionaryOnly || nlpNames.size() != 1) && dictionaryNames.size() != 1) { nlpNames.clear(); nlpFallbacks.clear(); dictionaryNames.clear(); dictionaryFallbacks.clear(); nameFinder.clearAdaptiveData(); for (int s = 0; s < sentences.length; s++) { String[] tokens = sentences[s].split("[\\W\\s]|$|^"); for (int i = 0; i < tokens.length; i++) { String first = tokens[i]; String last = null; if (i + 1 < tokens.length) { last = tokens[i + 1]; } if (isFirstName(first, strict) && isLastName(last) && isFullName(first + " " + last, strict)) { dictionaryNames.add(first + " " + last); } else if (!strict && isFirstName(first, strict)) { dictionaryFallbacks.add(first); } } Span[] spans = nameFinder.find(tokens); for (int j = 0; !dictionaryOnly && j < spans.length; j++) { List<String> curNames = Arrays.asList(Span.spansToStrings(spans, tokens)); for (String curName : curNames) { if (curName.contains(" ") && isFullName(curName, strict)) { nlpNames.add(curName); } else if (isFirstName(curName, strict)) { nlpFallbacks.add(curName); } } } } } if (nlpNames.isEmpty()) { nlpNames = nlpFallbacks; } if (dictionaryNames.isEmpty()) { dictionaryNames = dictionaryFallbacks; } if (nlpNames.size() == 1) { return nlpNames.iterator().next(); } if (nlpFallbacks.size() == 1) { return nlpFallbacks.iterator().next(); } if (dictionaryNames.size() == 1) { return dictionaryNames.iterator().next(); } if (dictionaryFallbacks.size() == 1) { return dictionaryFallbacks.iterator().next(); } return null; }
From source file:org.apache.stanbol.enhancer.engines.opennlp.impl.NEREngineCore.java
/** * THis method extracts NamedEntity occurrences by using existing {@link Token}s and * {@link Sentence}s in the parsed {@link AnalysedText}. * @param nameFinderModel the model used to find NamedEntities * @param at the Analysed Text/* w w w . ja v a 2 s .com*/ * @param language the language of the text * @return the found named Entity Occurrences */ protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, AnalysedText at, String language) { // version with explicit sentence endings to reflect heading / paragraph // structure of an HTML or PDF document converted to text NameFinderME finder = new NameFinderME(nameFinderModel); Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>(); List<Section> sentences = new ArrayList<Section>(); //Holds the tokens of the previouse (pos 0) current (pos 1) and next (pos 2) sentence AnalysedTextUtils.appandToList(at.getSentences(), sentences); if (sentences.isEmpty()) { //no sentence annotations sentences.add(at); //process as a single section } for (int i = 0; i < sentences.size(); i++) { String sentence = sentences.get(i).getSpan(); // build a context by concatenating three sentences to be used for // similarity ranking / disambiguation + contextual snippet in the // extraction structure List<String> contextElements = new ArrayList<String>(); contextElements.add(sentence); //three sentences as context String context = at.getSpan().substring(sentences.get(Math.max(0, i - 1)).getStart(), sentences.get(Math.min(sentences.size() - 1, i + 1)).getEnd()); // get the tokens, words of the current sentence List<Token> tokens = new ArrayList<Token>(32); List<String> words = new ArrayList<String>(32); for (Iterator<Token> it = sentences.get(i).getTokens(); it.hasNext();) { Token t = it.next(); tokens.add(t); words.add(t.getSpan()); } Span[] nameSpans = finder.find(words.toArray(new String[words.size()])); double[] probs = finder.probs(); //int lastStartPosition = 0; for (int j = 0; j < nameSpans.length; j++) { String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(), tokens.get(nameSpans[j].getEnd() - 1).getEnd()); Double confidence = 1.0; for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) { confidence *= probs[k]; } int start = tokens.get(nameSpans[j].getStart()).getStart(); int end = start + name.length(); NerTag nerTag = config.getNerTag(nameSpans[j].getType()); //create the occurrence for writing fise:TextAnnotations NameOccurrence occurrence = new NameOccurrence(name, start, end, nerTag.getType(), context, confidence); List<NameOccurrence> occurrences = nameOccurrences.get(name); if (occurrences == null) { occurrences = new ArrayList<NameOccurrence>(); } occurrences.add(occurrence); nameOccurrences.put(name, occurrences); //add also the NerAnnotation to the AnalysedText Chunk chunk = at.addChunk(start, end); //TODO: build AnnotationModel based on the configured Mappings chunk.addAnnotation(NER_ANNOTATION, Value.value(nerTag, confidence)); } } finder.clearAdaptiveData(); log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences); return nameOccurrences; }
From source file:org.apache.stanbol.enhancer.engines.opennlp.impl.NEREngineCore.java
protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, String text, String language) { // version with explicit sentence endings to reflect heading / paragraph // structure of an HTML or PDF document converted to text String textWithDots = text.replaceAll("\\n\\n", ".\n"); text = removeNonUtf8CompliantCharacters(text); SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en")); Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots); NameFinderME finder = new NameFinderME(nameFinderModel); Tokenizer tokenizer = openNLP.getTokenizer(language); Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>(); for (int i = 0; i < sentenceSpans.length; i++) { String sentence = sentenceSpans[i].getCoveredText(text).toString().trim(); // build a context by concatenating three sentences to be used for // similarity ranking / disambiguation + contextual snippet in the // extraction structure List<String> contextElements = new ArrayList<String>(); if (i > 0) { CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text); contextElements.add(previousSentence.toString().trim()); }//ww w . j a va2s . c o m contextElements.add(sentence.trim()); if (i + 1 < sentenceSpans.length) { CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text); contextElements.add(nextSentence.toString().trim()); } String context = StringUtils.join(contextElements, " "); // extract the names in the current sentence and // keep them store them with the current context Span[] tokenSpans = tokenizer.tokenizePos(sentence); String[] tokens = Span.spansToStrings(tokenSpans, sentence); Span[] nameSpans = finder.find(tokens); double[] probs = finder.probs(); //int lastStartPosition = 0; for (int j = 0; j < nameSpans.length; j++) { String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(), tokenSpans[nameSpans[j].getEnd() - 1].getEnd()); //NOTE: With OpenNLP 1.6 the probability is now stored in the span double prob = nameSpans[j].getProb(); //prob == 0.0 := unspecified Double confidence = prob != 0.0 ? Double.valueOf(prob) : null; if (confidence == null) { //fall back to the old if it is not set. for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) { prob *= probs[k]; } confidence = Double.valueOf(prob); } else if (confidence < 0.5d) { //It looks like as if preceptron based models do return //invalid probabilities. As it is expected the Named Entities //with a probability < 50% are not even returned by finder.find(..) //we will just ignore confidence values < 0.5 here confidence = null; } int start = tokenSpans[nameSpans[j].getStart()].getStart(); int absoluteStart = sentenceSpans[i].getStart() + start; int absoluteEnd = absoluteStart + name.length(); NerTag nerTag = config.getNerTag(nameSpans[j].getType()); NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, nerTag.getType(), context, confidence); List<NameOccurrence> occurrences = nameOccurrences.get(name); if (occurrences == null) { occurrences = new ArrayList<NameOccurrence>(); } occurrences.add(occurrence); nameOccurrences.put(name, occurrences); } } finder.clearAdaptiveData(); log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences); return nameOccurrences; }
From source file:org.dbpedia.spotlight.spot.NESpotter.java
protected List<SurfaceFormOccurrence> extractNameOccurrences(BaseModel nameFinderModel, Text text, URI oType) { String intext = text.text();/* w w w. j a va 2 s. co m*/ SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel); String[] sentences = sentenceDetector.sentDetect(intext); Span[] sentenceEndings = sentenceDetector.sentPosDetect(intext); int[] sentencePositions = new int[sentences.length + 1]; for (int k = 0; k < sentenceEndings.length; k++) { sentencePositions[k] = sentenceEndings[k].getStart(); } NameFinderME finder = new NameFinderME((TokenNameFinderModel) nameFinderModel); List<SurfaceFormOccurrence> sfOccurrences = new ArrayList<SurfaceFormOccurrence>(); Tokenizer tokenizer = new SimpleTokenizer(); for (int i = 0; i < sentences.length; i++) { String sentence = sentences[i]; //LOG.debug("Sentence: " + sentence); // extract the names in the current sentence String[] tokens = tokenizer.tokenize(sentence); Span[] tokenspan = tokenizer.tokenizePos(sentence); Span[] nameSpans = finder.find(tokens); double[] probs = finder.probs(); if (nameSpans != null && nameSpans.length > 0) { //System.out.println("Tokens: " +(new ArrayList(Arrays.asList(tokens))).toString()); //System.out.println("NameSpans: " +(new ArrayList(Arrays.asList(nameSpans))).toString()); for (Span span : nameSpans) { StringBuilder buf = new StringBuilder(); //System.out.println("StartSpan: " + span.getStart() + " EndSpan: " + span.getEnd()); for (int j = span.getStart(); j < span.getEnd(); j++) { //System.out.println(tokens[i] + " appended to " + buf.toString()); buf.append(tokens[j]); if (j < span.getEnd() - 1) buf.append(" "); } String surfaceFormStr = buf.toString().trim(); if (surfaceFormStr.contains(".")) { surfaceFormStr = correctPhrase(surfaceFormStr, sentence); } int entStart = sentencePositions[i] + tokenspan[span.getStart()].getStart(); int entEnd = sentencePositions[i] + tokenspan[span.getEnd() - 1].getEnd(); /* System.out.println("\n\nRR-NE Found = " + buf.toString()); System.out.println("Start = " + entStart); System.out.println("End = " + entEnd); System.out.println("Sentence = " + sentence); System.out.println("Text = " + text); */ SurfaceForm surfaceForm = new SurfaceForm(surfaceFormStr); SurfaceFormOccurrence sfocc = new SurfaceFormOccurrence(surfaceForm, text, entStart); sfocc.features().put("type", new Feature("type", oType.toString())); sfOccurrences.add(sfocc); } } } finder.clearAdaptiveData(); if (LOG.isDebugEnabled()) { LOG.debug("Occurrences found: " + StringUtils.join(sfOccurrences, ", ")); } return sfOccurrences; }