Example usage for opennlp.tools.namefind NameFinderME find

Introduction

In this page you can find the example usage for opennlp.tools.namefind NameFinderME find.

Prototype

public Span[] find(String[] tokens)

Source Link

Usage

From source file:edu.stanford.muse.index.NER.java

public static void testOpenNLP() {

    try {/*  w  ww. java 2  s  .  co  m*/
        String s = Util.readFile("/tmp/in");
        /*
        List<Pair<String,Float>> pairs = NER.namesFromText(s);
        for (Pair<String,Float> p: pairs) {
           System.out.println (p);
        }
        System.out.println ("-----");
        */

        InputStream pis = Config.getResourceAsStream("en-ner-person.bin");
        TokenNameFinderModel pmodel = new TokenNameFinderModel(pis);
        InputStream lis = Config.getResourceAsStream("en-ner-location.bin");
        TokenNameFinderModel lmodel = new TokenNameFinderModel(lis);
        InputStream ois = Config.getResourceAsStream("en-ner-organization.bin");
        TokenNameFinderModel omodel = new TokenNameFinderModel(ois);
        InputStream tokenStream = Config.getResourceAsStream("en-token.bin");
        TokenizerModel modelTokenizer = new TokenizerModel(tokenStream);
        TokenizerME tokenizer = new TokenizerME(modelTokenizer);
        Span[] tokSpans = tokenizer.tokenizePos(s); // Util.tokenize(s).toArray(new String[0]);

        String tokens[] = new String[tokSpans.length];
        for (int i = 0; i < tokSpans.length; i++)
            tokens[i] = s.substring(tokSpans[i].getStart(), tokSpans[i].getEnd());

        NameFinderME pFinder = new NameFinderME(pmodel);
        Span[] pSpans = pFinder.find(tokens);
        NameFinderME lFinder = new NameFinderME(lmodel);
        Span[] lSpans = lFinder.find(tokens);
        NameFinderME oFinder = new NameFinderME(omodel);
        Span[] oSpans = oFinder.find(tokens);
        System.out.println("Names found:");
        for (Span span : pSpans) {
            for (int i = span.getStart(); i < span.getEnd(); i++)
                System.out.print(tokens[i] + " ");
            System.out.println();
        }

        System.out.println("Locations found:");
        for (Span span : lSpans) {
            for (int i = span.getStart(); i < span.getEnd(); i++)
                System.out.print(tokens[i] + " ");
            System.out.println();
        }

        System.out.println("Orgs found:");
        for (Span span : oSpans) {
            for (int i = span.getStart(); i < span.getEnd(); i++)
                System.out.print(tokens[i] + " ");
            System.out.println();
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:com.screenslicer.core.nlp.Person.java

public static String extractName(String src, boolean strict, boolean dictionaryOnly) {
    NameFinderME nameFinder = new NameFinderME(nameModel);
    String[] sentences = NlpUtil.sentences(src);
    Collection<String> nlpNames = new HashSet<String>();
    Collection<String> nlpFallbacks = new HashSet<String>();
    Collection<String> dictionaryNames = new HashSet<String>();
    Collection<String> dictionaryFallbacks = new HashSet<String>();
    for (int i = 0; i < sentences.length; i++) {
        String[] tokens = NlpUtil.tokensFromSentence(sentences[i]);
        for (int j = 0; j < tokens.length; j++) {
            String first = tokens[j];
            String last = null;//from   w w  w. jav a 2  s .  co m
            if (j + 1 < tokens.length) {
                last = tokens[j + 1];
            }
            if (isFirstName(first, strict) && isLastName(last) && isFullName(first + " " + last, strict)) {
                dictionaryNames.add(first + " " + last);
            } else if (!strict && isFirstName(first, strict)) {
                dictionaryFallbacks.add(first);
            }
        }
        Span[] spans = nameFinder.find(tokens);
        for (int j = 0; !dictionaryOnly && j < spans.length; j++) {
            List<String> curNames = Arrays.asList(Span.spansToStrings(spans, tokens));
            for (String curName : curNames) {
                if (curName.contains(" ") && isFullName(curName, strict)) {
                    nlpNames.add(curName);
                } else if (isFirstName(curName, strict)) {
                    nlpFallbacks.add(curName);
                }
            }
        }
    }
    if (nlpNames.isEmpty()) {
        nlpNames = nlpFallbacks;
    }
    if (dictionaryNames.isEmpty()) {
        dictionaryNames = dictionaryFallbacks;
    }

    if ((dictionaryOnly || nlpNames.size() != 1) && dictionaryNames.size() != 1) {
        nlpNames.clear();
        nlpFallbacks.clear();
        dictionaryNames.clear();
        dictionaryFallbacks.clear();
        nameFinder.clearAdaptiveData();
        for (int s = 0; s < sentences.length; s++) {
            String[] tokens = sentences[s].split("[\\W\\s]|$|^");
            for (int i = 0; i < tokens.length; i++) {
                String first = tokens[i];
                String last = null;
                if (i + 1 < tokens.length) {
                    last = tokens[i + 1];
                }
                if (isFirstName(first, strict) && isLastName(last) && isFullName(first + " " + last, strict)) {
                    dictionaryNames.add(first + " " + last);
                } else if (!strict && isFirstName(first, strict)) {
                    dictionaryFallbacks.add(first);
                }
            }
            Span[] spans = nameFinder.find(tokens);
            for (int j = 0; !dictionaryOnly && j < spans.length; j++) {
                List<String> curNames = Arrays.asList(Span.spansToStrings(spans, tokens));
                for (String curName : curNames) {
                    if (curName.contains(" ") && isFullName(curName, strict)) {
                        nlpNames.add(curName);
                    } else if (isFirstName(curName, strict)) {
                        nlpFallbacks.add(curName);
                    }
                }
            }
        }
    }
    if (nlpNames.isEmpty()) {
        nlpNames = nlpFallbacks;
    }
    if (dictionaryNames.isEmpty()) {
        dictionaryNames = dictionaryFallbacks;
    }
    if (nlpNames.size() == 1) {
        return nlpNames.iterator().next();
    }
    if (nlpFallbacks.size() == 1) {
        return nlpFallbacks.iterator().next();
    }
    if (dictionaryNames.size() == 1) {
        return dictionaryNames.iterator().next();
    }
    if (dictionaryFallbacks.size() == 1) {
        return dictionaryFallbacks.iterator().next();
    }
    return null;
}

From source file:org.sglover.nlp.CoreNLPEntityTagger.java

private void findEntities(Entities namedEntities, List<TextAnnotation> allTextAnnotations, String[] tokens) {
    for (Map.Entry<String, TokenNameFinderModel> finderEntry : tokenNameFinders.entrySet()) {
        String type = finderEntry.getKey();
        NameFinderME finder = new NameFinderME(finderEntry.getValue());
        try {/*w w w  .j a va2s  .c  om*/
            Span[] spans = finder.find(tokens);
            double[] probs = finder.probs(spans);

            for (int ni = 0; ni < spans.length; ni++) {
                allTextAnnotations.add(new TextAnnotation(type, spans[ni], probs[ni]));
            }
        } finally {
            finder.clearAdaptiveData();
        }
    }

    if (allTextAnnotations.size() > 0) {
        removeConflicts(allTextAnnotations);
    }

    convertTextAnnotationsToNamedEntities(tokens, allTextAnnotations, namedEntities);
}

From source file:org.apache.stanbol.enhancer.engines.opennlp.impl.NEREngineCore.java

/**
 * THis method extracts NamedEntity occurrences by using existing {@link Token}s and 
 * {@link Sentence}s in the parsed {@link AnalysedText}.
 * @param nameFinderModel the model used to find NamedEntities
 * @param at the Analysed Text//  w w  w  .ja v  a 2 s .co m
 * @param language the language of the text
 * @return the found named Entity Occurrences
 */
protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
        AnalysedText at, String language) {
    // version with explicit sentence endings to reflect heading / paragraph
    // structure of an HTML or PDF document converted to text

    NameFinderME finder = new NameFinderME(nameFinderModel);
    Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
    List<Section> sentences = new ArrayList<Section>();
    //Holds the tokens of the previouse (pos 0) current (pos 1) and next (pos 2) sentence
    AnalysedTextUtils.appandToList(at.getSentences(), sentences);
    if (sentences.isEmpty()) { //no sentence annotations
        sentences.add(at); //process as a single section
    }
    for (int i = 0; i < sentences.size(); i++) {
        String sentence = sentences.get(i).getSpan();

        // build a context by concatenating three sentences to be used for
        // similarity ranking / disambiguation + contextual snippet in the
        // extraction structure
        List<String> contextElements = new ArrayList<String>();
        contextElements.add(sentence);
        //three sentences as context
        String context = at.getSpan().substring(sentences.get(Math.max(0, i - 1)).getStart(),
                sentences.get(Math.min(sentences.size() - 1, i + 1)).getEnd());

        // get the tokens, words of the current sentence
        List<Token> tokens = new ArrayList<Token>(32);
        List<String> words = new ArrayList<String>(32);
        for (Iterator<Token> it = sentences.get(i).getTokens(); it.hasNext();) {
            Token t = it.next();
            tokens.add(t);
            words.add(t.getSpan());
        }
        Span[] nameSpans = finder.find(words.toArray(new String[words.size()]));
        double[] probs = finder.probs();
        //int lastStartPosition = 0;
        for (int j = 0; j < nameSpans.length; j++) {
            String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(),
                    tokens.get(nameSpans[j].getEnd() - 1).getEnd());
            Double confidence = 1.0;
            for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                confidence *= probs[k];
            }
            int start = tokens.get(nameSpans[j].getStart()).getStart();
            int end = start + name.length();
            NerTag nerTag = config.getNerTag(nameSpans[j].getType());
            //create the occurrence for writing fise:TextAnnotations
            NameOccurrence occurrence = new NameOccurrence(name, start, end, nerTag.getType(), context,
                    confidence);
            List<NameOccurrence> occurrences = nameOccurrences.get(name);
            if (occurrences == null) {
                occurrences = new ArrayList<NameOccurrence>();
            }
            occurrences.add(occurrence);
            nameOccurrences.put(name, occurrences);
            //add also the NerAnnotation to the AnalysedText
            Chunk chunk = at.addChunk(start, end);
            //TODO: build AnnotationModel based on the configured Mappings
            chunk.addAnnotation(NER_ANNOTATION, Value.value(nerTag, confidence));
        }
    }
    finder.clearAdaptiveData();
    log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
    return nameOccurrences;
}

From source file:org.apache.stanbol.enhancer.engines.opennlp.impl.NEREngineCore.java

protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
        String text, String language) {
    // version with explicit sentence endings to reflect heading / paragraph
    // structure of an HTML or PDF document converted to text
    String textWithDots = text.replaceAll("\\n\\n", ".\n");
    text = removeNonUtf8CompliantCharacters(text);

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));

    Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);

    NameFinderME finder = new NameFinderME(nameFinderModel);
    Tokenizer tokenizer = openNLP.getTokenizer(language);
    Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
    for (int i = 0; i < sentenceSpans.length; i++) {
        String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();

        // build a context by concatenating three sentences to be used for
        // similarity ranking / disambiguation + contextual snippet in the
        // extraction structure
        List<String> contextElements = new ArrayList<String>();
        if (i > 0) {
            CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
            contextElements.add(previousSentence.toString().trim());
        }//from w  ww .j  av a 2 s. c om
        contextElements.add(sentence.trim());
        if (i + 1 < sentenceSpans.length) {
            CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
            contextElements.add(nextSentence.toString().trim());
        }
        String context = StringUtils.join(contextElements, " ");

        // extract the names in the current sentence and
        // keep them store them with the current context
        Span[] tokenSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = Span.spansToStrings(tokenSpans, sentence);
        Span[] nameSpans = finder.find(tokens);
        double[] probs = finder.probs();
        //int lastStartPosition = 0;
        for (int j = 0; j < nameSpans.length; j++) {
            String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(),
                    tokenSpans[nameSpans[j].getEnd() - 1].getEnd());
            //NOTE: With OpenNLP 1.6 the probability is now stored in the span
            double prob = nameSpans[j].getProb();
            //prob == 0.0 := unspecified
            Double confidence = prob != 0.0 ? Double.valueOf(prob) : null;
            if (confidence == null) { //fall back to the old if it is not set.
                for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                    prob *= probs[k];
                }
                confidence = Double.valueOf(prob);
            } else if (confidence < 0.5d) {
                //It looks like as if preceptron based models do return
                //invalid probabilities. As it is expected the Named Entities
                //with a probability < 50% are not even returned by finder.find(..)
                //we will just ignore confidence values < 0.5 here
                confidence = null;
            }
            int start = tokenSpans[nameSpans[j].getStart()].getStart();
            int absoluteStart = sentenceSpans[i].getStart() + start;
            int absoluteEnd = absoluteStart + name.length();
            NerTag nerTag = config.getNerTag(nameSpans[j].getType());
            NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, nerTag.getType(),
                    context, confidence);

            List<NameOccurrence> occurrences = nameOccurrences.get(name);
            if (occurrences == null) {
                occurrences = new ArrayList<NameOccurrence>();
            }
            occurrences.add(occurrence);
            nameOccurrences.put(name, occurrences);
        }
    }
    finder.clearAdaptiveData();
    log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
    return nameOccurrences;
}

From source file:org.dbpedia.spotlight.spot.NESpotter.java

protected List<SurfaceFormOccurrence> extractNameOccurrences(BaseModel nameFinderModel, Text text, URI oType) {
    String intext = text.text();/* w ww .  j  av  a2s. c om*/
    SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel);
    String[] sentences = sentenceDetector.sentDetect(intext);
    Span[] sentenceEndings = sentenceDetector.sentPosDetect(intext);
    int[] sentencePositions = new int[sentences.length + 1];
    for (int k = 0; k < sentenceEndings.length; k++) {
        sentencePositions[k] = sentenceEndings[k].getStart();
    }

    NameFinderME finder = new NameFinderME((TokenNameFinderModel) nameFinderModel);

    List<SurfaceFormOccurrence> sfOccurrences = new ArrayList<SurfaceFormOccurrence>();
    Tokenizer tokenizer = new SimpleTokenizer();
    for (int i = 0; i < sentences.length; i++) {
        String sentence = sentences[i];
        //LOG.debug("Sentence: " + sentence);

        // extract the names in the current sentence
        String[] tokens = tokenizer.tokenize(sentence);
        Span[] tokenspan = tokenizer.tokenizePos(sentence);
        Span[] nameSpans = finder.find(tokens);
        double[] probs = finder.probs();

        if (nameSpans != null && nameSpans.length > 0) {
            //System.out.println("Tokens: " +(new ArrayList(Arrays.asList(tokens))).toString());
            //System.out.println("NameSpans: " +(new ArrayList(Arrays.asList(nameSpans))).toString());
            for (Span span : nameSpans) {
                StringBuilder buf = new StringBuilder();
                //System.out.println("StartSpan: " + span.getStart() + " EndSpan: " + span.getEnd());
                for (int j = span.getStart(); j < span.getEnd(); j++) {
                    //System.out.println(tokens[i] + " appended to " + buf.toString());
                    buf.append(tokens[j]);
                    if (j < span.getEnd() - 1)
                        buf.append(" ");
                }
                String surfaceFormStr = buf.toString().trim();
                if (surfaceFormStr.contains(".")) {
                    surfaceFormStr = correctPhrase(surfaceFormStr, sentence);
                }

                int entStart = sentencePositions[i] + tokenspan[span.getStart()].getStart();
                int entEnd = sentencePositions[i] + tokenspan[span.getEnd() - 1].getEnd();

                /*
                System.out.println("\n\nRR-NE Found = " + buf.toString());
                System.out.println("Start = " + entStart);
                System.out.println("End = " + entEnd);
                System.out.println("Sentence = " + sentence);
                System.out.println("Text = " + text);
                */

                SurfaceForm surfaceForm = new SurfaceForm(surfaceFormStr);
                SurfaceFormOccurrence sfocc = new SurfaceFormOccurrence(surfaceForm, text, entStart);
                sfocc.features().put("type", new Feature("type", oType.toString()));
                sfOccurrences.add(sfocc);
            }
        }

    }
    finder.clearAdaptiveData();

    if (LOG.isDebugEnabled()) {
        LOG.debug("Occurrences found: " + StringUtils.join(sfOccurrences, ", "));
    }
    return sfOccurrences;
}