Example usage for opennlp.tools.util Span spansToStrings

Introduction

In this page you can find the example usage for opennlp.tools.util Span spansToStrings.

Prototype

public static String[] spansToStrings(Span[] spans, String[] tokens)

Source Link

Usage

From source file:hrpod.tools.nlp.NLPTools.java

public String[] tokenize(String text) {

    String[] chunkStrings = null;

    try {//from  w  ww . jav a  2  s . co m
        TokenizerME wordBreaker = new TokenizerME(getTokenModel());
        POSTaggerME posme = new POSTaggerME(getPosModel());
        ChunkerME chunkerME = new ChunkerME(getChunkerModel());

        //words is the tokenized sentence
        String[] words = wordBreaker.tokenize(text);
        //posTags are the parts of speech of every word in the sentence (The chunker needs this info)
        String[] posTags = posme.tag(words);
        //chunks are the start end "spans" indices to the chunks in the words array
        Span[] chunks = chunkerME.chunkAsSpans(words, posTags);
        //chunkStrings are the actual chunks
        chunkStrings = Span.spansToStrings(chunks, words);
        //for (int i = 0; i < chunks.length; i++) {
        //    if (chunks[i].getType().equals("NP")) {
        //        System.out.println("NP: \n\t" + chunkStrings[i]);

        //String[] split = chunkStrings[i].split(" ");
        //List<String> ngrams = ngram(Arrays.asList(split), N, " ");
        //System.out.println("ngrams:");
        //for (String gram : ngrams) {
        //  System.out.println("\t" + gram);
        //}
        //}
        //}
    } catch (Exception e) {
        logger.error("Error in tokenize", e);
    }

    return chunkStrings;

}

From source file:com.screenslicer.core.nlp.Person.java

public static String extractName(String src, boolean strict, boolean dictionaryOnly) {
    NameFinderME nameFinder = new NameFinderME(nameModel);
    String[] sentences = NlpUtil.sentences(src);
    Collection<String> nlpNames = new HashSet<String>();
    Collection<String> nlpFallbacks = new HashSet<String>();
    Collection<String> dictionaryNames = new HashSet<String>();
    Collection<String> dictionaryFallbacks = new HashSet<String>();
    for (int i = 0; i < sentences.length; i++) {
        String[] tokens = NlpUtil.tokensFromSentence(sentences[i]);
        for (int j = 0; j < tokens.length; j++) {
            String first = tokens[j];
            String last = null;/*from www. ja  va2 s  . co  m*/
            if (j + 1 < tokens.length) {
                last = tokens[j + 1];
            }
            if (isFirstName(first, strict) && isLastName(last) && isFullName(first + " " + last, strict)) {
                dictionaryNames.add(first + " " + last);
            } else if (!strict && isFirstName(first, strict)) {
                dictionaryFallbacks.add(first);
            }
        }
        Span[] spans = nameFinder.find(tokens);
        for (int j = 0; !dictionaryOnly && j < spans.length; j++) {
            List<String> curNames = Arrays.asList(Span.spansToStrings(spans, tokens));
            for (String curName : curNames) {
                if (curName.contains(" ") && isFullName(curName, strict)) {
                    nlpNames.add(curName);
                } else if (isFirstName(curName, strict)) {
                    nlpFallbacks.add(curName);
                }
            }
        }
    }
    if (nlpNames.isEmpty()) {
        nlpNames = nlpFallbacks;
    }
    if (dictionaryNames.isEmpty()) {
        dictionaryNames = dictionaryFallbacks;
    }

    if ((dictionaryOnly || nlpNames.size() != 1) && dictionaryNames.size() != 1) {
        nlpNames.clear();
        nlpFallbacks.clear();
        dictionaryNames.clear();
        dictionaryFallbacks.clear();
        nameFinder.clearAdaptiveData();
        for (int s = 0; s < sentences.length; s++) {
            String[] tokens = sentences[s].split("[\\W\\s]|$|^");
            for (int i = 0; i < tokens.length; i++) {
                String first = tokens[i];
                String last = null;
                if (i + 1 < tokens.length) {
                    last = tokens[i + 1];
                }
                if (isFirstName(first, strict) && isLastName(last) && isFullName(first + " " + last, strict)) {
                    dictionaryNames.add(first + " " + last);
                } else if (!strict && isFirstName(first, strict)) {
                    dictionaryFallbacks.add(first);
                }
            }
            Span[] spans = nameFinder.find(tokens);
            for (int j = 0; !dictionaryOnly && j < spans.length; j++) {
                List<String> curNames = Arrays.asList(Span.spansToStrings(spans, tokens));
                for (String curName : curNames) {
                    if (curName.contains(" ") && isFullName(curName, strict)) {
                        nlpNames.add(curName);
                    } else if (isFirstName(curName, strict)) {
                        nlpFallbacks.add(curName);
                    }
                }
            }
        }
    }
    if (nlpNames.isEmpty()) {
        nlpNames = nlpFallbacks;
    }
    if (dictionaryNames.isEmpty()) {
        dictionaryNames = dictionaryFallbacks;
    }
    if (nlpNames.size() == 1) {
        return nlpNames.iterator().next();
    }
    if (nlpFallbacks.size() == 1) {
        return nlpFallbacks.iterator().next();
    }
    if (dictionaryNames.size() == 1) {
        return dictionaryNames.iterator().next();
    }
    if (dictionaryFallbacks.size() == 1) {
        return dictionaryFallbacks.iterator().next();
    }
    return null;
}

From source file:org.sglover.nlp.CoreNLPEntityTagger.java

@Override
protected Entities getEntitiesImpl(String content) {
    Entities namedEntities = Entities.empty();

    SentenceModel sentenceModel = sentenceModels.get("en");
    SentenceDetector sentenceDetector = new SentenceDetectorME(sentenceModel);
    String[] sentences = sentenceDetector.sentDetect(content);

    TokenizerModel tm = tokenizerModels.get("en");
    TokenizerME wordBreaker = new TokenizerME(tm);

    for (String sentence : sentences) {
        String[] tokens = wordBreaker.tokenize(sentence);

        List<TextAnnotation> allTextAnnotations = new LinkedList<TextAnnotation>();

        POSModel posModel = posModels.get("en");
        POSTaggerME posme = new POSTaggerME(posModel);
        String[] posTags = posme.tag(tokens);

        List<String> npTokens = new LinkedList<>();

        ChunkerModel chunkerModel = chunkerModels.get("en");
        ChunkerME chunkerME = new ChunkerME(chunkerModel);
        Span[] chunks = chunkerME.chunkAsSpans(tokens, posTags);
        String[] chunkStrings = Span.spansToStrings(chunks, tokens);
        for (int i = 0; i < chunks.length; i++) {
            String chunkString = chunkStrings[i];
            logger.info("Chunk = " + chunkString + ", type = " + chunks[i].getType());
            if (chunks[i].getType().equals("NP")) {
                npTokens.add(chunkString);
            }/*from   w w w .j av a2  s .com*/
        }

        // findEntities(namedEntities, allTextAnnotations,
        // npTokens.toArray(new String[0]));
        findEntities(namedEntities, allTextAnnotations, tokens);
    }

    return namedEntities;
}

From source file:org.apache.stanbol.enhancer.engines.opennlp.impl.NEREngineCore.java

protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
        String text, String language) {
    // version with explicit sentence endings to reflect heading / paragraph
    // structure of an HTML or PDF document converted to text
    String textWithDots = text.replaceAll("\\n\\n", ".\n");
    text = removeNonUtf8CompliantCharacters(text);

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));

    Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);

    NameFinderME finder = new NameFinderME(nameFinderModel);
    Tokenizer tokenizer = openNLP.getTokenizer(language);
    Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
    for (int i = 0; i < sentenceSpans.length; i++) {
        String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();

        // build a context by concatenating three sentences to be used for
        // similarity ranking / disambiguation + contextual snippet in the
        // extraction structure
        List<String> contextElements = new ArrayList<String>();
        if (i > 0) {
            CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
            contextElements.add(previousSentence.toString().trim());
        }// w  w w. j a v  a  2  s .co  m
        contextElements.add(sentence.trim());
        if (i + 1 < sentenceSpans.length) {
            CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
            contextElements.add(nextSentence.toString().trim());
        }
        String context = StringUtils.join(contextElements, " ");

        // extract the names in the current sentence and
        // keep them store them with the current context
        Span[] tokenSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = Span.spansToStrings(tokenSpans, sentence);
        Span[] nameSpans = finder.find(tokens);
        double[] probs = finder.probs();
        //int lastStartPosition = 0;
        for (int j = 0; j < nameSpans.length; j++) {
            String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(),
                    tokenSpans[nameSpans[j].getEnd() - 1].getEnd());
            //NOTE: With OpenNLP 1.6 the probability is now stored in the span
            double prob = nameSpans[j].getProb();
            //prob == 0.0 := unspecified
            Double confidence = prob != 0.0 ? Double.valueOf(prob) : null;
            if (confidence == null) { //fall back to the old if it is not set.
                for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                    prob *= probs[k];
                }
                confidence = Double.valueOf(prob);
            } else if (confidence < 0.5d) {
                //It looks like as if preceptron based models do return
                //invalid probabilities. As it is expected the Named Entities
                //with a probability < 50% are not even returned by finder.find(..)
                //we will just ignore confidence values < 0.5 here
                confidence = null;
            }
            int start = tokenSpans[nameSpans[j].getStart()].getStart();
            int absoluteStart = sentenceSpans[i].getStart() + start;
            int absoluteEnd = absoluteStart + name.length();
            NerTag nerTag = config.getNerTag(nameSpans[j].getType());
            NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, nerTag.getType(),
                    context, confidence);

            List<NameOccurrence> occurrences = nameOccurrences.get(name);
            if (occurrences == null) {
                occurrences = new ArrayList<NameOccurrence>();
            }
            occurrences.add(occurrence);
            nameOccurrences.put(name, occurrences);
        }
    }
    finder.clearAdaptiveData();
    log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
    return nameOccurrences;
}

From source file:org.apache.tika.parser.geo.topic.NameEntityExtractor.java

public void getAllNameEntitiesfromInput(InputStream stream) throws IOException {
    String[] in = IOUtils.toString(stream, UTF_8).split(" ");
    Span nameE[];//  w w  w . j  av a  2s  .co  m

    //name finder is not thread safe https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind
    synchronized (nameFinder) {
        nameE = nameFinder.find(in);
        //the same name finder is reused, so clear adaptive data
        nameFinder.clearAdaptiveData();
    }

    String spanNames = Arrays.toString(Span.spansToStrings(nameE, in));
    spanNames = spanNames.substring(1, spanNames.length() - 1);
    String[] tmp = spanNames.split(",");

    for (String name : tmp) {
        name = name.trim();
        this.locationNameEntities.add(name);
    }

}