Example usage for opennlp.tools.chunker ChunkerME chunkAsSpans

Introduction

In this page you can find the example usage for opennlp.tools.chunker ChunkerME chunkAsSpans.

Prototype

public Span[] chunkAsSpans(String[] toks, String[] tags)

Source Link

Usage

From source file:org.sglover.nlp.CoreNLPEntityTagger.java

@Override
protected Entities getEntitiesImpl(String content) {
    Entities namedEntities = Entities.empty();

    SentenceModel sentenceModel = sentenceModels.get("en");
    SentenceDetector sentenceDetector = new SentenceDetectorME(sentenceModel);
    String[] sentences = sentenceDetector.sentDetect(content);

    TokenizerModel tm = tokenizerModels.get("en");
    TokenizerME wordBreaker = new TokenizerME(tm);

    for (String sentence : sentences) {
        String[] tokens = wordBreaker.tokenize(sentence);

        List<TextAnnotation> allTextAnnotations = new LinkedList<TextAnnotation>();

        POSModel posModel = posModels.get("en");
        POSTaggerME posme = new POSTaggerME(posModel);
        String[] posTags = posme.tag(tokens);

        List<String> npTokens = new LinkedList<>();

        ChunkerModel chunkerModel = chunkerModels.get("en");
        ChunkerME chunkerME = new ChunkerME(chunkerModel);
        Span[] chunks = chunkerME.chunkAsSpans(tokens, posTags);
        String[] chunkStrings = Span.spansToStrings(chunks, tokens);
        for (int i = 0; i < chunks.length; i++) {
            String chunkString = chunkStrings[i];
            logger.info("Chunk = " + chunkString + ", type = " + chunks[i].getType());
            if (chunks[i].getType().equals("NP")) {
                npTokens.add(chunkString);
            }// w  w w .  j  av a  2  s .  c o  m
        }

        // findEntities(namedEntities, allTextAnnotations,
        // npTokens.toArray(new String[0]));
        findEntities(namedEntities, allTextAnnotations, tokens);
    }

    return namedEntities;
}

From source file:hrpod.tools.nlp.NLPTools.java

public String[] tokenize(String text) {

    String[] chunkStrings = null;

    try {//from  w  w w.j  a  v a 2  s  .  c o  m
        TokenizerME wordBreaker = new TokenizerME(getTokenModel());
        POSTaggerME posme = new POSTaggerME(getPosModel());
        ChunkerME chunkerME = new ChunkerME(getChunkerModel());

        //words is the tokenized sentence
        String[] words = wordBreaker.tokenize(text);
        //posTags are the parts of speech of every word in the sentence (The chunker needs this info)
        String[] posTags = posme.tag(words);
        //chunks are the start end "spans" indices to the chunks in the words array
        Span[] chunks = chunkerME.chunkAsSpans(words, posTags);
        //chunkStrings are the actual chunks
        chunkStrings = Span.spansToStrings(chunks, words);
        //for (int i = 0; i < chunks.length; i++) {
        //    if (chunks[i].getType().equals("NP")) {
        //        System.out.println("NP: \n\t" + chunkStrings[i]);

        //String[] split = chunkStrings[i].split(" ");
        //List<String> ngrams = ngram(Arrays.asList(split), N, " ");
        //System.out.println("ngrams:");
        //for (String gram : ngrams) {
        //  System.out.println("\t" + gram);
        //}
        //}
        //}
    } catch (Exception e) {
        logger.error("Error in tokenize", e);
    }

    return chunkStrings;

}

From source file:org.dbpedia.spotlight.spot.OpenNLPNGramSpotter.java

/**Extracts noun-phrase n-grams from the given piece of input text. 
 * @param text  A Text object containing the input from where to extract NP n-grams
 * @return A list of SurfaceFormOccurrence objects.
 *//*from   w  w  w  .j a  v  a2 s  .c  o  m*/
protected List<SurfaceFormOccurrence> extractNPNGrams(Text text) {
    String intext = text.text();
    //System.out.println("\n\nRR- nextractNPNGrams(...) method called! with text: " + intext + "\n\n");
    List<SurfaceFormOccurrence> npNgramSFLst = new ArrayList<SurfaceFormOccurrence>();
    SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel);
    TokenizerME tokenizer = new TokenizerME((TokenizerModel) tokenModel);
    POSTaggerME posTagger = new POSTaggerME((POSModel) posModel);
    ChunkerME chunker = new ChunkerME((ChunkerModel) chunkModel);

    Span[] sentSpans = sentenceDetector.sentPosDetect(intext);
    for (Span sentSpan : sentSpans) {
        String sentence = sentSpan.getCoveredText(intext).toString();
        int start = sentSpan.getStart();
        Span[] tokSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = new String[tokSpans.length];
        // System.out.println("\n\nTokens:");
        for (int i = 0; i < tokens.length; i++) {
            tokens[i] = tokSpans[i].getCoveredText(sentence).toString();
            // System.out.println(tokens[i]);
        }
        String[] tags = posTagger.tag(tokens);
        Span[] chunks = chunker.chunkAsSpans(tokens, tags);
        for (Span chunk : chunks) {
            if ("NP".equals(chunk.getType())) {
                //Note: getStart()/getEnd() methods of Chunk spans only give the start and end token indexes of the chunk.
                //The actual Start/End positions of the chunk in the sentence need to be extracted from POS sentenceSpans.
                //They are offsets from the begining of the sentence in question. Need to add the start postion of the sentence
                //to compute the actual start/end offsets from the begining of the input text.
                int begin = tokSpans[chunk.getStart()].getStart();
                int end = tokSpans[chunk.getEnd() - 1].getEnd();
                List<Map<String, Integer>> ngrampos = extractNGramPos(chunk.getStart(), chunk.getEnd() + -1);
                extractNGrams(ngrampos, start, text, tokSpans, npNgramSFLst);
            }
        }
    }
    return npNgramSFLst;
}