Example usage for opennlp.tools.tokenize TokenizerME tokenizePos

Introduction

In this page you can find the example usage for opennlp.tools.tokenize TokenizerME tokenizePos.

Prototype

public Span[] tokenizePos(String d)

Source Link

Document

Tokenizes the string.

Usage

From source file:edu.stanford.muse.index.NER.java

public static void testOpenNLP() {

    try {/*ww  w  .  ja  va2 s .  com*/
        String s = Util.readFile("/tmp/in");
        /*
        List<Pair<String,Float>> pairs = NER.namesFromText(s);
        for (Pair<String,Float> p: pairs) {
           System.out.println (p);
        }
        System.out.println ("-----");
        */

        InputStream pis = Config.getResourceAsStream("en-ner-person.bin");
        TokenNameFinderModel pmodel = new TokenNameFinderModel(pis);
        InputStream lis = Config.getResourceAsStream("en-ner-location.bin");
        TokenNameFinderModel lmodel = new TokenNameFinderModel(lis);
        InputStream ois = Config.getResourceAsStream("en-ner-organization.bin");
        TokenNameFinderModel omodel = new TokenNameFinderModel(ois);
        InputStream tokenStream = Config.getResourceAsStream("en-token.bin");
        TokenizerModel modelTokenizer = new TokenizerModel(tokenStream);
        TokenizerME tokenizer = new TokenizerME(modelTokenizer);
        Span[] tokSpans = tokenizer.tokenizePos(s); // Util.tokenize(s).toArray(new String[0]);

        String tokens[] = new String[tokSpans.length];
        for (int i = 0; i < tokSpans.length; i++)
            tokens[i] = s.substring(tokSpans[i].getStart(), tokSpans[i].getEnd());

        NameFinderME pFinder = new NameFinderME(pmodel);
        Span[] pSpans = pFinder.find(tokens);
        NameFinderME lFinder = new NameFinderME(lmodel);
        Span[] lSpans = lFinder.find(tokens);
        NameFinderME oFinder = new NameFinderME(omodel);
        Span[] oSpans = oFinder.find(tokens);
        System.out.println("Names found:");
        for (Span span : pSpans) {
            for (int i = span.getStart(); i < span.getEnd(); i++)
                System.out.print(tokens[i] + " ");
            System.out.println();
        }

        System.out.println("Locations found:");
        for (Span span : lSpans) {
            for (int i = span.getStart(); i < span.getEnd(); i++)
                System.out.print(tokens[i] + " ");
            System.out.println();
        }

        System.out.println("Orgs found:");
        for (Span span : oSpans) {
            for (int i = span.getStart(); i < span.getEnd(); i++)
                System.out.print(tokens[i] + " ");
            System.out.println();
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:org.dbpedia.spotlight.spot.OpenNLPNGramSpotter.java

/**Extracts noun-phrase n-grams from the given piece of input text. 
 * @param text  A Text object containing the input from where to extract NP n-grams
 * @return A list of SurfaceFormOccurrence objects.
 *//*from  w  w  w . j  ava2 s  . c o m*/
protected List<SurfaceFormOccurrence> extractNPNGrams(Text text) {
    String intext = text.text();
    //System.out.println("\n\nRR- nextractNPNGrams(...) method called! with text: " + intext + "\n\n");
    List<SurfaceFormOccurrence> npNgramSFLst = new ArrayList<SurfaceFormOccurrence>();
    SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel);
    TokenizerME tokenizer = new TokenizerME((TokenizerModel) tokenModel);
    POSTaggerME posTagger = new POSTaggerME((POSModel) posModel);
    ChunkerME chunker = new ChunkerME((ChunkerModel) chunkModel);

    Span[] sentSpans = sentenceDetector.sentPosDetect(intext);
    for (Span sentSpan : sentSpans) {
        String sentence = sentSpan.getCoveredText(intext).toString();
        int start = sentSpan.getStart();
        Span[] tokSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = new String[tokSpans.length];
        // System.out.println("\n\nTokens:");
        for (int i = 0; i < tokens.length; i++) {
            tokens[i] = tokSpans[i].getCoveredText(sentence).toString();
            // System.out.println(tokens[i]);
        }
        String[] tags = posTagger.tag(tokens);
        Span[] chunks = chunker.chunkAsSpans(tokens, tags);
        for (Span chunk : chunks) {
            if ("NP".equals(chunk.getType())) {
                //Note: getStart()/getEnd() methods of Chunk spans only give the start and end token indexes of the chunk.
                //The actual Start/End positions of the chunk in the sentence need to be extracted from POS sentenceSpans.
                //They are offsets from the begining of the sentence in question. Need to add the start postion of the sentence
                //to compute the actual start/end offsets from the begining of the input text.
                int begin = tokSpans[chunk.getStart()].getStart();
                int end = tokSpans[chunk.getEnd() - 1].getEnd();
                List<Map<String, Integer>> ngrampos = extractNGramPos(chunk.getStart(), chunk.getEnd() + -1);
                extractNGrams(ngrampos, start, text, tokSpans, npNgramSFLst);
            }
        }
    }
    return npNgramSFLst;
}