List of usage examples for opennlp.tools.tokenize TokenizerME tokenizePos
public Span[] tokenizePos(String d)
From source file:edu.stanford.muse.index.NER.java
public static void testOpenNLP() { try {/*ww w . ja va2 s . com*/ String s = Util.readFile("/tmp/in"); /* List<Pair<String,Float>> pairs = NER.namesFromText(s); for (Pair<String,Float> p: pairs) { System.out.println (p); } System.out.println ("-----"); */ InputStream pis = Config.getResourceAsStream("en-ner-person.bin"); TokenNameFinderModel pmodel = new TokenNameFinderModel(pis); InputStream lis = Config.getResourceAsStream("en-ner-location.bin"); TokenNameFinderModel lmodel = new TokenNameFinderModel(lis); InputStream ois = Config.getResourceAsStream("en-ner-organization.bin"); TokenNameFinderModel omodel = new TokenNameFinderModel(ois); InputStream tokenStream = Config.getResourceAsStream("en-token.bin"); TokenizerModel modelTokenizer = new TokenizerModel(tokenStream); TokenizerME tokenizer = new TokenizerME(modelTokenizer); Span[] tokSpans = tokenizer.tokenizePos(s); // Util.tokenize(s).toArray(new String[0]); String tokens[] = new String[tokSpans.length]; for (int i = 0; i < tokSpans.length; i++) tokens[i] = s.substring(tokSpans[i].getStart(), tokSpans[i].getEnd()); NameFinderME pFinder = new NameFinderME(pmodel); Span[] pSpans = pFinder.find(tokens); NameFinderME lFinder = new NameFinderME(lmodel); Span[] lSpans = lFinder.find(tokens); NameFinderME oFinder = new NameFinderME(omodel); Span[] oSpans = oFinder.find(tokens); System.out.println("Names found:"); for (Span span : pSpans) { for (int i = span.getStart(); i < span.getEnd(); i++) System.out.print(tokens[i] + " "); System.out.println(); } System.out.println("Locations found:"); for (Span span : lSpans) { for (int i = span.getStart(); i < span.getEnd(); i++) System.out.print(tokens[i] + " "); System.out.println(); } System.out.println("Orgs found:"); for (Span span : oSpans) { for (int i = span.getStart(); i < span.getEnd(); i++) System.out.print(tokens[i] + " "); System.out.println(); } } catch (IOException e) { e.printStackTrace(); } }
From source file:org.dbpedia.spotlight.spot.OpenNLPNGramSpotter.java
/**Extracts noun-phrase n-grams from the given piece of input text. * @param text A Text object containing the input from where to extract NP n-grams * @return A list of SurfaceFormOccurrence objects. *//*from w w w . j ava2 s . c o m*/ protected List<SurfaceFormOccurrence> extractNPNGrams(Text text) { String intext = text.text(); //System.out.println("\n\nRR- nextractNPNGrams(...) method called! with text: " + intext + "\n\n"); List<SurfaceFormOccurrence> npNgramSFLst = new ArrayList<SurfaceFormOccurrence>(); SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel); TokenizerME tokenizer = new TokenizerME((TokenizerModel) tokenModel); POSTaggerME posTagger = new POSTaggerME((POSModel) posModel); ChunkerME chunker = new ChunkerME((ChunkerModel) chunkModel); Span[] sentSpans = sentenceDetector.sentPosDetect(intext); for (Span sentSpan : sentSpans) { String sentence = sentSpan.getCoveredText(intext).toString(); int start = sentSpan.getStart(); Span[] tokSpans = tokenizer.tokenizePos(sentence); String[] tokens = new String[tokSpans.length]; // System.out.println("\n\nTokens:"); for (int i = 0; i < tokens.length; i++) { tokens[i] = tokSpans[i].getCoveredText(sentence).toString(); // System.out.println(tokens[i]); } String[] tags = posTagger.tag(tokens); Span[] chunks = chunker.chunkAsSpans(tokens, tags); for (Span chunk : chunks) { if ("NP".equals(chunk.getType())) { //Note: getStart()/getEnd() methods of Chunk spans only give the start and end token indexes of the chunk. //The actual Start/End positions of the chunk in the sentence need to be extracted from POS sentenceSpans. //They are offsets from the begining of the sentence in question. Need to add the start postion of the sentence //to compute the actual start/end offsets from the begining of the input text. int begin = tokSpans[chunk.getStart()].getStart(); int end = tokSpans[chunk.getEnd() - 1].getEnd(); List<Map<String, Integer>> ngrampos = extractNGramPos(chunk.getStart(), chunk.getEnd() + -1); extractNGrams(ngrampos, start, text, tokSpans, npNgramSFLst); } } } return npNgramSFLst; }