List of usage examples for opennlp.tools.chunker ChunkerME chunkAsSpans
public Span[] chunkAsSpans(String[] toks, String[] tags)
From source file:org.sglover.nlp.CoreNLPEntityTagger.java
@Override protected Entities getEntitiesImpl(String content) { Entities namedEntities = Entities.empty(); SentenceModel sentenceModel = sentenceModels.get("en"); SentenceDetector sentenceDetector = new SentenceDetectorME(sentenceModel); String[] sentences = sentenceDetector.sentDetect(content); TokenizerModel tm = tokenizerModels.get("en"); TokenizerME wordBreaker = new TokenizerME(tm); for (String sentence : sentences) { String[] tokens = wordBreaker.tokenize(sentence); List<TextAnnotation> allTextAnnotations = new LinkedList<TextAnnotation>(); POSModel posModel = posModels.get("en"); POSTaggerME posme = new POSTaggerME(posModel); String[] posTags = posme.tag(tokens); List<String> npTokens = new LinkedList<>(); ChunkerModel chunkerModel = chunkerModels.get("en"); ChunkerME chunkerME = new ChunkerME(chunkerModel); Span[] chunks = chunkerME.chunkAsSpans(tokens, posTags); String[] chunkStrings = Span.spansToStrings(chunks, tokens); for (int i = 0; i < chunks.length; i++) { String chunkString = chunkStrings[i]; logger.info("Chunk = " + chunkString + ", type = " + chunks[i].getType()); if (chunks[i].getType().equals("NP")) { npTokens.add(chunkString); }// w w w . j av a 2 s . c o m } // findEntities(namedEntities, allTextAnnotations, // npTokens.toArray(new String[0])); findEntities(namedEntities, allTextAnnotations, tokens); } return namedEntities; }
From source file:hrpod.tools.nlp.NLPTools.java
public String[] tokenize(String text) { String[] chunkStrings = null; try {//from w w w.j a v a 2 s . c o m TokenizerME wordBreaker = new TokenizerME(getTokenModel()); POSTaggerME posme = new POSTaggerME(getPosModel()); ChunkerME chunkerME = new ChunkerME(getChunkerModel()); //words is the tokenized sentence String[] words = wordBreaker.tokenize(text); //posTags are the parts of speech of every word in the sentence (The chunker needs this info) String[] posTags = posme.tag(words); //chunks are the start end "spans" indices to the chunks in the words array Span[] chunks = chunkerME.chunkAsSpans(words, posTags); //chunkStrings are the actual chunks chunkStrings = Span.spansToStrings(chunks, words); //for (int i = 0; i < chunks.length; i++) { // if (chunks[i].getType().equals("NP")) { // System.out.println("NP: \n\t" + chunkStrings[i]); //String[] split = chunkStrings[i].split(" "); //List<String> ngrams = ngram(Arrays.asList(split), N, " "); //System.out.println("ngrams:"); //for (String gram : ngrams) { // System.out.println("\t" + gram); //} //} //} } catch (Exception e) { logger.error("Error in tokenize", e); } return chunkStrings; }
From source file:org.dbpedia.spotlight.spot.OpenNLPNGramSpotter.java
/**Extracts noun-phrase n-grams from the given piece of input text. * @param text A Text object containing the input from where to extract NP n-grams * @return A list of SurfaceFormOccurrence objects. *//*from w w w .j a v a2 s .c o m*/ protected List<SurfaceFormOccurrence> extractNPNGrams(Text text) { String intext = text.text(); //System.out.println("\n\nRR- nextractNPNGrams(...) method called! with text: " + intext + "\n\n"); List<SurfaceFormOccurrence> npNgramSFLst = new ArrayList<SurfaceFormOccurrence>(); SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel); TokenizerME tokenizer = new TokenizerME((TokenizerModel) tokenModel); POSTaggerME posTagger = new POSTaggerME((POSModel) posModel); ChunkerME chunker = new ChunkerME((ChunkerModel) chunkModel); Span[] sentSpans = sentenceDetector.sentPosDetect(intext); for (Span sentSpan : sentSpans) { String sentence = sentSpan.getCoveredText(intext).toString(); int start = sentSpan.getStart(); Span[] tokSpans = tokenizer.tokenizePos(sentence); String[] tokens = new String[tokSpans.length]; // System.out.println("\n\nTokens:"); for (int i = 0; i < tokens.length; i++) { tokens[i] = tokSpans[i].getCoveredText(sentence).toString(); // System.out.println(tokens[i]); } String[] tags = posTagger.tag(tokens); Span[] chunks = chunker.chunkAsSpans(tokens, tags); for (Span chunk : chunks) { if ("NP".equals(chunk.getType())) { //Note: getStart()/getEnd() methods of Chunk spans only give the start and end token indexes of the chunk. //The actual Start/End positions of the chunk in the sentence need to be extracted from POS sentenceSpans. //They are offsets from the begining of the sentence in question. Need to add the start postion of the sentence //to compute the actual start/end offsets from the begining of the input text. int begin = tokSpans[chunk.getStart()].getStart(); int end = tokSpans[chunk.getEnd() - 1].getEnd(); List<Map<String, Integer>> ngrampos = extractNGramPos(chunk.getStart(), chunk.getEnd() + -1); extractNGrams(ngrampos, start, text, tokSpans, npNgramSFLst); } } } return npNgramSFLst; }