Java tutorial
/* * Copyright 2012 DBpedia Spotlight Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org */ package org.dbpedia.spotlight.spot; import opennlp.tools.chunker.ChunkerME; import opennlp.tools.chunker.ChunkerModel; import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSTaggerME; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.Span; import opennlp.tools.util.model.BaseModel; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dbpedia.spotlight.exceptions.ConfigurationException; import org.dbpedia.spotlight.model.SpotlightConfiguration; import org.dbpedia.spotlight.model.SurfaceForm; import org.dbpedia.spotlight.model.SurfaceFormOccurrence; import org.dbpedia.spotlight.model.Text; import java.util.*; /** * Consider as spots only the expressions marked as: * - named entities by the NER tagger, * - the noun-phrase chunks extracted by a shallow parser, * - all sub-expressions of up to 5 tokens of the noun-phrase chunks. * * This increases the coverage of NESpotter, which tends to annotate very little. * * The main advantage of this approach against the dictionary-based approach that we use now is that * it will not require 8GB of RAM to store a dictionary at runtime. * * TODO requires us to deal with overlaps when generating the annotations * TODO do not break chunks within quotes. * TODO do not get n-grams from NER spots, only from NP chunk spots * * @author Rohana Rajapakse (GOSS Interactive Limited) - implemented the class * @author pablomendes - adapted to integrate with architecture, made stopwords configurable and case insensitive, adjusted logging */ public class OpenNLPNGramSpotter implements Spotter { private final Log LOG = LogFactory.getLog(this.getClass()); protected static BaseModel sentenceModel = null; protected static BaseModel chunkModel = null; protected static BaseModel tokenModel = null; protected static BaseModel posModel = null; protected Set<String> stopWords = SpotlightConfiguration.DEFAULT_STOPWORDS; //String directoryPath = "C:/software/appservers/dbp-spotlight-trunk3/data/models/opennlp/"; //now reading from configuration properties String directoryPath = null; //Need OpenNLP modles. At present they are loaded in the constructor, but they should better be loaded at the startup of //dbpediaSpotlight to avoid re-loading them each time a request arrives. A singleton to hold the models would do. public OpenNLPNGramSpotter(String opennlpmodeldir, String i18nLanguageCode) throws ConfigurationException { //directoryPath = null; //for reading from dependency Jar files String directoryPath = opennlpmodeldir; if (OpenNLPNGramSpotter.sentenceModel == null) { OpenNLPNGramSpotter.sentenceModel = OpenNLPUtil.loadModel(directoryPath, i18nLanguageCode + OpenNLPUtil.OpenNlpModels.SentenceModel.filename(), OpenNLPUtil.OpenNlpModels.SentenceModel.toString()); } if (OpenNLPNGramSpotter.chunkModel == null) { OpenNLPNGramSpotter.chunkModel = OpenNLPUtil.loadModel(directoryPath, i18nLanguageCode + OpenNLPUtil.OpenNlpModels.ChunkModel.filename(), OpenNLPUtil.OpenNlpModels.ChunkModel.toString()); } if (OpenNLPNGramSpotter.posModel == null) { OpenNLPNGramSpotter.posModel = OpenNLPUtil.loadModel(directoryPath, i18nLanguageCode + OpenNLPUtil.OpenNlpModels.POSModel.filename(), OpenNLPUtil.OpenNlpModels.POSModel.toString()); } if (OpenNLPNGramSpotter.tokenModel == null) { OpenNLPNGramSpotter.tokenModel = OpenNLPUtil.loadModel(directoryPath, i18nLanguageCode + OpenNLPUtil.OpenNlpModels.TokenizerModel.filename(), OpenNLPUtil.OpenNlpModels.TokenizerModel.toString()); } } @Override public List<SurfaceFormOccurrence> extract(Text text) { //System.out.println("\n\nRR- extract(...) method called! with text: " + intext + "\n\n"); //remove special chars from input text, and keep a list of positions of them n a list. //start/end offsets need to be adjusted after extracting spots from cleaned text. String orgText = text.text(); List<Integer> chars2removeLst = OpenNLPUtil.chars2remove(orgText); String cleanText = OpenNLPUtil.cleanText(orgText, chars2removeLst); Text cleanTextStr = new Text(cleanText); //extracting NounPhrase nGrams List<SurfaceFormOccurrence> npNgrams = extractNPNGrams(cleanTextStr); /* System.out.println("\n\nAll NGrams of sentence:"); System.out.println(intext + "\n"); for( SurfaceFormOccurrence ng: npNgrams) { System.out.println(ng.surfaceForm() + " [" + ng.textOffset() + "]"); } */ if (npNgrams != null && !npNgrams.isEmpty()) { //lets correct the offsets for (SurfaceFormOccurrence ng : npNgrams) { int offset_clean = ng.textOffset(); int offset_org = OpenNLPUtil.computeOffset(orgText, offset_clean, chars2removeLst); ng.setTextOffset(offset_org); //System.out.println(ng.surfaceForm() + " [" + ng.textOffset() + "]"); } return npNgrams; } else { return (List) new ArrayList<String>(); } } String name = "OpenNLPNGramSpotter"; @Override public String getName() { return name; } @Override public void setName(String n) { this.name = n; } /**Extracts noun-phrase n-grams from the given piece of input text. * @param text A Text object containing the input from where to extract NP n-grams * @return A list of SurfaceFormOccurrence objects. */ protected List<SurfaceFormOccurrence> extractNPNGrams(Text text) { String intext = text.text(); //System.out.println("\n\nRR- nextractNPNGrams(...) method called! with text: " + intext + "\n\n"); List<SurfaceFormOccurrence> npNgramSFLst = new ArrayList<SurfaceFormOccurrence>(); SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel); TokenizerME tokenizer = new TokenizerME((TokenizerModel) tokenModel); POSTaggerME posTagger = new POSTaggerME((POSModel) posModel); ChunkerME chunker = new ChunkerME((ChunkerModel) chunkModel); Span[] sentSpans = sentenceDetector.sentPosDetect(intext); for (Span sentSpan : sentSpans) { String sentence = sentSpan.getCoveredText(intext).toString(); int start = sentSpan.getStart(); Span[] tokSpans = tokenizer.tokenizePos(sentence); String[] tokens = new String[tokSpans.length]; // System.out.println("\n\nTokens:"); for (int i = 0; i < tokens.length; i++) { tokens[i] = tokSpans[i].getCoveredText(sentence).toString(); // System.out.println(tokens[i]); } String[] tags = posTagger.tag(tokens); Span[] chunks = chunker.chunkAsSpans(tokens, tags); for (Span chunk : chunks) { if ("NP".equals(chunk.getType())) { //Note: getStart()/getEnd() methods of Chunk spans only give the start and end token indexes of the chunk. //The actual Start/End positions of the chunk in the sentence need to be extracted from POS sentenceSpans. //They are offsets from the begining of the sentence in question. Need to add the start postion of the sentence //to compute the actual start/end offsets from the begining of the input text. int begin = tokSpans[chunk.getStart()].getStart(); int end = tokSpans[chunk.getEnd() - 1].getEnd(); List<Map<String, Integer>> ngrampos = extractNGramPos(chunk.getStart(), chunk.getEnd() + -1); extractNGrams(ngrampos, start, text, tokSpans, npNgramSFLst); } } } return npNgramSFLst; } public void extractNGrams(List<Map<String, Integer>> ngrampos, int start, Text text, Span[] tokSpans, List<SurfaceFormOccurrence> sfOccurrences) { String intext = text.text(); for (Map<String, Integer> mapelem : ngrampos) { int starttokenidx = mapelem.get("start"); int endtokenidx = mapelem.get("end"); //restrict to max 3-word phrases int noftkens = endtokenidx - starttokenidx; boolean ignorephrase = false; int begin = start + tokSpans[starttokenidx].getStart(); int end = start + tokSpans[endtokenidx].getEnd(); String txtform = intext.substring(begin, end); //ignore empty phrases if (txtform.trim().length() == 0) { //System.out.println("empty txtform"); continue; } //Ignore phrases that contain more than 3-terms. It is unlikely that such long phrases to hit any resources. //TODO Need to experiment with the cut-off value though. if (noftkens > 2) ignorephrase = true; //ignore phrases starting with a stopword int starttkn_begin = start + tokSpans[starttokenidx].getStart(); int starttkn_end = start + tokSpans[starttokenidx].getEnd(); String starttknTxt = intext.substring(starttkn_begin, starttkn_end); if (isStopWord(starttknTxt)) ignorephrase = true; //ignore phrases ending with a stopword int endtkn_begin = start + tokSpans[endtokenidx].getStart(); int endtkn_end = start + tokSpans[endtokenidx].getEnd(); String endtknTxt = intext.substring(endtkn_begin, endtkn_end); if (isStopWord(endtknTxt)) ignorephrase = true; if (!ignorephrase) { NGram ng = new NGram(txtform, begin, end); SurfaceForm surfaceForm = new SurfaceForm(ng.getTextform()); assert !ng.getTextform().isEmpty(); SurfaceFormOccurrence sfocc = new SurfaceFormOccurrence(surfaceForm, text, ng.getStart()); if (surfaceForm.name().trim().length() > 0 && !sfOccurrences.contains(sfocc)) { sfOccurrences.add(sfocc); } } } } /**Generates a list of start/end tokens (indexes) of all sub=phrases/n-grams, given start and end token indexes * e.g. if start token index and end token index are 5 and 7 (means token 5,6 and 7 makes up a noun phrase) * then generate [5], [5,6], [5,6,7], [6], [6,7] and [7] as sub-phrases (n-grams) of the the original phrase. * @param startpos * @param endpos * @return A list of Maps. A Map element has only two keys "start" and "end". */ public List<Map<String, Integer>> extractNGramPos(int startpos, int endpos) { List<Map<String, Integer>> ngrampos1 = new ArrayList<Map<String, Integer>>(); if (startpos <= endpos) { for (int i = startpos; i <= endpos; i++) { for (int j = i; j <= endpos; j++) { int start = i; int end = j; Map<String, Integer> posmap = new HashMap<String, Integer>(); posmap.put("start", start); posmap.put("end", end); ngrampos1.add(posmap); } } } return ngrampos1; } /**Uses the stopWords from Lucene (StopAnalyzer.ENGLISH_STOP_WORDS_SET) to find if a given piece of text is * a stopword. * @param word * @return true if the input text is a stopword. */ private boolean isStopWord(String word) { boolean ret = false; ret = stopWords.contains(word.toLowerCase()); return ret; } }