List of usage examples for opennlp.tools.tokenize SimpleTokenizer SimpleTokenizer
@Deprecated
public SimpleTokenizer()
From source file:org.dbpedia.spotlight.spot.NESpotter.java
protected List<SurfaceFormOccurrence> extractNameOccurrences(BaseModel nameFinderModel, Text text, URI oType) { String intext = text.text();// w ww . ja va2s .co m SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel); String[] sentences = sentenceDetector.sentDetect(intext); Span[] sentenceEndings = sentenceDetector.sentPosDetect(intext); int[] sentencePositions = new int[sentences.length + 1]; for (int k = 0; k < sentenceEndings.length; k++) { sentencePositions[k] = sentenceEndings[k].getStart(); } NameFinderME finder = new NameFinderME((TokenNameFinderModel) nameFinderModel); List<SurfaceFormOccurrence> sfOccurrences = new ArrayList<SurfaceFormOccurrence>(); Tokenizer tokenizer = new SimpleTokenizer(); for (int i = 0; i < sentences.length; i++) { String sentence = sentences[i]; //LOG.debug("Sentence: " + sentence); // extract the names in the current sentence String[] tokens = tokenizer.tokenize(sentence); Span[] tokenspan = tokenizer.tokenizePos(sentence); Span[] nameSpans = finder.find(tokens); double[] probs = finder.probs(); if (nameSpans != null && nameSpans.length > 0) { //System.out.println("Tokens: " +(new ArrayList(Arrays.asList(tokens))).toString()); //System.out.println("NameSpans: " +(new ArrayList(Arrays.asList(nameSpans))).toString()); for (Span span : nameSpans) { StringBuilder buf = new StringBuilder(); //System.out.println("StartSpan: " + span.getStart() + " EndSpan: " + span.getEnd()); for (int j = span.getStart(); j < span.getEnd(); j++) { //System.out.println(tokens[i] + " appended to " + buf.toString()); buf.append(tokens[j]); if (j < span.getEnd() - 1) buf.append(" "); } String surfaceFormStr = buf.toString().trim(); if (surfaceFormStr.contains(".")) { surfaceFormStr = correctPhrase(surfaceFormStr, sentence); } int entStart = sentencePositions[i] + tokenspan[span.getStart()].getStart(); int entEnd = sentencePositions[i] + tokenspan[span.getEnd() - 1].getEnd(); /* System.out.println("\n\nRR-NE Found = " + buf.toString()); System.out.println("Start = " + entStart); System.out.println("End = " + entEnd); System.out.println("Sentence = " + sentence); System.out.println("Text = " + text); */ SurfaceForm surfaceForm = new SurfaceForm(surfaceFormStr); SurfaceFormOccurrence sfocc = new SurfaceFormOccurrence(surfaceForm, text, entStart); sfocc.features().put("type", new Feature("type", oType.toString())); sfOccurrences.add(sfocc); } } } finder.clearAdaptiveData(); if (LOG.isDebugEnabled()) { LOG.debug("Occurrences found: " + StringUtils.join(sfOccurrences, ", ")); } return sfOccurrences; }