List of usage examples for opennlp.tools.sentdetect SentenceDetectorME sentDetect
public String[] sentDetect(String s)
From source file:com.screenslicer.core.nlp.NlpUtil.java
public static String[] sentences(String src) { if (CommonUtil.isEmpty(src)) { return new String[0]; }//from w w w . j ava2 s. c om SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentenceModel); return sentenceDetector.sentDetect(src); }
From source file:it.uniud.ailab.dcore.wrappers.external.OpenNlpBootstrapperAnnotator.java
/** * Utility offered to other elements of the pipeline for text tokenizing. * * @param text the text to tokenize//from ww w.j a v a 2 s .c om * @param language the language of the input text * @return an array containing the tokenized text. */ public static String[] tokenizeText(String text, String language) { setup(); // Split the text into sentences SentenceModel sentModel = getSentenceModel(language + "-sent"); SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel); String sentences[] = sentenceDetector.sentDetect(text); // Get the right models TokenizerModel tokenModel = getTokenizerModel(language + "-token"); // Iterate through sentences and produce the distilled objects, // i.e. a sentence object with pos-tagged and stemmed tokens. List<String> tokenizedText = new ArrayList<>(); for (String sentenceString : sentences) { // Tokenize the sentence Tokenizer tokenizer = new TokenizerME(tokenModel); String tokens[] = tokenizer.tokenize(sentenceString); for (String token : tokens) { tokenizedText.add(token); } } return tokenizedText.toArray(new String[tokenizedText.size()]); }
From source file:it.uniud.ailab.dcore.wrappers.external.OpenNlpBootstrapperAnnotator.java
/** * Annotates the document using the Apache OpenNLP tools. * * @param component the component to annotate. *///from w w w . j av a2 s .com @Override public void annotate(Blackboard blackboard, DocumentComponent component) { // set up the annotator setup(); // Language tag used to retrieve the datasets String langTag = component.getLanguage().getLanguage(); // Split the text into sentences SentenceModel sentModel = getSentenceModel(langTag + "-sent"); SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel); String sentences[] = sentenceDetector.sentDetect(component.getText()); // Get the right models TokenizerModel tokenModel = getTokenizerModel(langTag + "-token"); POSModel POSModel = getPOSTaggerModel(langTag + "-pos-maxent"); // Iterate through sentences and produce the distilled objects, // i.e. a sentence object with pos-tagged and stemmed tokens. for (String sentenceString : sentences) { // the distilled sentence object Sentence sentence = new Sentence(sentenceString, "" + sentenceCounter++); sentence.setLanguage(component.getLanguage()); // Tokenize the sentence Tokenizer tokenizer = new TokenizerME(tokenModel); String tokens[] = tokenizer.tokenize(sentenceString); // POS tag the tokens POSTaggerME tagger = new POSTaggerME(POSModel); String tags[] = tagger.tag(tokens); // put the features detected by OpenNLP in the distiller's // sentence for (int i = 0; i < tokens.length; i++) { Token t = new Token(tokens[i]); t.setPoS(tags[i]); sentence.addToken(t); } // for ((DocumentComposite) component).addComponent(sentence); } // for (String sentenceString : sentences) }
From source file:org.dbpedia.spotlight.spot.NESpotter.java
protected List<SurfaceFormOccurrence> extractNameOccurrences(BaseModel nameFinderModel, Text text, URI oType) { String intext = text.text();//from w ww . j a va 2 s . co m SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel); String[] sentences = sentenceDetector.sentDetect(intext); Span[] sentenceEndings = sentenceDetector.sentPosDetect(intext); int[] sentencePositions = new int[sentences.length + 1]; for (int k = 0; k < sentenceEndings.length; k++) { sentencePositions[k] = sentenceEndings[k].getStart(); } NameFinderME finder = new NameFinderME((TokenNameFinderModel) nameFinderModel); List<SurfaceFormOccurrence> sfOccurrences = new ArrayList<SurfaceFormOccurrence>(); Tokenizer tokenizer = new SimpleTokenizer(); for (int i = 0; i < sentences.length; i++) { String sentence = sentences[i]; //LOG.debug("Sentence: " + sentence); // extract the names in the current sentence String[] tokens = tokenizer.tokenize(sentence); Span[] tokenspan = tokenizer.tokenizePos(sentence); Span[] nameSpans = finder.find(tokens); double[] probs = finder.probs(); if (nameSpans != null && nameSpans.length > 0) { //System.out.println("Tokens: " +(new ArrayList(Arrays.asList(tokens))).toString()); //System.out.println("NameSpans: " +(new ArrayList(Arrays.asList(nameSpans))).toString()); for (Span span : nameSpans) { StringBuilder buf = new StringBuilder(); //System.out.println("StartSpan: " + span.getStart() + " EndSpan: " + span.getEnd()); for (int j = span.getStart(); j < span.getEnd(); j++) { //System.out.println(tokens[i] + " appended to " + buf.toString()); buf.append(tokens[j]); if (j < span.getEnd() - 1) buf.append(" "); } String surfaceFormStr = buf.toString().trim(); if (surfaceFormStr.contains(".")) { surfaceFormStr = correctPhrase(surfaceFormStr, sentence); } int entStart = sentencePositions[i] + tokenspan[span.getStart()].getStart(); int entEnd = sentencePositions[i] + tokenspan[span.getEnd() - 1].getEnd(); /* System.out.println("\n\nRR-NE Found = " + buf.toString()); System.out.println("Start = " + entStart); System.out.println("End = " + entEnd); System.out.println("Sentence = " + sentence); System.out.println("Text = " + text); */ SurfaceForm surfaceForm = new SurfaceForm(surfaceFormStr); SurfaceFormOccurrence sfocc = new SurfaceFormOccurrence(surfaceForm, text, entStart); sfocc.features().put("type", new Feature("type", oType.toString())); sfOccurrences.add(sfocc); } } } finder.clearAdaptiveData(); if (LOG.isDebugEnabled()) { LOG.debug("Occurrences found: " + StringUtils.join(sfOccurrences, ", ")); } return sfOccurrences; }