List of usage examples for opennlp.tools.tokenize Tokenizer tokenize
String[] tokenize(String s);
From source file:com.screenslicer.core.nlp.NlpUtil.java
public static String[] tokensFromSentence(String sentence) { Tokenizer tokenizer = new TokenizerME(tokenModel); return tokenizer.tokenize(sentence); }
From source file:it.uniud.ailab.dcore.wrappers.external.OpenNlpBootstrapperAnnotator.java
/** * Utility offered to other elements of the pipeline for text tokenizing. * * @param text the text to tokenize// w ww .j a va2s.co m * @param language the language of the input text * @return an array containing the tokenized text. */ public static String[] tokenizeText(String text, String language) { setup(); // Split the text into sentences SentenceModel sentModel = getSentenceModel(language + "-sent"); SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel); String sentences[] = sentenceDetector.sentDetect(text); // Get the right models TokenizerModel tokenModel = getTokenizerModel(language + "-token"); // Iterate through sentences and produce the distilled objects, // i.e. a sentence object with pos-tagged and stemmed tokens. List<String> tokenizedText = new ArrayList<>(); for (String sentenceString : sentences) { // Tokenize the sentence Tokenizer tokenizer = new TokenizerME(tokenModel); String tokens[] = tokenizer.tokenize(sentenceString); for (String token : tokens) { tokenizedText.add(token); } } return tokenizedText.toArray(new String[tokenizedText.size()]); }
From source file:com.tamingtext.classifier.maxent.TestMaxent.java
private static void runTest(File[] inputFiles, DocumentCategorizer categorizer, Tokenizer tokenizer, ResultAnalyzer resultAnalyzer) throws FileNotFoundException, IOException { String line;/* www . j a v a2 s .c o m*/ //<start id="maxent.examples.test.execute"/> for (File ff : inputFiles) { BufferedReader in = new BufferedReader(new FileReader(ff)); while ((line = in.readLine()) != null) { String[] parts = line.split("\t"); if (parts.length != 2) continue; String docText = parts[1]; //<co id="tmt.preprocess"/> String[] tokens = tokenizer.tokenize(docText); double[] probs = categorizer.categorize(tokens); //<co id="tmt.categorize"/> String label = categorizer.getBestCategory(probs); int bestIndex = categorizer.getIndex(label); double score = probs[bestIndex]; ClassifierResult result //<co id="tmt.collect"/> = new ClassifierResult(label, score); resultAnalyzer.addInstance(parts[0], result); } in.close(); } System.err.println(resultAnalyzer.toString()); //<co id="tmt.summarize"/> /*<calloutlist> * <callout arearefs="tmt.preprocess">Preprocess text</callout> * <callout arearefs="tmt.categorize">Categorize</callout> * <callout arearefs="tmt.collect">Analyze Results</callout> * <callout arearefs="tmt.summarize">Present Results</callout> * </calloutlist>*/ //<end id="maxent.examples.test.execute"/> }
From source file:com.civis.utils.opennlp.models.address.AddressFinderMe.java
/** * {@inheritDoc}/*from ww w . j a v a2 s .com*/ */ @Override public List<AddressSpan> find(String text, TokenizerModel tokenizerModel) { Tokenizer tokenizer = new TokenizerME(tokenizerModel); String[] tokens = tokenizer.tokenize(text); return find(tokens); }
From source file:it.uniud.ailab.dcore.wrappers.external.OpenNlpBootstrapperAnnotator.java
/** * Annotates the document using the Apache OpenNLP tools. * * @param component the component to annotate. *//*from w w w . j av a2 s .c om*/ @Override public void annotate(Blackboard blackboard, DocumentComponent component) { // set up the annotator setup(); // Language tag used to retrieve the datasets String langTag = component.getLanguage().getLanguage(); // Split the text into sentences SentenceModel sentModel = getSentenceModel(langTag + "-sent"); SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel); String sentences[] = sentenceDetector.sentDetect(component.getText()); // Get the right models TokenizerModel tokenModel = getTokenizerModel(langTag + "-token"); POSModel POSModel = getPOSTaggerModel(langTag + "-pos-maxent"); // Iterate through sentences and produce the distilled objects, // i.e. a sentence object with pos-tagged and stemmed tokens. for (String sentenceString : sentences) { // the distilled sentence object Sentence sentence = new Sentence(sentenceString, "" + sentenceCounter++); sentence.setLanguage(component.getLanguage()); // Tokenize the sentence Tokenizer tokenizer = new TokenizerME(tokenModel); String tokens[] = tokenizer.tokenize(sentenceString); // POS tag the tokens POSTaggerME tagger = new POSTaggerME(POSModel); String tags[] = tagger.tag(tokens); // put the features detected by OpenNLP in the distiller's // sentence for (int i = 0; i < tokens.length; i++) { Token t = new Token(tokens[i]); t.setPoS(tags[i]); sentence.addToken(t); } // for ((DocumentComposite) component).addComponent(sentence); } // for (String sentenceString : sentences) }
From source file:org.dbpedia.spotlight.spot.NESpotter.java
protected List<SurfaceFormOccurrence> extractNameOccurrences(BaseModel nameFinderModel, Text text, URI oType) { String intext = text.text();/*w ww . j a v a2 s. c o m*/ SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel); String[] sentences = sentenceDetector.sentDetect(intext); Span[] sentenceEndings = sentenceDetector.sentPosDetect(intext); int[] sentencePositions = new int[sentences.length + 1]; for (int k = 0; k < sentenceEndings.length; k++) { sentencePositions[k] = sentenceEndings[k].getStart(); } NameFinderME finder = new NameFinderME((TokenNameFinderModel) nameFinderModel); List<SurfaceFormOccurrence> sfOccurrences = new ArrayList<SurfaceFormOccurrence>(); Tokenizer tokenizer = new SimpleTokenizer(); for (int i = 0; i < sentences.length; i++) { String sentence = sentences[i]; //LOG.debug("Sentence: " + sentence); // extract the names in the current sentence String[] tokens = tokenizer.tokenize(sentence); Span[] tokenspan = tokenizer.tokenizePos(sentence); Span[] nameSpans = finder.find(tokens); double[] probs = finder.probs(); if (nameSpans != null && nameSpans.length > 0) { //System.out.println("Tokens: " +(new ArrayList(Arrays.asList(tokens))).toString()); //System.out.println("NameSpans: " +(new ArrayList(Arrays.asList(nameSpans))).toString()); for (Span span : nameSpans) { StringBuilder buf = new StringBuilder(); //System.out.println("StartSpan: " + span.getStart() + " EndSpan: " + span.getEnd()); for (int j = span.getStart(); j < span.getEnd(); j++) { //System.out.println(tokens[i] + " appended to " + buf.toString()); buf.append(tokens[j]); if (j < span.getEnd() - 1) buf.append(" "); } String surfaceFormStr = buf.toString().trim(); if (surfaceFormStr.contains(".")) { surfaceFormStr = correctPhrase(surfaceFormStr, sentence); } int entStart = sentencePositions[i] + tokenspan[span.getStart()].getStart(); int entEnd = sentencePositions[i] + tokenspan[span.getEnd() - 1].getEnd(); /* System.out.println("\n\nRR-NE Found = " + buf.toString()); System.out.println("Start = " + entStart); System.out.println("End = " + entEnd); System.out.println("Sentence = " + sentence); System.out.println("Text = " + text); */ SurfaceForm surfaceForm = new SurfaceForm(surfaceFormStr); SurfaceFormOccurrence sfocc = new SurfaceFormOccurrence(surfaceForm, text, entStart); sfocc.features().put("type", new Feature("type", oType.toString())); sfOccurrences.add(sfocc); } } } finder.clearAdaptiveData(); if (LOG.isDebugEnabled()) { LOG.debug("Occurrences found: " + StringUtils.join(sfOccurrences, ", ")); } return sfOccurrences; }
From source file:os.Controller.java
public String tokenize(String teks) throws InvalidFormatException, IOException { InputStream is = new FileInputStream("en-token.bin"); TokenizerModel model = new TokenizerModel(is); Tokenizer tokenizer = new TokenizerME(model); String tokens[] = tokenizer.tokenize(teks); String result = ""; for (String a : tokens) { result = result + " " + a; }/*from w ww . jav a2 s . com*/ is.close(); return result; }