List of usage examples for opennlp.tools.tokenize TokenizerME TokenizerME
public TokenizerME(TokenizerModel model)
From source file:NLP.java
public NLP() throws FileNotFoundException, IOException, URISyntaxException { itemsList = new HashMap<String, String>(); String file = (new File(NLP.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())) .toString();//from ww w . ja va2 s . c o m String path = (new File(file).getParentFile().getPath()).toString(); model = new POSModelLoader().load(new File(path + "\\fr-pos.bin")); perfMon = new PerformanceMonitor(System.err, "sent"); tagger = new POSTaggerME(model); try (InputStream is = new FileInputStream(path + "\\fr-token.bin")) { tokenizer = new TokenizerME(new TokenizerModel(is)); } catch (Exception e) { System.out.println(e); } }
From source file:hrpod.tools.nlp.NLPTools.java
public String[] tokenize(String text) { String[] chunkStrings = null; try {/*from w ww . j av a 2 s . c om*/ TokenizerME wordBreaker = new TokenizerME(getTokenModel()); POSTaggerME posme = new POSTaggerME(getPosModel()); ChunkerME chunkerME = new ChunkerME(getChunkerModel()); //words is the tokenized sentence String[] words = wordBreaker.tokenize(text); //posTags are the parts of speech of every word in the sentence (The chunker needs this info) String[] posTags = posme.tag(words); //chunks are the start end "spans" indices to the chunks in the words array Span[] chunks = chunkerME.chunkAsSpans(words, posTags); //chunkStrings are the actual chunks chunkStrings = Span.spansToStrings(chunks, words); //for (int i = 0; i < chunks.length; i++) { // if (chunks[i].getType().equals("NP")) { // System.out.println("NP: \n\t" + chunkStrings[i]); //String[] split = chunkStrings[i].split(" "); //List<String> ngrams = ngram(Arrays.asList(split), N, " "); //System.out.println("ngrams:"); //for (String gram : ngrams) { // System.out.println("\t" + gram); //} //} //} } catch (Exception e) { logger.error("Error in tokenize", e); } return chunkStrings; }
From source file:de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter.java
@Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); sentenceModelProvider = new CasConfigurableProviderBase<SentenceDetectorME>() { {//from www . j av a 2 s . c om setDefault(VERSION, "20120616.0"); setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); setDefault(ARTIFACT_ID, "de.tudarmstadt.ukp.dkpro.core.opennlp-model-sentence-${language}-${variant}"); setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/" + "sentence-${language}-${variant}.bin"); setDefault(VARIANT, "maxent"); setOverride(LOCATION, modelLocation); setOverride(LANGUAGE, language); setOverride(VARIANT, variant); } @Override protected SentenceDetectorME produceResource(URL aUrl) throws IOException { InputStream is = null; try { is = aUrl.openStream(); SentenceModel model = new SentenceModel(is); return new SentenceDetectorME(model); } finally { closeQuietly(is); } } }; tokenModelProvider = new CasConfigurableProviderBase<TokenizerME>() { { setDefault(VERSION, "1.5"); setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); setDefault(ARTIFACT_ID, "de.tudarmstadt.ukp.dkpro.core.opennlp-model-token-${language}-${variant}"); setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/" + "token-${language}-${variant}.bin"); setDefault(VARIANT, "maxent"); setOverride(LOCATION, modelLocation); setOverride(LANGUAGE, language); setOverride(VARIANT, variant); } @Override protected TokenizerME produceResource(URL aUrl) throws IOException { InputStream is = null; try { is = aUrl.openStream(); TokenizerModel model = new TokenizerModel(is); return new TokenizerME(model); } finally { closeQuietly(is); } } }; }
From source file:it.uniud.ailab.dcore.wrappers.external.OpenNlpBootstrapperAnnotator.java
/** * Annotates the document using the Apache OpenNLP tools. * * @param component the component to annotate. *//* w ww. j a va2 s . co m*/ @Override public void annotate(Blackboard blackboard, DocumentComponent component) { // set up the annotator setup(); // Language tag used to retrieve the datasets String langTag = component.getLanguage().getLanguage(); // Split the text into sentences SentenceModel sentModel = getSentenceModel(langTag + "-sent"); SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel); String sentences[] = sentenceDetector.sentDetect(component.getText()); // Get the right models TokenizerModel tokenModel = getTokenizerModel(langTag + "-token"); POSModel POSModel = getPOSTaggerModel(langTag + "-pos-maxent"); // Iterate through sentences and produce the distilled objects, // i.e. a sentence object with pos-tagged and stemmed tokens. for (String sentenceString : sentences) { // the distilled sentence object Sentence sentence = new Sentence(sentenceString, "" + sentenceCounter++); sentence.setLanguage(component.getLanguage()); // Tokenize the sentence Tokenizer tokenizer = new TokenizerME(tokenModel); String tokens[] = tokenizer.tokenize(sentenceString); // POS tag the tokens POSTaggerME tagger = new POSTaggerME(POSModel); String tags[] = tagger.tag(tokens); // put the features detected by OpenNLP in the distiller's // sentence for (int i = 0; i < tokens.length; i++) { Token t = new Token(tokens[i]); t.setPoS(tags[i]); sentence.addToken(t); } // for ((DocumentComposite) component).addComponent(sentence); } // for (String sentenceString : sentences) }
From source file:com.screenslicer.core.nlp.NlpUtil.java
public static String[] tokensFromSentence(String sentence) { Tokenizer tokenizer = new TokenizerME(tokenModel); return tokenizer.tokenize(sentence); }
From source file:com.civis.utils.opennlp.models.address.AddressFinderMe.java
/** * {@inheritDoc}/*w w w .j a va 2 s . c o m*/ */ @Override public List<AddressSpan> find(String text, TokenizerModel tokenizerModel) { Tokenizer tokenizer = new TokenizerME(tokenizerModel); String[] tokens = tokenizer.tokenize(text); return find(tokens); }
From source file:edu.stanford.muse.index.NER.java
public synchronized static void initialize() throws ClassCastException, IOException, ClassNotFoundException { if (pFinder != null) return;/* www . jav a 2s. c om*/ long startTimeMillis = System.currentTimeMillis(); log.info("Initializing NER models"); try { InputStream pis = Config.getResourceAsStream("models/en-ner-person.bin"); TokenNameFinderModel pmodel = new TokenNameFinderModel(pis); pFinder = new NameFinderME(pmodel); InputStream lis = Config.getResourceAsStream("models/en-ner-location.bin"); TokenNameFinderModel lmodel = new TokenNameFinderModel(lis); lFinder = new NameFinderME(lmodel); InputStream ois = Config.getResourceAsStream("models/en-ner-organization.bin"); TokenNameFinderModel omodel = new TokenNameFinderModel(ois); oFinder = new NameFinderME(omodel); } //dont bother about this, instead try not to use it catch (Exception e) { Util.print_exception(e, log); } try { InputStream modelIn = Config.getResourceAsStream("models/en-sent.bin"); SentenceModel model = new SentenceModel(modelIn); sFinder = new SentenceDetectorME(model); InputStream tokenStream = Config.getResourceAsStream("models/en-token.bin"); TokenizerModel modelTokenizer = new TokenizerModel(tokenStream); tokenizer = new TokenizerME(modelTokenizer); } catch (Exception e) { Util.print_exception(e); } long endTimeMillis = System.currentTimeMillis(); log.info("Done initializing NER model in " + Util.commatize(endTimeMillis - startTimeMillis) + "ms"); }
From source file:ht.process.Processor.java
public ProcessorResult2 processDocumentV3(ProcessorParams param) { TokenizerModel portugueseTokenizer = (TokenizerModel) servletContext.getAttribute("portugueseTokenizer"); TokenizerModel englishTokenizer = (TokenizerModel) servletContext.getAttribute("englishTokenizer"); ConcurrentHashMap<String, String> portugueseStopwords = (ConcurrentHashMap<String, String>) servletContext .getAttribute("portugueseStopwords"); ConcurrentHashMap<String, String> englishStopwords = (ConcurrentHashMap<String, String>) servletContext .getAttribute("englishStopwords"); ResourceBundle portugueseMessages = (ResourceBundle) servletContext.getAttribute("portugueseMessages"); ResourceBundle englishMessages = (ResourceBundle) servletContext.getAttribute("englishMessages"); String text = param.body;// w w w .j a va 2 s. c o m //String text = LZString.decompressFromUTF16(param.body); Connection connection; try { connection = ((DataSource) servletContext.getAttribute("connPool")).getConnection(); } catch (SQLException ex) { Logger.getLogger(Processor.class.getName()).log(Level.SEVERE, null, ex); return null; } HashSet<String> semanticTypes; if (param.semanticTypes == null || param.semanticTypes.isEmpty()) semanticTypes = defaultSemanticTypes; else semanticTypes = param.semanticTypes; ConceptProcessor processor = null; ResourceBundle messages = null; switch (param.language) { case "en": processor = new EnglishProcessor(connection, englishStopwords, englishTokenizer, semanticTypes); //messages = englishMessages; break; case "pt": processor = new PortugueseProcessor(connection, portugueseStopwords, portugueseTokenizer, semanticTypes); //messages = portugueseMessages; break; default: processor = new EnglishProcessor(connection, englishStopwords, englishTokenizer, semanticTypes); //messages = englishMessages; break; } if (param.contentLanguage == null) param.contentLanguage = "detected"; if ((param.contentLanguage.equals("detected") && param.language.equals("en")) || param.contentLanguage.equals("en")) messages = englishMessages; else if ((param.contentLanguage.equals("detected") && param.language.equals("pt")) || param.contentLanguage.equals("pt")) messages = portugueseMessages; else messages = englishMessages; if (param.recognizeOnlyCHV == null) param.recognizeOnlyCHV = true; processor.recognizeOnlyCHV = param.recognizeOnlyCHV; if (param.recognizeWithoutDefinition == null) param.recognizeWithoutDefinition = true; if (param.styFilter == null) param.styFilter = "all"; if (param.styFilter.equals("all")) processor.allAccepted = true; else if (param.styFilter.equals("one")) processor.allAccepted = false; Tokenizer tokenizer = new TokenizerME(processor.tokenizerModel); Span spans[] = tokenizer.tokenizePos(text); //Span[] spansCopy = new Span[spans.length]; //System.arraycopy( spans, 0, spansCopy, 0, spans.length ); //System.out.println("TEXT: " + text); //System.out.println("SPANS: " + spans.length); ArrayList<Change> resultChanges = new ArrayList<>(); for (int i = 0; i < spans.length; i++) { Span initialSpan = spans[i]; Concept bestMatch = processor.processToken(spans, i, text, FORWARD_THRESHOLD); if (bestMatch != null) { //replace "'" so it doesn't break the tooltip html if the definition contains it String definition = processor.getDefinition(bestMatch); if (definition != null) { bestMatch.setDefinition(definition); } } if (bestMatch != null && ((!param.recognizeWithoutDefinition && bestMatch.definition != null) || param.recognizeWithoutDefinition)) { i += bestMatch.words - 1; /* if (lastFound == null) { splitText.add(text.substring(0, initialSpan.getStart())); } else { splitText.add(text.substring(lastFound.span.getEnd(), bestMatch.span.getStart())); } */ String definitionTooltip = replaceConcept(bestMatch, param.language, messages); resultChanges .add(new Change(bestMatch.span.getStart(), bestMatch.span.getEnd(), definitionTooltip)); //lastFound = bestMatch; } } try { connection.close(); } catch (SQLException ex) { Logger.getLogger(Processor.class.getName()).log(Level.SEVERE, null, ex); } return new ProcessorResult2(resultChanges); }
From source file:it.uniud.ailab.dcore.wrappers.external.OpenNlpBootstrapperAnnotator.java
/** * Utility offered to other elements of the pipeline for text tokenizing. * * @param text the text to tokenize//from w w w .j a v a 2 s .c o m * @param language the language of the input text * @return an array containing the tokenized text. */ public static String[] tokenizeText(String text, String language) { setup(); // Split the text into sentences SentenceModel sentModel = getSentenceModel(language + "-sent"); SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel); String sentences[] = sentenceDetector.sentDetect(text); // Get the right models TokenizerModel tokenModel = getTokenizerModel(language + "-token"); // Iterate through sentences and produce the distilled objects, // i.e. a sentence object with pos-tagged and stemmed tokens. List<String> tokenizedText = new ArrayList<>(); for (String sentenceString : sentences) { // Tokenize the sentence Tokenizer tokenizer = new TokenizerME(tokenModel); String tokens[] = tokenizer.tokenize(sentenceString); for (String token : tokens) { tokenizedText.add(token); } } return tokenizedText.toArray(new String[tokenizedText.size()]); }
From source file:org.sglover.nlp.CoreNLPEntityTagger.java
@Override protected Entities getEntitiesImpl(String content) { Entities namedEntities = Entities.empty(); SentenceModel sentenceModel = sentenceModels.get("en"); SentenceDetector sentenceDetector = new SentenceDetectorME(sentenceModel); String[] sentences = sentenceDetector.sentDetect(content); TokenizerModel tm = tokenizerModels.get("en"); TokenizerME wordBreaker = new TokenizerME(tm); for (String sentence : sentences) { String[] tokens = wordBreaker.tokenize(sentence); List<TextAnnotation> allTextAnnotations = new LinkedList<TextAnnotation>(); POSModel posModel = posModels.get("en"); POSTaggerME posme = new POSTaggerME(posModel); String[] posTags = posme.tag(tokens); List<String> npTokens = new LinkedList<>(); ChunkerModel chunkerModel = chunkerModels.get("en"); ChunkerME chunkerME = new ChunkerME(chunkerModel); Span[] chunks = chunkerME.chunkAsSpans(tokens, posTags); String[] chunkStrings = Span.spansToStrings(chunks, tokens); for (int i = 0; i < chunks.length; i++) { String chunkString = chunkStrings[i]; logger.info("Chunk = " + chunkString + ", type = " + chunks[i].getType()); if (chunks[i].getType().equals("NP")) { npTokens.add(chunkString); }/*from w w w .j a v a 2 s .c om*/ } // findEntities(namedEntities, allTextAnnotations, // npTokens.toArray(new String[0])); findEntities(namedEntities, allTextAnnotations, tokens); } return namedEntities; }