List of usage examples for opennlp.tools.postag POSTaggerME POSTaggerME
public POSTaggerME(POSModel model)
From source file:hrpod.tools.nlp.NLPTools.java
public ArrayList getTokens(String txt) { ArrayList<String> wordList = null; try {//from w ww .ja va 2s .com POSTaggerME posme = new POSTaggerME(getPosModel()); String words[] = tokenize(txt);//tokenize into words and phrases wordList = new StopWordRemoval().removeStopWords(words); //remove stop words //String[] posTags = posme.tag(wordList.toArray(new String[0])); logger.info("DONE"); } catch (Exception e) { logger.error("ERROR in GetTokens", e); } finally { } return wordList; }
From source file:hrpod.tools.nlp.NLPTools.java
public ArrayList getStemmedTokens(String txt) { ArrayList<String> wordList = null; try {/*from w w w . j ava 2 s .c om*/ POSTaggerME posme = new POSTaggerME(getPosModel()); String words[] = tokenize(txt);//tokenize into words and phrases wordList = new StopWordRemoval().removeStopWords(words); //remove stop words wordList = stemmer(wordList);//stem words //String[] posTags = posme.tag(wordList.toArray(new String[0])); logger.info("DONE"); } catch (Exception e) { logger.error("ERROR in GetTokens", e); } finally { } return wordList; }
From source file:NLP.java
public NLP() throws FileNotFoundException, IOException, URISyntaxException { itemsList = new HashMap<String, String>(); String file = (new File(NLP.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())) .toString();// w w w.j a v a2s .co m String path = (new File(file).getParentFile().getPath()).toString(); model = new POSModelLoader().load(new File(path + "\\fr-pos.bin")); perfMon = new PerformanceMonitor(System.err, "sent"); tagger = new POSTaggerME(model); try (InputStream is = new FileInputStream(path + "\\fr-token.bin")) { tokenizer = new TokenizerME(new TokenizerModel(is)); } catch (Exception e) { System.out.println(e); } }
From source file:de.dfki.km.perspecting.obie.experiments.PhraseExperiment.java
/** * @throws java.lang.Exception/*w w w . j a va 2 s .c o m*/ */ @BeforeClass public static void setUpBeforeClass() throws Exception { pool.setUser($DATABASE_SERVER_USER); pool.setPassword($DATABASE_SERVER_PW); pool.setPortNumber($DATABASE_SERVER_PORT); pool.setDatabaseName($DATABASE); pool.setServerName($DATABASE_SERVER); pool.setMaxConnections(100); kb = new PostgresKB(pool.getConnection(), $DATABASE, new URI("http://test.de")); pipeline = new Pipeline(kb); LanguageIdentification languageClassification = new LanguageIdentification(Language.EN); WordSegmenter wordTokenizer = new WordSegmenter(); SentenceSegmenter sentenceTokenizer = new SentenceSegmenter(); POSModel posModel = new POSModel(Scoobie.class.getResourceAsStream("pos/en/en-pos-maxent.bin")); POSTagging posTagger = new POSTagging(new POSTaggerME(posModel)); ProperNameRecognition nounPhraseChunker = new ProperNameRecognition( new CRFNounPhraseChunkerModel(Scoobie.class.getResourceAsStream("npc/en/EN.crf"))); SuffixArrayBuilder suffixArrayBuilder = new SuffixArrayBuilder(100, new LiteralHashing(4)); RDFLiteralSpotting entityRecognizer = new RDFLiteralSpotting(); InstanceRecognition subjectResolver = new InstanceRecognition(); pipeline.configure(languageClassification, wordTokenizer, sentenceTokenizer, posTagger, nounPhraseChunker, suffixArrayBuilder, entityRecognizer, new DummyTask(), new DummyTask(), subjectResolver, new DummyTask(), new DummyTask(), new DummyTask(), new DummyTask()); }
From source file:de.dfki.km.perspecting.obie.experiments.ProperNameExperiment.java
/** * @throws java.lang.Exception/* w w w.j a v a 2 s. c o m*/ */ public static void setUp(String databaseServer, String dataBase) throws Exception { pool.setUser($DATABASE_SERVER_USER); pool.setPassword($DATABASE_SERVER_PW); pool.setPortNumber($DATABASE_SERVER_PORT); pool.setDatabaseName(dataBase); pool.setServerName(databaseServer); pool.setMaxConnections(100); kb = new PostgresKB(pool.getConnection(), dataBase, new URI("http://test.de")); pipeline = new Pipeline(kb); LanguageIdentification languageClassification = new LanguageIdentification(Language.EN); WordSegmenter wordTokenizer = new WordSegmenter(); SentenceSegmenter sentenceTokenizer = new SentenceSegmenter(); POSModel posModel = new POSModel(Scoobie.class.getResourceAsStream("pos/en/en-pos-maxent.bin")); POSTagging posTagger = new POSTagging(new POSTaggerME(posModel)); ProperNameRecognition nounPhraseChunker = new ProperNameRecognition( new CRFNounPhraseChunkerModel(Scoobie.class.getResourceAsStream("npc/en/EN.crf"))); SuffixArrayBuilder suffixArrayBuilder = new SuffixArrayBuilder(100, new LiteralHashing(4)); RDFLiteralSpotting entityRecognizer = new RDFLiteralSpotting(); pipeline.configure(languageClassification, wordTokenizer, sentenceTokenizer, posTagger, nounPhraseChunker, suffixArrayBuilder, entityRecognizer, new DummyTask(), new DummyTask(), new DummyTask(), new DummyTask(), new DummyTask(), new DummyTask(), new DummyTask()); }
From source file:hrpod.tools.nlp.NLPTools.java
public String[] tokenize(String text) { String[] chunkStrings = null; try {//w w w. j a v a 2 s . co m TokenizerME wordBreaker = new TokenizerME(getTokenModel()); POSTaggerME posme = new POSTaggerME(getPosModel()); ChunkerME chunkerME = new ChunkerME(getChunkerModel()); //words is the tokenized sentence String[] words = wordBreaker.tokenize(text); //posTags are the parts of speech of every word in the sentence (The chunker needs this info) String[] posTags = posme.tag(words); //chunks are the start end "spans" indices to the chunks in the words array Span[] chunks = chunkerME.chunkAsSpans(words, posTags); //chunkStrings are the actual chunks chunkStrings = Span.spansToStrings(chunks, words); //for (int i = 0; i < chunks.length; i++) { // if (chunks[i].getType().equals("NP")) { // System.out.println("NP: \n\t" + chunkStrings[i]); //String[] split = chunkStrings[i].split(" "); //List<String> ngrams = ngram(Arrays.asList(split), N, " "); //System.out.println("ngrams:"); //for (String gram : ngrams) { // System.out.println("\t" + gram); //} //} //} } catch (Exception e) { logger.error("Error in tokenize", e); } return chunkStrings; }
From source file:com.geocode.service.impl.AddressServiceImpl.java
@PostConstruct public void init() throws URISyntaxException { model = new POSModelLoader() .load(new File(this.getClass().getResource(basePath + "en-pos-maxent.bin").toURI())); tagger = new POSTaggerME(model); list.addAll(baseList);/*from w ww .j a va 2 s .c o m*/ CounterHelper.readCounter(); }
From source file:it.uniud.ailab.dcore.wrappers.external.OpenNlpBootstrapperAnnotator.java
/** * Annotates the document using the Apache OpenNLP tools. * * @param component the component to annotate. *//* ww w .j a v a2 s . c o m*/ @Override public void annotate(Blackboard blackboard, DocumentComponent component) { // set up the annotator setup(); // Language tag used to retrieve the datasets String langTag = component.getLanguage().getLanguage(); // Split the text into sentences SentenceModel sentModel = getSentenceModel(langTag + "-sent"); SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel); String sentences[] = sentenceDetector.sentDetect(component.getText()); // Get the right models TokenizerModel tokenModel = getTokenizerModel(langTag + "-token"); POSModel POSModel = getPOSTaggerModel(langTag + "-pos-maxent"); // Iterate through sentences and produce the distilled objects, // i.e. a sentence object with pos-tagged and stemmed tokens. for (String sentenceString : sentences) { // the distilled sentence object Sentence sentence = new Sentence(sentenceString, "" + sentenceCounter++); sentence.setLanguage(component.getLanguage()); // Tokenize the sentence Tokenizer tokenizer = new TokenizerME(tokenModel); String tokens[] = tokenizer.tokenize(sentenceString); // POS tag the tokens POSTaggerME tagger = new POSTaggerME(POSModel); String tags[] = tagger.tag(tokens); // put the features detected by OpenNLP in the distiller's // sentence for (int i = 0; i < tokens.length; i++) { Token t = new Token(tokens[i]); t.setPoS(tags[i]); sentence.addToken(t); } // for ((DocumentComposite) component).addComponent(sentence); } // for (String sentenceString : sentences) }
From source file:de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger.java
@Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); modelProvider = new CasConfigurableProviderBase<POSTagger>() { {//from w w w . j ava 2s .c o m setDefault(VERSION, "20120616.0"); setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); setDefault(ARTIFACT_ID, "de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-${language}-${variant}"); setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/" + "tagger-${language}-${variant}.bin"); setDefault(VARIANT, "maxent"); setOverride(LOCATION, modelLocation); setOverride(LANGUAGE, language); setOverride(VARIANT, variant); } @Override protected POSTagger produceResource(URL aUrl) throws IOException { InputStream is = null; try { is = aUrl.openStream(); POSModel model = new POSModel(is); if (printTagSet) { List<String> tags = new ArrayList<String>(); for (int i = 0; i < model.getPosModel().getNumOutcomes(); i++) { tags.add(model.getPosModel().getOutcome(i)); } Collections.sort(tags); StringBuilder sb = new StringBuilder(); sb.append("Model contains [").append(tags.size()).append("] tags: "); for (String tag : tags) { sb.append(tag); sb.append(" "); } getContext().getLogger().log(INFO, sb.toString()); } return new POSTaggerME(model); } finally { closeQuietly(is); } } }; mappingProvider = new MappingProvider(); mappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/" + "core/api/lexmorph/tagset/${language}-${tagger.tagset}-tagger.map"); mappingProvider.setDefault(MappingProvider.BASE_TYPE, POS.class.getName()); mappingProvider.setDefault("tagger.tagset", "default"); mappingProvider.setOverride(MappingProvider.LOCATION, mappingLocation); mappingProvider.setOverride(MappingProvider.LANGUAGE, language); mappingProvider.addImport("tagger.tagset", modelProvider); }
From source file:de.dfki.km.perspecting.obie.experiments.RelevanceRatingExperiment.java
public static void setUp(String $DATABASE_SERVER, String $DATABASE, TextCorpus corpus) throws Exception { pool.setUser($DATABASE_SERVER_USER); pool.setPassword($DATABASE_SERVER_PW); pool.setPortNumber($DATABASE_SERVER_PORT); pool.setDatabaseName($DATABASE);//from www . j a v a 2 s . com pool.setServerName($DATABASE_SERVER); pool.setMaxConnections(100); kb = new PostgresKB(pool.getConnection(), $DATABASE, new URI("http://test.de")); pipeline = new Pipeline(kb); LanguageIdentification languageClassification = new LanguageIdentification(Language.EN); WordSegmenter wordTokenizer = new WordSegmenter(); SentenceSegmenter sentenceTokenizer = new SentenceSegmenter(); POSModel posModel = new POSModel(Scoobie.class.getResourceAsStream("pos/en/en-pos-maxent.bin")); POSTagging posTagger = new POSTagging(new POSTaggerME(posModel)); ProperNameRecognition nounPhraseChunker = new ProperNameRecognition( new CRFNounPhraseChunkerModel($SCOOBIE_HOME + $DATABASE_DBPEDIA_en2 + "/npc/en/EN.crf")); SuffixArrayBuilder suffixArrayBuilder = new SuffixArrayBuilder(100, new LiteralHashing(4)); RDFLiteralSpotting namedEntityRecognizer = new RDFLiteralSpotting(); InstanceRecognition instanceResolver = new InstanceRecognition(); EntityDisambiguation instanceDisambiguator = new EntityDisambiguation( new AmbiguityResolver[] { new DegreeBasedResolver() }); KnownFactsRetrieval factRetrieval = new KnownFactsRetrieval(); ArrayList<int[]> l = new ArrayList<int[]>(); int max = (int) Math.pow(2, 9); for (int i = 0; i < max; i++) { String binary = Integer.toBinaryString(i); String prefix = ""; for (int pad = 0; pad < 9 - binary.length(); pad++) { prefix += "0"; } binary = prefix + binary; TIntHashSet s = new TIntHashSet(); for (int j = 0; j < 9; j++) { if (j < binary.length() && binary.charAt(j) == '1') { s.add(j); } } if (s.size() > 1) l.add(s.toArray()); } RelevanceRating relevanceRating = new RelevanceRating(new RatingMetric[] { new AuthorityBasedRating(), // 0 new HubBasedRating(), // 1 new PageRankBasedRating(), // 2 new DegreeBasedRating(), // 3 new CapacityBasedRating(), // 4 new RandomRating(), // 5 new PositionBasedRating(), // 6 new TermFrequencyBasedRating(), // 7 new InverseDocumentFrequencyBasedRating(corpus, new File(corpus.getCorpus().getAbsolutePath() + "/index/")) }, // 8 l.toArray(new int[l.size()][])); pipeline.configure(languageClassification, wordTokenizer, sentenceTokenizer, posTagger, nounPhraseChunker, suffixArrayBuilder, namedEntityRecognizer, new DummyTask(), new DummyTask(), instanceResolver, instanceDisambiguator, factRetrieval, relevanceRating, new DummyTask()); }