Example usage for opennlp.tools.postag POSTaggerME POSTaggerME

List of usage examples for opennlp.tools.postag POSTaggerME POSTaggerME

Introduction

In this page you can find the example usage for opennlp.tools.postag POSTaggerME POSTaggerME.

Prototype

public POSTaggerME(POSModel model) 

Source Link

Document

Initializes the current instance with the provided model.

Usage

From source file:hrpod.tools.nlp.NLPTools.java

public ArrayList getTokens(String txt) {
    ArrayList<String> wordList = null;
    try {//from w ww  .ja va  2s .com

        POSTaggerME posme = new POSTaggerME(getPosModel());
        String words[] = tokenize(txt);//tokenize into words and phrases                        
        wordList = new StopWordRemoval().removeStopWords(words); //remove stop words

        //String[] posTags = posme.tag(wordList.toArray(new String[0]));
        logger.info("DONE");
    } catch (Exception e) {
        logger.error("ERROR in GetTokens", e);
    } finally {

    }
    return wordList;
}

From source file:hrpod.tools.nlp.NLPTools.java

public ArrayList getStemmedTokens(String txt) {
    ArrayList<String> wordList = null;
    try {/*from  w  w  w  .  j  ava  2  s  .c  om*/

        POSTaggerME posme = new POSTaggerME(getPosModel());
        String words[] = tokenize(txt);//tokenize into words and phrases                        
        wordList = new StopWordRemoval().removeStopWords(words); //remove stop words
        wordList = stemmer(wordList);//stem words

        //String[] posTags = posme.tag(wordList.toArray(new String[0]));
        logger.info("DONE");
    } catch (Exception e) {
        logger.error("ERROR in GetTokens", e);
    } finally {

    }
    return wordList;
}

From source file:NLP.java

public NLP() throws FileNotFoundException, IOException, URISyntaxException {
    itemsList = new HashMap<String, String>();

    String file = (new File(NLP.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()))
            .toString();//  w w  w.j  a v  a2s  .co  m
    String path = (new File(file).getParentFile().getPath()).toString();

    model = new POSModelLoader().load(new File(path + "\\fr-pos.bin"));
    perfMon = new PerformanceMonitor(System.err, "sent");
    tagger = new POSTaggerME(model);

    try (InputStream is = new FileInputStream(path + "\\fr-token.bin")) {
        tokenizer = new TokenizerME(new TokenizerModel(is));
    } catch (Exception e) {
        System.out.println(e);
    }
}

From source file:de.dfki.km.perspecting.obie.experiments.PhraseExperiment.java

/**
 * @throws java.lang.Exception/*w  w  w  .  j  a  va 2  s .c  o m*/
 */
@BeforeClass
public static void setUpBeforeClass() throws Exception {

    pool.setUser($DATABASE_SERVER_USER);
    pool.setPassword($DATABASE_SERVER_PW);
    pool.setPortNumber($DATABASE_SERVER_PORT);
    pool.setDatabaseName($DATABASE);
    pool.setServerName($DATABASE_SERVER);
    pool.setMaxConnections(100);

    kb = new PostgresKB(pool.getConnection(), $DATABASE, new URI("http://test.de"));
    pipeline = new Pipeline(kb);

    LanguageIdentification languageClassification = new LanguageIdentification(Language.EN);
    WordSegmenter wordTokenizer = new WordSegmenter();
    SentenceSegmenter sentenceTokenizer = new SentenceSegmenter();

    POSModel posModel = new POSModel(Scoobie.class.getResourceAsStream("pos/en/en-pos-maxent.bin"));
    POSTagging posTagger = new POSTagging(new POSTaggerME(posModel));
    ProperNameRecognition nounPhraseChunker = new ProperNameRecognition(
            new CRFNounPhraseChunkerModel(Scoobie.class.getResourceAsStream("npc/en/EN.crf")));

    SuffixArrayBuilder suffixArrayBuilder = new SuffixArrayBuilder(100, new LiteralHashing(4));
    RDFLiteralSpotting entityRecognizer = new RDFLiteralSpotting();
    InstanceRecognition subjectResolver = new InstanceRecognition();

    pipeline.configure(languageClassification, wordTokenizer, sentenceTokenizer, posTagger, nounPhraseChunker,
            suffixArrayBuilder, entityRecognizer, new DummyTask(), new DummyTask(), subjectResolver,
            new DummyTask(), new DummyTask(), new DummyTask(), new DummyTask());

}

From source file:de.dfki.km.perspecting.obie.experiments.ProperNameExperiment.java

/**
 * @throws java.lang.Exception/* w w  w.j a v  a  2 s. c  o  m*/
 */

public static void setUp(String databaseServer, String dataBase) throws Exception {

    pool.setUser($DATABASE_SERVER_USER);
    pool.setPassword($DATABASE_SERVER_PW);
    pool.setPortNumber($DATABASE_SERVER_PORT);
    pool.setDatabaseName(dataBase);
    pool.setServerName(databaseServer);
    pool.setMaxConnections(100);

    kb = new PostgresKB(pool.getConnection(), dataBase, new URI("http://test.de"));
    pipeline = new Pipeline(kb);

    LanguageIdentification languageClassification = new LanguageIdentification(Language.EN);
    WordSegmenter wordTokenizer = new WordSegmenter();
    SentenceSegmenter sentenceTokenizer = new SentenceSegmenter();

    POSModel posModel = new POSModel(Scoobie.class.getResourceAsStream("pos/en/en-pos-maxent.bin"));
    POSTagging posTagger = new POSTagging(new POSTaggerME(posModel));
    ProperNameRecognition nounPhraseChunker = new ProperNameRecognition(
            new CRFNounPhraseChunkerModel(Scoobie.class.getResourceAsStream("npc/en/EN.crf")));

    SuffixArrayBuilder suffixArrayBuilder = new SuffixArrayBuilder(100, new LiteralHashing(4));
    RDFLiteralSpotting entityRecognizer = new RDFLiteralSpotting();
    pipeline.configure(languageClassification, wordTokenizer, sentenceTokenizer, posTagger, nounPhraseChunker,
            suffixArrayBuilder, entityRecognizer, new DummyTask(), new DummyTask(), new DummyTask(),
            new DummyTask(), new DummyTask(), new DummyTask(), new DummyTask());

}

From source file:hrpod.tools.nlp.NLPTools.java

public String[] tokenize(String text) {

    String[] chunkStrings = null;

    try {//w w w.  j  a v  a 2  s .  co m
        TokenizerME wordBreaker = new TokenizerME(getTokenModel());
        POSTaggerME posme = new POSTaggerME(getPosModel());
        ChunkerME chunkerME = new ChunkerME(getChunkerModel());

        //words is the tokenized sentence
        String[] words = wordBreaker.tokenize(text);
        //posTags are the parts of speech of every word in the sentence (The chunker needs this info)
        String[] posTags = posme.tag(words);
        //chunks are the start end "spans" indices to the chunks in the words array
        Span[] chunks = chunkerME.chunkAsSpans(words, posTags);
        //chunkStrings are the actual chunks
        chunkStrings = Span.spansToStrings(chunks, words);
        //for (int i = 0; i < chunks.length; i++) {
        //    if (chunks[i].getType().equals("NP")) {
        //        System.out.println("NP: \n\t" + chunkStrings[i]);

        //String[] split = chunkStrings[i].split(" ");
        //List<String> ngrams = ngram(Arrays.asList(split), N, " ");
        //System.out.println("ngrams:");
        //for (String gram : ngrams) {
        //  System.out.println("\t" + gram);
        //}
        //}
        //}
    } catch (Exception e) {
        logger.error("Error in tokenize", e);
    }

    return chunkStrings;

}

From source file:com.geocode.service.impl.AddressServiceImpl.java

@PostConstruct
public void init() throws URISyntaxException {
    model = new POSModelLoader()
            .load(new File(this.getClass().getResource(basePath + "en-pos-maxent.bin").toURI()));
    tagger = new POSTaggerME(model);
    list.addAll(baseList);/*from  w ww  .j a va 2 s  .c o m*/
    CounterHelper.readCounter();
}

From source file:it.uniud.ailab.dcore.wrappers.external.OpenNlpBootstrapperAnnotator.java

/**
 * Annotates the document using the Apache OpenNLP tools.
 *
 * @param component the component to annotate.
 *//*  ww w  .j a  v  a2  s . c o  m*/
@Override
public void annotate(Blackboard blackboard, DocumentComponent component) {

    // set up the annotator
    setup();

    // Language tag used to retrieve the datasets
    String langTag = component.getLanguage().getLanguage();

    // Split the text into sentences
    SentenceModel sentModel = getSentenceModel(langTag + "-sent");

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
    String sentences[] = sentenceDetector.sentDetect(component.getText());

    // Get the right models
    TokenizerModel tokenModel = getTokenizerModel(langTag + "-token");
    POSModel POSModel = getPOSTaggerModel(langTag + "-pos-maxent");

    // Iterate through sentences and produce the distilled objects, 
    // i.e. a sentence object with pos-tagged and stemmed tokens.
    for (String sentenceString : sentences) {

        // the distilled sentence object
        Sentence sentence = new Sentence(sentenceString, "" + sentenceCounter++);
        sentence.setLanguage(component.getLanguage());

        // Tokenize the sentence
        Tokenizer tokenizer = new TokenizerME(tokenModel);
        String tokens[] = tokenizer.tokenize(sentenceString);

        // POS tag the tokens
        POSTaggerME tagger = new POSTaggerME(POSModel);
        String tags[] = tagger.tag(tokens);

        // put the features detected by OpenNLP in the distiller's
        // sentence
        for (int i = 0; i < tokens.length; i++) {
            Token t = new Token(tokens[i]);
            t.setPoS(tags[i]);
            sentence.addToken(t);

        } // for 
        ((DocumentComposite) component).addComponent(sentence);

    } // for (String sentenceString : sentences)
}

From source file:de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger.java

@Override
public void initialize(UimaContext aContext) throws ResourceInitializationException {
    super.initialize(aContext);

    modelProvider = new CasConfigurableProviderBase<POSTagger>() {
        {//from   w  w w  .  j ava  2s  .c  o  m
            setDefault(VERSION, "20120616.0");
            setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core");
            setDefault(ARTIFACT_ID,
                    "de.tudarmstadt.ukp.dkpro.core.opennlp-model-tagger-${language}-${variant}");

            setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/"
                    + "tagger-${language}-${variant}.bin");
            setDefault(VARIANT, "maxent");

            setOverride(LOCATION, modelLocation);
            setOverride(LANGUAGE, language);
            setOverride(VARIANT, variant);
        }

        @Override
        protected POSTagger produceResource(URL aUrl) throws IOException {
            InputStream is = null;
            try {
                is = aUrl.openStream();
                POSModel model = new POSModel(is);

                if (printTagSet) {
                    List<String> tags = new ArrayList<String>();
                    for (int i = 0; i < model.getPosModel().getNumOutcomes(); i++) {
                        tags.add(model.getPosModel().getOutcome(i));
                    }
                    Collections.sort(tags);

                    StringBuilder sb = new StringBuilder();
                    sb.append("Model contains [").append(tags.size()).append("] tags: ");

                    for (String tag : tags) {
                        sb.append(tag);
                        sb.append(" ");
                    }
                    getContext().getLogger().log(INFO, sb.toString());
                }

                return new POSTaggerME(model);
            } finally {
                closeQuietly(is);
            }
        }
    };

    mappingProvider = new MappingProvider();
    mappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/"
            + "core/api/lexmorph/tagset/${language}-${tagger.tagset}-tagger.map");
    mappingProvider.setDefault(MappingProvider.BASE_TYPE, POS.class.getName());
    mappingProvider.setDefault("tagger.tagset", "default");
    mappingProvider.setOverride(MappingProvider.LOCATION, mappingLocation);
    mappingProvider.setOverride(MappingProvider.LANGUAGE, language);
    mappingProvider.addImport("tagger.tagset", modelProvider);

}

From source file:de.dfki.km.perspecting.obie.experiments.RelevanceRatingExperiment.java

public static void setUp(String $DATABASE_SERVER, String $DATABASE, TextCorpus corpus) throws Exception {

    pool.setUser($DATABASE_SERVER_USER);
    pool.setPassword($DATABASE_SERVER_PW);
    pool.setPortNumber($DATABASE_SERVER_PORT);
    pool.setDatabaseName($DATABASE);//from   www .  j a  v a 2 s  .  com
    pool.setServerName($DATABASE_SERVER);
    pool.setMaxConnections(100);

    kb = new PostgresKB(pool.getConnection(), $DATABASE, new URI("http://test.de"));
    pipeline = new Pipeline(kb);

    LanguageIdentification languageClassification = new LanguageIdentification(Language.EN);
    WordSegmenter wordTokenizer = new WordSegmenter();
    SentenceSegmenter sentenceTokenizer = new SentenceSegmenter();

    POSModel posModel = new POSModel(Scoobie.class.getResourceAsStream("pos/en/en-pos-maxent.bin"));
    POSTagging posTagger = new POSTagging(new POSTaggerME(posModel));

    ProperNameRecognition nounPhraseChunker = new ProperNameRecognition(
            new CRFNounPhraseChunkerModel($SCOOBIE_HOME + $DATABASE_DBPEDIA_en2 + "/npc/en/EN.crf"));

    SuffixArrayBuilder suffixArrayBuilder = new SuffixArrayBuilder(100, new LiteralHashing(4));
    RDFLiteralSpotting namedEntityRecognizer = new RDFLiteralSpotting();
    InstanceRecognition instanceResolver = new InstanceRecognition();
    EntityDisambiguation instanceDisambiguator = new EntityDisambiguation(
            new AmbiguityResolver[] { new DegreeBasedResolver() });

    KnownFactsRetrieval factRetrieval = new KnownFactsRetrieval();

    ArrayList<int[]> l = new ArrayList<int[]>();

    int max = (int) Math.pow(2, 9);
    for (int i = 0; i < max; i++) {
        String binary = Integer.toBinaryString(i);
        String prefix = "";
        for (int pad = 0; pad < 9 - binary.length(); pad++) {
            prefix += "0";
        }
        binary = prefix + binary;

        TIntHashSet s = new TIntHashSet();
        for (int j = 0; j < 9; j++) {
            if (j < binary.length() && binary.charAt(j) == '1') {
                s.add(j);
            }
        }
        if (s.size() > 1)
            l.add(s.toArray());
    }

    RelevanceRating relevanceRating = new RelevanceRating(new RatingMetric[] { new AuthorityBasedRating(), // 0
            new HubBasedRating(), // 1
            new PageRankBasedRating(), // 2
            new DegreeBasedRating(), // 3
            new CapacityBasedRating(), // 4
            new RandomRating(), // 5
            new PositionBasedRating(), // 6
            new TermFrequencyBasedRating(), // 7
            new InverseDocumentFrequencyBasedRating(corpus,
                    new File(corpus.getCorpus().getAbsolutePath() + "/index/")) }, // 8

            l.toArray(new int[l.size()][]));

    pipeline.configure(languageClassification, wordTokenizer, sentenceTokenizer, posTagger, nounPhraseChunker,
            suffixArrayBuilder, namedEntityRecognizer, new DummyTask(), new DummyTask(), instanceResolver,
            instanceDisambiguator, factRetrieval, relevanceRating, new DummyTask());

}