Example usage for opennlp.tools.tokenize TokenizerME TokenizerME

Introduction

In this page you can find the example usage for opennlp.tools.tokenize TokenizerME TokenizerME.

Prototype

public TokenizerME(TokenizerModel model)

Source Link

Usage

From source file:NLP.java

public NLP() throws FileNotFoundException, IOException, URISyntaxException {
    itemsList = new HashMap<String, String>();

    String file = (new File(NLP.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()))
            .toString();//from   ww w  .  ja va2  s . c o  m
    String path = (new File(file).getParentFile().getPath()).toString();

    model = new POSModelLoader().load(new File(path + "\\fr-pos.bin"));
    perfMon = new PerformanceMonitor(System.err, "sent");
    tagger = new POSTaggerME(model);

    try (InputStream is = new FileInputStream(path + "\\fr-token.bin")) {
        tokenizer = new TokenizerME(new TokenizerModel(is));
    } catch (Exception e) {
        System.out.println(e);
    }
}

From source file:hrpod.tools.nlp.NLPTools.java

public String[] tokenize(String text) {

    String[] chunkStrings = null;

    try {/*from w ww  . j av  a 2  s  . c  om*/
        TokenizerME wordBreaker = new TokenizerME(getTokenModel());
        POSTaggerME posme = new POSTaggerME(getPosModel());
        ChunkerME chunkerME = new ChunkerME(getChunkerModel());

        //words is the tokenized sentence
        String[] words = wordBreaker.tokenize(text);
        //posTags are the parts of speech of every word in the sentence (The chunker needs this info)
        String[] posTags = posme.tag(words);
        //chunks are the start end "spans" indices to the chunks in the words array
        Span[] chunks = chunkerME.chunkAsSpans(words, posTags);
        //chunkStrings are the actual chunks
        chunkStrings = Span.spansToStrings(chunks, words);
        //for (int i = 0; i < chunks.length; i++) {
        //    if (chunks[i].getType().equals("NP")) {
        //        System.out.println("NP: \n\t" + chunkStrings[i]);

        //String[] split = chunkStrings[i].split(" ");
        //List<String> ngrams = ngram(Arrays.asList(split), N, " ");
        //System.out.println("ngrams:");
        //for (String gram : ngrams) {
        //  System.out.println("\t" + gram);
        //}
        //}
        //}
    } catch (Exception e) {
        logger.error("Error in tokenize", e);
    }

    return chunkStrings;

}

From source file:de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter.java

@Override
public void initialize(UimaContext aContext) throws ResourceInitializationException {
    super.initialize(aContext);

    sentenceModelProvider = new CasConfigurableProviderBase<SentenceDetectorME>() {
        {//from   www  . j  av a  2  s .  c om
            setDefault(VERSION, "20120616.0");
            setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core");
            setDefault(ARTIFACT_ID,
                    "de.tudarmstadt.ukp.dkpro.core.opennlp-model-sentence-${language}-${variant}");

            setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/"
                    + "sentence-${language}-${variant}.bin");
            setDefault(VARIANT, "maxent");

            setOverride(LOCATION, modelLocation);
            setOverride(LANGUAGE, language);
            setOverride(VARIANT, variant);
        }

        @Override
        protected SentenceDetectorME produceResource(URL aUrl) throws IOException {
            InputStream is = null;
            try {
                is = aUrl.openStream();
                SentenceModel model = new SentenceModel(is);
                return new SentenceDetectorME(model);
            } finally {
                closeQuietly(is);
            }
        }
    };

    tokenModelProvider = new CasConfigurableProviderBase<TokenizerME>() {
        {
            setDefault(VERSION, "1.5");
            setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core");
            setDefault(ARTIFACT_ID, "de.tudarmstadt.ukp.dkpro.core.opennlp-model-token-${language}-${variant}");

            setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/"
                    + "token-${language}-${variant}.bin");
            setDefault(VARIANT, "maxent");

            setOverride(LOCATION, modelLocation);
            setOverride(LANGUAGE, language);
            setOverride(VARIANT, variant);
        }

        @Override
        protected TokenizerME produceResource(URL aUrl) throws IOException {
            InputStream is = null;
            try {
                is = aUrl.openStream();
                TokenizerModel model = new TokenizerModel(is);
                return new TokenizerME(model);
            } finally {
                closeQuietly(is);
            }
        }
    };
}

From source file:it.uniud.ailab.dcore.wrappers.external.OpenNlpBootstrapperAnnotator.java

/**
 * Annotates the document using the Apache OpenNLP tools.
 *
 * @param component the component to annotate.
 *//* w  ww.  j  a  va2 s .  co m*/
@Override
public void annotate(Blackboard blackboard, DocumentComponent component) {

    // set up the annotator
    setup();

    // Language tag used to retrieve the datasets
    String langTag = component.getLanguage().getLanguage();

    // Split the text into sentences
    SentenceModel sentModel = getSentenceModel(langTag + "-sent");

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
    String sentences[] = sentenceDetector.sentDetect(component.getText());

    // Get the right models
    TokenizerModel tokenModel = getTokenizerModel(langTag + "-token");
    POSModel POSModel = getPOSTaggerModel(langTag + "-pos-maxent");

    // Iterate through sentences and produce the distilled objects, 
    // i.e. a sentence object with pos-tagged and stemmed tokens.
    for (String sentenceString : sentences) {

        // the distilled sentence object
        Sentence sentence = new Sentence(sentenceString, "" + sentenceCounter++);
        sentence.setLanguage(component.getLanguage());

        // Tokenize the sentence
        Tokenizer tokenizer = new TokenizerME(tokenModel);
        String tokens[] = tokenizer.tokenize(sentenceString);

        // POS tag the tokens
        POSTaggerME tagger = new POSTaggerME(POSModel);
        String tags[] = tagger.tag(tokens);

        // put the features detected by OpenNLP in the distiller's
        // sentence
        for (int i = 0; i < tokens.length; i++) {
            Token t = new Token(tokens[i]);
            t.setPoS(tags[i]);
            sentence.addToken(t);

        } // for 
        ((DocumentComposite) component).addComponent(sentence);

    } // for (String sentenceString : sentences)
}

From source file:com.screenslicer.core.nlp.NlpUtil.java

public static String[] tokensFromSentence(String sentence) {
    Tokenizer tokenizer = new TokenizerME(tokenModel);
    return tokenizer.tokenize(sentence);
}

From source file:com.civis.utils.opennlp.models.address.AddressFinderMe.java

/**
 * {@inheritDoc}/*w  w  w .j a  va 2  s  .  c  o  m*/
 */
@Override
public List<AddressSpan> find(String text, TokenizerModel tokenizerModel) {
    Tokenizer tokenizer = new TokenizerME(tokenizerModel);
    String[] tokens = tokenizer.tokenize(text);
    return find(tokens);
}

From source file:edu.stanford.muse.index.NER.java

public synchronized static void initialize() throws ClassCastException, IOException, ClassNotFoundException {
    if (pFinder != null)
        return;/* www  .  jav a 2s.  c  om*/
    long startTimeMillis = System.currentTimeMillis();
    log.info("Initializing NER models");

    try {
        InputStream pis = Config.getResourceAsStream("models/en-ner-person.bin");
        TokenNameFinderModel pmodel = new TokenNameFinderModel(pis);
        pFinder = new NameFinderME(pmodel);

        InputStream lis = Config.getResourceAsStream("models/en-ner-location.bin");
        TokenNameFinderModel lmodel = new TokenNameFinderModel(lis);
        lFinder = new NameFinderME(lmodel);

        InputStream ois = Config.getResourceAsStream("models/en-ner-organization.bin");
        TokenNameFinderModel omodel = new TokenNameFinderModel(ois);
        oFinder = new NameFinderME(omodel);
    }
    //dont bother about this, instead try not to use it
    catch (Exception e) {
        Util.print_exception(e, log);
    }
    try {
        InputStream modelIn = Config.getResourceAsStream("models/en-sent.bin");
        SentenceModel model = new SentenceModel(modelIn);
        sFinder = new SentenceDetectorME(model);

        InputStream tokenStream = Config.getResourceAsStream("models/en-token.bin");
        TokenizerModel modelTokenizer = new TokenizerModel(tokenStream);
        tokenizer = new TokenizerME(modelTokenizer);
    } catch (Exception e) {
        Util.print_exception(e);
    }

    long endTimeMillis = System.currentTimeMillis();
    log.info("Done initializing NER model in " + Util.commatize(endTimeMillis - startTimeMillis) + "ms");
}

From source file:ht.process.Processor.java

public ProcessorResult2 processDocumentV3(ProcessorParams param) {

    TokenizerModel portugueseTokenizer = (TokenizerModel) servletContext.getAttribute("portugueseTokenizer");
    TokenizerModel englishTokenizer = (TokenizerModel) servletContext.getAttribute("englishTokenizer");

    ConcurrentHashMap<String, String> portugueseStopwords = (ConcurrentHashMap<String, String>) servletContext
            .getAttribute("portugueseStopwords");
    ConcurrentHashMap<String, String> englishStopwords = (ConcurrentHashMap<String, String>) servletContext
            .getAttribute("englishStopwords");

    ResourceBundle portugueseMessages = (ResourceBundle) servletContext.getAttribute("portugueseMessages");
    ResourceBundle englishMessages = (ResourceBundle) servletContext.getAttribute("englishMessages");

    String text = param.body;// w w w .j  a  va  2 s.  c o  m
    //String text = LZString.decompressFromUTF16(param.body);

    Connection connection;
    try {
        connection = ((DataSource) servletContext.getAttribute("connPool")).getConnection();
    } catch (SQLException ex) {
        Logger.getLogger(Processor.class.getName()).log(Level.SEVERE, null, ex);
        return null;
    }

    HashSet<String> semanticTypes;
    if (param.semanticTypes == null || param.semanticTypes.isEmpty())
        semanticTypes = defaultSemanticTypes;
    else
        semanticTypes = param.semanticTypes;

    ConceptProcessor processor = null;
    ResourceBundle messages = null;

    switch (param.language) {
    case "en":
        processor = new EnglishProcessor(connection, englishStopwords, englishTokenizer, semanticTypes);
        //messages = englishMessages;
        break;
    case "pt":
        processor = new PortugueseProcessor(connection, portugueseStopwords, portugueseTokenizer,
                semanticTypes);
        //messages = portugueseMessages;
        break;
    default:
        processor = new EnglishProcessor(connection, englishStopwords, englishTokenizer, semanticTypes);
        //messages = englishMessages;
        break;
    }

    if (param.contentLanguage == null)
        param.contentLanguage = "detected";

    if ((param.contentLanguage.equals("detected") && param.language.equals("en"))
            || param.contentLanguage.equals("en"))
        messages = englishMessages;
    else if ((param.contentLanguage.equals("detected") && param.language.equals("pt"))
            || param.contentLanguage.equals("pt"))
        messages = portugueseMessages;
    else
        messages = englishMessages;

    if (param.recognizeOnlyCHV == null)
        param.recognizeOnlyCHV = true;

    processor.recognizeOnlyCHV = param.recognizeOnlyCHV;

    if (param.recognizeWithoutDefinition == null)
        param.recognizeWithoutDefinition = true;

    if (param.styFilter == null)
        param.styFilter = "all";

    if (param.styFilter.equals("all"))
        processor.allAccepted = true;
    else if (param.styFilter.equals("one"))
        processor.allAccepted = false;

    Tokenizer tokenizer = new TokenizerME(processor.tokenizerModel);

    Span spans[] = tokenizer.tokenizePos(text);
    //Span[] spansCopy = new Span[spans.length];
    //System.arraycopy( spans, 0, spansCopy, 0, spans.length );
    //System.out.println("TEXT: " + text);
    //System.out.println("SPANS: " + spans.length);

    ArrayList<Change> resultChanges = new ArrayList<>();

    for (int i = 0; i < spans.length; i++) {

        Span initialSpan = spans[i];

        Concept bestMatch = processor.processToken(spans, i, text, FORWARD_THRESHOLD);

        if (bestMatch != null) {
            //replace "'" so it doesn't break the tooltip html if the definition contains it
            String definition = processor.getDefinition(bestMatch);
            if (definition != null) {
                bestMatch.setDefinition(definition);
            }
        }

        if (bestMatch != null && ((!param.recognizeWithoutDefinition && bestMatch.definition != null)
                || param.recognizeWithoutDefinition)) {
            i += bestMatch.words - 1;

            /*
            if (lastFound == null) {
            splitText.add(text.substring(0, initialSpan.getStart()));
            } else {
            splitText.add(text.substring(lastFound.span.getEnd(), bestMatch.span.getStart()));
            }
            */
            String definitionTooltip = replaceConcept(bestMatch, param.language, messages);
            resultChanges
                    .add(new Change(bestMatch.span.getStart(), bestMatch.span.getEnd(), definitionTooltip));

            //lastFound = bestMatch;

        }
    }

    try {
        connection.close();
    } catch (SQLException ex) {
        Logger.getLogger(Processor.class.getName()).log(Level.SEVERE, null, ex);
    }

    return new ProcessorResult2(resultChanges);
}

From source file:it.uniud.ailab.dcore.wrappers.external.OpenNlpBootstrapperAnnotator.java

/**
 * Utility offered to other elements of the pipeline for text tokenizing.
 *
 * @param text the text to tokenize//from  w w w  .j  a v  a  2 s  .c  o  m
 * @param language the language of the input text
 * @return an array containing the tokenized text.
 */
public static String[] tokenizeText(String text, String language) {

    setup();

    // Split the text into sentences
    SentenceModel sentModel = getSentenceModel(language + "-sent");

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
    String sentences[] = sentenceDetector.sentDetect(text);

    // Get the right models
    TokenizerModel tokenModel = getTokenizerModel(language + "-token");

    // Iterate through sentences and produce the distilled objects, 
    // i.e. a sentence object with pos-tagged and stemmed tokens.
    List<String> tokenizedText = new ArrayList<>();

    for (String sentenceString : sentences) {

        // Tokenize the sentence
        Tokenizer tokenizer = new TokenizerME(tokenModel);
        String tokens[] = tokenizer.tokenize(sentenceString);
        for (String token : tokens) {
            tokenizedText.add(token);
        }
    }
    return tokenizedText.toArray(new String[tokenizedText.size()]);
}

From source file:org.sglover.nlp.CoreNLPEntityTagger.java

@Override
protected Entities getEntitiesImpl(String content) {
    Entities namedEntities = Entities.empty();

    SentenceModel sentenceModel = sentenceModels.get("en");
    SentenceDetector sentenceDetector = new SentenceDetectorME(sentenceModel);
    String[] sentences = sentenceDetector.sentDetect(content);

    TokenizerModel tm = tokenizerModels.get("en");
    TokenizerME wordBreaker = new TokenizerME(tm);

    for (String sentence : sentences) {
        String[] tokens = wordBreaker.tokenize(sentence);

        List<TextAnnotation> allTextAnnotations = new LinkedList<TextAnnotation>();

        POSModel posModel = posModels.get("en");
        POSTaggerME posme = new POSTaggerME(posModel);
        String[] posTags = posme.tag(tokens);

        List<String> npTokens = new LinkedList<>();

        ChunkerModel chunkerModel = chunkerModels.get("en");
        ChunkerME chunkerME = new ChunkerME(chunkerModel);
        Span[] chunks = chunkerME.chunkAsSpans(tokens, posTags);
        String[] chunkStrings = Span.spansToStrings(chunks, tokens);
        for (int i = 0; i < chunks.length; i++) {
            String chunkString = chunkStrings[i];
            logger.info("Chunk = " + chunkString + ", type = " + chunks[i].getType());
            if (chunks[i].getType().equals("NP")) {
                npTokens.add(chunkString);
            }/*from   w w  w .j  a  v a  2  s .c om*/
        }

        // findEntities(namedEntities, allTextAnnotations,
        // npTokens.toArray(new String[0]));
        findEntities(namedEntities, allTextAnnotations, tokens);
    }

    return namedEntities;
}