Example usage for opennlp.tools.tokenize TokenizerME TokenizerME

List of usage examples for opennlp.tools.tokenize TokenizerME TokenizerME

Introduction

In this page you can find the example usage for opennlp.tools.tokenize TokenizerME TokenizerME.

Prototype

public TokenizerME(TokenizerModel model) 

Source Link

Usage

From source file:NLP.java

public NLP() throws FileNotFoundException, IOException, URISyntaxException {
    itemsList = new HashMap<String, String>();

    String file = (new File(NLP.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()))
            .toString();//from   ww w  .  ja va2  s . c o  m
    String path = (new File(file).getParentFile().getPath()).toString();

    model = new POSModelLoader().load(new File(path + "\\fr-pos.bin"));
    perfMon = new PerformanceMonitor(System.err, "sent");
    tagger = new POSTaggerME(model);

    try (InputStream is = new FileInputStream(path + "\\fr-token.bin")) {
        tokenizer = new TokenizerME(new TokenizerModel(is));
    } catch (Exception e) {
        System.out.println(e);
    }
}

From source file:hrpod.tools.nlp.NLPTools.java

public String[] tokenize(String text) {

    String[] chunkStrings = null;

    try {/*from w ww  . j av  a 2  s  . c  om*/
        TokenizerME wordBreaker = new TokenizerME(getTokenModel());
        POSTaggerME posme = new POSTaggerME(getPosModel());
        ChunkerME chunkerME = new ChunkerME(getChunkerModel());

        //words is the tokenized sentence
        String[] words = wordBreaker.tokenize(text);
        //posTags are the parts of speech of every word in the sentence (The chunker needs this info)
        String[] posTags = posme.tag(words);
        //chunks are the start end "spans" indices to the chunks in the words array
        Span[] chunks = chunkerME.chunkAsSpans(words, posTags);
        //chunkStrings are the actual chunks
        chunkStrings = Span.spansToStrings(chunks, words);
        //for (int i = 0; i < chunks.length; i++) {
        //    if (chunks[i].getType().equals("NP")) {
        //        System.out.println("NP: \n\t" + chunkStrings[i]);

        //String[] split = chunkStrings[i].split(" ");
        //List<String> ngrams = ngram(Arrays.asList(split), N, " ");
        //System.out.println("ngrams:");
        //for (String gram : ngrams) {
        //  System.out.println("\t" + gram);
        //}
        //}
        //}
    } catch (Exception e) {
        logger.error("Error in tokenize", e);
    }

    return chunkStrings;

}

From source file:de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter.java

@Override
public void initialize(UimaContext aContext) throws ResourceInitializationException {
    super.initialize(aContext);

    sentenceModelProvider = new CasConfigurableProviderBase<SentenceDetectorME>() {
        {//from   www  . j  av a  2  s .  c om
            setDefault(VERSION, "20120616.0");
            setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core");
            setDefault(ARTIFACT_ID,
                    "de.tudarmstadt.ukp.dkpro.core.opennlp-model-sentence-${language}-${variant}");

            setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/"
                    + "sentence-${language}-${variant}.bin");
            setDefault(VARIANT, "maxent");

            setOverride(LOCATION, modelLocation);
            setOverride(LANGUAGE, language);
            setOverride(VARIANT, variant);
        }

        @Override
        protected SentenceDetectorME produceResource(URL aUrl) throws IOException {
            InputStream is = null;
            try {
                is = aUrl.openStream();
                SentenceModel model = new SentenceModel(is);
                return new SentenceDetectorME(model);
            } finally {
                closeQuietly(is);
            }
        }
    };

    tokenModelProvider = new CasConfigurableProviderBase<TokenizerME>() {
        {
            setDefault(VERSION, "1.5");
            setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core");
            setDefault(ARTIFACT_ID, "de.tudarmstadt.ukp.dkpro.core.opennlp-model-token-${language}-${variant}");

            setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/"
                    + "token-${language}-${variant}.bin");
            setDefault(VARIANT, "maxent");

            setOverride(LOCATION, modelLocation);
            setOverride(LANGUAGE, language);
            setOverride(VARIANT, variant);
        }

        @Override
        protected TokenizerME produceResource(URL aUrl) throws IOException {
            InputStream is = null;
            try {
                is = aUrl.openStream();
                TokenizerModel model = new TokenizerModel(is);
                return new TokenizerME(model);
            } finally {
                closeQuietly(is);
            }
        }
    };
}

From source file:it.uniud.ailab.dcore.wrappers.external.OpenNlpBootstrapperAnnotator.java

/**
 * Annotates the document using the Apache OpenNLP tools.
 *
 * @param component the component to annotate.
 *//* w  ww.  j  a  va2 s .  co m*/
@Override
public void annotate(Blackboard blackboard, DocumentComponent component) {

    // set up the annotator
    setup();

    // Language tag used to retrieve the datasets
    String langTag = component.getLanguage().getLanguage();

    // Split the text into sentences
    SentenceModel sentModel = getSentenceModel(langTag + "-sent");

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
    String sentences[] = sentenceDetector.sentDetect(component.getText());

    // Get the right models
    TokenizerModel tokenModel = getTokenizerModel(langTag + "-token");
    POSModel POSModel = getPOSTaggerModel(langTag + "-pos-maxent");

    // Iterate through sentences and produce the distilled objects, 
    // i.e. a sentence object with pos-tagged and stemmed tokens.
    for (String sentenceString : sentences) {

        // the distilled sentence object
        Sentence sentence = new Sentence(sentenceString, "" + sentenceCounter++);
        sentence.setLanguage(component.getLanguage());

        // Tokenize the sentence
        Tokenizer tokenizer = new TokenizerME(tokenModel);
        String tokens[] = tokenizer.tokenize(sentenceString);

        // POS tag the tokens
        POSTaggerME tagger = new POSTaggerME(POSModel);
        String tags[] = tagger.tag(tokens);

        // put the features detected by OpenNLP in the distiller's
        // sentence
        for (int i = 0; i < tokens.length; i++) {
            Token t = new Token(tokens[i]);
            t.setPoS(tags[i]);
            sentence.addToken(t);

        } // for 
        ((DocumentComposite) component).addComponent(sentence);

    } // for (String sentenceString : sentences)
}

From source file:com.screenslicer.core.nlp.NlpUtil.java

public static String[] tokensFromSentence(String sentence) {
    Tokenizer tokenizer = new TokenizerME(tokenModel);
    return tokenizer.tokenize(sentence);
}

From source file:com.civis.utils.opennlp.models.address.AddressFinderMe.java

/**
 * {@inheritDoc}/*w  w  w .j a  va 2  s  .  c  o  m*/
 */
@Override
public List<AddressSpan> find(String text, TokenizerModel tokenizerModel) {
    Tokenizer tokenizer = new TokenizerME(tokenizerModel);
    String[] tokens = tokenizer.tokenize(text);
    return find(tokens);
}

From source file:edu.stanford.muse.index.NER.java

public synchronized static void initialize() throws ClassCastException, IOException, ClassNotFoundException {
    if (pFinder != null)
        return;/* www  .  jav a 2s.  c  om*/
    long startTimeMillis = System.currentTimeMillis();
    log.info("Initializing NER models");

    try {
        InputStream pis = Config.getResourceAsStream("models/en-ner-person.bin");
        TokenNameFinderModel pmodel = new TokenNameFinderModel(pis);
        pFinder = new NameFinderME(pmodel);

        InputStream lis = Config.getResourceAsStream("models/en-ner-location.bin");
        TokenNameFinderModel lmodel = new TokenNameFinderModel(lis);
        lFinder = new NameFinderME(lmodel);

        InputStream ois = Config.getResourceAsStream("models/en-ner-organization.bin");
        TokenNameFinderModel omodel = new TokenNameFinderModel(ois);
        oFinder = new NameFinderME(omodel);
    }
    //dont bother about this, instead try not to use it
    catch (Exception e) {
        Util.print_exception(e, log);
    }
    try {
        InputStream modelIn = Config.getResourceAsStream("models/en-sent.bin");
        SentenceModel model = new SentenceModel(modelIn);
        sFinder = new SentenceDetectorME(model);

        InputStream tokenStream = Config.getResourceAsStream("models/en-token.bin");
        TokenizerModel modelTokenizer = new TokenizerModel(tokenStream);
        tokenizer = new TokenizerME(modelTokenizer);
    } catch (Exception e) {
        Util.print_exception(e);
    }

    long endTimeMillis = System.currentTimeMillis();
    log.info("Done initializing NER model in " + Util.commatize(endTimeMillis - startTimeMillis) + "ms");
}

From source file:ht.process.Processor.java

public ProcessorResult2 processDocumentV3(ProcessorParams param) {

    TokenizerModel portugueseTokenizer = (TokenizerModel) servletContext.getAttribute("portugueseTokenizer");
    TokenizerModel englishTokenizer = (TokenizerModel) servletContext.getAttribute("englishTokenizer");

    ConcurrentHashMap<String, String> portugueseStopwords = (ConcurrentHashMap<String, String>) servletContext
            .getAttribute("portugueseStopwords");
    ConcurrentHashMap<String, String> englishStopwords = (ConcurrentHashMap<String, String>) servletContext
            .getAttribute("englishStopwords");

    ResourceBundle portugueseMessages = (ResourceBundle) servletContext.getAttribute("portugueseMessages");
    ResourceBundle englishMessages = (ResourceBundle) servletContext.getAttribute("englishMessages");

    String text = param.body;// w w w .j  a  va  2 s.  c o  m
    //String text = LZString.decompressFromUTF16(param.body);

    Connection connection;
    try {
        connection = ((DataSource) servletContext.getAttribute("connPool")).getConnection();
    } catch (SQLException ex) {
        Logger.getLogger(Processor.class.getName()).log(Level.SEVERE, null, ex);
        return null;
    }

    HashSet<String> semanticTypes;
    if (param.semanticTypes == null || param.semanticTypes.isEmpty())
        semanticTypes = defaultSemanticTypes;
    else
        semanticTypes = param.semanticTypes;

    ConceptProcessor processor = null;
    ResourceBundle messages = null;

    switch (param.language) {
    case "en":
        processor = new EnglishProcessor(connection, englishStopwords, englishTokenizer, semanticTypes);
        //messages = englishMessages;
        break;
    case "pt":
        processor = new PortugueseProcessor(connection, portugueseStopwords, portugueseTokenizer,
                semanticTypes);
        //messages = portugueseMessages;
        break;
    default:
        processor = new EnglishProcessor(connection, englishStopwords, englishTokenizer, semanticTypes);
        //messages = englishMessages;
        break;
    }

    if (param.contentLanguage == null)
        param.contentLanguage = "detected";

    if ((param.contentLanguage.equals("detected") && param.language.equals("en"))
            || param.contentLanguage.equals("en"))
        messages = englishMessages;
    else if ((param.contentLanguage.equals("detected") && param.language.equals("pt"))
            || param.contentLanguage.equals("pt"))
        messages = portugueseMessages;
    else
        messages = englishMessages;

    if (param.recognizeOnlyCHV == null)
        param.recognizeOnlyCHV = true;

    processor.recognizeOnlyCHV = param.recognizeOnlyCHV;

    if (param.recognizeWithoutDefinition == null)
        param.recognizeWithoutDefinition = true;

    if (param.styFilter == null)
        param.styFilter = "all";

    if (param.styFilter.equals("all"))
        processor.allAccepted = true;
    else if (param.styFilter.equals("one"))
        processor.allAccepted = false;

    Tokenizer tokenizer = new TokenizerME(processor.tokenizerModel);

    Span spans[] = tokenizer.tokenizePos(text);
    //Span[] spansCopy = new Span[spans.length];
    //System.arraycopy( spans, 0, spansCopy, 0, spans.length );
    //System.out.println("TEXT: " + text);
    //System.out.println("SPANS: " + spans.length);

    ArrayList<Change> resultChanges = new ArrayList<>();

    for (int i = 0; i < spans.length; i++) {

        Span initialSpan = spans[i];

        Concept bestMatch = processor.processToken(spans, i, text, FORWARD_THRESHOLD);

        if (bestMatch != null) {
            //replace "'" so it doesn't break the tooltip html if the definition contains it
            String definition = processor.getDefinition(bestMatch);
            if (definition != null) {
                bestMatch.setDefinition(definition);
            }
        }

        if (bestMatch != null && ((!param.recognizeWithoutDefinition && bestMatch.definition != null)
                || param.recognizeWithoutDefinition)) {
            i += bestMatch.words - 1;

            /*
            if (lastFound == null) {
            splitText.add(text.substring(0, initialSpan.getStart()));
            } else {
            splitText.add(text.substring(lastFound.span.getEnd(), bestMatch.span.getStart()));
            }
            */
            String definitionTooltip = replaceConcept(bestMatch, param.language, messages);
            resultChanges
                    .add(new Change(bestMatch.span.getStart(), bestMatch.span.getEnd(), definitionTooltip));

            //lastFound = bestMatch;

        }
    }

    try {
        connection.close();
    } catch (SQLException ex) {
        Logger.getLogger(Processor.class.getName()).log(Level.SEVERE, null, ex);
    }

    return new ProcessorResult2(resultChanges);
}

From source file:it.uniud.ailab.dcore.wrappers.external.OpenNlpBootstrapperAnnotator.java

/**
 * Utility offered to other elements of the pipeline for text tokenizing.
 *
 * @param text the text to tokenize//from  w w w  .j  a v  a  2 s  .c  o  m
 * @param language the language of the input text
 * @return an array containing the tokenized text.
 */
public static String[] tokenizeText(String text, String language) {

    setup();

    // Split the text into sentences
    SentenceModel sentModel = getSentenceModel(language + "-sent");

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
    String sentences[] = sentenceDetector.sentDetect(text);

    // Get the right models
    TokenizerModel tokenModel = getTokenizerModel(language + "-token");

    // Iterate through sentences and produce the distilled objects, 
    // i.e. a sentence object with pos-tagged and stemmed tokens.
    List<String> tokenizedText = new ArrayList<>();

    for (String sentenceString : sentences) {

        // Tokenize the sentence
        Tokenizer tokenizer = new TokenizerME(tokenModel);
        String tokens[] = tokenizer.tokenize(sentenceString);
        for (String token : tokens) {
            tokenizedText.add(token);
        }
    }
    return tokenizedText.toArray(new String[tokenizedText.size()]);
}

From source file:org.sglover.nlp.CoreNLPEntityTagger.java

@Override
protected Entities getEntitiesImpl(String content) {
    Entities namedEntities = Entities.empty();

    SentenceModel sentenceModel = sentenceModels.get("en");
    SentenceDetector sentenceDetector = new SentenceDetectorME(sentenceModel);
    String[] sentences = sentenceDetector.sentDetect(content);

    TokenizerModel tm = tokenizerModels.get("en");
    TokenizerME wordBreaker = new TokenizerME(tm);

    for (String sentence : sentences) {
        String[] tokens = wordBreaker.tokenize(sentence);

        List<TextAnnotation> allTextAnnotations = new LinkedList<TextAnnotation>();

        POSModel posModel = posModels.get("en");
        POSTaggerME posme = new POSTaggerME(posModel);
        String[] posTags = posme.tag(tokens);

        List<String> npTokens = new LinkedList<>();

        ChunkerModel chunkerModel = chunkerModels.get("en");
        ChunkerME chunkerME = new ChunkerME(chunkerModel);
        Span[] chunks = chunkerME.chunkAsSpans(tokens, posTags);
        String[] chunkStrings = Span.spansToStrings(chunks, tokens);
        for (int i = 0; i < chunks.length; i++) {
            String chunkString = chunkStrings[i];
            logger.info("Chunk = " + chunkString + ", type = " + chunks[i].getType());
            if (chunks[i].getType().equals("NP")) {
                npTokens.add(chunkString);
            }/*from   w w  w .j  a  v a  2  s .c om*/
        }

        // findEntities(namedEntities, allTextAnnotations,
        // npTokens.toArray(new String[0]));
        findEntities(namedEntities, allTextAnnotations, tokens);
    }

    return namedEntities;
}