Example usage for opennlp.tools.tokenize Tokenizer tokenize

Introduction

In this page you can find the example usage for opennlp.tools.tokenize Tokenizer tokenize.

Prototype

String[] tokenize(String s);

Source Link

Document

Splits a string into its atomic parts

Usage

From source file:com.screenslicer.core.nlp.NlpUtil.java

public static String[] tokensFromSentence(String sentence) {
    Tokenizer tokenizer = new TokenizerME(tokenModel);
    return tokenizer.tokenize(sentence);
}

From source file:it.uniud.ailab.dcore.wrappers.external.OpenNlpBootstrapperAnnotator.java

/**
 * Utility offered to other elements of the pipeline for text tokenizing.
 *
 * @param text the text to tokenize// w ww  .j  a va2s.co  m
 * @param language the language of the input text
 * @return an array containing the tokenized text.
 */
public static String[] tokenizeText(String text, String language) {

    setup();

    // Split the text into sentences
    SentenceModel sentModel = getSentenceModel(language + "-sent");

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
    String sentences[] = sentenceDetector.sentDetect(text);

    // Get the right models
    TokenizerModel tokenModel = getTokenizerModel(language + "-token");

    // Iterate through sentences and produce the distilled objects, 
    // i.e. a sentence object with pos-tagged and stemmed tokens.
    List<String> tokenizedText = new ArrayList<>();

    for (String sentenceString : sentences) {

        // Tokenize the sentence
        Tokenizer tokenizer = new TokenizerME(tokenModel);
        String tokens[] = tokenizer.tokenize(sentenceString);
        for (String token : tokens) {
            tokenizedText.add(token);
        }
    }
    return tokenizedText.toArray(new String[tokenizedText.size()]);
}

From source file:com.tamingtext.classifier.maxent.TestMaxent.java

private static void runTest(File[] inputFiles, DocumentCategorizer categorizer, Tokenizer tokenizer,
        ResultAnalyzer resultAnalyzer) throws FileNotFoundException, IOException {
    String line;/* www .  j  a  v  a2  s  .c  o  m*/
    //<start id="maxent.examples.test.execute"/>
    for (File ff : inputFiles) {
        BufferedReader in = new BufferedReader(new FileReader(ff));
        while ((line = in.readLine()) != null) {
            String[] parts = line.split("\t");
            if (parts.length != 2)
                continue;

            String docText = parts[1]; //<co id="tmt.preprocess"/>
            String[] tokens = tokenizer.tokenize(docText);

            double[] probs = categorizer.categorize(tokens); //<co id="tmt.categorize"/>
            String label = categorizer.getBestCategory(probs);
            int bestIndex = categorizer.getIndex(label);
            double score = probs[bestIndex];

            ClassifierResult result //<co id="tmt.collect"/>
                    = new ClassifierResult(label, score);
            resultAnalyzer.addInstance(parts[0], result);
        }
        in.close();
    }

    System.err.println(resultAnalyzer.toString()); //<co id="tmt.summarize"/>
    /*<calloutlist>
     * <callout arearefs="tmt.preprocess">Preprocess text</callout>
     * <callout arearefs="tmt.categorize">Categorize</callout>
     * <callout arearefs="tmt.collect">Analyze Results</callout>
     * <callout arearefs="tmt.summarize">Present Results</callout>
     * </calloutlist>*/
    //<end id="maxent.examples.test.execute"/>
}

From source file:com.civis.utils.opennlp.models.address.AddressFinderMe.java

/**
 * {@inheritDoc}/*from   ww w .  j a  v a2  s  .com*/
 */
@Override
public List<AddressSpan> find(String text, TokenizerModel tokenizerModel) {
    Tokenizer tokenizer = new TokenizerME(tokenizerModel);
    String[] tokens = tokenizer.tokenize(text);
    return find(tokens);
}

From source file:it.uniud.ailab.dcore.wrappers.external.OpenNlpBootstrapperAnnotator.java

/**
 * Annotates the document using the Apache OpenNLP tools.
 *
 * @param component the component to annotate.
 *//*from w  w  w  .  j av a2 s .c  om*/
@Override
public void annotate(Blackboard blackboard, DocumentComponent component) {

    // set up the annotator
    setup();

    // Language tag used to retrieve the datasets
    String langTag = component.getLanguage().getLanguage();

    // Split the text into sentences
    SentenceModel sentModel = getSentenceModel(langTag + "-sent");

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
    String sentences[] = sentenceDetector.sentDetect(component.getText());

    // Get the right models
    TokenizerModel tokenModel = getTokenizerModel(langTag + "-token");
    POSModel POSModel = getPOSTaggerModel(langTag + "-pos-maxent");

    // Iterate through sentences and produce the distilled objects, 
    // i.e. a sentence object with pos-tagged and stemmed tokens.
    for (String sentenceString : sentences) {

        // the distilled sentence object
        Sentence sentence = new Sentence(sentenceString, "" + sentenceCounter++);
        sentence.setLanguage(component.getLanguage());

        // Tokenize the sentence
        Tokenizer tokenizer = new TokenizerME(tokenModel);
        String tokens[] = tokenizer.tokenize(sentenceString);

        // POS tag the tokens
        POSTaggerME tagger = new POSTaggerME(POSModel);
        String tags[] = tagger.tag(tokens);

        // put the features detected by OpenNLP in the distiller's
        // sentence
        for (int i = 0; i < tokens.length; i++) {
            Token t = new Token(tokens[i]);
            t.setPoS(tags[i]);
            sentence.addToken(t);

        } // for 
        ((DocumentComposite) component).addComponent(sentence);

    } // for (String sentenceString : sentences)
}

From source file:org.dbpedia.spotlight.spot.NESpotter.java

protected List<SurfaceFormOccurrence> extractNameOccurrences(BaseModel nameFinderModel, Text text, URI oType) {
    String intext = text.text();/*w ww  . j  a v a2  s.  c  o m*/
    SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel);
    String[] sentences = sentenceDetector.sentDetect(intext);
    Span[] sentenceEndings = sentenceDetector.sentPosDetect(intext);
    int[] sentencePositions = new int[sentences.length + 1];
    for (int k = 0; k < sentenceEndings.length; k++) {
        sentencePositions[k] = sentenceEndings[k].getStart();
    }

    NameFinderME finder = new NameFinderME((TokenNameFinderModel) nameFinderModel);

    List<SurfaceFormOccurrence> sfOccurrences = new ArrayList<SurfaceFormOccurrence>();
    Tokenizer tokenizer = new SimpleTokenizer();
    for (int i = 0; i < sentences.length; i++) {
        String sentence = sentences[i];
        //LOG.debug("Sentence: " + sentence);

        // extract the names in the current sentence
        String[] tokens = tokenizer.tokenize(sentence);
        Span[] tokenspan = tokenizer.tokenizePos(sentence);
        Span[] nameSpans = finder.find(tokens);
        double[] probs = finder.probs();

        if (nameSpans != null && nameSpans.length > 0) {
            //System.out.println("Tokens: " +(new ArrayList(Arrays.asList(tokens))).toString());
            //System.out.println("NameSpans: " +(new ArrayList(Arrays.asList(nameSpans))).toString());
            for (Span span : nameSpans) {
                StringBuilder buf = new StringBuilder();
                //System.out.println("StartSpan: " + span.getStart() + " EndSpan: " + span.getEnd());
                for (int j = span.getStart(); j < span.getEnd(); j++) {
                    //System.out.println(tokens[i] + " appended to " + buf.toString());
                    buf.append(tokens[j]);
                    if (j < span.getEnd() - 1)
                        buf.append(" ");
                }
                String surfaceFormStr = buf.toString().trim();
                if (surfaceFormStr.contains(".")) {
                    surfaceFormStr = correctPhrase(surfaceFormStr, sentence);
                }

                int entStart = sentencePositions[i] + tokenspan[span.getStart()].getStart();
                int entEnd = sentencePositions[i] + tokenspan[span.getEnd() - 1].getEnd();

                /*
                System.out.println("\n\nRR-NE Found = " + buf.toString());
                System.out.println("Start = " + entStart);
                System.out.println("End = " + entEnd);
                System.out.println("Sentence = " + sentence);
                System.out.println("Text = " + text);
                */

                SurfaceForm surfaceForm = new SurfaceForm(surfaceFormStr);
                SurfaceFormOccurrence sfocc = new SurfaceFormOccurrence(surfaceForm, text, entStart);
                sfocc.features().put("type", new Feature("type", oType.toString()));
                sfOccurrences.add(sfocc);
            }
        }

    }
    finder.clearAdaptiveData();

    if (LOG.isDebugEnabled()) {
        LOG.debug("Occurrences found: " + StringUtils.join(sfOccurrences, ", "));
    }
    return sfOccurrences;
}

From source file:os.Controller.java

public String tokenize(String teks) throws InvalidFormatException, IOException {
    InputStream is = new FileInputStream("en-token.bin");

    TokenizerModel model = new TokenizerModel(is);

    Tokenizer tokenizer = new TokenizerME(model);

    String tokens[] = tokenizer.tokenize(teks);
    String result = "";

    for (String a : tokens) {
        result = result + " " + a;
    }/*from   w  ww . jav a2 s  .  com*/

    is.close();

    return result;
}