Example usage for opennlp.tools.tokenize SimpleTokenizer INSTANCE

Introduction

In this page you can find the example usage for opennlp.tools.tokenize SimpleTokenizer INSTANCE.

Prototype

SimpleTokenizer INSTANCE

To view the source code for opennlp.tools.tokenize SimpleTokenizer INSTANCE.

Click Source Link

Usage

From source file:com.tamingtext.classifier.maxent.TrainMaxent.java

public TrainMaxent(Tokenizer tokenizer) {
    if (tokenizer == null)
        this.tokenizer = SimpleTokenizer.INSTANCE;

}

From source file:com.tamingtext.classifier.maxent.TrainMaxent.java

public void train(String source, String destination) throws IOException {
    //<start id="maxent.examples.train.setup"/> 
    File[] inputFiles = FileUtil.buildFileList(new File(source));
    File modelFile = new File(destination);

    Tokenizer tokenizer = SimpleTokenizer.INSTANCE; //<co id="tm.tok"/>
    CategoryDataStream ds = new CategoryDataStream(inputFiles, tokenizer);

    int cutoff = 5;
    int iterations = 100;
    NameFinderFeatureGenerator nffg //<co id="tm.fg"/>
            = new NameFinderFeatureGenerator();
    BagOfWordsFeatureGenerator bowfg = new BagOfWordsFeatureGenerator();

    DoccatModel model = DocumentCategorizerME.train("en", ds, cutoff, iterations, nffg, bowfg); //<co id="tm.train"/>
    model.serialize(new FileOutputStream(modelFile));

    /*<calloutlist>
    <callout arearefs="tm.tok">Create data stream</callout>
    <callout arearefs="tm.fg">Set up features generators</callout> 
    <callout arearefs="tm.train">Train categorizer</callout>  
    </calloutlist>*///w w w .j  a v a2 s. com
    //<end id="maxent.examples.train.setup"/>
}

From source file:com.tamingtext.classifier.maxent.TestMaxent.java

private static void execute(File[] inputFiles, File modelFile) throws IOException, FileNotFoundException {
    //<start id="maxent.examples.test.setup"/> 
    NameFinderFeatureGenerator nffg //<co id="tmx.feature"/>
            = new NameFinderFeatureGenerator();
    BagOfWordsFeatureGenerator bowfg = new BagOfWordsFeatureGenerator();

    InputStream modelStream = //<co id="tmx.modelreader"/>
            new FileInputStream(modelFile);
    DoccatModel model = new DoccatModel(modelStream);
    DocumentCategorizer categorizer //<co id="tmx.categorizer"/>
            = new DocumentCategorizerME(model, nffg, bowfg);
    Tokenizer tokenizer = SimpleTokenizer.INSTANCE;

    int catCount = categorizer.getNumberOfCategories();
    Collection<String> categories = new ArrayList<String>(catCount);
    for (int i = 0; i < catCount; i++) {
        categories.add(categorizer.getCategory(i));
    }/* w  w  w . j  a  v  a  2 s.co m*/
    ResultAnalyzer resultAnalyzer = //<co id="tmx.results"/>
            new ResultAnalyzer(categories, "unknown");
    runTest(inputFiles, categorizer, tokenizer, resultAnalyzer); //<co id="tmx.run"/>
    /*<calloutlist>
    <callout arearefs="tmx.feature">Setup Feature Generators</callout>
    <callout arearefs="tmx.modelreader">Load Model</callout>
    <callout arearefs="tmx.categorizer">Create Categorizer</callout>
    <callout arearefs="tmx.results">Prepare Result Analyzer</callout>
    <callout arearefs="tmx.run">Execute Test</callout>
    </calloutlist>*/
    //<end id="maxent.examples.test.setup"/>
}

From source file:org.apache.stanbol.commons.opennlp.OpenNLP.java

/**
 * Getter for the Tokenizer of a given language. This first tries to
 * create an {@link TokenizerME} instance if the required 
 * {@link TokenizerModel} for the parsed language is available. if such a
 * model is not available it returns the {@link SimpleTokenizer} instance.
 * @param language the language or <code>null</code> to build a 
 * {@link SimpleTokenizer}/*from w w w.j  a v a  2  s  .  com*/
 * @return the {@link Tokenizer} for the parsed language.
 */
public Tokenizer getTokenizer(String language) {
    Tokenizer tokenizer = null;
    if (language != null) {
        try {
            TokenizerModel model = getTokenizerModel(language);
            if (model != null) {
                tokenizer = new TokenizerME(model);
            }
        } catch (InvalidFormatException e) {
            log.warn("Unable to load Tokenizer Model for " + language + ": "
                    + "Will use Simple Tokenizer instead", e);
        } catch (IOException e) {
            log.warn("Unable to load Tokenizer Model for " + language + ": "
                    + "Will use Simple Tokenizer instead", e);
        }
    }
    if (tokenizer == null) {
        log.debug("Use Simple Tokenizer for language {}", language);
        tokenizer = SimpleTokenizer.INSTANCE;
    } else {
        log.debug("Use ME Tokenizer for language {}", language);
    }
    return tokenizer;
}