Example usage for opennlp.tools.doccat BagOfWordsFeatureGenerator BagOfWordsFeatureGenerator

Introduction

In this page you can find the example usage for opennlp.tools.doccat BagOfWordsFeatureGenerator BagOfWordsFeatureGenerator.

Prototype

public BagOfWordsFeatureGenerator()

Source Link

Usage

From source file:com.tamingtext.classifier.maxent.TrainMaxent.java

public void train(String source, String destination) throws IOException {
    //<start id="maxent.examples.train.setup"/> 
    File[] inputFiles = FileUtil.buildFileList(new File(source));
    File modelFile = new File(destination);

    Tokenizer tokenizer = SimpleTokenizer.INSTANCE; //<co id="tm.tok"/>
    CategoryDataStream ds = new CategoryDataStream(inputFiles, tokenizer);

    int cutoff = 5;
    int iterations = 100;
    NameFinderFeatureGenerator nffg //<co id="tm.fg"/>
            = new NameFinderFeatureGenerator();
    BagOfWordsFeatureGenerator bowfg = new BagOfWordsFeatureGenerator();

    DoccatModel model = DocumentCategorizerME.train("en", ds, cutoff, iterations, nffg, bowfg); //<co id="tm.train"/>
    model.serialize(new FileOutputStream(modelFile));

    /*<calloutlist>
    <callout arearefs="tm.tok">Create data stream</callout>
    <callout arearefs="tm.fg">Set up features generators</callout> 
    <callout arearefs="tm.train">Train categorizer</callout>  
    </calloutlist>*//*from   w w w.ja  v  a2  s.  co m*/
    //<end id="maxent.examples.train.setup"/>
}

From source file:com.tamingtext.classifier.maxent.TestMaxent.java

private static void execute(File[] inputFiles, File modelFile) throws IOException, FileNotFoundException {
    //<start id="maxent.examples.test.setup"/> 
    NameFinderFeatureGenerator nffg //<co id="tmx.feature"/>
            = new NameFinderFeatureGenerator();
    BagOfWordsFeatureGenerator bowfg = new BagOfWordsFeatureGenerator();

    InputStream modelStream = //<co id="tmx.modelreader"/>
            new FileInputStream(modelFile);
    DoccatModel model = new DoccatModel(modelStream);
    DocumentCategorizer categorizer //<co id="tmx.categorizer"/>
            = new DocumentCategorizerME(model, nffg, bowfg);
    Tokenizer tokenizer = SimpleTokenizer.INSTANCE;

    int catCount = categorizer.getNumberOfCategories();
    Collection<String> categories = new ArrayList<String>(catCount);
    for (int i = 0; i < catCount; i++) {
        categories.add(categorizer.getCategory(i));
    }// w w w .jav a  2s .  c o  m
    ResultAnalyzer resultAnalyzer = //<co id="tmx.results"/>
            new ResultAnalyzer(categories, "unknown");
    runTest(inputFiles, categorizer, tokenizer, resultAnalyzer); //<co id="tmx.run"/>
    /*<calloutlist>
    <callout arearefs="tmx.feature">Setup Feature Generators</callout>
    <callout arearefs="tmx.modelreader">Load Model</callout>
    <callout arearefs="tmx.categorizer">Create Categorizer</callout>
    <callout arearefs="tmx.results">Prepare Result Analyzer</callout>
    <callout arearefs="tmx.run">Execute Test</callout>
    </calloutlist>*/
    //<end id="maxent.examples.test.setup"/>
}