List of usage examples for opennlp.tools.doccat BagOfWordsFeatureGenerator BagOfWordsFeatureGenerator
public BagOfWordsFeatureGenerator()
From source file:com.tamingtext.classifier.maxent.TrainMaxent.java
public void train(String source, String destination) throws IOException { //<start id="maxent.examples.train.setup"/> File[] inputFiles = FileUtil.buildFileList(new File(source)); File modelFile = new File(destination); Tokenizer tokenizer = SimpleTokenizer.INSTANCE; //<co id="tm.tok"/> CategoryDataStream ds = new CategoryDataStream(inputFiles, tokenizer); int cutoff = 5; int iterations = 100; NameFinderFeatureGenerator nffg //<co id="tm.fg"/> = new NameFinderFeatureGenerator(); BagOfWordsFeatureGenerator bowfg = new BagOfWordsFeatureGenerator(); DoccatModel model = DocumentCategorizerME.train("en", ds, cutoff, iterations, nffg, bowfg); //<co id="tm.train"/> model.serialize(new FileOutputStream(modelFile)); /*<calloutlist> <callout arearefs="tm.tok">Create data stream</callout> <callout arearefs="tm.fg">Set up features generators</callout> <callout arearefs="tm.train">Train categorizer</callout> </calloutlist>*//*from w w w.ja v a2 s. co m*/ //<end id="maxent.examples.train.setup"/> }
From source file:com.tamingtext.classifier.maxent.TestMaxent.java
private static void execute(File[] inputFiles, File modelFile) throws IOException, FileNotFoundException { //<start id="maxent.examples.test.setup"/> NameFinderFeatureGenerator nffg //<co id="tmx.feature"/> = new NameFinderFeatureGenerator(); BagOfWordsFeatureGenerator bowfg = new BagOfWordsFeatureGenerator(); InputStream modelStream = //<co id="tmx.modelreader"/> new FileInputStream(modelFile); DoccatModel model = new DoccatModel(modelStream); DocumentCategorizer categorizer //<co id="tmx.categorizer"/> = new DocumentCategorizerME(model, nffg, bowfg); Tokenizer tokenizer = SimpleTokenizer.INSTANCE; int catCount = categorizer.getNumberOfCategories(); Collection<String> categories = new ArrayList<String>(catCount); for (int i = 0; i < catCount; i++) { categories.add(categorizer.getCategory(i)); }// w w w .jav a 2s . c o m ResultAnalyzer resultAnalyzer = //<co id="tmx.results"/> new ResultAnalyzer(categories, "unknown"); runTest(inputFiles, categorizer, tokenizer, resultAnalyzer); //<co id="tmx.run"/> /*<calloutlist> <callout arearefs="tmx.feature">Setup Feature Generators</callout> <callout arearefs="tmx.modelreader">Load Model</callout> <callout arearefs="tmx.categorizer">Create Categorizer</callout> <callout arearefs="tmx.results">Prepare Result Analyzer</callout> <callout arearefs="tmx.run">Execute Test</callout> </calloutlist>*/ //<end id="maxent.examples.test.setup"/> }