List of usage examples for opennlp.tools.tokenize SimpleTokenizer INSTANCE
SimpleTokenizer INSTANCE
To view the source code for opennlp.tools.tokenize SimpleTokenizer INSTANCE.
Click Source Link
From source file:com.tamingtext.classifier.maxent.TrainMaxent.java
public TrainMaxent(Tokenizer tokenizer) { if (tokenizer == null) this.tokenizer = SimpleTokenizer.INSTANCE; }
From source file:com.tamingtext.classifier.maxent.TrainMaxent.java
public void train(String source, String destination) throws IOException { //<start id="maxent.examples.train.setup"/> File[] inputFiles = FileUtil.buildFileList(new File(source)); File modelFile = new File(destination); Tokenizer tokenizer = SimpleTokenizer.INSTANCE; //<co id="tm.tok"/> CategoryDataStream ds = new CategoryDataStream(inputFiles, tokenizer); int cutoff = 5; int iterations = 100; NameFinderFeatureGenerator nffg //<co id="tm.fg"/> = new NameFinderFeatureGenerator(); BagOfWordsFeatureGenerator bowfg = new BagOfWordsFeatureGenerator(); DoccatModel model = DocumentCategorizerME.train("en", ds, cutoff, iterations, nffg, bowfg); //<co id="tm.train"/> model.serialize(new FileOutputStream(modelFile)); /*<calloutlist> <callout arearefs="tm.tok">Create data stream</callout> <callout arearefs="tm.fg">Set up features generators</callout> <callout arearefs="tm.train">Train categorizer</callout> </calloutlist>*///w w w .j a v a2 s. com //<end id="maxent.examples.train.setup"/> }
From source file:com.tamingtext.classifier.maxent.TestMaxent.java
private static void execute(File[] inputFiles, File modelFile) throws IOException, FileNotFoundException { //<start id="maxent.examples.test.setup"/> NameFinderFeatureGenerator nffg //<co id="tmx.feature"/> = new NameFinderFeatureGenerator(); BagOfWordsFeatureGenerator bowfg = new BagOfWordsFeatureGenerator(); InputStream modelStream = //<co id="tmx.modelreader"/> new FileInputStream(modelFile); DoccatModel model = new DoccatModel(modelStream); DocumentCategorizer categorizer //<co id="tmx.categorizer"/> = new DocumentCategorizerME(model, nffg, bowfg); Tokenizer tokenizer = SimpleTokenizer.INSTANCE; int catCount = categorizer.getNumberOfCategories(); Collection<String> categories = new ArrayList<String>(catCount); for (int i = 0; i < catCount; i++) { categories.add(categorizer.getCategory(i)); }/* w w w . j a v a 2 s.co m*/ ResultAnalyzer resultAnalyzer = //<co id="tmx.results"/> new ResultAnalyzer(categories, "unknown"); runTest(inputFiles, categorizer, tokenizer, resultAnalyzer); //<co id="tmx.run"/> /*<calloutlist> <callout arearefs="tmx.feature">Setup Feature Generators</callout> <callout arearefs="tmx.modelreader">Load Model</callout> <callout arearefs="tmx.categorizer">Create Categorizer</callout> <callout arearefs="tmx.results">Prepare Result Analyzer</callout> <callout arearefs="tmx.run">Execute Test</callout> </calloutlist>*/ //<end id="maxent.examples.test.setup"/> }
From source file:org.apache.stanbol.commons.opennlp.OpenNLP.java
/** * Getter for the Tokenizer of a given language. This first tries to * create an {@link TokenizerME} instance if the required * {@link TokenizerModel} for the parsed language is available. if such a * model is not available it returns the {@link SimpleTokenizer} instance. * @param language the language or <code>null</code> to build a * {@link SimpleTokenizer}/*from w w w.j a v a 2 s . com*/ * @return the {@link Tokenizer} for the parsed language. */ public Tokenizer getTokenizer(String language) { Tokenizer tokenizer = null; if (language != null) { try { TokenizerModel model = getTokenizerModel(language); if (model != null) { tokenizer = new TokenizerME(model); } } catch (InvalidFormatException e) { log.warn("Unable to load Tokenizer Model for " + language + ": " + "Will use Simple Tokenizer instead", e); } catch (IOException e) { log.warn("Unable to load Tokenizer Model for " + language + ": " + "Will use Simple Tokenizer instead", e); } } if (tokenizer == null) { log.debug("Use Simple Tokenizer for language {}", language); tokenizer = SimpleTokenizer.INSTANCE; } else { log.debug("Use ME Tokenizer for language {}", language); } return tokenizer; }