com.tamingtext.classifier.maxent.TestMaxent.java Source code

Java tutorial

Introduction

Here is the source code for com.tamingtext.classifier.maxent.TestMaxent.java

Source

/*
 * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 * -------------------
 * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
 * http://www.manning.com/ingersoll
 */

package com.tamingtext.classifier.maxent;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;

import opennlp.tools.doccat.BagOfWordsFeatureGenerator;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizer;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;

import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.mahout.classifier.ClassifierResult;
import org.apache.mahout.classifier.ResultAnalyzer;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.tamingtext.util.FileUtil;

public class TestMaxent {

    private static final Logger log = LoggerFactory.getLogger(TestMaxent.class);

    /**
     * @param args
     */
    public static void main(String[] args) throws IOException {
        DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
        ArgumentBuilder abuilder = new ArgumentBuilder();
        GroupBuilder gbuilder = new GroupBuilder();

        Option helpOpt = DefaultOptionCreator.helpOption();

        Option inputDirOpt = obuilder.withLongName("input").withRequired(true)
                .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
                .withDescription("The input directory").withShortName("i").create();

        Option modelOpt = obuilder.withLongName("model").withRequired(true)
                .withArgument(abuilder.withName("index").withMinimum(1).withMaximum(1).create())
                .withDescription("The directory containing the index model").withShortName("m").create();

        Group group = gbuilder.withName("Options").withOption(helpOpt).withOption(inputDirOpt).withOption(modelOpt)
                .create();

        try {
            Parser parser = new Parser();

            parser.setGroup(group);
            parser.setHelpOption(helpOpt);
            CommandLine cmdLine = parser.parse(args);
            if (cmdLine.hasOption(helpOpt)) {
                CommandLineUtil.printHelp(group);
                return;
            }

            String inputPath = (String) cmdLine.getValue(inputDirOpt);
            File f = new File(inputPath);
            if (!f.isDirectory()) {
                throw new IllegalArgumentException(f + " is not a directory or does not exit");
            }
            File[] inputFiles = FileUtil.buildFileList(f);

            File modelDir = new File((String) cmdLine.getValue(modelOpt));
            execute(inputFiles, modelDir);
        } catch (OptionException e) {
            log.error("Error while parsing options", e);
        }

    }

    private static void execute(File[] inputFiles, File modelFile) throws IOException, FileNotFoundException {
        //<start id="maxent.examples.test.setup"/> 
        NameFinderFeatureGenerator nffg //<co id="tmx.feature"/>
                = new NameFinderFeatureGenerator();
        BagOfWordsFeatureGenerator bowfg = new BagOfWordsFeatureGenerator();

        InputStream modelStream = //<co id="tmx.modelreader"/>
                new FileInputStream(modelFile);
        DoccatModel model = new DoccatModel(modelStream);
        DocumentCategorizer categorizer //<co id="tmx.categorizer"/>
                = new DocumentCategorizerME(model, nffg, bowfg);
        Tokenizer tokenizer = SimpleTokenizer.INSTANCE;

        int catCount = categorizer.getNumberOfCategories();
        Collection<String> categories = new ArrayList<String>(catCount);
        for (int i = 0; i < catCount; i++) {
            categories.add(categorizer.getCategory(i));
        }
        ResultAnalyzer resultAnalyzer = //<co id="tmx.results"/>
                new ResultAnalyzer(categories, "unknown");
        runTest(inputFiles, categorizer, tokenizer, resultAnalyzer); //<co id="tmx.run"/>
        /*<calloutlist>
        <callout arearefs="tmx.feature">Setup Feature Generators</callout>
        <callout arearefs="tmx.modelreader">Load Model</callout>
        <callout arearefs="tmx.categorizer">Create Categorizer</callout>
        <callout arearefs="tmx.results">Prepare Result Analyzer</callout>
        <callout arearefs="tmx.run">Execute Test</callout>
        </calloutlist>*/
        //<end id="maxent.examples.test.setup"/>
    }

    private static void runTest(File[] inputFiles, DocumentCategorizer categorizer, Tokenizer tokenizer,
            ResultAnalyzer resultAnalyzer) throws FileNotFoundException, IOException {
        String line;
        //<start id="maxent.examples.test.execute"/>
        for (File ff : inputFiles) {
            BufferedReader in = new BufferedReader(new FileReader(ff));
            while ((line = in.readLine()) != null) {
                String[] parts = line.split("\t");
                if (parts.length != 2)
                    continue;

                String docText = parts[1]; //<co id="tmt.preprocess"/>
                String[] tokens = tokenizer.tokenize(docText);

                double[] probs = categorizer.categorize(tokens); //<co id="tmt.categorize"/>
                String label = categorizer.getBestCategory(probs);
                int bestIndex = categorizer.getIndex(label);
                double score = probs[bestIndex];

                ClassifierResult result //<co id="tmt.collect"/>
                        = new ClassifierResult(label, score);
                resultAnalyzer.addInstance(parts[0], result);
            }
            in.close();
        }

        System.err.println(resultAnalyzer.toString()); //<co id="tmt.summarize"/>
        /*<calloutlist>
         * <callout arearefs="tmt.preprocess">Preprocess text</callout>
         * <callout arearefs="tmt.categorize">Categorize</callout>
         * <callout arearefs="tmt.collect">Analyze Results</callout>
         * <callout arearefs="tmt.summarize">Present Results</callout>
         * </calloutlist>*/
        //<end id="maxent.examples.test.execute"/>
    }
}