com.tamingtext.classifier.bayes.ClassifyDocument.java Source code

Java tutorial

Introduction

Here is the source code for com.tamingtext.classifier.bayes.ClassifyDocument.java

Source

/*
 * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 * -------------------
 * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
 * http://www.manning.com/ingersoll
 */

package com.tamingtext.classifier.bayes;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;

import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.apache.mahout.classifier.ClassifierResult;

import org.apache.mahout.classifier.bayes.Algorithm;
import org.apache.mahout.classifier.bayes.BayesAlgorithm;
import org.apache.mahout.classifier.bayes.BayesParameters;
import org.apache.mahout.classifier.bayes.ClassifierContext;
import org.apache.mahout.classifier.bayes.Datastore;
import org.apache.mahout.classifier.bayes.InMemoryBayesDatastore;
import org.apache.mahout.classifier.bayes.InvalidDatastoreException;
import org.apache.mahout.common.CommandLineUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** Simply Utility to demonstrate classifying a document using the Mahout Bayes classifier. Uses the Lucene
 *  StandardAnalyzer for Tokenization.
 */
public class ClassifyDocument {

    private static final Logger log = LoggerFactory.getLogger(ExtractTrainingData.class);

    public static void main(String[] args) {
        log.info("Command-line arguments: " + Arrays.toString(args));

        DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
        ArgumentBuilder abuilder = new ArgumentBuilder();
        GroupBuilder gbuilder = new GroupBuilder();

        Option inputOpt = obuilder.withLongName("input").withRequired(true)
                .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
                .withDescription("Input file").withShortName("i").create();

        Option modelOpt = obuilder.withLongName("model").withRequired(true)
                .withArgument(abuilder.withName("model").withMinimum(1).withMaximum(1).create())
                .withDescription("Model to use when classifying data").withShortName("m").create();

        Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
                .create();

        Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(modelOpt).withOption(helpOpt)
                .create();

        try {
            Parser parser = new Parser();
            parser.setGroup(group);
            CommandLine cmdLine = parser.parse(args);

            if (cmdLine.hasOption(helpOpt)) {
                CommandLineUtil.printHelp(group);
                return;
            }

            File inputFile = new File(cmdLine.getValue(inputOpt).toString());

            if (!inputFile.isFile()) {
                throw new IllegalArgumentException(inputFile + " does not exist or is not a file");
            }

            File modelDir = new File(cmdLine.getValue(modelOpt).toString());

            if (!modelDir.isDirectory()) {
                throw new IllegalArgumentException(modelDir + " does not exist or is not a directory");
            }

            BayesParameters p = new BayesParameters();
            p.set("basePath", modelDir.getCanonicalPath());
            Datastore ds = new InMemoryBayesDatastore(p);
            Algorithm a = new BayesAlgorithm();
            ClassifierContext ctx = new ClassifierContext(a, ds);
            ctx.initialize();

            //TODO: make the analyzer configurable
            StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
            TokenStream ts = analyzer.tokenStream(null,
                    new InputStreamReader(new FileInputStream(inputFile), "UTF-8"));

            ArrayList<String> tokens = new ArrayList<String>(1000);
            while (ts.incrementToken()) {
                tokens.add(ts.getAttribute(CharTermAttribute.class).toString());
            }
            String[] document = tokens.toArray(new String[tokens.size()]);

            ClassifierResult[] cr = ctx.classifyDocument(document, "unknown", 5);

            for (ClassifierResult r : cr) {
                System.err.println(r.getLabel() + "\t" + r.getScore());
            }
        } catch (OptionException e) {
            log.error("Exception", e);
            CommandLineUtil.printHelp(group);
        } catch (IOException e) {
            log.error("IOException", e);
        } catch (InvalidDatastoreException e) {
            log.error("InvalidDataStoreException", e);
        } finally {

        }
    }
}