implements a simple text classifier in Java using WEKA FilteredClassifier - Java Machine Learning AI

Java examples for Machine Learning AI:weka

Description

implements a simple text classifier in Java using WEKA FilteredClassifier

Demo Code

/**//from   w  w w.j  a  v  a  2s .  c  o  m
 * A Java class that implements a simple language identifier, based on WEKA.
 * It requires a serialized model of the type FilteredClassifier.
 * WEKA is available at: http://www.cs.waikato.ac.nz/ml/weka/
 * Copyright (C) 2013 Jose Maria Gomez Hidalgo - http://www.esp.uem.es/jmgomez
 *
 * This program is free software: you can redistribute it and/or modify
 * it for any purpose.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 */

import weka.core.*;
import weka.classifiers.meta.FilteredClassifier;
import java.util.List;
import java.util.ArrayList;
import java.io.*;

/**
 * This class implements a simple text classifier in Java using WEKA.
 * It loads a file with the text to classify, and the model that has been
 * learnt with teh learn.sh script.
 * @author Jose Maria Gomez Hidalgo - http://www.esp.uem.es/jmgomez
 * @see MyFilteredLearner
 */
public class LanguageIdentifier {

    /**
     * String that stores the text to guess its language.
     */
    String text;
    /**
     * Object that stores the instance.
     */
    Instances instances;
    /**
     * Object that stores the classifier.
     */
    FilteredClassifier classifier;

    /**
     * This method loads the text to be classified.
     * @param fileName The name of the file that stores the text.
     */
    public void load(String fileName) {
        try {
            BufferedReader reader = new BufferedReader(new FileReader(
                    fileName));
            String line;
            text = "";
            while ((line = reader.readLine()) != null) {
                text = text + " " + line;
            }
            System.out.println("===== Loaded text data: " + fileName
                    + " =====");
            reader.close();
            System.out.println(text);
        } catch (IOException e) {
            System.out.println("Problem found when reading: " + fileName);
        }
    }

    /**
     * This method loads the model to be used as classifier.
     * @param fileName The name of the file that stores the text.
     */
    public void loadModel(String fileName) {
        try {
            ObjectInputStream in = new ObjectInputStream(
                    new FileInputStream(fileName));
            Object tmp = in.readObject();
            classifier = (FilteredClassifier) tmp;
            in.close();
            System.out
                    .println("===== Loaded model: " + fileName + " =====");
        } catch (Exception e) {
            // Given the cast, a ClassNotFoundException must be caught along with the IOException
            System.out.println("Problem found when reading: " + fileName);
        }
    }

    /**
     * This method creates the instance to be classified, from the text that has been read.
     */
    public void makeInstance() {

        // Create the header
        List attributeList = new ArrayList(2);

        // Create first attribute, the class
        List values = new ArrayList(3);
        values.add("EN");
        values.add("FR");
        values.add("SP");
        Attribute attribute1 = new Attribute("language_class", values);
        attributeList.add(attribute1);

        // Create second attribute, the text
        Attribute attribute2 = new Attribute("text", (List) null);
        attributeList.add(attribute2);

        // Build instance set with just one instance
        instances = new Instances("Test relation",
                (java.util.ArrayList<Attribute>) attributeList, 1);
        // Set class index
        instances.setClassIndex(0);

        // Create and add the instance
        DenseInstance instance = new DenseInstance(2);
        instance.setDataset(instances);
        instance.setValue(attribute2, text);
        instances.add(instance);

        System.out
                .println("===== Instance created with reference dataset =====");
        System.out.println(instances);
    }

    /**
     * This method performs the classification of the instance.
     * Output is done at the command-line.
     */
    public void classify() {
        try {
            double pred = classifier
                    .classifyInstance(instances.instance(0));
            System.out.println("===== Classified instance =====");
            System.out.println("Class predicted: "
                    + instances.classAttribute().value((int) pred));
        } catch (Exception e) {
            System.out.println("Problem found when classifying the text");
        }
    }

    /**
     * Main method. It is an example of the usage of this class.
     * @param args Command-line arguments: fileData and fileModel.
     */
    public static void main(String[] args) {

        LanguageIdentifier classifier;
        if (args.length < 2)
            System.out
                    .println("Usage: java LanguageIdentifier <fileData> <fileModel>");
        else {
            classifier = new LanguageIdentifier();
            classifier.load(args[0]);
            classifier.loadModel(args[1]);
            classifier.makeInstance();
            classifier.classify();
        }
    }
}

Related Tutorials