org.ml.classifier.TextDirectoryToArff.java Source code

Introduction

Here is the source code for org.ml.classifier.TextDirectoryToArff.java
Source

package org.ml.classifier;
/*
 *    TextDirectoryToArff.java
 *    Copyright (C) 2002 Richard Kirkby
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

import java.io.File;
import java.util.Vector;

import org.apache.log4j.Logger;

import weka.classifiers.Evaluation;
import weka.classifiers.bayes.NaiveBayesMultinomial;
import weka.classifiers.meta.FilteredClassifier;
import weka.core.Instances;
import weka.core.SerializationHelper;
import weka.core.converters.TextDirectoryLoader;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Standardize;
import weka.filters.unsupervised.attribute.StringToWordVector;
import weka.filters.unsupervised.instance.SparseToNonSparse;

/**
 * Builds an arff dataset from the documents in a given directory.
 * Assumes that the file names for the documents end with ".txt".
 *
 * Usage:<p>
 *
 * TextDirectoryToArff <directory path><p>
 *
 * @author Richard Kirkby (rkirkby at cs.waikato.ac.nz)
 * @version 1.0
 */
public class TextDirectoryToArff {

    private static final Logger LOGGER = Logger.getLogger(TextDirectoryToArff.class);

    static final String rootDir = "c:/work/analytics-testdata";
    static final String MODEL = rootDir + "/weka/testmodel.model";
    static final String TRAINING_FILES = rootDir + "/20news-bydate-train";
    static final String TESTING_FILES = rootDir + "/20news-bydate-test";

    public Instances createDataset(String directoryPath) throws Exception {

        //    FastVector atts = new FastVector(2);
        //    atts.addElement(new Attribute("filename", (FastVector) null));
        //    atts.addElement(new Attribute("contents", (FastVector) null));
        //    Instances data = new Instances("text_files_in_" + directoryPath, atts, 0);
        //
        //    File dir = new File(directoryPath);
        //    String[] files = dir.list();
        //    for (int i = 0; i < files.length; i++) {
        //      if (files[i].endsWith(".txt")) {
        //   try {
        //     double[] newInst = new double[2];
        //     newInst[0] = (double)data.attribute(0).addStringValue(files[i]);
        //     File txt = new File(directoryPath + File.separator + files[i]);  
        //     InputStreamReader is;
        //     is = new InputStreamReader(new FileInputStream(txt));
        //     StringBuffer txtStr = new StringBuffer();
        //     int c;
        //     while ((c = is.read()) != -1) {
        //       txtStr.append((char)c);
        //     }
        //     newInst[1] = (double)data.attribute(1).addStringValue(txtStr.toString());
        //     data.add(new Instance(1.0, newInst));
        //   } catch (Exception e) {
        //     //System.err.println("failed to convert file: " + directoryPath + File.separator + files[i]);
        //   }
        //      }
        //    }

        // convert the directory into a dataset
        TextDirectoryLoader loader = new TextDirectoryLoader();
        loader.setDirectory(new File(directoryPath));
        loader.setOutputFilename(false);

        Instances data = loader.getDataSet();
        //System.out.println("\n\nImported data:\n\n" + dataRaw);

        return data;
    }

    public static void main(String[] args) {

        //      if (args.length == 2) {
        TextDirectoryToArff tdta = new TextDirectoryToArff();
        try {
            //            Instances trainData = tdta.createDataset(TRAINING_FILES);
            //            LOGGER.debug(trainData.toString());

            Instances testData = tdta.createDataset(TESTING_FILES);
            //            LOGGER.debug(testData.toString());
            //            System.out.println(testData);

            //            System.exit(0);

            // apply the StringToWordVector in a batch mode
            // (see the source code of setOptions(String[]) method of the filter
            // if you want to know which command-line option corresponds to which
            // bean property)
            //            StringToWordVector strToWordFilter = new StringToWordVector();
            //            strToWordFilter.setInputFormat(trainData);
            //            strToWordFilter.setOutputWordCounts(true);
            //            strToWordFilter.setTFTransform(true);
            //            strToWordFilter.setIDFTransform(true);

            //            trainData = Filter.useFilter(trainData, strToWordFilter);
            //            testData = Filter.useFilter(testData, strToWordFilter);

            //transform to non-sparse format
            //            SparseToNonSparse spFilter = new SparseToNonSparse(); 
            //            spFilter.setInputFormat(trainData);
            //            trainData = Filter.useFilter(trainData, spFilter);
            //            testData = Filter.useFilter(testData, spFilter);

            //            Standardize standardizeFilter = new Standardize();
            //            standardizeFilter.setInputFormat(trainData);
            //            
            //            Instances newTrainData = Filter.useFilter(trainData, standardizeFilter);
            //            Instances newTestData = Filter.useFilter(testData, standardizeFilter);

            //            NaiveBayesMultinomial cl = null;

            //            // train classifier
            //            cl = new NaiveBayesMultinomial();
            //            // further options...
            //            cl.buildClassifier(trainData);

            //            FilteredClassifier fcl = new FilteredClassifier();
            //            fcl.setFilter(strToWordFilter);
            //            fcl.setClassifier(cl);
            //            
            //            fcl.buildClassifier(trainData);

            //            SerializationHelper.write(MODEL, fcl);

            // read the model from the file
            FilteredClassifier fcl = (FilteredClassifier) SerializationHelper.read(MODEL);

            //            System.out.println("Training finished!");
            //            System.exit(0);

            //            Evaluation eTest = new Evaluation(trainData);

            //            eTest.evaluateModel(cl, trainData);
            //            String strSummary = eTest.toSummaryString();
            //            LOGGER.debug(strSummary);

            //            eTest.evaluateModel(cl, testData);
            //            strSummary = eTest.toSummaryString();
            //            LOGGER.debug(strSummary);

            // Get the confusion matrix
            //            double[][] cmMatrix = eTest.confusionMatrix();
            //            LOGGER.debug(cmMatrix);

            int[] myLst = { 5, 7, 9, 100, 345, 1000, 1500, 7500 };

            for (int i = 0; i < myLst.length; i++) {
                int idx = myLst[i];
                System.out.println("Actual: " + testData.instance(idx).stringValue(testData.classIndex()));
                long start = System.currentTimeMillis();
                System.out.println(fcl.classifyInstance(testData.instance(idx)));
                long end = System.currentTimeMillis();
                System.out.println("\n Time: " + (end - start) + " ms");
            }

        } catch (Exception e) {
            LOGGER.error(e.getMessage());
            e.printStackTrace();
        }
        //      } else {
        //         System.out.println("Usage: java TextDirectoryToArff <directory name>");
        //      }
    }
}