Java tutorial
package org.ml.classifier; /* * TextDirectoryToArff.java * Copyright (C) 2002 Richard Kirkby * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ import java.io.File; import java.util.Vector; import org.apache.log4j.Logger; import weka.classifiers.Evaluation; import weka.classifiers.bayes.NaiveBayesMultinomial; import weka.classifiers.meta.FilteredClassifier; import weka.core.Instances; import weka.core.SerializationHelper; import weka.core.converters.TextDirectoryLoader; import weka.filters.Filter; import weka.filters.unsupervised.attribute.Standardize; import weka.filters.unsupervised.attribute.StringToWordVector; import weka.filters.unsupervised.instance.SparseToNonSparse; /** * Builds an arff dataset from the documents in a given directory. * Assumes that the file names for the documents end with ".txt". * * Usage:<p> * * TextDirectoryToArff <directory path><p> * * @author Richard Kirkby (rkirkby at cs.waikato.ac.nz) * @version 1.0 */ public class TextDirectoryToArff { private static final Logger LOGGER = Logger.getLogger(TextDirectoryToArff.class); static final String rootDir = "c:/work/analytics-testdata"; static final String MODEL = rootDir + "/weka/testmodel.model"; static final String TRAINING_FILES = rootDir + "/20news-bydate-train"; static final String TESTING_FILES = rootDir + "/20news-bydate-test"; public Instances createDataset(String directoryPath) throws Exception { // FastVector atts = new FastVector(2); // atts.addElement(new Attribute("filename", (FastVector) null)); // atts.addElement(new Attribute("contents", (FastVector) null)); // Instances data = new Instances("text_files_in_" + directoryPath, atts, 0); // // File dir = new File(directoryPath); // String[] files = dir.list(); // for (int i = 0; i < files.length; i++) { // if (files[i].endsWith(".txt")) { // try { // double[] newInst = new double[2]; // newInst[0] = (double)data.attribute(0).addStringValue(files[i]); // File txt = new File(directoryPath + File.separator + files[i]); // InputStreamReader is; // is = new InputStreamReader(new FileInputStream(txt)); // StringBuffer txtStr = new StringBuffer(); // int c; // while ((c = is.read()) != -1) { // txtStr.append((char)c); // } // newInst[1] = (double)data.attribute(1).addStringValue(txtStr.toString()); // data.add(new Instance(1.0, newInst)); // } catch (Exception e) { // //System.err.println("failed to convert file: " + directoryPath + File.separator + files[i]); // } // } // } // convert the directory into a dataset TextDirectoryLoader loader = new TextDirectoryLoader(); loader.setDirectory(new File(directoryPath)); loader.setOutputFilename(false); Instances data = loader.getDataSet(); //System.out.println("\n\nImported data:\n\n" + dataRaw); return data; } public static void main(String[] args) { // if (args.length == 2) { TextDirectoryToArff tdta = new TextDirectoryToArff(); try { // Instances trainData = tdta.createDataset(TRAINING_FILES); // LOGGER.debug(trainData.toString()); Instances testData = tdta.createDataset(TESTING_FILES); // LOGGER.debug(testData.toString()); // System.out.println(testData); // System.exit(0); // apply the StringToWordVector in a batch mode // (see the source code of setOptions(String[]) method of the filter // if you want to know which command-line option corresponds to which // bean property) // StringToWordVector strToWordFilter = new StringToWordVector(); // strToWordFilter.setInputFormat(trainData); // strToWordFilter.setOutputWordCounts(true); // strToWordFilter.setTFTransform(true); // strToWordFilter.setIDFTransform(true); // trainData = Filter.useFilter(trainData, strToWordFilter); // testData = Filter.useFilter(testData, strToWordFilter); //transform to non-sparse format // SparseToNonSparse spFilter = new SparseToNonSparse(); // spFilter.setInputFormat(trainData); // trainData = Filter.useFilter(trainData, spFilter); // testData = Filter.useFilter(testData, spFilter); // Standardize standardizeFilter = new Standardize(); // standardizeFilter.setInputFormat(trainData); // // Instances newTrainData = Filter.useFilter(trainData, standardizeFilter); // Instances newTestData = Filter.useFilter(testData, standardizeFilter); // NaiveBayesMultinomial cl = null; // // train classifier // cl = new NaiveBayesMultinomial(); // // further options... // cl.buildClassifier(trainData); // FilteredClassifier fcl = new FilteredClassifier(); // fcl.setFilter(strToWordFilter); // fcl.setClassifier(cl); // // fcl.buildClassifier(trainData); // SerializationHelper.write(MODEL, fcl); // read the model from the file FilteredClassifier fcl = (FilteredClassifier) SerializationHelper.read(MODEL); // System.out.println("Training finished!"); // System.exit(0); // Evaluation eTest = new Evaluation(trainData); // eTest.evaluateModel(cl, trainData); // String strSummary = eTest.toSummaryString(); // LOGGER.debug(strSummary); // eTest.evaluateModel(cl, testData); // strSummary = eTest.toSummaryString(); // LOGGER.debug(strSummary); // Get the confusion matrix // double[][] cmMatrix = eTest.confusionMatrix(); // LOGGER.debug(cmMatrix); int[] myLst = { 5, 7, 9, 100, 345, 1000, 1500, 7500 }; for (int i = 0; i < myLst.length; i++) { int idx = myLst[i]; System.out.println("Actual: " + testData.instance(idx).stringValue(testData.classIndex())); long start = System.currentTimeMillis(); System.out.println(fcl.classifyInstance(testData.instance(idx))); long end = System.currentTimeMillis(); System.out.println("\n Time: " + (end - start) + " ms"); } } catch (Exception e) { LOGGER.error(e.getMessage()); e.printStackTrace(); } // } else { // System.out.println("Usage: java TextDirectoryToArff <directory name>"); // } } }