com.ML_Hadoop.NaiveBayesClassifier_Continuous_Features.NaiveBayesClassifierMapReduce_Continuous_Features.java Source code

Java tutorial

Introduction

Here is the source code for com.ML_Hadoop.NaiveBayesClassifier_Continuous_Features.NaiveBayesClassifierMapReduce_Continuous_Features.java

Source

// *********************************************************************************************************
// *********************************************************************************************************
// Naive Bayes Classifier for continuous value features in Hadoop (many mappers-one reducer)
// (the distribution of each features is assumed to be Gaussian. Therefore, output of reduce function will be 
//  mean and variance of each feature.)
// 
// Running
//     "bin/hadoop jar NaiveBayesClassifierMapReduce_Continuous_Features.jar ... 
//                 com.ML_Hadoop.NaiveBayesClassifierMapReduce_Continuous_Features.NaiveBayesClassifierMapReduce_Continuous_Features ...
//                 <input_data_path> <output_path> <# of classes, by default=1> <# of features, by default=1>"
//
//     To stop confusion, it is recommended to remove "/user/hduser/naive_bayes_continuous.txt" before starting a new job by 
//     "bin/hadoop dfs -rmr /user/hduser/naive_bayes_continuous.txt"
//
// Inputs 
//     <input_data_path>  input text files/directory. Each line in the text file is as follows 
//                        (a sample: http://code.google.com/p/ml-hadoop/downloads/detail?name=naivebayesdata.txt&can=2&q=#makechanges)
//                        [class_id, feature-1, feature-2, ...,  feature-N]
//                         1,-0.134323,-0.124812,1,-0.495327,-0.296429,-0.980676,-0.3829,-1,-1 
//     # of classes       The number of classes/groups/labels in final training data. By default its value is 1.
//     # of features      The number of features in input data. feature_size = (number of items in a line of input text file).
//
// Outputs 
//     "/user/hduser/naive_bayes_continuous.txt"     Keeps <mean, variance> of each feature, computed from training data
//
// Examples
//     "bin/hadoop jar NaiveBayesClassifierMapReduce.jar ...
//      com.ML_Hadoop.NaiveBayesClassifier_Continuous_Features.NaiveBayesClassifierMapReduce_Continuous_Features ...
//      /user/hduser/naivebayesdata.txt /user/hduser/Naive_output 8 9"
//
// Author
//   Nikzad Babaii Rizvandi <nikzad.b(at)gmail.com>
//
// License
//   The program is free to use for non-commercial academic purposes,
//   but for course works, you must understand what is going inside to use.
//   The program can be used, modified, or re-distributed for any purposes
//   if you or one of your group understand codes. Please note that the author
//   does not guarantee the code works properly for all applications; therefore 
//   the author does not accept any responsibility on any damage/lost by using
//   this code.
//
// Changes
//   18/02/2013  First Edition
// *********************************************************************************************************
// *********************************************************************************************************

package com.ML_Hadoop.NaiveBayesClassifier_Continuous_Features;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class NaiveBayesClassifierMapReduce_Continuous_Features {

    /**
     * @param args
     * @throws IOException 
     * @throws ClassNotFoundException 
     * @throws InterruptedException 
     */
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

        int number_of_classes = 1;
        int number_of_features = 1;
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);

        Job job = new Job(conf, "NaiveBayesClassifierMapReduce_Continuous_Features");
        job.setJarByClass(NaiveBayesClassifierMapReduce_Continuous_Features.class);

        conf = job.getConfiguration(); // This line is mandatory. 

        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(FloatArrayWritable.class);

        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(MapArrayWritable.class);

        job.setMapperClass(NaiveBayesClassifierMap_Continuous_Features.class);
        job.setReducerClass(NaiveBayesClassifierReduce_Continuous_Features.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.setNumReduceTasks(1);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        Path out = new Path(args[1]);
        if (fs.exists(out))
            fs.delete(out, true);
        FileOutputFormat.setOutputPath(job, out);
        number_of_classes = Integer.parseInt(args[2]);
        number_of_features = Integer.parseInt(args[3]);
        conf.setInt("number_of_classes", number_of_classes);
        conf.setInt("number_of_features", number_of_features);

        try {
            job.waitForCompletion(true);

        } catch (IOException e) {
            e.printStackTrace();
        }
    }

}