com.ML_Hadoop.NaiveBayesClassifier_Continuous_Features.NaiveBayesClassifierReduce_Continuous_Features.java Source code

Java tutorial

Introduction

Here is the source code for com.ML_Hadoop.NaiveBayesClassifier_Continuous_Features.NaiveBayesClassifierReduce_Continuous_Features.java

Source

// *********************************************************************************************************
// *********************************************************************************************************
// Reduce function of Naive Bayes Classifier with Gaussian distribution in Hadoop
// Author
//   Nikzad Babaii Rizvandi <nikzad.b(at)gmail.com>
//
// License
//   The program is free to use for non-commercial academic purposes,
//   but for course works, you must understand what is going inside to use.
//   The program can be used, modified, or re-distributed for any purposes
//   if you or one of your group understand codes. Please note that the author
//   does not guarantee the code works properly for all applications; therefore 
//   the author does not accept any responsibility on any damage/lost by using
//   this code.
//
// Changes
//   18/02/2013  First Edition
// *********************************************************************************************************
// *********************************************************************************************************

package com.ML_Hadoop.NaiveBayesClassifier_Continuous_Features;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.URI;
import java.util.ArrayList;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class NaiveBayesClassifierReduce_Continuous_Features
        extends Reducer<LongWritable, MapArrayWritable, LongWritable, FloatWritable> {

    private int number_of_classes, number_of_features;
    private ArrayList<MapWritable[]> probablity_info_output = new ArrayList<MapWritable[]>();

    @Override
    public void setup(Context context) {
        number_of_classes = context.getConfiguration().getInt("number_of_classes", 1);
        number_of_features = context.getConfiguration().getInt("number_of_features", 1);
        for (int i = 0; i < number_of_classes; i++) {
            probablity_info_output.add(null);
        }
    }

    @Override
    protected void cleanup(Context context) throws IOException {
        String uri = "/user/hduser/naive_bayes_continuous.txt";
        Path path = new Path(uri);

        try {
            FileSystem fs = FileSystem.get(URI.create(uri), context.getConfiguration());
            if (fs.exists(path))
                fs.delete(path, true);
            BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));
            br.write("class_id,     mu(mean),     std");
            br.write("-------------------------------\n");
            for (int i = 0; i < number_of_classes; i++) {
                br.write("-------- Class-" + i + "-------\n");
                for (int j = 0; j < number_of_features; j++) {
                    br.write(((FloatWritable) probablity_info_output.get(i)[j].get(new Text("class_id_mu")))
                            + ",  ");
                    br.write(((FloatWritable) probablity_info_output.get(i)[j].get(new Text("class_id_std")))
                            + "\n");
                }
                br.write("\n");
            }
            br.close();
        } catch (Exception e) {
            System.out.println("File /user/hduser/naive_bayes_continuous.txt cannot be found");
        }

    }

    @Override // necessary otherwise it runs default reduce()
    public void reduce(LongWritable key, Iterable<MapArrayWritable> values, Context context)
            throws IOException, InterruptedException {

        int key_index = (int) key.get();

        Float[] sigma_x2 = new Float[number_of_features];
        Float[] mu_x = new Float[number_of_features];
        Float[] num_x = new Float[number_of_features];
        Float[] partial_num_x = new Float[number_of_features];
        Float[] total_num_x = new Float[number_of_features];

        Float[] class_id_mu = new Float[number_of_features];
        Float[] class_id_std = new Float[number_of_features];

        MapWritable[] t = new MapWritable[number_of_features];

        // It is a MUST to initilize all arrays before usage.
        for (int i = 0; i < number_of_features; i++) {
            t[i] = new MapWritable(); // each member of an array (including MapWritable[] ) MUST be initilized before use
            sigma_x2[i] = 0.0f;
            mu_x[i] = 0.0f;
            num_x[i] = 0.0f;
            partial_num_x[i] = 0.0f;
            total_num_x[i] = 0.0f;
            class_id_mu[i] = 0.0f;
            class_id_std[i] = 0.0f;
        }

        for (MapArrayWritable val : values) {
            for (int i = 0; i < number_of_features; i++) {
                num_x[i] = ((FloatWritable) ((MapWritable) (val.get()[i])).get(new Text("num_x_local"))).get();
                sigma_x2[i] += ((FloatWritable) ((MapWritable) (val.get()[i])).get(new Text("sigma_x2"))).get();
                mu_x[i] = ((FloatWritable) ((MapWritable) (val.get()[i])).get(new Text("mu_x_local"))).get();

                partial_num_x[i] += mu_x[i] * num_x[i]; // calculates mu(i)*N(i)
                total_num_x[i] += num_x[i]; // calculates total N=N1+N2+...+Nk
            }
        }

        for (int i = 0; i < number_of_features & total_num_x[0] != 0; i++) {
            class_id_mu[i] = partial_num_x[i] / total_num_x[i];
            class_id_std[i] = sigma_x2[i] / total_num_x[i] - (class_id_mu[i] * class_id_mu[i]);
        }

        for (int i = 0; i < number_of_features & total_num_x[0] != 0; i++) {
            t[i].put(new Text("class_id_mu"), new FloatWritable(class_id_mu[i]));
            t[i].put(new Text("class_id_std"), new FloatWritable(class_id_std[i]));
        }

        probablity_info_output.set(key_index, t);

    }
}