com.ML_Hadoop.K_meansClustering.K_meansClusteringMapReduce.java Source code

Java tutorial

Introduction

Here is the source code for com.ML_Hadoop.K_meansClustering.K_meansClusteringMapReduce.java

Source

// *********************************************************************************************************
// *********************************************************************************************************
// K-means clustering in Hadoop (many mappers-one reducer)
// 
// Running
//     "bin/hadoop jar K_meansClusteringMapReduce.jar com.ML_Hadoop.K_meansClustering.K_meansClusteringMapReduce ...
//                 <input_data_path> <output_path> <# of clusters, by default=2> <# of iteration, by default=30> ...
//                 <# of features, by default=2>"
//
//     To stop confusion, it is recommended to remove "/user/hduser/K-means" and "/user/hduser/k_mean.txt" before starting a new job by 
//     "bin/hadoop dfs -rmr /user/hduser/K-means"
//
// Inputs 
//     <input_data_path>  input text files/directory. Each line in the text file is as follows 
//                        (a sample: http://cs.joensuu.fi/sipu/datasets/pathbased.txt  - note that the third column is labels):
//                        [yreal <TAB> feature-1 <TAB> feature-2 <TAB> ...<TAB> feature-N]
//                          151      0.038   0.05   ...      0.061 
//     # of clusters      The number of clusters/groups/labels in final clustering. By default its value is 2.
//     # of iteration     The number of iterations. Its value is 30 by default.
//     # of features      The number of features in input data. feature_size = (number of items in a line of input text file).
//
// Outputs 
//     "/user/hduser/K-means"     Keeps the history of values of clusters' centroids for each iteration.
//
//     "/user/hduser/k_mean.txt"  Keeps the latest update of centroids. 
//
// Examples
//     "bin/hadoop jar K_meansClusteringMapReduce.jar com.ML_Hadoop.K_meansClustering.K_meansClusteringMapReduce ...
//                 /user/hduser/pathbased.txt /user/hduser/output-KMEANS 3 4 2"
// Author
//   Nikzad Babaii Rizvandi <nikzad.b(at)gmail.com>
//
// License
//   The program is free to use for non-commercial academic purposes,
//   but for course works, you must understand what is going inside to use.
//   The program can be used, modified, or re-distributed for any purposes
//   if you or one of your group understand codes. Please note that the author
//   does not guarantee the code works properly for all applications; therefore 
//   the author does not accept any responsibility on any damage/lost by using
//   this code.
//
// Changes
//   11/02/2013  First Edition
// *********************************************************************************************************
// *********************************************************************************************************

package com.ML_Hadoop.K_meansClustering;

import java.io.IOException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class K_meansClusteringMapReduce {

    public static void main(String[] args) throws Exception {
        int iteration = 0, num_of_iteration = 30;
        int feature_size = 2;
        FileSystem fs;
        int number_of_clusters = 2;

        do {
            Configuration conf = new Configuration();
            fs = FileSystem.get(conf);

            Job job = new Job(conf, "K_meansClusteringMapReduce");
            job.setJarByClass(K_meansClusteringMapReduce.class);

            conf = job.getConfiguration(); // This line is mandatory. 

            job.setOutputKeyClass(LongWritable.class);
            job.setOutputValueClass(FloatArrayWritable.class);

            job.setMapperClass(K_meansClusteringMap.class);
            job.setReducerClass(K_meansClusteringReduce.class);

            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(TextOutputFormat.class);

            job.setNumReduceTasks(1); // set number of reducers to one.

            FileInputFormat.addInputPath(job, new Path(args[0]));
            Path out = new Path(args[1]);
            if (fs.exists(out))
                fs.delete(out, true);

            FileOutputFormat.setOutputPath(job, out);
            number_of_clusters = Integer.parseInt(args[2]);
            num_of_iteration = Integer.parseInt(args[3]);
            feature_size = Integer.parseInt(args[4]);

            conf.setInt("number_of_clusters", number_of_clusters);
            conf.setInt("feature_size", feature_size);
            conf.setInt("current_iteration_num", iteration);

            try {
                job.waitForCompletion(true);
                iteration++;
            } catch (IOException e) {
                e.printStackTrace();
            }
        } while (iteration < num_of_iteration);

    }
}