DAAL.Kmeans.java Source code

Introduction

Here is the source code for DAAL.Kmeans.java
Source

/* file: Kmeans.java */
/*
 //  Copyright(C) 2014-2015 Intel Corporation. All Rights Reserved.
 //
 //  The source code, information  and  material ("Material") contained herein is
 //  owned  by Intel Corporation or its suppliers or licensors, and title to such
 //  Material remains  with Intel Corporation  or its suppliers or licensors. The
 //  Material  contains proprietary information  of  Intel or  its  suppliers and
 //  licensors. The  Material is protected by worldwide copyright laws and treaty
 //  provisions. No  part  of  the  Material  may  be  used,  copied, reproduced,
 //  modified, published, uploaded, posted, transmitted, distributed or disclosed
 //  in any way  without Intel's  prior  express written  permission. No  license
 //  under  any patent, copyright  or  other intellectual property rights  in the
 //  Material  is  granted  to  or  conferred  upon  you,  either  expressly,  by
 //  implication, inducement,  estoppel or  otherwise.  Any  license  under  such
 //  intellectual  property  rights must  be express  and  approved  by  Intel in
 //  writing.
 //
 //  *Third Party trademarks are the property of their respective owners.
 //
 //  Unless otherwise  agreed  by Intel  in writing, you may not remove  or alter
 //  this  notice or  any other notice embedded  in Materials by Intel or Intel's
 //  suppliers or licensors in any way.
 //
 ////////////////////////////////////////////////////////////////////////////////
 //  Content:
 //     Java sample of K-Means clustering in the distributed processing mode
 ////////////////////////////////////////////////////////////////////////////////
 */

package DAAL;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import java.net.URI;

import com.intel.daal.data_management.data.*;
import com.intel.daal.data_management.data_source.*;
import com.intel.daal.services.*;

/* Implement Tool to be able to pass -libjars on start */
public class Kmeans extends Configured implements Tool {

    private static final int nIterations = 5;

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new Kmeans(), args);
        System.exit(res);
    }

    @Override
    public int run(String[] args) throws Exception {
        Configuration conf = this.getConf();

        /* Put shared libraries into the distributed cache */
        DistributedCache.createSymlink(conf);
        DistributedCache.addCacheFile(new URI("/Hadoop/Libraries/libJavaAPI.so#libJavaAPI.so"), conf);
        DistributedCache.addCacheFile(new URI("/Hadoop/Libraries/libtbb.so.2#libtbb.so.2"), conf);
        DistributedCache.addCacheFile(new URI("/Hadoop/Libraries/libiomp5.so#libiomp5.so"), conf);

        Job initJob = new Job(conf, "K-means Init Job");

        FileInputFormat.setInputPaths(initJob, new Path(args[0]));
        FileOutputFormat.setOutputPath(initJob, new Path("/Hadoop/Kmeans/initResults"));

        initJob.setMapperClass(KmeansInitStep1Mapper.class);
        initJob.setReducerClass(KmeansInitStep2Reducer.class);
        initJob.setInputFormatClass(TextInputFormat.class);
        initJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        initJob.setOutputKeyClass(IntWritable.class);
        initJob.setOutputValueClass(WriteableData.class);
        initJob.setJarByClass(Kmeans.class);
        initJob.waitForCompletion(true);

        int a = 1;

        for (int i = 0; i < nIterations; i++) {
            Job job = new Job(conf, "K-means compute Job");
            FileInputFormat.setInputPaths(job, new Path(args[0]));

            if (i == nIterations - 1) {
                FileOutputFormat.setOutputPath(job, new Path(args[1]));
            } else {
                FileOutputFormat.setOutputPath(job, new Path("/Hadoop/Kmeans/ResultsIter" + i));
            }
            job.setMapperClass(KmeansStep1Mapper.class);
            job.setReducerClass(KmeansStep2Reducer.class);
            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(SequenceFileOutputFormat.class);
            job.setOutputKeyClass(IntWritable.class);
            job.setOutputValueClass(WriteableData.class);
            job.setJarByClass(Kmeans.class);

            if (job.waitForCompletion(true)) {
                a = 0;
            } else {
                a = 1;
            }
            ;
        }

        return a;
    }
}