weka.distributed.hadoop.KMeansClustererHadoopJob.java Source code

Introduction

Here is the source code for weka.distributed.hadoop.KMeansClustererHadoopJob.java
Source

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    KMeansClustererHadoopJob
 *    Copyright (C) 2014 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.distributed.hadoop;

import distributed.core.DistributedJob;
import distributed.core.DistributedJobConfig;
import distributed.hadoop.AbstractHadoopJobConfig;
import distributed.hadoop.HDFSConfig;
import distributed.hadoop.HDFSUtils;
import distributed.hadoop.MapReduceJobConfig;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import weka.clusterers.CentroidSketch;
import weka.clusterers.ClusterUtils;
import weka.clusterers.Clusterer;
import weka.clusterers.PreconstructedFilteredClusterer;
import weka.clusterers.PreconstructedKMeans;
import weka.clusterers.SimpleKMeans;
import weka.core.CommandlineRunnable;
import weka.core.Environment;
import weka.core.EuclideanDistance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.NormalizableDistance;
import weka.core.Option;
import weka.core.SelectedTag;
import weka.core.Utils;
import weka.distributed.CSVToARFFHeaderMapTask;
import weka.distributed.CSVToARFFHeaderReduceTask;
import weka.distributed.DistributedWekaException;
import weka.distributed.KMeansMapTask;
import weka.distributed.KMeansReduceTask;
import weka.filters.Filter;
import weka.gui.beans.ClustererProducer;
import weka.gui.beans.TextProducer;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Vector;

/**
 * Hadoop job for building a k-means clusterer usining either random centroid or
 * k-means|| initialization.
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision$
 */
public class KMeansClustererHadoopJob extends HadoopJob
        implements CommandlineRunnable, TextProducer, ClustererProducer {

    /**
     * For serialization
     */
    private static final long serialVersionUID = -4063045814370310397L;

    /**
     * The subdirectory of the output directory that this job saves its results to
     */
    protected static final String OUTPUT_SUBDIR = "/kmeans";

    /** Name of the output model */
    protected String m_modelName = "outputModel.model";

    /** ARFF job */
    protected ArffHeaderHadoopJob m_arffHeaderJob = new ArffHeaderHadoopJob();

    /** Randomized data chunk job */
    protected RandomizedDataChunkHadoopJob m_randomizeJob = new RandomizedDataChunkHadoopJob();

    /** Full path to the final model in HDFS */
    protected String m_hdfsPathToAggregatedClusterer = "";

    /** Number of nodes in the user's Hadoop cluster. Default = 1 */
    protected String m_numNodesAvailable = "1";

    /**
     * True if the job to create randomly shuffled (and stratified) data chunks is
     * to be run before building the model
     */
    protected boolean m_randomize;

    /**
     * Number of data chunks to create (either this or numInstancesPerDataChunk
     * should be specified)
     */
    protected String m_numDataChunks = "";

    /**
     * Number of instances per data chunk (determines how many data chunks are
     * created)
     */
    protected String m_numInstancesPerDataChunk = "";

    /** The configuration for the randomized data chunk creation phase */
    protected MapReduceJobConfig m_randomizeConfig = new MapReduceJobConfig();

    /** The final clusterer produced by this job */
    protected Clusterer m_finalClusterer;

    /** Maximum number of iterations to run */
    protected String m_numIterations = "20";

    /** Perform 10 runs of k-means in parallel */
    protected String m_numRuns = "1";

    /** Number of clusters to find */
    protected String m_numClusters = "2";

    /**
     * The random seed to use with data randomization and k-means|| initialization
     */
    protected String m_randomSeed = "1";

    /** Number of iterations for the k-means|| initialization */
    protected String m_kMeansParallelInitSteps = "5";

    /** Close enough to have converged? */
    protected double m_convergenceTolerance = 1e-4;

    /** Options for the randomize/stratify job */
    protected String m_randomizeJobOpts = "";

    /** Options for the k-means map task */
    protected String m_kMeansMapTaskOpts = "";

    /** Options for the ARFF job */
    protected String m_wekaCsvToArffMapTaskOpts = "";

    /**
     * Holds priming data for distance function (if k-means|| initialization is
     * run)
     */
    protected Instances m_distanceFunctionPrimingData;

    /** Initialize with the k-means parallel routine? */
    protected boolean m_initializeWithRandomCenters;

    /**
     * Whether to display standard deviations of centroids in textual output of
     * final model
     */
    protected boolean m_displayStdDevs;

    /**
     * Constructor
     */
    public KMeansClustererHadoopJob() {
        super("KMeans clusterer builder job",
                "Build a k-means clusterer with either standard " + "initialization or k-means|| initialization");

        m_mrConfig.setMapperClass(KMeansHadoopMapper.class.getName());
        m_mrConfig.setReducerClass(KMeansHadoopReducer.class.getName());
    }

    /**
     * Help information
     * 
     * @return the help information for this job
     */
    public String globalInfo() {
        return "Learns a k-means clustering using either standard random initialization "
                + "or k-means|| initialization";
    }

    /**
     * Tip text for this property
     * 
     * @return the tip text for this property
     */
    public String convergenceToleranceTipText() {
        return "Tollerance for convergence";
    }

    /**
     * Set the convergence tolerance
     * 
     * @param tol the convergence tolerance
     */
    public void setConvergenceTolerance(double tol) {
        m_convergenceTolerance = tol;
    }

    /**
     * Get the convergence tolerance
     * 
     * @return the convergence tolerance
     */
    public double getConvergenceTolerance() {
        return m_convergenceTolerance;
    }

    /**
     * Tip text for this property
     * 
     * @return the tip text for this property
     */
    public String initWithRandomCentroidsTipText() {
        return "Initialize with randomly selected centroids rather than use the "
                + "k-means|| initialization procedure";
    }

    /**
     * Set whether to initialize with randomly selected centroids rather than
     * using the k-means|| initialization procedure.
     * 
     * @param init true if randomly selected initial centroids are to be used
     */
    public void setInitWithRandomCentroids(boolean init) {
        m_initializeWithRandomCenters = init;
    }

    /**
     * Get whether to initialize with randomly selected centroids rather than
     * using the k-means|| initialization procedure.
     * 
     * @return true if randomly selected initial centroids are to be used
     */
    public boolean getInitWithRandomCentroids() {
        return m_initializeWithRandomCenters;
    }

    /**
     * Tip text for this property
     * 
     * @return the tip text for this property
     */
    public String numNodesInClusterTipText() {
        return "The number of nodes in the Hadoop cluster - "
                + "used when determining the number of reducers to run";
    }

    /**
     * Set the number of nodes in the Hadoop cluster
     * 
     * @param n the number of nodes in the Hadoop cluster
     */
    public void setNumNodesInCluster(String n) {
        m_numNodesAvailable = n;
    }

    /**
     * Get the number of nodes in the Hadoop cluster
     * 
     * @return the number of nodes in the Hadoop cluster
     */
    public String getNumNodesInCluster() {
        return m_numNodesAvailable;
    }

    /**
     * Tip text for this property
     * 
     * @return tip text for this property
     */
    public String randomlyShuffleDataNumChunksTipText() {
        return "The number of randomly shuffled data chunks to create. Use in "
                + "conjunction with createRandomizedDataChunks";
    }

    /**
     * Set the number of randomly shuffled data chunks to create. Use in
     * conjunction with createRandomizedDataChunks.
     * 
     * @param chunks the number of chunks to create.
     */
    public void setRandomlyShuffleDataNumChunks(String chunks) {
        m_numDataChunks = chunks;
    }

    /**
     * Get the number of randomly shuffled data chunks to create. Use in
     * conjunction with createRandomizedDataChunks.
     * 
     * @return the number of chunks to create.
     */
    public String getRandomlyShuffleDataNumChunks() {
        return m_numDataChunks;
    }

    /**
     * Tip text for this property
     * 
     * @return tip text for this property
     */
    public String modelFileNameTipText() {
        return "The name only (not full path) that the model should be saved to in the output directory";
    }

    /**
     * Set the name only for the model file
     * 
     * @param m the name only (not full path) that the model should be saved to
     */
    public void setModelFileName(String m) {
        m_modelName = m;
    }

    /**
     * Get the name only for the model file
     * 
     * @return the name only (not full path) that the model should be saved to
     */
    public String getModelFileName() {
        return m_modelName;
    }

    /**
     * Tip text for this property
     * 
     * @return the tip text for this property
     */
    public String randomlyShuffleDataTipText() {
        return "Randomly shuffle the order of the input data";
    }

    /**
     * Set whether to randomly shuffle the order of the instances in the input
     * data before clustering
     * 
     * @param r true if the data should be randomly shuffled
     */
    public void setRandomlyShuffleData(boolean r) {
        m_randomize = r;
    }

    /**
     * Get whether to randomly shuffle the order of the instances in the input
     * data before clustering
     * 
     * @return true if the data should be randomly shuffled
     */
    public boolean getRandomlyShuffleData() {
        return m_randomize;
    }

    /**
     * Tip text for this property.
     * 
     * @return the tip text for this property
     */
    public String numClustersTipText() {
        return "The number of clusters to find";
    }

    /**
     * Set the number of clusters to find
     * 
     * @param numClusters the number of clusters to find
     */
    public void setNumClusters(String numClusters) {
        m_numClusters = numClusters;
    }

    /**
     * Get the number of clusters to find
     * 
     * @return the number of clusters to find
     */
    public String getNumClusters() {
        return m_numClusters;
    }

    /**
     * Tip text for this property.
     * 
     * @return the tip text for this property
     */
    public String numRunsTipText() {
        return "The number of k-means runs to perform in parallel (best run is selected as final model)";
    }

    /**
     * Set the number of k-means runs to perform in parallel
     * 
     * @param numRuns the number of k-means runs to perform in parallel
     */
    public void setNumRuns(String numRuns) {
        m_numRuns = numRuns;
    }

    /**
     * Get the number of k-means runs to perform in parallel
     * 
     * @return the number of k-means runs to perform in parallel
     */
    public String getNumRuns() {
        return m_numRuns;
    }

    /**
     * Tip text for this property.
     * 
     * @return the tip text for this property
     */
    public String numIterationsTipText() {
        return "The maximum number of k-means iterations to perform";
    }

    /**
     * Set the maximum number of k-means iterations to perform
     * 
     * @param numIts the maximum number of iterations to perform
     */
    public void setNumIterations(String numIts) {
        m_numIterations = numIts;
    }

    /**
     * Get the maximum number of k-means iterations to perform
     * 
     * @return the maximum number of iterations to perform
     */
    public String getNumIterations() {
        return m_numIterations;
    }

    /**
     * Tip text for this property.
     * 
     * @return the tip text for this property
     */
    public String randomSeedTipText() {
        return "Seed for random number generation";
    }

    /**
     * Set the seed for random number generation
     * 
     * @param seed the seed for the random number generator
     */
    public void setRandomSeed(String seed) {
        m_randomSeed = seed;
    }

    /**
     * Get the seed for random number generation
     * 
     * @return the seed for the random number generator
     */
    public String getRandomSeed() {
        return m_randomSeed;
    }

    protected void setKMeansMapTaskOpts(String opts) {
        m_kMeansMapTaskOpts = opts;
    }

    protected String getKMeansMapTaskOpts() {
        return m_kMeansMapTaskOpts;
    }

    /**
     * Tip text for this property.
     * 
     * @return the tip text for this property
     */
    public String kMeansParallelInitStepsTipText() {
        return "The number of iterations of the k-means|| initialization routine to perform. "
                + "Only applies if initializeFlow using random centroids has not been turned on.";
    }

    /**
     * Set the number of iterations of the k-means|| initialization routine to
     * perform
     * 
     * @param steps the number of iterations of the k-means|| init routine to
     *          perform
     */
    public void setKMeansParallelInitSteps(String steps) {
        m_kMeansParallelInitSteps = steps;
    }

    /**
     * Get the number of iterations of the k-means|| initialization routine to
     * perform
     * 
     * @return the number of iterations of the k-means|| init routine to perform
     */
    public String getKMeansParallelInitSteps() {
        return m_kMeansParallelInitSteps;
    }

    /**
     * Set the options for the randomize/stratify task
     * 
     * @param opts the options for the randomize task
     */
    public void setRandomizeJobOptions(String opts) {
        m_randomizeJobOpts = opts;
    }

    /**
     * Get the options for the randomize/stratify task
     * 
     * @return the options for the randomize task
     */
    public String getRandomizedJobOptions() {
        return m_randomizeJobOpts;
    }

    /**
     * Get the options to the header job
     * 
     * @return options to the header job
     */
    public String getCSVMapTaskOptions() {
        return m_wekaCsvToArffMapTaskOpts;
    }

    /**
     * Set the options to the header job
     * 
     * @param opts options to the header job
     */
    public void setCSVMapTaskOptions(String opts) {
        m_wekaCsvToArffMapTaskOpts = opts;
    }

    /**
     * Tip text for this property
     * 
     * @return the tip text for this property
     */
    public String displayCentroidStdDevsTipText() {
        return "Display centroid standard deviations in textual output of model";
    }

    /**
     * Set whether to display the standard deviations of centroids in textual
     * output of the model
     * 
     * @param d true if standard deviations are to be displayed
     */
    public void setDisplayCentroidStdDevs(boolean d) {
        m_displayStdDevs = d;
    }

    /**
     * Get whether to display the standard deviations of centroids in textual
     * output of the model
     * 
     * @return true if standard deviations are to be displayed
     */
    public boolean getDisplayCentroidStdDevs() {
        return m_displayStdDevs;
    }

    @Override
    public Enumeration<Option> listOptions() {
        Vector<Option> result = new Vector<Option>();

        result.add(new Option("\tCreate data splits with the order of the input instances\n\t"
                + "shuffled randomly. Use in conjuction with -num-chunks.", "randomize", 0, "-randomize"));
        result.add(
                new Option("\tNumber of randomized data chunks. Use in conjunction with\n\t" + "-randomized-chunks",
                        "num-chunks", 1, "-num-chunks <integer>"));

        result.add(new Option(
                "\tName of output model file. Model will be\n\t"
                        + "written to output-path/k-means/model/<model name>",
                "model-file-name", 1, "-model-file-name <model-name>"));

        result.add(new Option("\tNumber of clusters to find (default = 2)", "num-clusters", 1,
                "-num-clusters <integer>"));

        result.add(new Option("\tMax number of k-means iterations (default = 20)", "num-iterations", 1,
                "-num-iterations <integer>"));

        result.add(new Option(
                "\tNumber of separately initialized runs of k-means to\n\t" + "perform in parallel (default = 1)",
                "num-runs", 1, "-num-runs <integer>"));

        result.add(
                new Option("\tTolerance for convergence (default = 1e-4)", "tolerance", 1, "-tolerance <double>"));

        result.add(new Option("\tInitialize with randomly selected centroids instead\n\t"
                + "of running k-means|| initialization.", "init-random", 0, "-init-random"));

        result.add(new Option("\tDisplay std. deviations for centroids", "V", 0, "-V"));

        result.add(new Option("\tRandom seed (default 1).", "seed", 1, "-seed <integer>"));

        result.add(
                new Option("\tNumber of nodes available in cluster (default = 1).", "num-nodes", 1, "-num-nodes"));

        KMeansMapTask tempMapTask = new KMeansMapTask();
        Enumeration<Option> mapOpts = tempMapTask.listOptions();
        while (mapOpts.hasMoreElements()) {
            result.add(mapOpts.nextElement());
        }

        RandomizedDataChunkHadoopJob tempRJob = new RandomizedDataChunkHadoopJob();
        Enumeration<Option> randOpts = tempRJob.listOptions();
        while (randOpts.hasMoreElements()) {
            result.add(randOpts.nextElement());
        }

        return result.elements();
    }

    @Override
    public void setOptions(String[] options) throws Exception {
        String modelFileName = Utils.getOption("model-file-name", options);
        if (!DistributedJobConfig.isEmpty(modelFileName)) {
            setModelFileName(modelFileName);
        }

        setRandomlyShuffleData(Utils.getFlag("randomize", options));

        setInitWithRandomCentroids(Utils.getFlag("init-random", options));

        String numDataChunks = Utils.getOption("num-chunks", options);
        setRandomlyShuffleDataNumChunks(numDataChunks);

        String temp = Utils.getOption("num-clusters", options);
        if (!DistributedJobConfig.isEmpty(temp)) {
            setNumClusters(temp);
        }
        temp = Utils.getOption("num-iterations", options);
        if (!DistributedJobConfig.isEmpty(temp)) {
            setNumIterations(temp);
        }
        temp = Utils.getOption("num-runs", options);
        if (!DistributedJobConfig.isEmpty(temp)) {
            setNumRuns(temp);
        }
        temp = Utils.getOption("seed", options);
        if (!DistributedJobConfig.isEmpty(temp)) {
            setRandomSeed(temp);
        }
        temp = Utils.getOption("tolerance", options);
        if (!DistributedJobConfig.isEmpty(temp)) {
            setConvergenceTolerance(Double.parseDouble(temp));
        }

        String numNodes = Utils.getOption("num-nodes", options);
        setNumNodesInCluster(numNodes);
        setDisplayCentroidStdDevs(Utils.getFlag('V', options));

        KMeansMapTask tempKTask = new KMeansMapTask();
        tempKTask.setOptions(options);
        String mapOpts = Utils.joinOptions(tempKTask.getOptions());
        // if (!DistributedJobConfig.isEmpty(mapOpts)) {
        setKMeansMapTaskOpts(mapOpts);
        // }

        String[] optionsCopy = options.clone();

        super.setOptions(options);

        // options for the randomize job
        m_randomizeJob.setOptions(optionsCopy.clone());
        String optsToRandomize = Utils.joinOptions(m_randomizeJob.getOptions());
        if (!DistributedJobConfig.isEmpty(optsToRandomize)) {
            setRandomizeJobOptions(optsToRandomize);
        }

        // options for the ARFF header job
        m_arffHeaderJob.setOptions(optionsCopy);
        String optsToCSVTask = Utils.joinOptions(m_arffHeaderJob.getOptions());
        if (!DistributedJobConfig.isEmpty(optsToCSVTask)) {
            setCSVMapTaskOptions(optsToCSVTask);
        }
    }

    /**
     * Get the options for this job only
     * 
     * @return the options for this job only
     */
    public String[] getJobOptionsOnly() {
        List<String> options = new ArrayList<String>();

        options.add("-model-file-name");
        options.add(getModelFileName());

        if (getRandomlyShuffleData()) {
            options.add("-randomize");

            if (!DistributedJobConfig.isEmpty(getRandomlyShuffleDataNumChunks())) {
                options.add("-num-chunks");
                options.add(getRandomlyShuffleDataNumChunks());
            }
        }

        if (getInitWithRandomCentroids()) {
            options.add("-init-random");
        }

        if (getDisplayCentroidStdDevs()) {
            options.add("-V");
        }

        options.add("-num-clusters");
        options.add(getNumClusters());
        options.add("-num-iterations");
        options.add(getNumIterations());
        options.add("-num-runs");
        options.add(getNumRuns());
        options.add("-seed");
        options.add(getRandomSeed());
        options.add("-tolerance");
        options.add("" + getConvergenceTolerance());

        if (!DistributedJobConfig.isEmpty(getNumNodesInCluster())) {
            options.add("-num-nodes");
            options.add(getNumNodesInCluster());
        }

        return options.toArray(new String[options.size()]);
    }

    @Override
    public String[] getOptions() {

        List<String> options = new ArrayList<String>();
        for (String opt : getJobOptionsOnly()) {
            options.add(opt);
        }

        if (!DistributedJobConfig.isEmpty(getKMeansMapTaskOpts())) {
            try {
                String[] kMeansOpts = Utils.splitOptions(getKMeansMapTaskOpts());
                for (String s : kMeansOpts) {
                    options.add(s);
                }
            } catch (Exception ex) {
                ex.printStackTrace();
            }
        }

        if (!DistributedJobConfig.isEmpty(getCSVMapTaskOptions())) {
            try {
                String[] csvOpts = Utils.splitOptions(getCSVMapTaskOptions());

                for (String s : csvOpts) {
                    options.add(s);
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        if (!DistributedJobConfig.isEmpty(getRandomizedJobOptions())) {
            try {
                String[] csvOpts = Utils.splitOptions(getRandomizedJobOptions());

                for (String s : csvOpts) {
                    options.add(s);
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        return options.toArray(new String[options.size()]);
    }

    /**
     * Initializes and runs the phase that creates randomly shuffled data chunks
     * from the input file(s).
     * 
     * @param header the header of the training data
     * @return true if the job is successful
     * @throws DistributedWekaException if a problem occurs
     * @throws IOException if a problem occurs
     */
    protected boolean initializeAndRunRandomizeDataJob(Instances header)
            throws DistributedWekaException, IOException {

        if (!getRandomlyShuffleData()) {
            return true;
        }

        if (m_env == null) {
            m_env = Environment.getSystemWide();
        }

        logMessage("Checking to see if randomize data chunk job is needed...");
        statusMessage("Checking to see if randomize data chunk job is needed...");
        m_randomizeJob.setEnvironment(m_env);
        m_randomizeJob.setLog(getLog());
        m_randomizeJob.setStatusMessagePrefix(m_statusMessagePrefix);

        if (!DistributedJobConfig.isEmpty(getRandomSeed())) {
            m_randomizeJob.setRandomSeed(environmentSubstitute(getRandomSeed()));
        }

        m_randomizeJob.setNumRandomizedDataChunks(getRandomlyShuffleDataNumChunks());

        // make sure that the class attribute does not get set by default!
        m_randomizeJob.setDontDefaultToLastAttIfClassNotSpecified(true);

        if (!m_randomizeJob.runJob()) {
            statusMessage("Unable to continue - randomized data chunk job failed!");
            logMessage("Unable to continue - randomized data chunk job failed!");
            return false;
        }

        // unset any mapredMaxSplitSize here because we
        // now have the number of maps determined by the number of
        // data chunks generated by the randomization job
        m_mrConfig.setMapredMaxSplitSize("");

        // alter the input path to point to the output
        // directory of the randomize job.
        // String randomizeOutputPath = m_randomizeConfig.getOutputPath();
        String randomizeOutputPath = m_randomizeJob.getRandomizedChunkOutputPath();
        m_mrConfig.setInputPaths(randomizeOutputPath);

        return true;
    }

    /**
     * Initialize and run the ARFF header creation job (if necessary).
     * 
     * @return true if the job was successful
     * @throws DistributedWekaException if a problem occurs
     * @throws IOException if a problem occurs
     */
    protected boolean initializeAndRunArffJob() throws DistributedWekaException, IOException {

        if (m_env == null) {
            m_env = Environment.getSystemWide();
        }

        // Run the ARFF header job first

        logMessage("Executing ARFF Job....");
        statusMessage("Excecuting ARFF Job...");
        m_arffHeaderJob.setEnvironment(m_env);
        m_arffHeaderJob.setLog(getLog());
        m_arffHeaderJob.setStatusMessagePrefix(m_statusMessagePrefix);

        if (!m_arffHeaderJob.runJob()) {
            statusMessage("Unable to continue - creating the ARFF header failed!");
            logMessage("Unable to continue - creating the ARFF header failed!");
            return false;
        }

        return true;
    }

    /**
     * Stage intermediate sketches by moving them from the output directory to a
     * staging directory and then add them to the distributed cache ready for the
     * next iteration
     * 
     * @param conf the configuration to use
     * @param outputPath the path in HDFS where the intermediate sketches were
     *          dumped by the previous iteration
     * @param numRuns the number of runs
     * @throws IOException if a problem occurs
     */
    protected void stageIntermediateSketches(Configuration conf, String outputPath, int numRuns)
            throws IOException {

        for (int i = 0; i < numRuns; i++) {
            String fullOutputPath = outputPath + "/" + KMeansCentroidSketchHadoopMapper.SKETCH_FILE_PREFIX + i;
            String stagingPath = HDFSUtils.WEKA_TEMP_DISTRIBUTED_CACHE_FILES
                    + KMeansCentroidSketchHadoopMapper.SKETCH_FILE_PREFIX + i;
            HDFSUtils.moveInHDFS(fullOutputPath, stagingPath, m_mrConfig.getHDFSConfig(), m_env);
            HDFSUtils.addFileToDistributedCache(m_mrConfig.getHDFSConfig(), conf, stagingPath, m_env);
        }
    }

    /**
     * Stage the ARFF header and add to the distributed cache
     * 
     * @param pathToHeader path to the header in HDFS
     * @param hdfsConfig the HDFS config to use
     * @param conf the configuration to use
     * @return just the filename part of the ARFF header path
     * @throws IOException if a problem occurs
     */
    protected String stageArffHeader(String pathToHeader, HDFSConfig hdfsConfig, Configuration conf)
            throws IOException {
        HDFSUtils.addFileToDistributedCache(hdfsConfig, conf, pathToHeader, m_env);
        String fileNameOnly = pathToHeader.substring(pathToHeader.lastIndexOf("/") + 1, pathToHeader.length());

        return fileNameOnly;
    }

    /**
     * Reads the k-means centroid sketches from the output directory.
     * 
     * @param conf the configuration to use
     * @param outputPath the output path where the centroids were written
     * @param numRuns the number of runs being performed
     * @return an array of centroid sketches
     * @throws IOException if a problem occurs
     */
    protected CentroidSketch[] getSketchesFromHDFS(Configuration conf, String outputPath, int numRuns)
            throws IOException {

        CentroidSketch[] results = new CentroidSketch[numRuns];
        FileSystem fs = FileSystem.get(conf);
        for (int i = 0; i < numRuns; i++) {
            String fullOutputPath = outputPath + "/" + KMeansCentroidSketchHadoopMapper.SKETCH_FILE_PREFIX + i;
            Path p = new Path(fullOutputPath);
            InputStream is = fs.open(p);
            BufferedInputStream bis = new BufferedInputStream(is);
            ObjectInputStream ois = new ObjectInputStream(bis);

            try {
                CentroidSketch sketchForRun = (CentroidSketch) ois.readObject();
                results[i] = sketchForRun;
            } catch (ClassNotFoundException ex) {
                throw new IOException(ex);
            } finally {
                is.close();
            }
        }

        return results;
    }

    /**
     * Reads the k-means reduce results from the output directory
     * 
     * @param conf the configuration to use
     * @param outputPath the output path where the centroids were written
     * @param numRuns the number of runs being performed
     * @param converged an array indicating which runs have converged already
     *          (thus we don't excpect a result in the output directory for them)
     * @return an array of KMeansReduceTask objects
     * @throws IOException if a problem occurs
     */
    protected KMeansReduceTask[] getKMeansReducesFromHDFS(Configuration conf, String outputPath, int numRuns,
            boolean[] converged) throws IOException {

        KMeansReduceTask[] results = new KMeansReduceTask[numRuns];
        FileSystem fs = FileSystem.get(conf);
        for (int i = 0; i < numRuns; i++) {
            if (!converged[i]) {
                String fullOutputPath = outputPath + "/" + KMeansHadoopReducer.KMEANS_REDUCE_FILE_PREFIX + i;
                Path p = new Path(fullOutputPath);
                InputStream is = fs.open(p);
                BufferedInputStream bis = new BufferedInputStream(is);
                ObjectInputStream ois = new ObjectInputStream(bis);

                try {
                    KMeansReduceTask reduceForRun = (KMeansReduceTask) ois.readObject();
                    results[i] = reduceForRun;
                } catch (ClassNotFoundException ex) {
                    throw new IOException(ex);
                } finally {
                    is.close();
                }
            }
        }

        return results;
    }

    /**
     * Perform one iteration of the k-means algorithm
     * 
     * @param numRuns number of runs of k-means being performed in parallel
     * @param iterationNum the current iteration number
     * @param conf the configuration object to use
     * @param kMeansConfig the MapReduceJobConfig for the k-means iteration
     * @param baseMapTasks an array of KMeansMapTasks to use as the start points
     *          of the iteration
     * @param arffHeaderFileName the name of the arff header to use
     * @param jobName the job name
     * @throws DistributedWekaException if a problem occurs
     * @throws IOException if a problem occurs
     */
    protected void performKMeansIteration(int numRuns, int iterationNum, Configuration conf,
            MapReduceJobConfig kMeansConfig, KMeansMapTask[] baseMapTasks, String arffHeaderFileName,
            String jobName) throws DistributedWekaException, IOException {

        HDFSConfig hdfsConfig = kMeansConfig.getHDFSConfig();

        for (int i = 0; i < numRuns; i++) {
            // stage each and add to distributed cache
            HDFSUtils.serializeObjectToDistributedCache(baseMapTasks[i], hdfsConfig, conf,
                    KMeansHadoopMapper.KMEANS_MAP_FILE_PREFIX + i, m_env);
        }

        kMeansConfig.setMapperClass(KMeansHadoopMapper.class.getName());
        kMeansConfig.setReducerClass(KMeansHadoopReducer.class.getName());
        kMeansConfig.setUserSuppliedProperty(KMeansHadoopMapper.KMEANS_MAP_TASK_OPTIONS,
                environmentSubstitute(getKMeansMapTaskOpts()) + " -arff-header " + arffHeaderFileName
                        + " -num-runs " + numRuns + " -iteration " + iterationNum);
        kMeansConfig.setUserSuppliedProperty(KMeansHadoopReducer.KMEANS_WRITE_PATH, kMeansConfig.getOutputPath());

        Job job = null;
        try {
            job = kMeansConfig.configureForHadoop(jobName, conf, m_env);
        } catch (ClassNotFoundException e) {
            throw new DistributedWekaException(e);
        }

        cleanOutputDirectory(job);
        statusMessage("Submitting k-means pass: " + (iterationNum + 1));
        logMessage("Submitting k-means pass: " + (iterationNum + 1));

        if (!runJob(job)) {
            throw new DistributedWekaException(
                    "k-means iteration: " + (iterationNum + 1) + " failed " + "- check logs on Hadoop");
        }
    }

    /**
     * Make the final PreconstructedKMeans clusterer to wrap the centroids and
     * stats found during map-reduce.
     * 
     * @param best the best result from the runs of k-means that were performed in
     *          parallel
     * @param preprocess any pre-processing filters applied
     * @param initialStartingPoints the initial starting centroids
     * @param finalNumIterations the final number of iterations performed
     * @return a final clusterer object
     * @throws DistributedWekaException if a problem occurs
     */
    protected Clusterer makeFinalClusterer(KMeansReduceTask best, Filter preprocess,
            Instances initialStartingPoints, int finalNumIterations) throws DistributedWekaException {

        Clusterer finalClusterer = null;
        PreconstructedKMeans finalKMeans = new PreconstructedKMeans();
        // global priming data for the distance function (this will be in
        // the transformed space if we're using preprocessing filters)
        Instances globalPrimingData = best.getGlobalDistanceFunctionPrimingData();
        NormalizableDistance dist = new EuclideanDistance();
        dist.setInstances(globalPrimingData);
        finalKMeans.setClusterCentroids(best.getCentroidsForRun());
        finalKMeans.setFinalNumberOfIterations(finalNumIterations + 1);
        if (initialStartingPoints != null) {
            finalKMeans.setInitialStartingPoints(initialStartingPoints);
        }
        try {
            finalKMeans.setDistanceFunction(dist);
            finalKMeans.setClusterStats(best.getAggregatedCentroidSummaries());
        } catch (Exception e) {
            throw new DistributedWekaException(e);
        }

        if (!getInitWithRandomCentroids()) {
            finalKMeans.setInitializationMethod(
                    new SelectedTag(SimpleKMeans.KMEANS_PLUS_PLUS, SimpleKMeans.TAGS_SELECTION));
        }

        finalKMeans.setDisplayStdDevs(getDisplayCentroidStdDevs());

        finalClusterer = finalKMeans;

        if (preprocess != null) {
            PreconstructedFilteredClusterer fc = new PreconstructedFilteredClusterer();
            fc.setFilter(preprocess);
            fc.setClusterer(finalKMeans);
            finalClusterer = fc;
        }

        return finalClusterer;
    }

    /**
     * Perform all k-means iterations
     * 
     * @param numRuns the number of runs of k-means to perform in parallel
     * @param conf the configuration object to use
     * @param startPoints the initial starting centroids
     * @throws DistributedWekaException if a problem occurs
     * @throws IOException if a problem occurs
     */
    protected void runKMeansIterations(int numRuns, Configuration conf, List<Instances> startPoints)
            throws DistributedWekaException, IOException {

        // make a copy of the start points
        List<Instances> initializationPoints = new ArrayList<Instances>();
        for (Instances i : startPoints) {
            initializationPoints.add(i);
        }

        int numIterations = 20;
        String numIterationsS = getNumIterations();
        if (!DistributedJobConfig.isEmpty(numIterationsS)) {
            try {
                numIterations = Integer.parseInt(environmentSubstitute(numIterationsS));
            } catch (NumberFormatException ex) {
                // ignore
            }
        }

        KMeansReduceTask bestResult = null;
        int bestRunNum = -1;
        int finalNumIterations = -1;

        Instances headerWithSummary = m_arffHeaderJob.getFinalHeader();
        // add the aggregated ARFF header to the distributed cache
        String arffHeaderFileName = environmentSubstitute(m_arffHeaderJob.getAggregatedHeaderPath());
        arffHeaderFileName = stageArffHeader(arffHeaderFileName, m_mrConfig.getHDFSConfig(), conf);

        KMeansMapTask[] mapTasks = new KMeansMapTask[numRuns];
        for (int i = 0; i < numRuns; i++) {
            try {
                mapTasks[i] = new KMeansMapTask();
                mapTasks[i].setOptions(Utils.splitOptions(environmentSubstitute(getKMeansMapTaskOpts())));

                mapTasks[i].init(headerWithSummary);
            } catch (Exception e) {
                throw new DistributedWekaException(e);
            }

            mapTasks[i].setCentroids(startPoints.get(i));
            if (m_distanceFunctionPrimingData != null) {
                mapTasks[i].setDummyDistancePrimingData(m_distanceFunctionPrimingData);
            }
        }

        boolean[] converged = new boolean[numRuns];
        int numConverged = 0;

        for (int i = 0; i < numIterations; i++) {
            String jobName = "k-means - iteration " + (i + 1) + " :"
                    + environmentSubstitute(Utils.joinOptions(getJobOptionsOnly())) + " "
                    + environmentSubstitute(getKMeansMapTaskOpts()) + " -arff-header " + arffHeaderFileName;

            performKMeansIteration(numRuns, i, conf, m_mrConfig, mapTasks, arffHeaderFileName, jobName);

            // fetch reduce results from HDFS and check convergence...
            KMeansReduceTask[] statsForIteration = getKMeansReducesFromHDFS(conf, m_mrConfig.getOutputPath(),
                    numRuns, converged);

            for (int j = 0; j < numRuns; j++) {
                if (i == 0 && m_distanceFunctionPrimingData == null) {
                    // just finished the first iteration, so we'll have global mins/maxes
                    // available for the (potentially) filtered training data
                    mapTasks[j].setDummyDistancePrimingData(
                            statsForIteration[j].getGlobalDistanceFunctionPrimingData());

                    logDebug("Setting dummy distance priming data:\n"
                            + statsForIteration[j].getGlobalDistanceFunctionPrimingData());
                }

                if (!converged[j]) {
                    Instances newCentersForRun = statsForIteration[j].getCentroidsForRun();

                    logDebug("Centers for run " + j + " iteration: " + (i + 1) + "\n" + newCentersForRun);
                    logDebug("Total within cluster error for run " + j + ": "
                            + statsForIteration[j].getTotalWithinClustersError());

                    if (i < numIterations - 1) {
                        // check for convergence - if we dropped a centroid (because it
                        // became
                        // empty) then we'll check for convergence in the next iteration
                        if (newCentersForRun.numInstances() == startPoints.get(j).numInstances()) {
                            boolean changed = false;
                            double totalDist = 0;
                            for (int k = 0; k < newCentersForRun.numInstances(); k++) {
                                double dist = mapTasks[j].distance(newCentersForRun.instance(k),
                                        startPoints.get(j).instance(k));
                                logDebug("Run " + j + " cluster " + k + " convergence distance: " + dist);
                                totalDist += dist;

                                if (dist > m_convergenceTolerance) {
                                    changed = true;
                                    if (i < 2) {
                                        break;
                                    }
                                }
                            }

                            if (!changed) {
                                logMessage("Run: " + j + " converged in " + (i + 1)
                                        + " iterations. Total within cluster error: "
                                        + statsForIteration[j].getTotalWithinClustersError());
                                // List<Instances> centroidSummaries =
                                // statsForIteration[j].getAggregatedCentroidSummaries();
                                // if (true) {
                                // for (Instances sum : centroidSummaries) {
                                // System.err.println(sum);
                                // }
                                // }
                                converged[j] = true;
                                numConverged++;

                                if (bestResult == null) {
                                    bestResult = statsForIteration[j];
                                    bestRunNum = j;
                                    finalNumIterations = bestResult.getIterationNumber();
                                } else {
                                    if (statsForIteration[j].getTotalWithinClustersError() < bestResult
                                            .getTotalWithinClustersError()) {
                                        bestResult = statsForIteration[j];
                                        bestRunNum = j;
                                        finalNumIterations = bestResult.getIterationNumber();
                                    }
                                }
                            } else if (i > 2 && bestResult != null) {
                                // try to stop slowly converging runs - that will probably
                                // never beat the current best - from dragging the job out
                                double remainingIts = numIterations - i;

                                // TODO should probably keep a running average of the
                                // improvement in squared error per run
                                double projectedImprovement = remainingIts * totalDist;
                                double currentSqErr = statsForIteration[j].getTotalWithinClustersError();
                                if ((bestResult.getTotalWithinClustersError()
                                        + m_convergenceTolerance) < (currentSqErr - projectedImprovement)) {
                                    // doesn't look like this run will catch up to the current
                                    // best...
                                    logDebug("Aborting run " + j + " as its current within clust. error ("
                                            + currentSqErr + ") " + "is unlikely to beat the current best run ("
                                            + bestResult.getTotalWithinClustersError() + ") within " + remainingIts
                                            + " iterations");
                                    converged[j] = true;
                                    numConverged++;
                                }
                            }
                        }
                    }

                    // update start-points with new centers
                    startPoints.set(j, newCentersForRun);
                    mapTasks[j].setCentroids(newCentersForRun);
                }
            }

            // check for convergence of all *all* runs and break
            if (numConverged == numRuns || i == numIterations - 1) {
                for (int j = 0; j < numRuns; j++) {
                    if (statsForIteration[j] != null) {
                        if (bestResult == null) {
                            bestResult = statsForIteration[j];
                            bestRunNum = j;
                            finalNumIterations = bestResult.getIterationNumber();
                        } else {
                            if (statsForIteration[j].getTotalWithinClustersError() < bestResult
                                    .getTotalWithinClustersError()) {
                                bestResult = statsForIteration[j];
                                bestRunNum = j;
                                finalNumIterations = bestResult.getIterationNumber();
                            }
                        }
                    }
                }
                break;
            }
        }

        m_finalClusterer = makeFinalClusterer(bestResult, mapTasks[0].getPreprocessingFilters(),
                initializationPoints.get(bestRunNum), finalNumIterations);

        logMessage(m_finalClusterer.toString());
        writeFinalClustererToHDFS(conf, CSVToARFFHeaderReduceTask.stripSummaryAtts(headerWithSummary));
    }

    /**
     * Writes the final clusterer as a serialized model to the output directory in
     * HDFS.
     * 
     * @param conf the configuration object to use
     * @param headerNoSummary the header of the training data
     * @throws IOException if a problem occurs
     */
    protected void writeFinalClustererToHDFS(Configuration conf, Instances headerNoSummary) throws IOException {
        if (m_finalClusterer != null) {
            statusMessage("Writing k-means model to job output directory...");
            logMessage("Writing k-means model to job output directory");

            String outputDir = m_mrConfig.getOutputPath();
            ObjectOutputStream oos = null;
            try {
                FileSystem fs = FileSystem.get(conf);

                Path p = new Path(outputDir + "/" + getModelFileName());
                FSDataOutputStream dos = fs.create(p);

                oos = new ObjectOutputStream(new BufferedOutputStream(dos));
                oos.writeObject(m_finalClusterer);
            } finally {
                if (oos != null) {
                    oos.flush();
                    oos.close();
                }
            }

            if (headerNoSummary != null) {
                // now write the header
                statusMessage("Writing ARFF header to job output directory...");
                logMessage("Writing ARFF header to job output directory");
                String p = outputDir + "/" + getModelFileName().replace(".model", "").replace(".MODEL", "")
                        + "_arffHeader.arff";
                CSVToArffHeaderHadoopReducer.writeHeaderToDestination(headerNoSummary, p, conf);
            }
        }
    }

    /**
     * Run the k-means|| initialization job
     * 
     * @param numRuns the number of runs
     * @param numClusters the number of clusters
     * @return a list of Instances objects
     * @throws DistributedWekaException
     * @throws IOException if a problem occurs
     * @throws DistributedWekaException if a problem occurs
     */
    protected List<Instances> initializeWithKMeansParallel(int numRuns, int numClusters)
            throws DistributedWekaException, IOException {

        int numSteps = 2;
        if (!DistributedJobConfig.isEmpty(getKMeansParallelInitSteps())) {
            try {
                numSteps = Integer.parseInt(environmentSubstitute(getKMeansParallelInitSteps()));
            } catch (NumberFormatException ex) {
                // don't fuss
            }
        }

        int randomSeed = 1;
        if (!DistributedJobConfig.isEmpty(getRandomSeed())) {
            try {
                randomSeed = Integer.parseInt(environmentSubstitute(getRandomSeed()));
            } catch (NumberFormatException ex) {
                // don't fuss
            }
        }

        Instances headerWithSummary = m_arffHeaderJob.getFinalHeader();

        // Step 1: start with 1 randomly chosen point for each run
        List<Instances> randomSingleCenters = initializeWithRandomCenters(numRuns, 1);

        // Create a single KMeansMap task (just for data filtering purposes)
        KMeansMapTask forFilteringOnly = new KMeansMapTask();

        try {
            forFilteringOnly.setOptions(Utils.splitOptions(environmentSubstitute(getKMeansMapTaskOpts())));

            // initialize sketches
            forFilteringOnly.init(headerWithSummary);
        } catch (Exception ex) {
            throw new DistributedWekaException(ex);
        }

        // initial distance function to use with the sketches
        NormalizableDistance distanceFunc = forFilteringOnly.getDistanceFunction();

        // Create iteration 0 CentroidSketches, serialize and place in
        // the distributed cache
        CentroidSketch[] initialSketches = new CentroidSketch[numRuns];
        for (int i = 0; i < numRuns; i++) {
            try {
                Instances transformedStartSketch = randomSingleCenters.get(i);
                // forFilteringOnly.applyFilters(randomSingleCenters.get(i));

                initialSketches[i] = new CentroidSketch(transformedStartSketch, distanceFunc, 2 * numClusters,
                        randomSeed + i);
            } catch (Exception ex) {
                throw new DistributedWekaException(ex);
            }
        }

        // serialize sketches into tmp distributed cache staging location in
        // HDFS
        HDFSConfig hdfsConfig = m_mrConfig.getHDFSConfig();
        Configuration conf = new Configuration();
        try {
            for (int i = 0; i < numRuns; i++) {
                HDFSUtils.serializeObjectToDistributedCache(initialSketches[i], hdfsConfig, conf, "sketch_run" + i,
                        m_env);
            }
        } catch (IOException e) {
            throw new DistributedWekaException(e);
        }

        MapReduceJobConfig kMeansParallelConfig = new MapReduceJobConfig();

        // set the base connection-based options
        try {
            kMeansParallelConfig.setOptions(getOptions());
        } catch (Exception e1) {
            throw new DistributedWekaException(e1);
        }
        kMeansParallelConfig.setMapperClass(KMeansCentroidSketchHadoopMapper.class.getName());
        kMeansParallelConfig.setReducerClass(KMeansCentroidSketchHadoopReducer.class.getName());

        kMeansParallelConfig.setNumberOfReducers(m_mrConfig.getNumberOfReducers());

        // set the input path in case we are using randomized chunks
        kMeansParallelConfig.setInputPaths(m_mrConfig.getInputPaths());

        // save the sketches into a subdirectory
        kMeansParallelConfig.setOutputPath(m_mrConfig.getOutputPath() + "/sketch");
        kMeansParallelConfig.setUserSuppliedProperty(KMeansCentroidSketchHadoopReducer.SKETCH_WRITE_PATH,
                kMeansParallelConfig.getOutputPath());

        String arffHeaderFileName = environmentSubstitute(m_arffHeaderJob.getAggregatedHeaderPath());
        arffHeaderFileName = stageArffHeader(arffHeaderFileName, kMeansParallelConfig.getHDFSConfig(), conf);

        kMeansParallelConfig.setUserSuppliedProperty(
                KMeansCentroidSketchHadoopMapper.CENTROID_SKETCH_MAP_TASK_OPTIONS,
                environmentSubstitute(Utils.joinOptions(getJobOptionsOnly())) + " "
                        + environmentSubstitute(getKMeansMapTaskOpts()) + " -first-iteration -arff-header "
                        + arffHeaderFileName);

        kMeansParallelConfig.setUserSuppliedProperty(
                CSVToArffHeaderHadoopMapper.CSV_TO_ARFF_HEADER_MAP_TASK_OPTIONS,
                environmentSubstitute(getCSVMapTaskOptions()));

        addWekaLibrariesToClasspath(conf);

        // run the sketch iterations
        Job job = null;
        for (int i = 0; i < numSteps; i++) {
            if (i == 1) {
                // no longer the first iteration
                kMeansParallelConfig.setUserSuppliedProperty(
                        KMeansCentroidSketchHadoopMapper.CENTROID_SKETCH_MAP_TASK_OPTIONS,
                        environmentSubstitute(Utils.joinOptions(getJobOptionsOnly())) + " "
                                + environmentSubstitute(getKMeansMapTaskOpts()) + " -arff-header "
                                + arffHeaderFileName);
            }

            String jobName = "k-means|| initialization job - iteration: " + (i + 1) + " " + kMeansParallelConfig
                    .getUserSuppliedProperty(KMeansCentroidSketchHadoopMapper.CENTROID_SKETCH_MAP_TASK_OPTIONS);

            try {
                job = kMeansParallelConfig.configureForHadoop(jobName, conf, m_env);
            } catch (ClassNotFoundException e) {
                throw new DistributedWekaException(e);
            }

            cleanOutputDirectory(job);

            statusMessage("Submitting iteration (" + (i + 1) + ") of job: k-means|| initialization");
            logMessage("Submitting iteration (" + (i + 1) + ") of job: k-means|| initialization");

            if (!runJob(job)) {
                throw new DistributedWekaException("k-means|| initialization failed " + "- check logs on Hadoop");
            }

            if (i < numSteps - 1) {
                // now need to move output sketches to staging ready for next
                // iteration
                statusMessage("Staging intermediate centroid sketches ready for " + "iteration " + (i + 2));
                logMessage("Staging intermediate centroid sketches ready for " + "iteration " + (i + 2));
                stageIntermediateSketches(conf, kMeansParallelConfig.getOutputPath(), numRuns);
            }
        }

        CentroidSketch[] finalSketches = getSketchesFromHDFS(conf, kMeansParallelConfig.getOutputPath(), numRuns);
        Instances globalPrimingData = finalSketches[0].getDistanceFunction().getInstances();
        if (globalPrimingData.numInstances() != 2) {
            throw new DistributedWekaException("Was expecting a two instance (global priming data)"
                    + " dataset to be set in the distance function in each sketch!");
        }

        // Configure some KMeans map tasks ready to be used to assign training
        // instances to sketch candidate centers
        KMeansMapTask[] onePassMaps = new KMeansMapTask[numRuns];
        for (int i = 0; i < numRuns; i++) {
            try {
                onePassMaps[i] = new KMeansMapTask();
                onePassMaps[i].setOptions(Utils.splitOptions(environmentSubstitute(getKMeansMapTaskOpts())));

                onePassMaps[i].init(headerWithSummary);
            } catch (Exception e) {
                throw new DistributedWekaException(e);
            }

            onePassMaps[i].setCentroids(finalSketches[i].getCurrentSketch());
            onePassMaps[i].setDummyDistancePrimingData(globalPrimingData);
        }

        String jobName = "k-means|| initialization job - computing sketch membership: " + kMeansParallelConfig
                .getUserSuppliedProperty(KMeansCentroidSketchHadoopMapper.CENTROID_SKETCH_MAP_TASK_OPTIONS);

        performKMeansIteration(numRuns, 0, conf, kMeansParallelConfig, onePassMaps, arffHeaderFileName, jobName);

        // now retrieve the KMeansReduceTasks that hold the clustering stats
        // for the centroid sketch centers and compute the final centers!
        boolean[] converged = new boolean[numRuns];
        KMeansReduceTask[] statsForSketches = getKMeansReducesFromHDFS(conf, kMeansParallelConfig.getOutputPath(),
                numRuns, converged);
        List<Instances> finalStartingPointsForRuns = ClusterUtils.weightSketchesAndClusterToFinalStartPoints(
                numRuns, numClusters, finalSketches, statsForSketches, getDebug());

        logDebug("Final starting points for run: 0\n" + finalStartingPointsForRuns.get(0));

        m_distanceFunctionPrimingData = globalPrimingData;
        logDebug("Distance function priming data:\n" + m_distanceFunctionPrimingData);

        return finalStartingPointsForRuns;
    }

    /**
     * If the data has been randomly shuffled into n chunks then this does select
     * randomly chosen centers. If the data hasn't been randomly shuffled then
     * rows are read sequentially from the first data file in the input directory
     * 
     * @param numRuns the number of runs of k-means
     * @param numClusters the number of clusters
     * @return a list of centers (as Instances objects)
     * @throws DistributedWekaException if a problem occurs
     */
    protected List<Instances> initializeWithRandomCenters(int numRuns, int numClusters)
            throws DistributedWekaException {

        String csvConfig = getCSVMapTaskOptions();
        CSVToARFFHeaderMapTask csvTask = new CSVToARFFHeaderMapTask();
        Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(m_arffHeaderJob.getFinalHeader());
        Configuration conf = new Configuration();
        m_mrConfig.getHDFSConfig().configureForHadoop(conf, m_env);

        List<Instance> candidateList = new ArrayList<Instance>();
        int numRowsToGet = 2 * numRuns * numClusters;
        boolean ok = false;

        try {
            csvTask.setOptions(Utils.splitOptions(csvConfig));
            csvTask.initParserOnly(CSVToARFFHeaderMapTask.instanceHeaderToAttributeNameList(headerNoSummary));
        } catch (Exception e) {
            throw new DistributedWekaException(e);
        }
        if (getRandomlyShuffleData()) {
            String randomizedOutputPath = m_randomizeJob.getRandomizedChunkOutputPath();
            try {
                FileSystem fs = FileSystem.get(conf);
                // FileStatus[] contents = fs.listStatus(new
                // Path(randomizedOutputPath));

                int chunkNum = 0;

                while (!ok) {
                    Path chunk = new Path(randomizedOutputPath + "/chunk" + chunkNum + "-r-00000");
                    if (!fs.exists(chunk)) {
                        if (chunkNum == 0) {
                            // something bad has happened - there doesn't seem to be any
                            // chunk files
                            throw new DistributedWekaException("Unable to find any chunk files in the "
                                    + "randomize job's output directory: " + randomizedOutputPath);
                        }
                        break; // run out of chunks
                    }
                    FSDataInputStream di = fs.open(chunk);
                    BufferedReader br = null;
                    try {
                        br = new BufferedReader(new InputStreamReader(di));

                        // get a few more than we need in order to avoid
                        // duplicates (hopefully)
                        int count = 0;
                        String line = null;
                        while ((line = br.readLine()) != null && count < numRowsToGet) {
                            String[] parsed = csvTask.parseRowOnly(line);
                            Instance inst = csvTask.makeInstance(headerNoSummary, false, parsed, false);
                            candidateList.add(inst);
                            count++;
                        }

                        if (count == numRowsToGet) {
                            ok = true;
                        } else {
                            chunkNum++;
                        }
                        br.close();
                        br = null;
                    } catch (Exception ex) {
                        throw new DistributedWekaException(ex);
                    } finally {
                        if (br != null) {
                            br.close();
                        }
                    }
                }
            } catch (IOException ex) {
                throw new DistributedWekaException(ex);
            }
        } else {
            String inS = m_mrConfig.getInputPaths();
            String[] inputPaths = inS.split(",");
            BufferedReader br = null;
            try {
                FileSystem fs = FileSystem.get(conf);
                int count = 0;
                for (String inPath : inputPaths) {
                    FileStatus[] contents = fs.listStatus(new Path(inPath));
                    for (FileStatus s : contents) {
                        String nameOnly = s.getPath().toString();
                        nameOnly = nameOnly.substring(nameOnly.lastIndexOf("/") + 1, nameOnly.length());
                        if (!nameOnly.startsWith(".") && !nameOnly.startsWith("_") && fs.isFile(s.getPath())) {
                            FSDataInputStream di = fs.open(s.getPath());

                            br = new BufferedReader(new InputStreamReader(di));
                            String line = null;
                            while ((line = br.readLine()) != null && count < numRowsToGet) {
                                String[] parsed = csvTask.parseRowOnly(line);
                                Instance inst = csvTask.makeInstance(headerNoSummary, false, parsed, false);
                                candidateList.add(inst);
                                count++;
                            }

                            if (count == numRowsToGet) {
                                ok = true;
                                break;
                            }
                            br.close();
                            br = null;
                        }
                    }
                }
            } catch (Exception ex) {
                throw new DistributedWekaException(ex);
            } finally {
                if (br != null) {
                    try {
                        br.close();
                    } catch (IOException e) {
                        throw new DistributedWekaException(e);
                    }
                }
            }
        }

        if (candidateList.size() < numRuns * numClusters) {
            throw new DistributedWekaException("Was unable to obtain enough initial start points " + "for "
                    + numRuns + " runs with " + numClusters + " start points each.");
        }

        // make sure that start points and header have been through any filters
        KMeansMapTask forFilteringOnly = new KMeansMapTask();
        try {
            forFilteringOnly.setOptions(Utils.splitOptions(environmentSubstitute(getKMeansMapTaskOpts())));

            // initialize sketches
            forFilteringOnly.init(m_arffHeaderJob.getFinalHeader());

            for (int i = 0; i < candidateList.size(); i++) {
                Instance filtered = forFilteringOnly.applyFilters(candidateList.get(i));
                candidateList.set(i, filtered);
            }

            headerNoSummary = forFilteringOnly.applyFilters(headerNoSummary);
        } catch (Exception ex) {
            throw new DistributedWekaException(ex);
        }

        List<Instances> startPoints = KMeansMapTask.assignStartPointsFromList(numRuns, numClusters, candidateList,
                headerNoSummary);

        return startPoints;
    }

    @Override
    public boolean runJob() throws DistributedWekaException {
        boolean success = true;
        ClassLoader orig = Thread.currentThread().getContextClassLoader();
        try {
            Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());
            try {
                setJobStatus(JobStatus.RUNNING);

                if (!initializeAndRunArffJob()) {
                    return false;
                }

                if (!initializeAndRunRandomizeDataJob(m_arffHeaderJob.getFinalHeader())) {
                    return false;
                }

                String outputPath = m_mrConfig.getOutputPath();
                outputPath += OUTPUT_SUBDIR;
                outputPath = environmentSubstitute(outputPath);
                m_mrConfig.setOutputPath(outputPath);

                // reducer will write the aggregated classifier to here
                outputPath += "/" + environmentSubstitute(getModelFileName());
                m_hdfsPathToAggregatedClusterer = outputPath;

                Configuration conf = new Configuration();

                // Need these for row parsing via open-csv
                m_mrConfig.setUserSuppliedProperty(CSVToArffHeaderHadoopMapper.CSV_TO_ARFF_HEADER_MAP_TASK_OPTIONS,
                        environmentSubstitute(getCSVMapTaskOptions()));

                try {
                    installWekaLibrariesInHDFS(conf);
                } catch (IOException ex) {
                    setJobStatus(JobStatus.FAILED);
                    throw new DistributedWekaException(ex);
                }

                int nRuns = 1;
                String numRuns = getNumRuns();
                if (!DistributedJobConfig.isEmpty(numRuns)) {
                    try {
                        nRuns = Integer.parseInt(environmentSubstitute(getNumRuns()));
                    } catch (NumberFormatException e) {
                    }
                }
                int numNodes = 1;
                String nNodes = getNumNodesInCluster();
                if (!DistributedJobConfig.isEmpty(nNodes)) {
                    try {
                        numNodes = Integer.parseInt(nNodes);
                    } catch (NumberFormatException e) {
                    }
                }

                String taskMaxKey = AbstractHadoopJobConfig.isHadoop2()
                        ? MapReduceJobConfig.HADOOP2_TASKTRACKER_REDUCE_TASKS_MAXIMUM
                        : MapReduceJobConfig.HADOOP_TASKTRACKER_REDUCE_TASKS_MAXIMUM;
                String reduceTasksMaxPerNode = conf.get(taskMaxKey);
                int reduceMax = 2;

                // allow our configuration to override the defaults for the cluster
                String userMaxOverride = m_mrConfig
                        .getUserSuppliedProperty(MapReduceJobConfig.HADOOP_TASKTRACKER_REDUCE_TASKS_MAXIMUM);
                if (DistributedJobConfig.isEmpty(userMaxOverride)) {
                    // try the Hadoop 2 version
                    userMaxOverride = m_mrConfig
                            .getUserSuppliedProperty(MapReduceJobConfig.HADOOP2_TASKTRACKER_REDUCE_TASKS_MAXIMUM);
                }
                if (!DistributedJobConfig.isEmpty(userMaxOverride)) {
                    reduceTasksMaxPerNode = environmentSubstitute(userMaxOverride);
                }

                if (!DistributedJobConfig.isEmpty(reduceTasksMaxPerNode)) {
                    reduceMax = Integer.parseInt(environmentSubstitute(reduceTasksMaxPerNode));
                }
                int numReducers = Math.min(nRuns, reduceMax * numNodes);
                if (numReducers > 1) {
                    logMessage("Setting number of reducers for clustering job to: " + numReducers);

                    m_mrConfig.setNumberOfReducers("" + numReducers);
                }

                int numClusters = 2;
                if (!DistributedJobConfig.isEmpty(getNumClusters())) {
                    String nCl = environmentSubstitute(getNumClusters());
                    try {
                        numClusters = Integer.parseInt(nCl);
                    } catch (NumberFormatException e) {
                    }
                }

                // k-means!! initialization for k-means
                if (!m_initializeWithRandomCenters) {
                    List<Instances> startPoints = initializeWithKMeansParallel(nRuns, numClusters);
                    runKMeansIterations(nRuns, conf, startPoints);
                } else {
                    // random initialization for k-means
                    List<Instances> startPoints = initializeWithRandomCenters(nRuns, numClusters);

                    logDebug("Randomly selected starting points for run 0\n" + startPoints.get(0).toString());
                    runKMeansIterations(nRuns, conf, startPoints);
                }

            } catch (IOException e) {
                throw new DistributedWekaException(e);
            }
        } finally {
            Thread.currentThread().setContextClassLoader(orig);
        }

        return success;
    }

    @Override
    public Clusterer getClusterer() {
        return m_finalClusterer;
    }

    @Override
    public Instances getTrainingHeader() {
        if (m_arffHeaderJob != null) {
            Instances result = m_arffHeaderJob.getFinalHeader();
            if (result != null) {
                try {
                    return CSVToARFFHeaderReduceTask.stripSummaryAtts(result);
                } catch (DistributedWekaException e) {
                    e.printStackTrace();
                }
            }
        }

        return null;
    }

    @Override
    public String getText() {
        return m_finalClusterer != null ? m_finalClusterer.toString() : "Clusterer not built yet!";
    }

    @Override
    public void stopJob() {
        super.stopJob();

        if (m_arffHeaderJob != null) {
            m_arffHeaderJob.stopJob();
        }

        if (m_randomizeJob != null) {
            m_randomizeJob.stopJob();
        }
    }

    /**
     * Main method for executing this job from the command line
     * 
     * @param args arguments to the job
     */
    public static void main(String[] args) {

        KMeansClustererHadoopJob job = new KMeansClustererHadoopJob();
        job.run(job, args);
    }

    @Override
    public void run(Object toRun, String[] args) {

        if (!(toRun instanceof KMeansClustererHadoopJob)) {
            throw new IllegalArgumentException("Object to run is not a CorrelationMatrixHadoopJob!");
        }

        try {
            KMeansClustererHadoopJob job = (KMeansClustererHadoopJob) toRun;

            if (Utils.getFlag('h', args)) {
                String help = DistributedJob.makeOptionsStr(job);
                System.err.println(help);
                System.exit(1);
            }

            job.setOptions(args);
            job.runJob();

            // if (!DistributedJobConfig.isEmpty(getText())) {
            // System.out.println(getText());
            // }
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }
}