org.apache.hama.examples.Kmeans.java Source code

Introduction

Here is the source code for org.apache.hama.examples.Kmeans.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hama.examples;

import java.util.List;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hama.HamaConfiguration;
import org.apache.hama.bsp.BSPJob;
import org.apache.hama.commons.io.VectorWritable;
import org.apache.hama.ml.kmeans.KMeansBSP;

/**
 * Uses the {@link KMeansBSP} class to run a Kmeans Clustering with BSP. You can
 * provide your own input, or generate some random input for benchmarking.
 * 
 * For your own input, you can supply a text file that contains a tab separated
 * sequence of doubles on each line. The first k-vectors are used as the seed
 * centers.
 * 
 * For random input, just supply the "-g" command the number of vectors to
 * generate and the dimension of the vectors.
 * 
 * You must pass always an input directory and an output path, as well as how
 * many iterations the algorithm should run (it will also stop if the centers
 * won't move anymore).
 * 
 * The centers are stored in the given input path under
 * center/center_output.seq. This is a center sequencefile with
 * {@link VectorWritable} as key and {@link NullWritable} as value. You can read
 * it with the normal FS cat utility, but you have to add the hama-ml jar to the
 * lib directory of Hadoop, so it can find the vector classes.
 * 
 * The assignments from an index (the order of the center in the above sequence
 * file matters!, also starting from 0!) to a vector can be found in the output
 * path as text file.
 * 
 */
public class Kmeans {

    public static void main(String[] args) throws Exception {
        if (args.length < 4 || (args.length > 4 && args.length != 7)) {
            System.out.println(
                    "USAGE: <INPUT_PATH> <OUTPUT_PATH> <MAXITERATIONS> <K (how many centers)> -g [<COUNT> <DIMENSION OF VECTORS>]");
            return;
        }
        HamaConfiguration conf = new HamaConfiguration();

        Path in = new Path(args[0]);
        Path out = new Path(args[1]);
        FileSystem fs = FileSystem.get(conf);
        Path center = null;
        if (fs.isFile(in)) {
            center = new Path(in.getParent(), "center/cen.seq");
        } else {
            center = new Path(in, "center/cen.seq");
        }
        Path centerOut = new Path(out, "center/center_output.seq");
        conf.set(KMeansBSP.CENTER_IN_PATH, center.toString());
        conf.set(KMeansBSP.CENTER_OUT_PATH, centerOut.toString());
        int iterations = Integer.parseInt(args[2]);
        conf.setInt(KMeansBSP.MAX_ITERATIONS_KEY, iterations);
        int k = Integer.parseInt(args[3]);
        if (args.length == 7 && args[4].equals("-g")) {
            int count = Integer.parseInt(args[5]);
            if (k > count)
                throw new IllegalArgumentException("K can't be greater than n!");
            int dimension = Integer.parseInt(args[6]);
            System.out.println("N: " + count + " Dimension: " + dimension + " Iterations: " + iterations);
            if (!fs.isFile(in)) {
                in = new Path(in, "input.seq");
            }
            // prepare the input, like deleting old versions and creating centers
            KMeansBSP.prepareInput(count, k, dimension, conf, in, center, out, fs);
        } else {
            if (!fs.isFile(in)) {
                System.out.println("Cannot read text input file: " + in.toString());
                return;
            }
            // Set the last argument to TRUE if first column is required to be the key
            in = KMeansBSP.prepareInputText(k, conf, in, center, out, fs, true);
        }

        BSPJob job = KMeansBSP.createJob(conf, in, out, true);

        long startTime = System.currentTimeMillis();
        // just submit the job
        if (job.waitForCompletion(true)) {
            System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
        }

        System.out.println("\nHere are a few lines of output:");
        List<String> results = KMeansBSP.readOutput(conf, out, fs, 4);
        for (String line : results) {
            System.out.println(line);
        }
        System.out.println("...");
    }
}