Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hama.examples; import java.util.List; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hama.HamaConfiguration; import org.apache.hama.bsp.BSPJob; import org.apache.hama.commons.io.VectorWritable; import org.apache.hama.ml.kmeans.KMeansBSP; /** * Uses the {@link KMeansBSP} class to run a Kmeans Clustering with BSP. You can * provide your own input, or generate some random input for benchmarking. * * For your own input, you can supply a text file that contains a tab separated * sequence of doubles on each line. The first k-vectors are used as the seed * centers. * * For random input, just supply the "-g" command the number of vectors to * generate and the dimension of the vectors. * * You must pass always an input directory and an output path, as well as how * many iterations the algorithm should run (it will also stop if the centers * won't move anymore). * * The centers are stored in the given input path under * center/center_output.seq. This is a center sequencefile with * {@link VectorWritable} as key and {@link NullWritable} as value. You can read * it with the normal FS cat utility, but you have to add the hama-ml jar to the * lib directory of Hadoop, so it can find the vector classes. * * The assignments from an index (the order of the center in the above sequence * file matters!, also starting from 0!) to a vector can be found in the output * path as text file. * */ public class Kmeans { public static void main(String[] args) throws Exception { if (args.length < 4 || (args.length > 4 && args.length != 7)) { System.out.println( "USAGE: <INPUT_PATH> <OUTPUT_PATH> <MAXITERATIONS> <K (how many centers)> -g [<COUNT> <DIMENSION OF VECTORS>]"); return; } HamaConfiguration conf = new HamaConfiguration(); Path in = new Path(args[0]); Path out = new Path(args[1]); FileSystem fs = FileSystem.get(conf); Path center = null; if (fs.isFile(in)) { center = new Path(in.getParent(), "center/cen.seq"); } else { center = new Path(in, "center/cen.seq"); } Path centerOut = new Path(out, "center/center_output.seq"); conf.set(KMeansBSP.CENTER_IN_PATH, center.toString()); conf.set(KMeansBSP.CENTER_OUT_PATH, centerOut.toString()); int iterations = Integer.parseInt(args[2]); conf.setInt(KMeansBSP.MAX_ITERATIONS_KEY, iterations); int k = Integer.parseInt(args[3]); if (args.length == 7 && args[4].equals("-g")) { int count = Integer.parseInt(args[5]); if (k > count) throw new IllegalArgumentException("K can't be greater than n!"); int dimension = Integer.parseInt(args[6]); System.out.println("N: " + count + " Dimension: " + dimension + " Iterations: " + iterations); if (!fs.isFile(in)) { in = new Path(in, "input.seq"); } // prepare the input, like deleting old versions and creating centers KMeansBSP.prepareInput(count, k, dimension, conf, in, center, out, fs); } else { if (!fs.isFile(in)) { System.out.println("Cannot read text input file: " + in.toString()); return; } // Set the last argument to TRUE if first column is required to be the key in = KMeansBSP.prepareInputText(k, conf, in, center, out, fs, true); } BSPJob job = KMeansBSP.createJob(conf, in, out, true); long startTime = System.currentTimeMillis(); // just submit the job if (job.waitForCompletion(true)) { System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } System.out.println("\nHere are a few lines of output:"); List<String> results = KMeansBSP.readOutput(conf, out, fs, 4); for (String line : results) { System.out.println(line); } System.out.println("..."); } }