Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to You under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.digitalpebble.behemoth.mahout.util; import java.io.IOException; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.clustering.classify.WeightedVectorWritable; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.math.NamedVector; import org.apache.mahout.math.Vector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.digitalpebble.behemoth.BehemothConfiguration; /** * Generates a SequenceFile containing a mapping clusterID / doc ID which can be * used later to inject the cluster refs into a BehemothDoc SeqFile **/ public class ClusterDocIDDumper extends Configured implements Tool, Mapper<IntWritable, WeightedVectorWritable, Text, Text> { private transient static Logger log = LoggerFactory.getLogger(ClusterDocIDDumper.class); public static void main(String[] args) { int res; try { res = ToolRunner.run(BehemothConfiguration.create(), new ClusterDocIDDumper(), args); } catch (Exception e) { res = -1; } System.exit(res); } public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input clusteredPoints"); options.addOption("o", "output", true, "output doc cluster IDs"); // parse the command line arguments CommandLine line = null; try { line = parser.parse(options, args); if (line.hasOption("help")) { formatter.printHelp("ClusterDocIDDumper", options); return 0; } if (!line.hasOption("o") | !line.hasOption("i")) { formatter.printHelp("ClusterDocIDDumper", options); return -1; } } catch (ParseException e) { formatter.printHelp("ClusterDocIDDumper", options); } Path inPath = new Path(line.getOptionValue("i")); Path outPath = new Path(line.getOptionValue("o")); // extracts the string representations from the vectors int retVal = extract(inPath, outPath); if (retVal != 0) { HadoopUtil.delete(getConf(), outPath); return retVal; } return 0; } public int extract(Path input, Path output) throws IOException { JobConf job = new JobConf(getConf()); // job.setJobName(this.getClass().getName()); job.setJarByClass(this.getClass()); FileInputFormat.addInputPath(job, input); job.setInputFormat(SequenceFileInputFormat.class); job.setNumReduceTasks(0); job.setMapperClass(ClusterDocIDDumper.class); FileOutputFormat.setOutputPath(job, output); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); RunningJob rj = JobClient.runJob(job); if (rj.isSuccessful() == false) return -1; return 0; } public void configure(JobConf conf) { setConf(conf); } public void close() throws IOException { } public void map(IntWritable key, WeightedVectorWritable value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { Vector v = value.getVector(); if (v instanceof NamedVector) { String name = ((NamedVector) v).getName(); if (name != null & name.length() > 2) output.collect(new Text(name), new Text(key.toString())); else reporter.incrCounter("ClusterDocIDDumper", "Missing name", 1); } else reporter.incrCounter("ClusterDocIDDumper", "Unnamed vector", 1); } }