Java tutorial
/************************************************** Copyright (C) 2014 Raptis Dimos <raptis.dimos@yahoo.gr> This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************/ import java.io.IOException; import java.util.*; import org.apache.hadoop.fs.Path; import org.apache.hadoop.conf.*; import org.apache.hadoop.io.*; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import java.util.HashSet; public class PopularURLs { public static class Map extends Mapper<LongWritable, Text, Text, Text> { private Text url = new Text(); private Text user_id = new Text(); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); //columns seperated with tab delimeter StringTokenizer tokenizer = new StringTokenizer(line, "\t"); if (tokenizer.countTokens() == 5) { String user_id_str = tokenizer.nextToken(); tokenizer.nextToken(); tokenizer.nextToken(); tokenizer.nextToken(); String url_str = tokenizer.nextToken(); user_id.set(user_id_str); url.set(url_str); if (!user_id.equals("AnonID")) context.write(url, user_id); } } } public static class Reduce extends Reducer<Text, Text, Text, IntWritable> { public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { HashSet<Text> users = new HashSet<Text>(); for (Text val : values) users.add(val); if (users.size() >= 10) { context.write(key, new IntWritable(users.size())); } } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "popularurls"); job.setJarByClass(PopularURLs.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); } }