Java tutorial
/* * Cloud9: A Hadoop toolkit for working with big data * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package edu.umd.honghongie; import java.io.IOException; import java.util.Arrays; import java.util.ArrayList; import java.util.List; import java.util.Iterator; import java.util.HashMap; import java.util.Map; import java.net.URI; import java.io.File; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.math.NumberUtils; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob; import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import tl.lin.data.map.HMapStIW; import tl.lin.data.pair.PairOfStrings; /** * <p> * Implementation of the "stripes" algorithm for computing co-occurrence matrices from a large text * collection. This algorithm is described in Chapter 3 of "Data-Intensive Text Processing with * MapReduce" by Lin & Dyer, as well as the following paper: * </p> * @ author Lingzi Hong */ public class StripesPMI extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(StripesPMI.class); private static int lines = 0; private static class Map_First extends Mapper<LongWritable, Text, Text, IntWritable> { private static final Text WORD = new Text(); private static final IntWritable ONE = new IntWritable(1); @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); lines = lines + 1; String[] terms = line.split("\\s+"); //get unique set of the line ArrayList<String> list = new ArrayList<String>(); for (int i = 0; i < terms.length; i++) { if (!list.contains(terms[i])) { list.add(terms[i]); } } for (int i = 0; i < list.size(); i++) { String word = list.get(i); // skip empty tokens if (word.length() == 0) continue; WORD.set(word); context.write(WORD, ONE); } } } // output of mapper should be exactly the same with input of combiner protected static class MyCombiner extends Reducer<Text, IntWritable, Text, IntWritable> { private static final IntWritable SUM = new IntWritable(); @Override public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; Iterator<IntWritable> iter = values.iterator(); while (iter.hasNext()) { sum += iter.next().get(); } SUM.set(sum); context.write(key, SUM); } } protected static class Reduce_First extends Reducer<Text, IntWritable, Text, IntWritable> { private static final IntWritable SUM = new IntWritable(); @Override public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; Iterator<IntWritable> iter = values.iterator(); while (iter.hasNext()) { sum += iter.next().get(); } SUM.set(sum); context.write(key, SUM); } } /* second job */ private static class Map_Second extends Mapper<LongWritable, Text, Text, HMapStIW> { private static final HMapStIW MAP = new HMapStIW(); private static final Text KEY = new Text(); @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] terms = line.split("\\s+"); ArrayList<String> list = new ArrayList<String>(); for (int i = 0; i < terms.length; i++) { if (!list.contains(terms[i])) { list.add(terms[i]); } } for (int i = 0; i < list.size(); i++) { String word = list.get(i); // skip empty tokens if (word.length() == 0) continue; MAP.clear(); for (int j = 0; j < list.size(); j++) { //skip itself if (j == i) continue; // skip empty tokens String tt = list.get(j); if (tt.length() == 0) continue; MAP.increment(tt); } KEY.set(word); context.write(KEY, MAP); // System.out.println(word); // System.out.println(MAP.toString()); } } } protected static class Reduce_Second extends Reducer<Text, HMapStIW, PairOfStrings, FloatWritable> { private static final FloatWritable VALUE = new FloatWritable(1); //remember to change after debug private static final PairOfStrings PAIR = new PairOfStrings(); Map<String, Float> singlewordmap = new HashMap<String, Float>();//store values for single map @Override protected void setup(Context context) throws IOException, InterruptedException { if (context.getCacheFiles() != null && context.getCacheFiles().length > 0) { URI mappingFileUri = context.getCacheFiles()[0]; if (mappingFileUri != null) { // System.out.println("mappingFileUri is not null*********"); String filetext = FileUtils.readFileToString(new File("part-r-00000"));//note the path of the file String[] words = filetext.split("\\s+"); // System.out.println(words[0]+"*********"+words[1]); int cnt = 0, len = words.length; String word; float wordcnt; while (cnt + 1 < len) { word = words[cnt].trim();//remove space at the start or the end position // System.out.println("*********"+words[cnt+1]); wordcnt = Float.parseFloat(words[cnt + 1].trim()); singlewordmap.put(word, wordcnt); cnt += 2; } } } else { System.out.println(">>>>>> NO CACHE FILES AT ALL"); } } @Override public void reduce(Text key, Iterable<HMapStIW> values, Context context) throws IOException, InterruptedException { // System.out.println("code 1*****************"); Iterator<HMapStIW> iter = values.iterator(); HMapStIW map = new HMapStIW(); while (iter.hasNext()) { map.plus(iter.next()); } float leftwordcnt = 0; float rightwordcnt = 0; Iterator itervalue = map.keySet().iterator(); while (itervalue.hasNext()) { String tt = (String) itervalue.next(); int cv = map.get(tt); if (cv > 9) //ten or more { if (singlewordmap.containsKey(key.toString())) leftwordcnt = singlewordmap.get(key.toString()); if (singlewordmap.containsKey(tt)) rightwordcnt = singlewordmap.get(tt); if (leftwordcnt * rightwordcnt == 0) System.out.println("one word may not be there"); float respmi = (float) Math.log10(1.0 * cv / (leftwordcnt * rightwordcnt)) + (float) Math.log10(lines); PAIR.set(key.toString(), tt); VALUE.set(respmi); context.write(PAIR, VALUE); } } } } /** * Creates an instance of this tool. */ public StripesPMI() { } private static final String INPUT = "input"; private static final String OUTPUT = "output"; private static final String NUM_REDUCERS = "numReducers"; /** * Runs this tool. */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); // options.addOption(OptionBuilder.withArgName("num").hasArg() // .withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; // int window = cmdline.hasOption(WINDOW) ? // Integer.parseInt(cmdline.getOptionValue(WINDOW)) : 2; LOG.info("Tool: " + StripesPMI.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); // LOG.info(" - window: " + window); LOG.info(" - number of reducers: " + reduceTasks); //JobConf conf = new JobConf(PairsPMI.class); // first job //Job job1 = new Job (conf,"join1"); Configuration conf1 = getConf(); Job job1 = Job.getInstance(conf1); job1.setJobName(StripesPMI.class.getSimpleName()); job1.setJarByClass(StripesPMI.class); job1.setNumReduceTasks(1); //file path of job1 // Delete the output directory if it exist Path dir = new Path("temp"); FileSystem.get(getConf()).delete(dir, true); FileInputFormat.setInputPaths(job1, new Path(inputPath)); FileOutputFormat.setOutputPath(job1, new Path("temp")); job1.setMapperClass(Map_First.class); job1.setCombinerClass(MyCombiner.class); job1.setReducerClass(Reduce_First.class); job1.setMapOutputKeyClass(Text.class);//map output key job1.setMapOutputValueClass(IntWritable.class);//map output value job1.setOutputKeyClass(Text.class);//reduce output key job1.setOutputValueClass(IntWritable.class);//reduce output value // ControlledJob ctrljob1=new ControlledJob(conf); // ctrljob1.setJob(job1); long startTime1 = System.currentTimeMillis(); job1.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime1) / 1000.0 + " seconds"); //begin job2 //Configuration conf2 = getConf(); Job job2 = Job.getInstance(getConf()); job2.setJobName(StripesPMI.class.getSimpleName()); job2.setJarByClass(StripesPMI.class); job2.setNumReduceTasks(reduceTasks); //delete the output directory if it exists. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); //file path of job2 FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath)); job2.addCacheFile(new URI("temp/part-r-00000")); job2.setMapperClass(Map_Second.class); job2.setReducerClass(Reduce_Second.class); job2.setMapOutputKeyClass(Text.class);//map output key job2.setMapOutputValueClass(HMapStIW.class);//map output value job2.setOutputKeyClass(PairOfStrings.class);//reduce output key job2.setOutputValueClass(FloatWritable.class);//reduce output value long startTime2 = System.currentTimeMillis(); job2.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime2) / 1000.0 + " seconds"); System.out .println("Total Job Finished in" + (System.currentTimeMillis() - startTime1) / 1000.0 + " seconds"); System.out.println("total number of lines:" + lines); return 0; } /** * Dispatches command-line arguments to the tool via the {@code ToolRunner}. */ public static void main(String[] args) throws Exception { ToolRunner.run(new StripesPMI(), args); } }