Java tutorial
/* * Cloud9: A MapReduce Library for Hadoop * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package edu.umd.cloud9.io.benchmark; import java.io.IOException; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapred.lib.IdentityMapper; import org.apache.hadoop.mapred.lib.IdentityReducer; import edu.umd.cloud9.io.pair.PairOfInts; /** * <p> * Benchmark for comparing Hadoop sorting with and without the WritableComparator optimization. Task * is sorting the one million PairOfInts created by {@link GenerateRandomPairsOfInts}. * </p> * * <p> * Comparison of sort speed with and without WritableComparator optimization, on Hadoop 0.17.2 in * local mode, Java 1.5, MacBookPro (2.6 GHz, 2GB RAM) running Windows XP/Cygwin. Benchmark * conducted 10/16/2008. Running times reported in seconds. * </p> * * <table border="1" cellpadding="5"> * <tr> * <td><b>Trial</b></td> * <td><b>Without Optimization</b></td> * <td><b>With Optimization</b></td> * </tr> * <tr><td> 1</td><td>36.406</td><td>21.344</td></tr> * <tr><td> 2</td><td>35.562</td><td>21.407</td></tr> * <tr><td> 3</td><td>36.532</td><td>22.453</td></tr> * <tr><td> 4</td><td>36.39 </td><td>22.484</td></tr> * <tr><td> 5</td><td>36.453</td><td>21.375</td></tr> * <tr><td> 6</td><td>35.5 </td><td>22.484</td></tr> * <tr><td> 7</td><td>36.391</td><td>22.562</td></tr> * <tr><td> 8</td><td>36.323</td><td>22.484</td></tr> * <tr><td> 9</td><td>35.906</td><td>22.422</td></tr> * <tr><td>10</td><td>36.453</td><td>22.344</td></tr> * <tr> * <td><b>mean</b></td> * <td>36.19 [35.95, 36.43]</td> * <td>22.14 [21.81, 22.46]</td> * </tr> * </table> * * <p> * Numbers in square brackets denote 95% confidence intervals. * </p> * */ public class HadoopSortRandomPairsOfInts { private HadoopSortRandomPairsOfInts() { } /** * Runs this benchmark. */ public static void main(String[] args) throws IOException { String inputPath = "random-pairs.seq"; String outputPath = "random-pairs.sorted"; int numMapTasks = 1; int numReduceTasks = 1; JobConf conf = new JobConf(HadoopSortRandomPairsOfInts.class); conf.setJobName("SortRandomPairsOfInts"); conf.setNumMapTasks(numMapTasks); conf.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(PairOfInts.class); conf.setOutputValueClass(IntWritable.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(IdentityMapper.class); conf.setCombinerClass(IdentityReducer.class); conf.setReducerClass(IdentityReducer.class); // Delete the output directory if it exists already Path outputDir = new Path(outputPath); FileSystem.get(conf).delete(outputDir, true); long startTime; double duration; startTime = System.currentTimeMillis(); JobClient.runJob(conf); duration = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Job took " + duration + " seconds"); } }