Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; //import java.util.StringTokenizer; import java.util.regex.*; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.commons.lang.StringUtils; /** * This is an example Hadoop Map/Reduce application. * It reads the text input files, breaks each line into words * and counts them. The output is a locally sorted list of words and the * count of how often they occurred. * * To run: bin/hadoop jar build/hadoop-examples.jar wordcount * [-m <i>maps</i>] [-r <i>reduces</i>] <i>in-dir</i> <i>out-dir</i> */ public class NgramMatrixBuilder extends Configured implements Tool { /** * Counts the words in each line. * For each line of input, break the line into words and emit them as * (<b>word</b>, <b>1</b>). */ public static class MapClass extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> { private Text ngram_key = new Text(); private IntWritable one = new IntWritable(1); private static int ngram_size = 3; public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); ngram_generators(key, line, output); } private void ngram_generators(LongWritable key, String l, OutputCollector<Text, IntWritable> output) throws IOException { Pattern pattern = Pattern.compile("(\\S.+?[.!?])(?=\\s+|$)"); Matcher matcher = pattern.matcher(l); String sentence; while (matcher.find()) { sentence = matcher.group(); switch (sentence.charAt(sentence.length() - 1)) { case '.': case '!': case '?': sentence = sentence.substring(0, sentence.length() - 1); break; default: break; } get_phrases(key, sentence, output); } } private void get_phrases(LongWritable key, String s, OutputCollector<Text, IntWritable> output) throws IOException { Pattern pattern = Pattern.compile( "\\b(a|all|an|am|and|are|as|at|be|but|by|can|do|for|from|had|has|have|he|his|how|if|in|is|it|its|it's|my|no|not|of|on|or|our|so|that|the|their|these|they|this|to|us|was|we|were|when|where|which|who|with|would|you|A|All|An|Am|And|Are|As|At|Be|But|By|Can|Do|For|From|Had|Has|Have|He|His|How|If|In|Is|It|Its|It's|My|No|Not|Of|On|Or|Our|So|That|The|Their|These|They|This|To|Us|Was|We|Were|When|Where|Which|Who|With|Would|You)\\b|[,]"); Matcher matcher = pattern.matcher(s); String phrase; int start = 0; while (matcher.find()) { phrase = s.substring(start, matcher.start()); if (StringUtils.trim(phrase).length() > 1) map_ngrams(key, StringUtils.trim(phrase), output); start = matcher.end(); } } private List<String> get_ngrams(int n, String str) { List<String> ngrams = new ArrayList<String>(); String[] words = str.split(" "); for (int i = 0; i < words.length - n + 1; i++) ngrams.add(concat(words, i, i + n)); return ngrams; } private String concat(String[] words, int start, int end) { StringBuilder sb = new StringBuilder(); for (int i = start; i < end; i++) sb.append((i > start ? " " : "") + words[i]); return sb.toString(); } private void map_ngrams(LongWritable key, String p, OutputCollector<Text, IntWritable> output) throws IOException { for (int n = 1; n <= ngram_size; n++) { for (String ngram : get_ngrams(n, p)) { if (StringUtils.trim(ngram).length() > 1) { ngram_key.set(key.toString() + "@" + StringUtils.trim(ngram)); output.collect(ngram_key, one); } } } } } /** * A reducer class that just emits the sum of the input values. */ public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> { public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { IntWritable value; int sum = 0; while (values.hasNext()) { sum += values.next().get(); } value = new IntWritable(sum); output.collect(key, value); } } static int printUsage() { System.out.println("wordcount [-m <maps>] [-r <reduces>] <input> <output>"); ToolRunner.printGenericCommandUsage(System.out); return -1; } /** * The main driver for word count map/reduce program. * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker. */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), NgramMatrixBuilder.class); conf.setJobName("ngrammatrixbuilder"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MapClass.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } TextInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new NgramMatrixBuilder(), args); System.exit(res); } }