Java tutorial
/******************************************************************************* ** /******************************************************************************* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package jobimtext.thesaurus.distributional.hadoop.mapreduce; import java.io.IOException; import jobimtext.thesaurus.distributional.hadoop.SumReducer.IntSumReducer; import jobimtext.thesaurus.distributional.hadoop.util.HadoopUtil; import jobimtext.thesaurus.distributional.hadoop.util.StringUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; /** * This MapReducer trims whitespaces from each word in the value. It is * applicable for the context input files in the format: * * Value: string \t string .. * * Key: int * * @author Martin Riedl * */ public class CleanContext { /** * The mapper takes the feature and its word list (all the words having this * feature) as input It then compares all words pair-wise and outputs 1/n, * with n being the number of words. * */ @SuppressWarnings("deprecation") public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> { public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { int linesize; // split line into tokens // the first is the word feature, the rest are all // word having this feature String[] line = value.toString().split("\t"); String newLine = ""; linesize = line.length; for (int i = 0; i < linesize - 1; i++) { String l = line[i]; String ln = StringUtil.trim(l); int li = l.lastIndexOf("#"); String w = ln.substring(0, li); String p = ln.substring(li + 1); String wt = StringUtil.trim(w); if (wt.length() == 0) { System.out.println("Neglected: ^" + l + "$" + "\t" + "^" + ln + "$" + "\t" + line); return; } newLine += wt + "#" + p + "\t"; } output.collect(new Text(newLine.substring(0, newLine.length() - 1)), new IntWritable(Integer.parseInt(line[linesize - 1]))); } // map() } // class map @SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(IntSumReducer.class); conf.setReducerClass(IntSumReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* number of milliseconds before killing a not responding task */ conf.set("mapred.task.timeout", "600000"); /* change to 128mb */ conf.set("dfs.block.size", "134217728"); /* set the maximum number of task per node */ int maptasks = 100; /* * Number of map tasks to deploy on each machine. 0.5 to 2 * * (cores/node) */ conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); /* * The default number of map tasks per job. Typically set to a prime * several times greater than number of available hosts. */ conf.set("mapred.map.tasks", "" + maptasks); int reducetasks = 120; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); conf.set("mapred.job.map.memory.mb", "3000"); conf.set("mapred.job.reduce.memory.mb", "3000"); JobClient.runJob(conf); } // main } // class SimCounts