Java tutorial
/* * chombo: Hadoop Map Reduce utility * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.chombo.mr; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.chombo.util.SecondarySort; import org.chombo.util.TextInt; import org.chombo.util.Tuple; import org.chombo.util.Utility; /** * Inner joins two sets of records. Join can be done with one or more keys * @author pranab * */ public class Joiner extends Configured implements Tool { @Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Joiner MR"; job.setJobName(jobName); job.setJarByClass(Joiner.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); Utility.setConfiguration(job.getConfiguration()); job.setMapperClass(Joiner.JoinerMapper.class); job.setReducerClass(Joiner.JoinerReducer.class); job.setMapOutputKeyClass(TextInt.class); job.setMapOutputValueClass(Tuple.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setGroupingComparatorClass(SecondarySort.TextIntIdPairGroupComprator.class); job.setPartitionerClass(SecondarySort.TextIntIdPairTuplePartitioner.class); int numReducer = job.getConfiguration().getInt("joi.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); int status = job.waitForCompletion(true) ? 0 : 1; return status; } /** * @author pranab * */ public static class JoinerMapper extends Mapper<LongWritable, Text, TextInt, Tuple> { private TextInt outKey = new TextInt(); private Tuple outVal = new Tuple(); private int[] keyFieldFirst; private int[] keyFieldSecond; private String fieldDelimRegex; private String fieldDelimOut; private boolean isFirstTypeSplit; private boolean sortKeyFields; /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context) */ protected void setup(Context context) throws IOException, InterruptedException { fieldDelimRegex = context.getConfiguration().get("field.delim.regex", ","); fieldDelimOut = context.getConfiguration().get("field.delim", ","); String firstTypePrefix = context.getConfiguration().get("first.type.prefix", "first"); isFirstTypeSplit = ((FileSplit) context.getInputSplit()).getPath().getName() .startsWith(firstTypePrefix); keyFieldFirst = Utility.intArrayFromString(context.getConfiguration().get("key.field.first"), fieldDelimRegex); keyFieldSecond = Utility.intArrayFromString(context.getConfiguration().get("key.field.second"), fieldDelimRegex); if (keyFieldFirst.length != keyFieldSecond.length) { throw new IllegalStateException("composite key sizes are not equal"); } sortKeyFields = context.getConfiguration().getBoolean("sort.key.fields", false); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split(fieldDelimRegex); //key fields as key and remaining as value if (isFirstTypeSplit) { outKey.set(Utility.extractFields(items, keyFieldFirst, fieldDelimOut, sortKeyFields), 0); Utility.createTuple(items, keyFieldFirst, outVal); outVal.prepend("0"); // context.getCounter("Join stats", "left set count").increment(1); } else { outKey.set(Utility.extractFields(items, keyFieldSecond, fieldDelimOut, sortKeyFields), 1); Utility.createTuple(items, keyFieldSecond, outVal); outVal.prepend("1"); // context.getCounter("Join stats", "right set count").increment(1); } context.write(outKey, outVal); } } /** * @author pranab * */ public static class JoinerReducer extends Reducer<TextInt, Tuple, NullWritable, Text> { private Text outVal = new Text(); private List<Tuple> fistTypeList = new ArrayList<Tuple>(); private int[] keyFieldFirst; private int[] keyFieldSecond; private String fieldDelimRegex; private String fieldDelimOut; private Tuple secondType; private StringBuilder stBld = new StringBuilder(); private boolean outputKeyAtBeg; private boolean outputFirstType; private boolean outputSecondType; private int secondSetCount; private Tuple firstTypeDefaultValue; private Tuple secondTypeDefaultValue; /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context) */ protected void setup(Context context) throws IOException, InterruptedException { Configuration config = context.getConfiguration(); fieldDelimRegex = config.get("field.delim.regex", ","); fieldDelimOut = config.get("field.delim", ","); keyFieldFirst = Utility.intArrayFromString(context.getConfiguration().get("key.field.first"), fieldDelimRegex); keyFieldSecond = Utility.intArrayFromString(context.getConfiguration().get("key.field.second"), fieldDelimRegex); outputKeyAtBeg = config.getBoolean("output.key.at.begin", true); outputFirstType = config.getBoolean("output.first.type", true); outputSecondType = config.getBoolean("output.second.type", true); String firstTypeDefaultValueSt = config.get("first.set.default.value"); if (!StringUtils.isBlank(firstTypeDefaultValueSt)) { firstTypeDefaultValue = new Tuple(); Utility.createTuple(firstTypeDefaultValueSt, firstTypeDefaultValue); firstTypeDefaultValue.prepend("0"); } String secondTypeDefaultValueSt = config.get("second.set.default.value"); if (!StringUtils.isBlank(secondTypeDefaultValueSt)) { secondTypeDefaultValue = new Tuple(); Utility.createTuple(secondTypeDefaultValueSt, secondTypeDefaultValue); secondTypeDefaultValue.prepend("1"); } } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context) */ protected void reduce(TextInt key, Iterable<Tuple> values, Context context) throws IOException, InterruptedException { fistTypeList.clear(); secondSetCount = 0; for (Tuple value : values) { if (value.startsWith("0")) { fistTypeList.add(value); } else { secondType = value; ++secondSetCount; for (Tuple firstType : fistTypeList) { setOutValue(key, firstType); context.write(NullWritable.get(), outVal); } //if first set empty use default, basically right outer join if (fistTypeList.isEmpty() && null != firstTypeDefaultValue) { setOutValue(key, firstTypeDefaultValue); context.write(NullWritable.get(), outVal); // context.getCounter("Join stats", "Right outer join").increment(1); } } } //if second set is empty, use default value if provided, basically left outer join if (secondSetCount == 0 && null != secondTypeDefaultValue) { secondType = secondTypeDefaultValue; for (Tuple firstType : fistTypeList) { setOutValue(key, firstType); context.write(NullWritable.get(), outVal); // context.getCounter("Join stats", "Left outer join").increment(1); } } } /** * @param key * @param firstType */ private void setOutValue(TextInt key, Tuple firstType) { stBld.delete(0, stBld.length()); if (outputKeyAtBeg) { stBld.append(key.getFirst()).append(fieldDelimOut); } if (outputFirstType) { stBld.append(firstType.toString(1)).append(fieldDelimOut); } if (outputSecondType) { stBld.append(secondType.toString(1)); } if (!outputKeyAtBeg) { if (outputSecondType) { stBld.append(fieldDelimOut); } stBld.append(key.getFirst()); } outVal.set(stBld.toString()); } } /** * @param args */ public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new Joiner(), args); System.exit(exitCode); } }