AllLab_Skeleton.Lab6.ReduceSideJoin.java Source code

Introduction

Here is the source code for AllLab_Skeleton.Lab6.ReduceSideJoin.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package AllLab_Skeleton.Lab6;

import java.io.IOException;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

/**
 *
 * @author bhavesh
 */
public class ReduceSideJoin {

    final static String otherThanQuote = " [^\"] ";
    final static String quotedString = String.format(" \" %s* \" ", otherThanQuote);
    final static String regex = String.format("(?x) " + // enable comments, ignore white spaces
            ",                         " + // match a comma
            "(?=                       " + // start positive look ahead
            "  (                       " + // start group 1
            "    %s*                   " + // match 'otherThanQuote'
                                           // zero or more times
            "    %s                    " + // match 'quotedString'
            "  )*                      " + // end group 1 and repeat it
                                           // zero or more times
            "  %s*                     " + // match 'otherThanQuote'
            "  $                       " + // match the end of the
                                           // string
            ")                         ", // stop positive look ahead
            otherThanQuote, quotedString, otherThanQuote);

    public static class UserJoinMapper extends Mapper<Object, Text, Text, Text> {

        private Text outkey = new Text();
        private Text outvalue = new Text();

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            // Parse the input string into a nice map
            String[] separatedInput = value.toString().split(regex, -1);
            // String[] tagTokens = separatedInput[5].split(",");
            String userId = separatedInput[0];
            if (userId == null) {
                return;
            }
            // The foreign join key is the user ID
            outkey.set(userId);
            // Flag this record for the reducer and then output
            outvalue.set("A" + value.toString());
            context.write(outkey, outvalue);
        }
    }

    public static class CommentJoinMapper extends Mapper<Object, Text, Text, Text> {

        private Text outkey = new Text();
        private Text outvalue = new Text();

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String[] separatedInput = value.toString().split(regex, -1);
            // String[] tagTokens = separatedInput[5].split(",");
            String userId = separatedInput[0];
            //System.out.println("B" + userId);
            if (userId == null) {
                return;
            }
            // The foreign join key is the user ID
            outkey.set(userId);
            // Flag this record for the reducer and then output
            outvalue.set("B" + value.toString());
            context.write(outkey, outvalue);
        }
    }

    public static class UserJoinReducer extends Reducer<Text, Text, Text, Text> {

        private static final Text EMPTY_TEXT = new Text("");
        private Text tmp = new Text();
        private ArrayList<Text> listA = new ArrayList<Text>();
        private ArrayList<Text> listB = new ArrayList<Text>();
        private String joinType = null;

        public void setup(Context context) {
            // Get the type of join from our configuration
            joinType = context.getConfiguration().get("join.type");
        }

        public void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            // Clear our lists
            listA.clear();
            listB.clear();
            // iterate through all our values, binning each record based on what
            // it was tagged with. Make sure to remove the tag!
            while (values.iterator().hasNext()) {
                tmp = values.iterator().next();
                System.out.println(Character.toString((char) tmp.charAt(0)));
                if (Character.toString((char) tmp.charAt(0)).equals("A")) {

                    System.out.println("here4");
                    listA.add(new Text(tmp.toString().substring(1)));
                }
                if (Character.toString((char) tmp.charAt(0)).equals("B")) {
                    System.out.println("here5");
                    listB.add(new Text(tmp.toString().substring(1)));
                }
                System.out.println(tmp);
            }
            // Execute our join logic now that the lists are filled

            System.out.println(listB.size());
            executeJoinLogic(context);
        }

        private void executeJoinLogic(Context context) throws IOException, InterruptedException {

            if (joinType.equalsIgnoreCase("inner")) {
                // If both lists are not empty, join A with B
                //System.out.println("here3");
                if (!listA.isEmpty() && !listB.isEmpty()) {
                    System.out.println("here");
                    for (Text A : listA) {
                        //System.out.println("here1");
                        for (Text B : listB) {
                            //System.out.println("here2");
                            context.write(A, B);
                        }
                    }
                }
            } else if (joinType.equalsIgnoreCase("leftouter")) {
                // For each entry in A,
                for (Text A : listA) {
                    // If list B is not empty, join A and B
                    if (!listB.isEmpty()) {
                        for (Text B : listB) {
                            context.write(A, B);
                        }
                    } else {
                        // Else, output A by itself
                        context.write(A, EMPTY_TEXT);
                    }
                }
            } else if (joinType.equalsIgnoreCase("rightouter")) {
                // For each entry in B,
                for (Text B : listB) {
                    // If list A is not empty, join A and B
                    if (!listA.isEmpty()) {
                        for (Text A : listA) {
                            context.write(A, B);
                        }
                    } else {
                        // Else, output B by itself
                        context.write(EMPTY_TEXT, B);
                    }
                }
            } else if (joinType.equalsIgnoreCase("fullouter")) {
                // If list A is not empty
                if (!listA.isEmpty()) {
                    // For each entry in A
                    for (Text A : listA) {
                        // If list B is not empty, join A with B
                        if (!listB.isEmpty()) {
                            for (Text B : listB) {
                                context.write(A, B);
                            }
                        } else {
                            // Else, output A by itself
                            context.write(A, EMPTY_TEXT);
                        }
                    }
                } else {
                    // If list A is empty, just output B
                    for (Text B : listB) {
                        context.write(EMPTY_TEXT, B);
                    }
                }
            } else if (joinType.equalsIgnoreCase("anti")) {
                // If list A is empty and B is empty or vice versa
                if (listA.isEmpty() ^ listB.isEmpty()) {
                    // Iterate both A and B with null values
                    // The previous XOR check will make sure exactly one of
                    // these lists is empty and therefore the list will be
                    // skipped
                    for (Text A : listA) {
                        context.write(A, EMPTY_TEXT);
                    }
                    for (Text B : listB) {
                        context.write(EMPTY_TEXT, B);
                    }
                }
            }
        }
    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf, "ReduceSideJoin");
        job.setJarByClass(ReduceSideJoin.class);

        // Use MultipleInputs to set which input uses what mapper
        // This will keep parsing of each data set separate from a logical
        // standpoint
        // The first two elements of the args array are the two inputs
        MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, UserJoinMapper.class);
        MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, CommentJoinMapper.class);
        job.getConfiguration().set("join.type", "leftouter");
        //job.setNumReduceTasks(0);
        job.setReducerClass(UserJoinReducer.class);

        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, new Path(args[2]));

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.waitForCompletion(true);
    }

}