com.neu.cs6240.TopKExperts.JoinQA.java Source code

Java tutorial

Introduction

Here is the source code for com.neu.cs6240.TopKExperts.JoinQA.java

Source

/**
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package com.neu.cs6240.TopKExperts;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import au.com.bytecode.opencsv.CSVParser;

public class JoinQA {

    private static final int POST_ID = 0;
    private static final int POST_TYPE_ID = 1;
    private static final int ACCEPTED_ANS_ID = 2;
    private static final int CREATION_DATE = 3;
    private static final int OWNER_USER_ID = 6;
    private static final int TAGS = 10;
    private static final String DATE_PATTERN = "yyyy-MM-dd'T'HH:mm:ss.SSS";
    private static final SimpleDateFormat formatter = new SimpleDateFormat(DATE_PATTERN);

    public static class JoinQAMapper extends Mapper<Object, Text, JoinQAKey, JoinQAValue> {
        // initialize CSVParser as comma separated values
        private CSVParser csvParser = new CSVParser(',', '"');

        public void map(Object offset, Text line, Context context) throws IOException, InterruptedException {

            System.out.println("Line " + line.toString());

            // Parse the input line
            String[] parsedData = null;
            try {
                parsedData = this.csvParser.parseLine(line.toString());
            } catch (Exception e) {
                // In case of bad data record ignore them
                return;
            }

            JoinQAValue value = null;
            JoinQAKey key = null;

            if (!isValid(parsedData)) {
                return;
            }

            if (parsedData[POST_TYPE_ID].equals("1")) {
                // Question
                key = new JoinQAKey(parsedData[ACCEPTED_ANS_ID], "Q");
                value = new JoinQAValue("", parsedData[TAGS], parsedData[POST_ID], parsedData[CREATION_DATE]);
            } else {
                // Answer
                // User Id must exists otherwise ignore
                key = new JoinQAKey(parsedData[POST_ID], "A");
                value = new JoinQAValue(parsedData[OWNER_USER_ID], "", "", parsedData[CREATION_DATE]);

            }
            context.write(key, value);
        }

        /**
         * remove invalid entries
         * 
         * @param parsedData
         * @return
         */
        private boolean isValid(String[] parsedData) {
            // We must have POST_ID & POST_TYPE_ID & CREATION_DATE
            if (parsedData[POST_ID].isEmpty() || parsedData[POST_TYPE_ID].isEmpty()
                    || parsedData[CREATION_DATE].isEmpty()) {
                return false;
            }

            // POST_TYPE_ID must be either 1 / 2
            if (!parsedData[POST_TYPE_ID].equals("1") && !parsedData[POST_TYPE_ID].equals("2")) {
                return false;
            }

            // POST_TYPE_ID = 1 => we must have ACCEPTED_ANS_ID & TAGS
            if (parsedData[POST_TYPE_ID].equals("1")
                    && (parsedData[ACCEPTED_ANS_ID].isEmpty() || parsedData[TAGS].isEmpty())) {
                return false;
            }

            // POST_TYPE_ID = 2 => we must have POST_ID & OWNER_USER_ID
            if (parsedData[POST_TYPE_ID].equals("2") && (parsedData[OWNER_USER_ID].isEmpty())) {
                return false;
            }

            return true;
        }
    }

    public static class JoinQAReducer extends Reducer<JoinQAKey, JoinQAValue, Text, NullWritable> {

        public void reduce(JoinQAKey key, Iterable<JoinQAValue> values, Context context)
                throws IOException, InterruptedException {

            ArrayList<JoinQAValue> questions = new ArrayList<JoinQAValue>();

            for (JoinQAValue value : values) {
                // Get all questions those will be secondary sorted followed by
                // answers
                if (key.getFlag().toString().equals("Q")) {
                    questions.add(new JoinQAValue(value.getUserId().toString(), value.getHashTags().toString(),
                            value.getQuestionId().toString(), value.getCreationDate().toString()));
                } else {
                    Iterator<JoinQAValue> questionIterator = questions.iterator();
                    while (questionIterator.hasNext()) {
                        JoinQAValue question = questionIterator.next();
                        StringBuilder output = new StringBuilder();

                        int timeToAnsInMinutes = getDiffMinutes(question.getCreationDate().toString(),
                                value.getCreationDate().toString());

                        output.append(value.getUserId().toString()).append(",")
                                .append(question.getHashTags().toString()).append(",")
                                .append("" + timeToAnsInMinutes).append(",")
                                .append(question.getQuestionId().toString());
                        context.write(new Text(output.toString()), NullWritable.get());
                    }
                }
            }
        }

        private int getDiffMinutes(String questionDate, String ansDate) {
            int minutes = -1;
            try {
                Date qDate = formatter.parse(questionDate);
                Date aDate = formatter.parse(ansDate);

                long milliSeconds = aDate.getTime() - qDate.getTime();
                minutes = (int) (milliSeconds / (60 * 1000));

            } catch (ParseException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            return minutes;
        }
    }

    public static class JoinQAPartitioner extends Partitioner<JoinQAKey, JoinQAValue> {
        /**
         * Based on the configured number of reducer, this will partition the
         * data approximately evenly based on number of unique post Ids
         */
        @Override
        public int getPartition(JoinQAKey key, JoinQAValue value, int numPartitions) {
            // multiply by 127 to perform some mixing
            return Math.abs(key.getPostId().hashCode() * 127) % numPartitions;
        }
    }

    public static class JoinQAGroupComparator extends WritableComparator {
        protected JoinQAGroupComparator() {
            super(JoinQAKey.class, true);
        }

        /**
         * Make sure that there'll be single reduce call per Post Id
         */
        @Override
        public int compare(WritableComparable w1, WritableComparable w2) {
            JoinQAKey key1 = (JoinQAKey) w1;
            JoinQAKey key2 = (JoinQAKey) w2;
            return key1.getPostId().compareTo(key2.getPostId());
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage: JoinQA <in> <out>");
            System.exit(2);
        }
        Job job = new Job(conf, "JoinQA");
        job.setJarByClass(JoinQA.class);
        job.setMapperClass(JoinQAMapper.class);
        job.setReducerClass(JoinQAReducer.class);
        job.setOutputKeyClass(JoinQAKey.class);
        job.setOutputValueClass(JoinQAValue.class);
        job.setPartitionerClass(JoinQAPartitioner.class);
        job.setGroupingComparatorClass(JoinQAGroupComparator.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        boolean isSucess = false;

        isSucess = job.waitForCompletion(true);

        if (isSucess) {
            // On successful completion of JoinQA start UserAnswerCountPerHashTag
            System.out.println("MR - JoinQA complete. Starting UserAnswerCountPerHashTag...");
            String[] argsForMR2 = new String[2];
            argsForMR2[0] = otherArgs[1];
            argsForMR2[1] = otherArgs[1] + "MR2";
            isSucess = UserAnswerCountPerHashTag.initUserAnswerCountPerHashTag(argsForMR2);

            if (isSucess) {
                // On successful completion of UserAnswerCountPerHashTag start TopKPerHashTag
                System.out.println("MR - UserAnswerCountPerHashTag complete. Starting TopKPerHashTag...");
                String[] argsForMR3 = new String[2];
                argsForMR3[0] = argsForMR2[1];
                argsForMR3[1] = argsForMR2[1] + "MR3";
                isSucess = TopKPerHashTag.initTopKPerHashTag(argsForMR3);
                if (isSucess) {
                    // Successfully complete TopKPerHashTag MR
                    System.out.println("All MR - Successful.");
                } else {
                    // Failed UserAnswerCountPerHashTag MR
                    System.out.println("MR - TopKPerHashTag failed.");
                }
            } else {
                // On unsuccessful completion of JoinQA end MR
                System.out.println("MR - UserAnswerCountPerHashTag failed.");
            }

        } else {
            // On unsuccessful completion of JoinQA end MR
            System.out.println("MR - JoinQA failed.");
        }

        System.exit(isSucess ? 0 : 1);
    }
}