ca.uwaterloo.iss4e.hadoop.pointperrow.CosineMain.java Source code

Java tutorial

Introduction

Here is the source code for ca.uwaterloo.iss4e.hadoop.pointperrow.CosineMain.java

Source

package ca.uwaterloo.iss4e.hadoop.pointperrow;

import ca.uwaterloo.iss4e.algorithm.Cosine;
import ca.uwaterloo.iss4e.hadoop.io.CartesianInputFormat;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.util.*;

/**
 * Copyright (c) 2014 Xiufeng Liu ( xiufeng.liu@uwaterloo.ca )
 * <p/>
 * This file is free software: you may copy, redistribute and/or modify it
 * under the terms of the GNU General Public License version 2
 * as published by the Free Software Foundation.
 * <p/>
 * This file is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 * <p/>
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see http://www.gnu.org/licenses.
 */

public class CosineMain extends Configured implements Tool {
    public static final String DESCRIPTION = "Consine similarity program";

    private static final Log LOG = LogFactory.getLog(CosineMain.class);

    public static class AggregateReadingsMapper
            extends org.apache.hadoop.mapreduce.Mapper<Object, Text, LongWritable, DoubleWritable> {
        @Override
        protected void map(Object obj, Text line, final Context context) throws IOException, InterruptedException {
            String[] values = line.toString().split(",");
            context.write(new LongWritable(Integer.parseInt(values[0])),
                    new DoubleWritable(Double.parseDouble(values[2])));
        }
    }

    public static class AggregateReadingsReducer extends Reducer<LongWritable, DoubleWritable, LongWritable, Text> {

        @Override
        protected void reduce(LongWritable meterID, Iterable<DoubleWritable> values, Context context)
                throws IOException, InterruptedException {
            StringBuffer buf = new StringBuffer();
            Iterator<DoubleWritable> itr = values.iterator();
            while (itr.hasNext()) {
                DoubleWritable value = itr.next();
                buf.append(value.get()).append(",");
            }
            context.write(meterID, new Text(buf.toString()));
        }
    }

    public static class DescendingKeyComparator extends WritableComparator {

        public DescendingKeyComparator() {
            super(DoubleWritable.class, true);
        }

        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            return -1 * a.compareTo(b);
        }
    }

    public static class CartesianProductMapper
            extends org.apache.hadoop.mapreduce.Mapper<Text, Text, DoubleWritable, Text> {
        //DecimalFormat df4 = new DecimalFormat("###.######");

        @Override
        protected void map(Text left, Text right, final Context context) throws IOException, InterruptedException {
            String[] leftFieldValues = left.toString().split("\t");
            String[] rightFieldValues = right.toString().split("\t");
            int leftMeterID = Integer.parseInt(leftFieldValues[0]);
            int rightMeterID = Integer.parseInt(rightFieldValues[0]);
            if (leftMeterID != rightMeterID) {
                String[] leftReadings = leftFieldValues[1].split(",");
                String[] rightReadings = rightFieldValues[1].split(",");
                //String similarity = df4.format(Cosine.cosine_similarity(leftReadings, rightReadings));
                double similarity = Cosine.cosine_similarity(leftReadings, rightReadings);
                Text value = new Text();
                if (leftMeterID < rightMeterID) {
                    value.set(leftMeterID + "," + rightMeterID);
                } else {
                    value.set(rightMeterID + "," + leftMeterID);
                }
                context.write(new DoubleWritable(similarity), value);
            }
        }
    }

    public static class CartesianProductReducer extends Reducer<DoubleWritable, Text, Text, DoubleWritable> {

        Map<String, Double> results = new HashMap<String, Double>();

        @Override
        protected void reduce(DoubleWritable similarity, Iterable<Text> meterIDs, Context context)
                throws IOException, InterruptedException {
            Iterator<Text> itr = meterIDs.iterator();
            while (itr.hasNext()) {
                String meterIDPair = itr.next().toString();
                if (results.size() < 10 && !results.containsKey(meterIDPair)) {
                    results.put(meterIDPair, similarity.get());
                }
            }
        }

        protected void cleanup(Context context) throws IOException, InterruptedException {
            for (Map.Entry<String, Double> entry : results.entrySet()) {
                context.write(new Text(entry.getKey()), new DoubleWritable(entry.getValue()));
            }
        }
    }

    public int run(String[] args) throws IOException {
        Configuration conf = getConf();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.ConsineMain <input> <output>");
            System.exit(2);
        }
        Job job1 = new Job(conf, "ConsineMain");
        job1.setJarByClass(CosineMain.class);

        job1.setMapperClass(AggregateReadingsMapper.class);
        job1.setMapOutputKeyClass(LongWritable.class);
        job1.setMapOutputValueClass(DoubleWritable.class);

        job1.setReducerClass(AggregateReadingsReducer.class);
        job1.setOutputKeyClass(LongWritable.class);
        job1.setOutputValueClass(Text.class);
        FileInputFormat.setInputDirRecursive(job1, true);
        FileInputFormat.setInputPaths(job1, new Path(otherArgs[0]));
        int lastIdx = otherArgs[0].lastIndexOf("/");
        String tempOutput = otherArgs[0].substring(0, lastIdx) + "/temp";
        FileOutputFormat.setOutputPath(job1, new Path(tempOutput));

        System.out.println("\nStarting Job-1 ...");
        final long startTime = System.currentTimeMillis();
        try {
            final long startTimeJob1 = System.currentTimeMillis();
            if (!job1.waitForCompletion(true)) {
                System.out.println("Job-1 failed.");
            } else {
                System.out.println("Duration of Job1 " + ((System.currentTimeMillis() - startTimeJob1) / 1000.0)
                        + " seconds.");
                final Job job2 = new Job(conf, "ConsineMain Aggregate");
                job2.setJarByClass(CosineMain.class);
                job2.setInputFormatClass(CartesianInputFormat.class);
                CartesianInputFormat.setLeftInputInfo(job2, TextInputFormat.class, tempOutput);
                CartesianInputFormat.setRightInputInfo(job2, TextInputFormat.class, tempOutput);
                FileOutputFormat.setOutputPath(job2, new Path(otherArgs[1]));

                job2.setMapperClass(CartesianProductMapper.class);
                job2.setMapOutputKeyClass(DoubleWritable.class);
                job2.setMapOutputValueClass(Text.class);

                job2.setSortComparatorClass(DescendingKeyComparator.class);

                job2.setReducerClass(CartesianProductReducer.class);
                job2.setOutputKeyClass(Text.class);
                job2.setOutputValueClass(DoubleWritable.class);

                job2.setNumReduceTasks(10);
                final long startTimeJob2 = System.currentTimeMillis();
                System.out.println("\nStarting Job-2 ...");
                if (!job2.waitForCompletion(true)) {
                    System.out.println("Job-2 failed.");
                } else {
                    System.out.println("Duration of Job2: "
                            + ((System.currentTimeMillis() - startTimeJob2) / 1000.0) + " seconds.");
                }

            }
            FileSystem fs = FileSystem.get(conf);
            fs.delete(new Path(tempOutput), true);
        } catch (Exception e) {
            throw new RuntimeException(e);
        } finally {
            final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
            System.out.println("Total Duration: " + duration + " seconds.");
        }
        return 0;
    }

    public int run1(String[] args) throws IOException {
        if (args.length != 3) {
            System.err.println("Usage: java " + getClass().getName() + " <inputDir> <outDir> <ntasks>");
            ToolRunner.printGenericCommandUsage(System.err);
            return -1;
        }
        Configuration conf = getConf();
        final Job job2 = new Job(conf, "ConsineMain cartesian product");
        job2.setJarByClass(CosineMain.class);

        job2.setInputFormatClass(CartesianInputFormat.class);
        CartesianInputFormat.setLeftInputInfo(job2, TextInputFormat.class, args[0]);
        CartesianInputFormat.setRightInputInfo(job2, TextInputFormat.class, args[0]);
        FileOutputFormat.setOutputPath(job2, new Path(args[1]));

        job2.setMapperClass(CartesianProductMapper.class);
        job2.setMapOutputKeyClass(DoubleWritable.class);
        job2.setMapOutputValueClass(Text.class);

        job2.setSortComparatorClass(DescendingKeyComparator.class);

        job2.setReducerClass(CartesianProductReducer.class);
        job2.setOutputKeyClass(Text.class);
        job2.setOutputValueClass(DoubleWritable.class);
        job2.setNumReduceTasks(Integer.parseInt(args[2]));

        System.out.println("\nStarting Job-2 ...");
        final long startTime = System.currentTimeMillis();
        try {
            if (!job2.waitForCompletion(true)) {
                System.out.println("Job-2 failed.");
                System.exit(1);
            }
        } catch (Exception e) {
            throw new RuntimeException(e);
        } finally {
            final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
            System.out.println("Duration is " + duration + " seconds.");
        }
        return 0;
    }

    public static void main(String[] argv) throws Exception {
        System.exit(ToolRunner.run(new Configuration(), new CosineMain(), argv));
    }
}