edu.umn.cs.sthadoop.operations.STJoin.java Source code

Java tutorial

Introduction

Here is the source code for edu.umn.cs.sthadoop.operations.STJoin.java

Source

/***********************************************************************
* Copyright (c) 2015 by Regents of the University of Minnesota.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Apache License, Version 2.0 which 
* accompanies this distribution and is available at
* http://www.opensource.org/licenses/apache2.0.php.
*
*************************************************************************/
package edu.umn.cs.sthadoop.operations;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import edu.umn.cs.spatialHadoop.OperationsParams;
import edu.umn.cs.sthadoop.core.STPoint;

/**
 * Implementation of Spatio-temporal Join, takes two dataset and joins them
 * based on Spatial and temporal predicates. For example, Join birds and human
 * at area A during time interval T.
 * 
 * @author Louai Alarabi
 *
 */
public class STJoin {

    /** Class logger */
    private static final Log LOG = LogFactory.getLog(STJoin.class);

    /**
     * The following code is the Join step Refinement. 
     */
    static class STJoinMap extends MapReduceBase implements Mapper<LongWritable, Text, LongWritable, Text> {
        LongWritable id = new LongWritable();
        double distance = 0.0;
        String timeresolution = "";
        int interval = 0;

        @Override
        public void configure(JobConf job) {
            // TODO Auto-generated method stub
            super.configure(job);
            String value = job.get("timedistance");
            String[] temp = value.split(",");
            this.timeresolution = temp[1];
            this.interval = Integer.parseInt(temp[0]);
            int miledistance = Integer.parseInt(job.get("spacedistance"));
            this.distance = (0.01167734911823545 * miledistance) / 0.81;
        }

        @Override
        public void map(LongWritable key, Text value, OutputCollector<LongWritable, Text> output, Reporter reporter)
                throws IOException {
            STPoint p1 = new STPoint();
            STPoint p2 = new STPoint();
            Text joined = new Text();
            ArrayList<STPoint> list = new ArrayList<STPoint>();
            if (value != null) {
                String[] points = value.toString().split("\t");
                id.set(Long.parseLong(points[0]));
                for (String p : points) {
                    try {
                        p1 = new STPoint(p);
                        list.add(p1);
                    } catch (Exception e) {

                    }
                }

                Collections.sort(list);
                int j = 0;
                for (int i = 0; i < list.size(); i++) {
                    j = i + 1;
                    while (j < list.size() && (list.get(i).distanceTo(list.get(j)) <= distance)) {
                        if (getTimeDistance(list.get(i).time, list.get(j).time, timeresolution, interval)) {
                            joined.set(list.get(i).toText(new Text()).toString() + "\t"
                                    + list.get(j).toText(new Text()).toString());
                            output.collect(id, joined);
                        }
                        j++;
                    }
                }
            }
        }

        private boolean getTimeDistance(String time1, String time2, String flag, int interval) {
            SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
            boolean result = false;
            try {
                Date d1 = format.parse(time1);
                Date d2 = format.parse(time2);

                //in milliseconds
                long diff = d2.getTime() - d1.getTime();
                if (flag.equals("day")) {
                    if (interval <= (int) (diff / (24 * 60 * 60 * 1000)))
                        result = true;
                } else if (flag.equals("hour")) {
                    if (interval <= (int) (diff / (60 * 60 * 1000) % 24))
                        result = true;
                } else if (flag.equals("minute")) {
                    if (interval <= (int) (diff / (60 * 1000) % 60))
                        result = true;
                } else if (flag.equals("second")) {
                    if (interval <= (int) (diff / 1000 % 60))
                        result = true;
                } else {
                    return result;
                }
            } catch (Exception e) {
                e.printStackTrace();
            }

            return result;
        }

    }

    //   static class STJoinReduce extends MapReduceBase implements 
    //   Reducer<LongWritable, Text, LongWritable, Text> {      
    //      
    //
    //      @Override
    //      public void reduce(final LongWritable cellId, Iterator<Text> values, 
    //            final OutputCollector<LongWritable,Text> output,Reporter reporter) throws IOException {
    // 
    //         while(values.hasNext()){
    //            output.collect(cellId, values.next());
    //         }
    //         
    //         
    //      }
    //      
    //      
    //   }

    /**
     * 
     * @param inputPath
     * @param outputPath
     * @param params
     * @return
     * @throws IOException
     * @throws Exception
     * @throws InterruptedException
     */
    private static long stJoin(Path inputPath, Path outputPath, OperationsParams params)
            throws IOException, Exception, InterruptedException {

        JobConf conf = new JobConf(new Configuration(), STJoin.class);
        FileSystem outfs = outputPath.getFileSystem(conf);
        outfs.delete(outputPath, true);
        conf.setJobName("STJoin");
        // pass params to the join map-reduce 
        conf.set("timedistance", params.get("timedistance"));
        conf.set("spacedistance", params.get("spacedistance"));
        //      conf.setMapOutputKeyClass(LongWritable.class);
        //      conf.setMapOutputValueClass(Text.class);
        conf.setOutputKeyClass(LongWritable.class);
        conf.setOutputValueClass(Text.class);
        // Mapper settings
        conf.setMapperClass(STJoinMap.class);
        //      conf.setReducerClass(STJoinReduce.class);
        //      conf.setCombinerClass(STJoinReduce.class);
        conf.setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
        FileInputFormat.setInputPaths(conf, inputPath);
        FileOutputFormat.setOutputPath(conf, outputPath);
        conf.setNumReduceTasks(0);
        JobClient.runJob(conf).waitForCompletion();
        outfs = inputPath.getFileSystem(conf);
        outfs.delete(inputPath);
        return 0;
    }

    private static void printUsage() {
        System.out.println("Runs a spatio-temporal range query on indexed data");
        System.out.println("Parameters: (* marks required parameters)");
        System.out.println("<input file> - (*) Path to input file");
        System.out.println("<output file> -  Path to input file");
        System.out.println("shape:<STPoint> - (*) Type of shapes stored in input file");
        System.out.println("rect:<x1,y1,x2,y2> - Spatial query range");
        System.out.println("interval:<date1,date2> - Temporal query range. " + "Format of each date is yyyy-mm-dd");
        System.out.println("timeDistance:[1,day - 1,hour - 30,minute - 120,second] -  time distance degree");
        System.out.println("spaceDistance:integer -  time distance degree");
        System.out.println("-overwrite - Overwrite output file without notice");
        GenericOptionsParser.printGenericCommandUsage(System.out);
    }

    private static String addtimeSpaceToInterval(String date, int interval) throws ParseException {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
        Calendar c = Calendar.getInstance();
        c.setTime(sdf.parse(date));
        c.add(Calendar.DATE, interval);
        date = sdf.format(c.getTime());
        return date;
    }

    /**
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {

        //       args = new String[10];
        //       args[0] = "/home/louai/nyc-taxi/yellowIndex";
        //       args[1] = "/home/louai/nyc-taxi/humanIndex";
        //       args[2] = "/home/louai/nyc-taxi/resultSTJoin";
        //       args[3] = "shape:edu.umn.cs.sthadoop.core.STPoint";
        //       args[4] =
        //       "rect:-74.98451232910156,35.04014587402344,-73.97936248779295,41.49399566650391";
        //       args[5] = "interval:2015-01-01,2015-01-02";
        //       args[6] = "timeDistance:1,day";
        //       args[7] = "spaceDistance:2";
        //       args[8] = "-overwrite";
        //       args[9] = "-no-local";

        OperationsParams params = new OperationsParams(new GenericOptionsParser(args));
        Path[] allFiles = params.getPaths();
        if (allFiles.length < 2) {
            System.err.println("This operation requires at least two input files");
            printUsage();
            System.exit(1);
        }
        if (allFiles.length == 2 && !params.checkInput()) {
            // One of the input files does not exist
            printUsage();
            System.exit(1);
        }
        if (allFiles.length > 2 && !params.checkInputOutput()) {
            printUsage();
            System.exit(1);
        }

        if (params.get("timedistance") == null) {
            System.err.println("time distance is missing");
            printUsage();
            System.exit(1);
        }

        if (params.get("spacedistance") == null) {
            System.err.println("space distance is missing");
            printUsage();
            System.exit(1);
        }

        Path[] inputPaths = allFiles.length == 2 ? allFiles : params.getInputPaths();
        Path outputPath = allFiles.length == 2 ? null : params.getOutputPath();

        // modify the query range with new time interval to consider in join 
        String[] value = params.get("timedistance").split(",");
        String[] date = params.get("interval").split(",");
        int interval = Integer.parseInt(value[0]);
        String start = addtimeSpaceToInterval(date[0], -interval);
        String end = addtimeSpaceToInterval(date[1], interval);
        params.set("interval", start + "," + end);

        // Query from the dataset.
        for (Path input : inputPaths) {
            args = new String[7];
            args[0] = input.toString();
            args[1] = outputPath.getParent().toString() + "candidatebuckets/" + input.getName();
            args[2] = "shape:" + params.get("shape");
            args[3] = "rect:" + params.get("rect");
            args[4] = "interval:" + params.get("interval");
            args[5] = "-overwrite";
            args[6] = "-no-local";
            for (String x : args)
                System.out.println(x);
            STRangeQuery.main(args);
            System.out.println("done with the STQuery from: " + input.toString() + "\n" + "candidate:" + args[1]);

        }
        // invoke the map-hash and reduce-join .
        FileSystem fs = outputPath.getFileSystem(params);
        Path inputstjoin;
        if (fs.exists(new Path(outputPath.getParent().toString() + "candidatebuckets/"))) {
            inputstjoin = new Path(outputPath.getParent().toString() + "candidatebuckets");
        } else {
            inputstjoin = new Path(outputPath.getParent().toString() + "/candidatebuckets");
        }
        Path hashedbucket = new Path(outputPath.getParent().toString() + "hashedbucket");
        long t1 = System.currentTimeMillis();
        // join hash step 
        args = new String[7];
        args[0] = inputstjoin.toString();
        args[1] = hashedbucket.toString();
        args[2] = "shape:" + params.get("shape");
        args[3] = "rect:" + params.get("rect");
        args[4] = "interval:" + params.get("interval");
        args[5] = "-overwrite";
        args[6] = "-no-local";
        for (String x : args)
            System.out.println(x);
        STHash.main(args);
        //      //join Step
        //      if(fs.exists(new Path(outputPath.getParent().toString()+"hashedbucket"))){
        //          inputstjoin = new Path(outputPath.getParent().toString()+"hashedbucket");
        //       }else{
        //          inputstjoin = new Path(outputPath.getParent().toString()+"/hashedbucket");
        //       }
        //Join refinement Step 
        stJoin(hashedbucket, outputPath, params);
        long t2 = System.currentTimeMillis();
        System.out.println("Total join time: " + (t2 - t1) + " millis");
    }

}