edu.umn.cs.spatialHadoop.operations.Skyline.java Source code

Java tutorial

Introduction

Here is the source code for edu.umn.cs.spatialHadoop.operations.Skyline.java

Source

/***********************************************************************
* Copyright (c) 2015 by Regents of the University of Minnesota.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Apache License, Version 2.0 which 
* accompanies this distribution and is available at
* http://www.opensource.org/licenses/apache2.0.php.
*
*************************************************************************/
package edu.umn.cs.spatialHadoop.operations;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.util.GenericOptionsParser;

import edu.umn.cs.spatialHadoop.OperationsParams;
import edu.umn.cs.spatialHadoop.OperationsParams.Direction;
import edu.umn.cs.spatialHadoop.core.GridRecordWriter;
import edu.umn.cs.spatialHadoop.core.Point;
import edu.umn.cs.spatialHadoop.core.Rectangle;
import edu.umn.cs.spatialHadoop.core.ResultCollector;
import edu.umn.cs.spatialHadoop.core.Shape;
import edu.umn.cs.spatialHadoop.core.SpatialSite;
import edu.umn.cs.spatialHadoop.indexing.GlobalIndex;
import edu.umn.cs.spatialHadoop.indexing.Partition;
import edu.umn.cs.spatialHadoop.mapred.BlockFilter;
import edu.umn.cs.spatialHadoop.mapred.DefaultBlockFilter;
import edu.umn.cs.spatialHadoop.mapred.ShapeInputFormat;
import edu.umn.cs.spatialHadoop.mapred.ShapeIterInputFormat;
import edu.umn.cs.spatialHadoop.mapred.SpatialRecordReader.ShapeIterator;
import edu.umn.cs.spatialHadoop.mapred.TextOutputFormat;
import edu.umn.cs.spatialHadoop.mapreduce.RTreeRecordReader3;
import edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3;
import edu.umn.cs.spatialHadoop.mapreduce.SpatialRecordReader3;
import edu.umn.cs.spatialHadoop.nasa.HDFRecordReader;
import edu.umn.cs.spatialHadoop.util.MemoryReporter;
import edu.umn.cs.spatialHadoop.util.Parallel;
import edu.umn.cs.spatialHadoop.util.Parallel.RunnableRange;

/**
 * Computes the skyline of a set of points
 * @author Ahmed Eldawy
 *
 */
public class Skyline {

    private static final Log LOG = LogFactory.getLog(Skyline.class);

    /**
     * Computes the skyline of a set of points using a divided and conquer
     * in-memory algorithm. The algorithm recursively splits the points into
     * half, computes the skyline of each half, and finally combines the two
     * skylines.
     * @param points
     * @param dir
     * @return
     */
    public static Point[] skylineInMemory(Point[] points, Direction dir) {
        Arrays.sort(points);
        return skylineRecursive(points, 0, points.length, dir);
    }

    /**
     * The recursive method of skyline
     * @param points
     * @param start
     * @param end
     * @param dir
     * @return
     */
    private static Point[] skylineRecursive(Point[] points, int start, int end, Direction dir) {
        if (end - start == 1) {
            // Return the one input point as the skyline
            return new Point[] { points[start] };
        }
        int mid = (start + end) / 2;
        // Find the skyline of each half
        Point[] skyline1 = skylineRecursive(points, start, mid, dir);
        Point[] skyline2 = skylineRecursive(points, mid, end, dir);
        // Merge the two skylines
        int cut_point = 0;
        while (cut_point < skyline1.length && !skylineDominate(skyline2[0], skyline1[cut_point], dir))
            cut_point++;
        Point[] result = new Point[cut_point + skyline2.length];
        System.arraycopy(skyline1, 0, result, 0, cut_point);
        System.arraycopy(skyline2, 0, result, cut_point, skyline2.length);
        return result;
    }

    /**
     * Returns true if p1 dominates p2 according to the given direction.
     * @param p1
     * @param p2
     * @param dir
     * @return
     */
    private static boolean skylineDominate(Point p1, Point p2, Direction dir) {
        switch (dir) {
        case MaxMax:
            return p1.x >= p2.x && p1.y >= p2.y;
        case MaxMin:
            return p1.x >= p2.x && p1.y <= p2.y;
        case MinMax:
            return p1.x <= p2.x && p1.y >= p2.y;
        case MinMin:
            return p1.x <= p2.x && p1.y <= p2.y;
        default:
            throw new RuntimeException("Unknown direction: " + dir);
        }
    }

    /**
     * Returns true if r1 dominates r2 in the given direction. r1 dominates r2 if
     * one point in r1 dominates all points in r2. There are two rules for
     * rectangle domination. We will describe them here in terms of MaxMax skyline
     * but they can be expanded to cover all other skylines.
     * 1- If the lowest corner of r1 dominates the highest corner of r2.
     * 2- If the semi-lowest corner of r1 dominates the highest corner of r2.
     * The lowest (highest) corner of a rectangle is the point inside this
     * rectangle with the minimum (maximum) coordinates in all dimensions.
     * The semi-lowest corner of a rectangle is a point inside the rectangle with
     * the minimum coordinates in all dimensions except for one dimension which
     * is the maximum coordinate.
     * 
     * Rule 2 is applied only if rectangles are compact which means they are
     * minimal bounding rectangles around contained points.
     * 
     * @param r1
     * @param r2
     * @param dir
     * @param compact - whether the input rectangles are compact or not
     * @return
     */
    public static boolean skylineDominate(Rectangle r1, Rectangle r2, Direction dir, boolean compact) {
        switch (dir) {
        case MaxMax:
            return compact ? (r1.x2 >= r2.x2 && r1.y1 >= r2.y2) || (r1.x1 >= r2.x2 && r1.y2 >= r2.y2)
                    : (r1.x1 >= r2.x2 && r1.y1 >= r2.y2);
        case MaxMin:
            return compact ? (r1.x2 >= r2.x2 && r1.y2 <= r2.y1) || (r1.x1 >= r2.x2 && r1.y1 <= r2.y1)
                    : (r1.x1 >= r2.x2 && r1.y2 <= r2.y1);
        case MinMax:
            return compact ? (r1.x2 <= r2.x1 && r1.y2 >= r2.y2) || (r1.x1 <= r2.x1 && r1.y2 >= r2.y2)
                    : (r1.x2 <= r2.x1 && r1.y1 >= r2.y2);
        case MinMin:
            return compact ? (r1.x2 <= r2.x1 && r1.y1 <= r2.y1) || (r1.x1 <= r2.x1 && r1.y2 <= r2.y1)
                    : (r1.x2 <= r2.x1 && r1.y2 <= r2.y1);
        default:
            throw new RuntimeException("Unknown direction: " + dir);
        }
    }

    /**
     * Computes the skyline of an input file using a single machine algorithm.
     * The output is written to the output file. If output file is null, the
     * output is just thrown away.
     * @param inFile
     * @param outFile
     * @param params
     * @throws IOException
     * @throws InterruptedException
     */
    public static void skylineLocal(Path inFile, Path outFile, final OperationsParams params)
            throws IOException, InterruptedException {
        if (params.getBoolean("mem", false))
            MemoryReporter.startReporting();
        // 1- Split the input path/file to get splits that can be processed
        // independently
        final SpatialInputFormat3<Rectangle, Point> inputFormat = new SpatialInputFormat3<Rectangle, Point>();
        Job job = Job.getInstance(params);
        SpatialInputFormat3.setInputPaths(job, inFile);
        final List<InputSplit> splits = inputFormat.getSplits(job);
        final Direction dir = params.getDirection("dir", Direction.MaxMax);

        // 2- Read all input points in memory
        LOG.info("Reading points from " + splits.size() + " splits");
        List<Point[]> allLists = Parallel.forEach(splits.size(), new RunnableRange<Point[]>() {
            @Override
            public Point[] run(int i1, int i2) {
                try {
                    List<Point> finalPoints = new ArrayList<Point>();
                    final int MaxSize = 100000;
                    Point[] points = new Point[MaxSize];
                    int size = 0;
                    for (int i = i1; i < i2; i++) {
                        org.apache.hadoop.mapreduce.lib.input.FileSplit fsplit = (org.apache.hadoop.mapreduce.lib.input.FileSplit) splits
                                .get(i);
                        final RecordReader<Rectangle, Iterable<Point>> reader = inputFormat
                                .createRecordReader(fsplit, null);
                        if (reader instanceof SpatialRecordReader3) {
                            ((SpatialRecordReader3) reader).initialize(fsplit, params);
                        } else if (reader instanceof RTreeRecordReader3) {
                            ((RTreeRecordReader3) reader).initialize(fsplit, params);
                        } else if (reader instanceof HDFRecordReader) {
                            ((HDFRecordReader) reader).initialize(fsplit, params);
                        } else {
                            throw new RuntimeException("Unknown record reader");
                        }
                        while (reader.nextKeyValue()) {
                            Iterable<Point> pts = reader.getCurrentValue();
                            for (Point p : pts) {
                                points[size++] = p.clone();
                                if (size >= points.length) {
                                    // Perform Skyline and write the result to finalPoints
                                    Point[] skylinePoints = skylineInMemory(points, dir);
                                    for (Point skylinePoint : skylinePoints)
                                        finalPoints.add(skylinePoint);
                                    size = 0; // reset
                                }
                            }
                        }
                        reader.close();
                    }
                    while (size-- > 0)
                        finalPoints.add(points[size]);
                    return finalPoints.toArray(new Point[finalPoints.size()]);
                } catch (IOException e) {
                    e.printStackTrace();
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
                return null;
            }
        }, params.getInt("parallel", Runtime.getRuntime().availableProcessors()));

        int totalNumPoints = 0;
        for (Point[] list : allLists)
            totalNumPoints += list.length;

        LOG.info("Read " + totalNumPoints + " points and merging into one list");
        Point[] allPoints = new Point[totalNumPoints];
        int pointer = 0;

        for (Point[] list : allLists) {
            System.arraycopy(list, 0, allPoints, pointer, list.length);
            pointer += list.length;
        }
        allLists.clear(); // To the let the GC collect it

        Point[] skyline = skylineInMemory(allPoints, dir);

        if (outFile != null) {
            if (params.getBoolean("overwrite", false)) {
                FileSystem outFs = outFile.getFileSystem(new Configuration());
                outFs.delete(outFile, true);
            }
            GridRecordWriter<Point> out = new GridRecordWriter<Point>(outFile, null, null, null);
            for (Point pt : skyline) {
                out.write(NullWritable.get(), pt);
            }
            out.close(null);
        }
    }

    public static class SkylineFilter extends DefaultBlockFilter {

        private Direction dir;

        @Override
        public void configure(Configuration conf) {
            super.configure(conf);
            dir = OperationsParams.getDirection(conf, "dir", Direction.MaxMax);
        }

        @Override
        public void selectCells(GlobalIndex<Partition> gIndex, ResultCollector<Partition> output) {
            Vector<Partition> non_dominated_partitions = new Vector<Partition>();

            for (Partition p : gIndex) {
                boolean dominated = false;
                int i = 0;
                while (!dominated && i < non_dominated_partitions.size()) {
                    Partition p2 = non_dominated_partitions.get(i);
                    dominated = skylineDominate(p2, p, dir, gIndex.isCompact());

                    // Check if the new partition dominates the previously selected one
                    if (skylineDominate(p, p2, dir, gIndex.isCompact())) {
                        // p2 is no longer non-dominated
                        non_dominated_partitions.remove(i);
                    } else {
                        // Skip to next non-dominated partition
                        i++;
                    }
                }
                if (!dominated) {
                    non_dominated_partitions.add(p);
                }
            }

            LOG.info("Processing " + non_dominated_partitions.size() + " out of " + gIndex.size() + " partition");
            System.out.println(
                    "Processing " + non_dominated_partitions.size() + " out of " + gIndex.size() + " partition");
            // Output all non-dominated partitions
            for (Partition p : non_dominated_partitions) {
                output.collect(p);
            }
        }
    }

    /**
     * An identity map function that returns values as-is with a null key. This
     * ensures that all values are reduced in one reducer.
     * @author Ahmed Eldawy
     */
    public static class IdentityMapper extends MapReduceBase
            implements Mapper<Rectangle, ShapeIterator, NullWritable, Point> {
        @Override
        public void map(Rectangle dummy, ShapeIterator points, OutputCollector<NullWritable, Point> output,
                Reporter reporter) throws IOException {
            for (Shape point : points) {
                output.collect(NullWritable.get(), (Point) point);
            }
        }
    }

    public static class SkylineReducer extends MapReduceBase
            implements Reducer<NullWritable, Point, NullWritable, Point> {

        private Direction dir;

        @Override
        public void configure(JobConf job) {
            super.configure(job);
            dir = OperationsParams.getDirection(job, "dir", Direction.MaxMax);
        }

        @Override
        public void reduce(NullWritable dummy, Iterator<Point> points, OutputCollector<NullWritable, Point> output,
                Reporter reporter) throws IOException {
            Vector<Point> vpoints = new Vector<Point>();
            while (points.hasNext()) {
                vpoints.add(points.next().clone());
            }
            Point[] skyline = skylineInMemory(vpoints.toArray(new Point[vpoints.size()]), dir);
            for (Point pt : skyline) {
                output.collect(dummy, pt);
            }
        }
    }

    private static void skylineMapReduce(Path inFile, Path userOutPath, OperationsParams params)
            throws IOException {
        JobConf job = new JobConf(params, Skyline.class);
        Path outPath = userOutPath;
        FileSystem outFs = (userOutPath == null ? inFile : userOutPath).getFileSystem(job);
        Shape shape = params.getShape("shape");

        if (outPath == null) {
            do {
                outPath = new Path(inFile.toUri().getPath() + ".skyline_" + (int) (Math.random() * 1000000));
            } while (outFs.exists(outPath));
        }

        job.setJobName("Skyline");
        job.setClass(SpatialSite.FilterClass, SkylineFilter.class, BlockFilter.class);
        job.setMapperClass(IdentityMapper.class);
        job.setCombinerClass(SkylineReducer.class);
        job.setReducerClass(SkylineReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(shape.getClass());
        job.setInputFormat(ShapeIterInputFormat.class);
        ShapeInputFormat.addInputPath(job, inFile);
        job.setOutputFormat(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, outPath);

        JobClient.runJob(job);

        // If outputPath not set by user, automatically delete it
        if (userOutPath == null)
            outFs.delete(outPath, true);
    }

    public static void skyline(Path inFile, Path outFile, OperationsParams params)
            throws IOException, InterruptedException {
        if (OperationsParams.isLocal(params, inFile)) {
            // Process without MapReduce
            skylineLocal(inFile, outFile, params);
        } else {
            // Process with MapReduce
            skylineMapReduce(inFile, outFile, params);
        }
    }

    private static void printUsage() {
        System.err.println("Computes the skyline of an input file of points");
        System.err.println("Parameters: (* marks required parameters)");
        System.err.println("<input file>: (*) Path to input file");
        System.err.println("<output file>: Path to output file");
        System.err.println(
                "<direction (max-max|max-min|min-max|min-min)>: Direction of skyline (default is max-max)");
        System.err.println("-overwrite: Overwrite output file without notice");
        GenericOptionsParser.printGenericCommandUsage(System.err);
    }

    public static void main(String[] args) throws IOException, InterruptedException {
        OperationsParams params = new OperationsParams(new GenericOptionsParser(args));
        Path[] paths = params.getPaths();
        if (paths.length <= 1 && !params.checkInput()) {
            printUsage();
            System.exit(1);
        }
        if (paths.length >= 2 && !params.checkInputOutput()) {
            printUsage();
            System.exit(1);
        }
        Path inFile = params.getInputPath();
        Path outFile = params.getOutputPath();

        long t1 = System.currentTimeMillis();
        skyline(inFile, outFile, params);
        long t2 = System.currentTimeMillis();
        System.out.println("Total time: " + (t2 - t1) + " millis");
    }

}