edu.umn.cs.spatialHadoop.operations.Disjoint.java Source code

Java tutorial

Introduction

Here is the source code for edu.umn.cs.spatialHadoop.operations.Disjoint.java

Source

/***********************************************************************
* Copyright (c) 2015 by Regents of the University of Minnesota.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Apache License, Version 2.0 which 
* accompanies this distribution and is available at
* http://www.opensource.org/licenses/apache2.0.php.
*
*************************************************************************/
package edu.umn.cs.spatialHadoop.operations;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.Counters.Counter;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.Task;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import edu.umn.cs.spatialHadoop.OperationsParams;
import edu.umn.cs.spatialHadoop.core.CellInfo;
import edu.umn.cs.spatialHadoop.core.GridInfo;
import edu.umn.cs.spatialHadoop.core.Rectangle;
import edu.umn.cs.spatialHadoop.core.ResultCollector2;
import edu.umn.cs.spatialHadoop.core.Shape;
import edu.umn.cs.spatialHadoop.core.SpatialAlgorithms;
import edu.umn.cs.spatialHadoop.core.SpatialSite;
import edu.umn.cs.spatialHadoop.mapred.ShapeLineInputFormat;
import edu.umn.cs.spatialHadoop.mapred.TextOutputFormat;
import edu.umn.cs.spatialHadoop.util.FileUtil;
import edu.umn.cs.spatialHadoop.util.Progressable;
import edu.umn.cs.spatialHadoop.TigerShape;
import edu.umn.cs.spatialHadoop.core.OGCJTSShape;

/**
 * An implementation of Spatial Join MapReduce as described in
 * S. Zhang, J. Han, Z. Liu, K. Wang, and Z. Xu. SJMR:
 * Parallelizing spatial join with MapReduce on clusters. In
 * CLUSTER, pages 18, New Orleans, LA, Aug. 2009.
 * The map function partitions data into grid cells and the reduce function
 * makes a plane-sweep over each cell.
 * @author eldawy
 *
 */
public class Disjoint {

    /**Class logger*/
    private static final Log LOG = LogFactory.getLog(Disjoint.class);
    private static final String PartitionGrid = "SJMR.PartitionGrid";
    public static final String PartitioiningFactor = "partition-grid-factor";
    private static final String InactiveMode = "SJMR.InactiveMode";
    private static final String isFilterOnlyMode = "DJ.FilterOnlyMode";
    private static final String JoiningThresholdPerOnce = "DJ.JoiningThresholdPerOnce";
    public static boolean isReduceInactive = false;
    public static boolean isSpatialJoinOutputRequired = true;
    public static boolean isFilterOnly = false;
    public static int joiningThresholdPerOnce = 50000;

    public static class IndexedText implements Writable {
        public byte index;
        public Text text;

        IndexedText() {
            text = new Text();
        }

        @Override
        public void write(DataOutput out) throws IOException {
            out.writeByte(index);
            text.write(out);
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            index = in.readByte();
            text.readFields(in);
        }
    }

    /**
     * The map class maps each object to all cells it overlaps with.
     * @author Ahmed Eldawy
     *
     */
    public static class DisjointMap extends MapReduceBase
            implements Mapper<Rectangle, Text, IntWritable, IndexedText> {
        private Shape shape;
        private IndexedText outputValue = new IndexedText();
        private GridInfo gridInfo;
        private IntWritable cellId = new IntWritable();
        private Path[] inputFiles;
        private InputSplit currentSplit;

        @Override
        public void configure(JobConf job) {
            super.configure(job);
            // Retrieve grid to use for partitioning
            gridInfo = (GridInfo) OperationsParams.getShape(job, PartitionGrid);
            // Create a stock shape for deserializing lines
            shape = SpatialSite.createStockShape(job);
            // Get input paths to determine file index for every record
            inputFiles = FileInputFormat.getInputPaths(job);
        }

        @Override
        public void map(Rectangle cellMbr, Text value, OutputCollector<IntWritable, IndexedText> output,
                Reporter reporter) throws IOException {
            if (reporter.getInputSplit() != currentSplit) {
                FileSplit fsplit = (FileSplit) reporter.getInputSplit();
                for (int i = 0; i < inputFiles.length; i++) {
                    if (fsplit.getPath().toString().startsWith(inputFiles[i].toString())) {
                        outputValue.index = (byte) i;
                    }
                }
                currentSplit = reporter.getInputSplit();
            }

            Text tempText = new Text(value);
            outputValue.text = value;
            shape.fromText(tempText);
            Rectangle shape_mbr = shape.getMBR();

            // Do a reference point technique to avoid processing the same record twice
            if (!cellMbr.isValid() || cellMbr.contains(shape_mbr.x1, shape_mbr.y1)) {
                Rectangle shapeMBR = shape.getMBR();
                if (shapeMBR == null)
                    return;

                //Map all to the same reduce task. Perform O(N^2) disjoint test.
                output.collect(new IntWritable(1), outputValue);
            }
        }
    }

    public static class DisjointReduce<S extends Shape> extends MapReduceBase
            implements Reducer<IntWritable, IndexedText, S, S> {
        /**Class logger*/
        private static final Log equalsLog = LogFactory.getLog(DisjointReduce.class);

        /**Number of files in the input*/
        private int inputFileCount;

        /**List of cells used by the reducer*/
        private GridInfo grid;
        private boolean inactiveMode;
        private boolean isFilterOnly;
        private int shapesThresholdPerOnce;

        private S shape;

        @Override
        public void configure(JobConf job) {
            super.configure(job);
            grid = (GridInfo) OperationsParams.getShape(job, PartitionGrid);
            shape = (S) SpatialSite.createStockShape(job);
            inputFileCount = FileInputFormat.getInputPaths(job).length;
            inactiveMode = OperationsParams.getInactiveModeFlag(job, InactiveMode);
            isFilterOnly = OperationsParams.getFilterOnlyModeFlag(job, isFilterOnlyMode);
            shapesThresholdPerOnce = OperationsParams.getJoiningThresholdPerOnce(job, JoiningThresholdPerOnce);
            equalsLog.info("configured the reduced task");
        }

        @Override
        public void reduce(IntWritable cellId, Iterator<IndexedText> values, final OutputCollector<S, S> output,
                Reporter reporter) throws IOException {
            if (!inactiveMode) {
                LOG.info("Start reduce() logic now !!!");
                long t1 = System.currentTimeMillis();

                // Extract CellInfo (MBR) for duplicate avoidance checking
                final CellInfo cellInfo = grid.getCell(cellId.get());

                // Partition retrieved shapes (values) into lists for each file
                List<S>[] shapeLists = new List[inputFileCount];
                for (int i = 0; i < shapeLists.length; i++) {
                    shapeLists[i] = new Vector<S>();
                }

                while (values.hasNext()) {
                    do {
                        IndexedText t = values.next();
                        S s = (S) shape.clone();
                        s.fromText(t.text);
                        shapeLists[t.index].add(s);
                    } while (values.hasNext() && shapeLists[1].size() < shapesThresholdPerOnce);

                    // Perform spatial join between the two lists
                    equalsLog.info(
                            "Starting Reduce: (" + shapeLists[0].size() + " X " + shapeLists[1].size() + ")...");
                    for (int i = 0; i < shapeLists[0].size(); i++) {
                        TigerShape x = (TigerShape) shapeLists[0].get(i);
                        for (int j = 0; j < shapeLists[1].size(); j++) {
                            TigerShape y = (TigerShape) shapeLists[0].get(j);
                            if (x.geom.disjoint(y.geom))
                                output.collect((S) x, (S) y);
                        }
                    }
                }

                long t2 = System.currentTimeMillis();
                LOG.info("Reducer finished in: " + (t2 - t1) + " millis");

            } else {
                LOG.info("Nothing to do !!!");
            }
        }
    }

    public static <S extends Shape> long disjoint(Path[] inFiles, Path userOutputPath, OperationsParams params)
            throws IOException, InterruptedException {
        JobConf job = new JobConf(params, Disjoint.class);

        LOG.info("Touches journey starts ....");
        FileSystem inFs = inFiles[0].getFileSystem(job);
        Path outputPath = userOutputPath;
        if (outputPath == null) {
            FileSystem outFs = FileSystem.get(job);
            do {
                outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
            } while (outFs.exists(outputPath));
        }
        FileSystem outFs = outputPath.getFileSystem(job);

        ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
        job.setJobName("Disjoint");
        job.setMapperClass(DisjointMap.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(IndexedText.class);
        job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
        job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
                inFs.getFileStatus(inFiles[1]).getBlockSize()));

        job.setReducerClass(DisjointReduce.class);
        job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

        job.setInputFormat(ShapeLineInputFormat.class);
        if (job.getBoolean("output", true))
            job.setOutputFormat(TextOutputFormat.class);
        else
            job.setOutputFormat(NullOutputFormat.class);
        ShapeLineInputFormat.setInputPaths(job, inFiles);

        // Calculate and set the dimensions of the grid to use in the map phase
        long total_size = 0;
        Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
        for (Path file : inFiles) {
            FileSystem fs = file.getFileSystem(params);
            Rectangle file_mbr = FileMBR.fileMBR(file, params);
            mbr.expand(file_mbr);
            total_size += FileUtil.getPathSize(fs, file);
        }
        // If the largest file is globally indexed, use its partitions
        total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
        int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
        int num_cells = (int) Math.max(1,
                total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
        LOG.info("Number of cells is configured to be " + num_cells);

        OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
        OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
        OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

        GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
        gridInfo.calculateCellDimensions(num_cells);
        OperationsParams.setShape(job, PartitionGrid, gridInfo);

        TextOutputFormat.setOutputPath(job, outputPath);

        if (OperationsParams.isLocal(job, inFiles)) {
            // Enforce local execution if explicitly set by user or for small files
            job.set("mapred.job.tracker", "local");
        }

        // Start the job
        RunningJob runningJob = JobClient.runJob(job);
        Counters counters = runningJob.getCounters();
        Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
        final long resultCount = outputRecordCounter.getValue();

        return resultCount;
    }

    private static void printUsage() {
        System.out.println("Performs Touches operation on two WKT files.");
        System.out.println("Parameters: (* marks the required parameters)");
        System.out.println("<input file 1> - (*) Path to the first input file");
        System.out.println("<input file 2> - (*) Path to the second input file");
        System.out.println("<output file> - Path to output file");
        System.out.println("partition-grid-factor:<value> - Patitioning grid factor (its default value is 20)");
        System.out.println("-overwrite - Overwrite output file without notice");
        GenericOptionsParser.printGenericCommandUsage(System.out);
    }

    /**
     * @param args
     * @throws IOException 
     * @throws InterruptedException 
     */
    public static void main(String[] args) throws IOException, InterruptedException {
        OperationsParams params = new OperationsParams(new GenericOptionsParser(args));
        Path[] allFiles = params.getPaths();
        if (allFiles.length < 2) {
            System.err.println("This operation requires at least two input files");
            printUsage();
            System.exit(1);
        }
        if (allFiles.length == 2 && !params.checkInput()) {
            // One of the input files does not exist
            printUsage();
            System.exit(1);
        }
        if (allFiles.length > 2 && !params.checkInputOutput()) {
            printUsage();
            System.exit(1);
        }

        Path[] inputPaths = allFiles.length == 2 ? allFiles : params.getInputPaths();
        Path outputPath = allFiles.length == 2 ? null : params.getOutputPath();

        if (params.get("repartition-only", "no").equals("yes")) {
            isReduceInactive = true;
        }

        if (params.get("joining-per-once") != null) {
            System.out.println("joining-per-once is set to: " + params.get("joining-per-once"));
            joiningThresholdPerOnce = Integer.parseInt(params.get("joining-per-once"));
        }

        if (params.get("filter-only") != null) {
            System.out.println("filer-only mode is set to: " + params.get("filter-only"));
            if (params.get("filter-only").equals("yes")) {
                isFilterOnly = true;
            } else {
                isFilterOnly = false;
            }
        }

        if (params.get("no-output") != null) {
            System.out.println("no-output mode is set to: " + params.get("no-output"));
            if (params.get("no-output").equals("yes")) {
                isSpatialJoinOutputRequired = false;
            } else {
                isSpatialJoinOutputRequired = true;
            }
        }

        long t1 = System.currentTimeMillis();
        long resultSize = disjoint(inputPaths, outputPath, params);
        long t2 = System.currentTimeMillis();
        System.out.println("Total time: " + (t2 - t1) + " millis");
        System.out.println("Result size: " + resultSize);
    }

}