edu.umn.cs.spatialHadoop.operations.Indexer.java Source code

Introduction

Here is the source code for edu.umn.cs.spatialHadoop.operations.Indexer.java
Source

/***********************************************************************
* Copyright (c) 2015 by Regents of the University of Minnesota.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Apache License, Version 2.0 which 
* accompanies this distribution and is available at
* http://www.opensource.org/licenses/apache2.0.php.
*
*************************************************************************/
package edu.umn.cs.spatialHadoop.operations;

import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.FileOutputCommitter;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobContext;
import org.apache.hadoop.mapred.LocalJobRunner;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.LineReader;

import edu.umn.cs.spatialHadoop.OperationsParams;
import edu.umn.cs.spatialHadoop.core.GridPartitioner;
import edu.umn.cs.spatialHadoop.core.HilbertCurvePartitioner;
import edu.umn.cs.spatialHadoop.core.KdTreePartitioner;
import edu.umn.cs.spatialHadoop.core.Partition;
import edu.umn.cs.spatialHadoop.core.Partitioner;
import edu.umn.cs.spatialHadoop.core.Point;
import edu.umn.cs.spatialHadoop.core.QuadTreePartitioner;
import edu.umn.cs.spatialHadoop.core.Rectangle;
import edu.umn.cs.spatialHadoop.core.ResultCollector;
import edu.umn.cs.spatialHadoop.core.STRPartitioner;
import edu.umn.cs.spatialHadoop.core.Shape;
import edu.umn.cs.spatialHadoop.core.SpatialSite;
import edu.umn.cs.spatialHadoop.core.ZCurvePartitioner;
import edu.umn.cs.spatialHadoop.io.Text2;
import edu.umn.cs.spatialHadoop.mapred.GridOutputFormat;
import edu.umn.cs.spatialHadoop.mapred.IndexOutputFormat;
import edu.umn.cs.spatialHadoop.mapred.IndexOutputFormat.IndexRecordWriter;
import edu.umn.cs.spatialHadoop.mapred.ShapeIterInputFormat;
import edu.umn.cs.spatialHadoop.util.FileUtil;

/**
 * @author Ahmed Eldawy
 *
 */
public class Indexer {
    private static final Log LOG = LogFactory.getLog(Indexer.class);

    private static final Map<String, Class<? extends Partitioner>> PartitionerClasses;
    private static final Map<String, Boolean> PartitionerReplicate;

    static {
        PartitionerClasses = new HashMap<String, Class<? extends Partitioner>>();
        PartitionerClasses.put("grid", GridPartitioner.class);
        PartitionerClasses.put("str", STRPartitioner.class);
        PartitionerClasses.put("str+", STRPartitioner.class);
        PartitionerClasses.put("quadtree", QuadTreePartitioner.class);
        PartitionerClasses.put("zcurve", ZCurvePartitioner.class);
        PartitionerClasses.put("hilbert", HilbertCurvePartitioner.class);
        PartitionerClasses.put("kdtree", KdTreePartitioner.class);

        PartitionerReplicate = new HashMap<String, Boolean>();
        PartitionerReplicate.put("grid", true);
        PartitionerReplicate.put("str", false);
        PartitionerReplicate.put("str+", true);
        PartitionerReplicate.put("quadtree", true);
        PartitionerReplicate.put("zcurve", false);
        PartitionerReplicate.put("hilbert", false);
        PartitionerReplicate.put("kdtree", true);
    }

    /**
     * The map and reduce functions for the repartition
     * @author Ahmed Eldawy
     *
     */
    public static class IndexMethods extends MapReduceBase
            implements Mapper<Rectangle, Iterable<? extends Shape>, IntWritable, Shape>,
            Reducer<IntWritable, Shape, IntWritable, Shape> {

        /**The partitioner used to partitioner the data across reducers*/
        private Partitioner partitioner;
        /**
         * Whether to replicate a record to all overlapping partitions or to assign
         * it to only one partition
         */
        private boolean replicate;

        @Override
        public void configure(JobConf job) {
            super.configure(job);
            this.partitioner = Partitioner.getPartitioner(job);
            this.replicate = job.getBoolean("replicate", false);
        }

        @Override
        public void map(Rectangle dummy, Iterable<? extends Shape> shapes,
                final OutputCollector<IntWritable, Shape> output, Reporter reporter) throws IOException {
            final IntWritable partitionID = new IntWritable();
            int i = 0;
            for (final Shape shape : shapes) {
                if (replicate) {
                    partitioner.overlapPartitions(shape, new ResultCollector<Integer>() {
                        @Override
                        public void collect(Integer r) {
                            partitionID.set(r);
                            try {
                                output.collect(partitionID, shape);
                            } catch (IOException e) {
                                LOG.warn("Error checking overlapping partitions", e);
                            }
                        }
                    });
                } else {
                    partitionID.set(partitioner.overlapPartition(shape));
                    if (partitionID.get() >= 0)
                        output.collect(partitionID, shape);
                }
                if (((++i) & 0xff) == 0) {
                    reporter.progress();
                }
            }
        }

        @Override
        public void reduce(IntWritable partitionID, Iterator<Shape> shapes,
                OutputCollector<IntWritable, Shape> output, Reporter reporter) throws IOException {
            while (shapes.hasNext()) {
                output.collect(partitionID, shapes.next());
            }
            // Indicate end of partition to close the file
            partitionID.set(-(partitionID.get() + 1));
            output.collect(partitionID, null);
        }
    }

    /**
     * Output committer that concatenates all master files into one master file.
     * @author Ahmed Eldawy
     *
     */
    public static class IndexerOutputCommitter extends FileOutputCommitter {
        @Override
        public void commitJob(JobContext context) throws IOException {
            super.commitJob(context);

            JobConf job = context.getJobConf();
            Path outPath = GridOutputFormat.getOutputPath(job);
            FileSystem outFs = outPath.getFileSystem(job);

            // Concatenate all master files into one file
            FileStatus[] resultFiles = outFs.listStatus(outPath, new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    return path.getName().contains("_master");
                }
            });

            if (resultFiles.length == 0) {
                LOG.warn("No _master files were written by reducers");
            } else {
                String sindex = job.get("sindex");
                Path masterPath = new Path(outPath, "_master." + sindex);
                OutputStream destOut = outFs.create(masterPath);
                Path wktPath = new Path(outPath, "_" + sindex + ".wkt");
                PrintStream wktOut = new PrintStream(outFs.create(wktPath));
                wktOut.println("ID\tBoundaries\tRecord Count\tSize\tFile name");
                Text tempLine = new Text2();
                Partition tempPartition = new Partition();
                final byte[] NewLine = new byte[] { '\n' };
                for (FileStatus f : resultFiles) {
                    LineReader in = new LineReader(outFs.open(f.getPath()));
                    while (in.readLine(tempLine) > 0) {
                        destOut.write(tempLine.getBytes(), 0, tempLine.getLength());
                        destOut.write(NewLine);
                        tempPartition.fromText(tempLine);
                        wktOut.println(tempPartition.toWKT());
                    }
                    in.close();
                    outFs.delete(f.getPath(), false); // Delete the copied file
                }
                destOut.close();
            }
        }
    }

    private static RunningJob indexMapReduce(Path inPath, Path outPath, OperationsParams params)
            throws IOException, InterruptedException {
        JobConf job = new JobConf(params, Indexer.class);
        job.setJobName("Indexer");

        // Set input file MBR if not already set
        Rectangle inputMBR = (Rectangle) params.getShape("mbr");
        if (inputMBR == null)
            inputMBR = FileMBR.fileMBR(inPath, params);
        OperationsParams.setShape(job, "mbr", inputMBR);

        // Set input and output
        job.setInputFormat(ShapeIterInputFormat.class);
        ShapeIterInputFormat.setInputPaths(job, inPath);
        job.setOutputFormat(IndexOutputFormat.class);
        GridOutputFormat.setOutputPath(job, outPath);

        // Set the correct partitioner according to index type
        String index = job.get("sindex");
        if (index == null)
            throw new RuntimeException("Index type is not set");
        long t1 = System.currentTimeMillis();
        Partitioner partitioner = createPartitioner(inPath, outPath, job, index);
        Partitioner.setPartitioner(job, partitioner);
        long t2 = System.currentTimeMillis();
        System.out.println("Total time for space subdivision in millis: " + (t2 - t1));

        // Set mapper and reducer
        Shape shape = params.getShape("shape");
        job.setMapperClass(IndexMethods.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(shape.getClass());
        job.setReducerClass(IndexMethods.class);
        job.setOutputCommitter(IndexerOutputCommitter.class);
        ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
        job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
        job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

        // Use multithreading in case the job is running locally
        job.setInt(LocalJobRunner.LOCAL_MAX_MAPS, Runtime.getRuntime().availableProcessors());

        // Start the job
        if (params.getBoolean("background", false)) {
            // Run in background
            JobClient jc = new JobClient(job);
            return jc.submitJob(job);
        } else {
            // Run and block until it is finished
            return JobClient.runJob(job);
        }
    }

    public static Partitioner createPartitioner(Path in, Path out, Configuration job, String partitionerName)
            throws IOException {
        return createPartitioner(new Path[] { in }, out, job, partitionerName);
    }

    /***
     * Create a partitioner for a particular job
     * @param in
     * @param out
     * @param job
     * @param partitionerName
     * @return
     * @throws IOException
     */
    public static Partitioner createPartitioner(Path[] ins, Path out, Configuration job, String partitionerName)
            throws IOException {
        try {
            Partitioner partitioner = null;
            Class<? extends Partitioner> partitionerClass = PartitionerClasses.get(partitionerName.toLowerCase());
            if (partitionerClass == null) {
                throw new RuntimeException("Unknown index type '" + partitionerName + "'");
            }
            boolean replicate = PartitionerReplicate.get(partitionerName.toLowerCase());
            job.setBoolean("replicate", replicate);
            partitioner = partitionerClass.newInstance();

            long t1 = System.currentTimeMillis();
            final Rectangle inMBR = (Rectangle) OperationsParams.getShape(job, "mbr");
            // Determine number of partitions
            long inSize = 0;
            for (Path in : ins) {
                inSize += FileUtil.getPathSize(in.getFileSystem(job), in);
            }
            long estimatedOutSize = (long) (inSize * (1.0 + job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.1f)));
            FileSystem outFS = out.getFileSystem(job);
            long outBlockSize = outFS.getDefaultBlockSize(out);
            int numPartitions = Math.max(1, (int) Math.ceil((float) estimatedOutSize / outBlockSize));
            LOG.info("Partitioning the space into " + numPartitions + " partitions");

            final Vector<Point> sample = new Vector<Point>();
            float sample_ratio = job.getFloat(SpatialSite.SAMPLE_RATIO, 0.01f);
            long sample_size = job.getLong(SpatialSite.SAMPLE_SIZE, 100 * 1024 * 1024);

            LOG.info("Reading a sample of " + (int) Math.round(sample_ratio * 100) + "%");
            ResultCollector<Point> resultCollector = new ResultCollector<Point>() {
                @Override
                public void collect(Point p) {
                    sample.add(p.clone());
                }
            };
            OperationsParams params2 = new OperationsParams(job);
            params2.setFloat("ratio", sample_ratio);
            params2.setLong("size", sample_size);
            params2.setClass("outshape", Point.class, Shape.class);
            Sampler.sample(ins, resultCollector, params2);
            long t2 = System.currentTimeMillis();
            System.out.println("Total time for sampling in millis: " + (t2 - t1));
            LOG.info("Finished reading a sample of " + sample.size() + " records");

            partitioner.createFromPoints(inMBR, sample.toArray(new Point[sample.size()]), numPartitions);

            return partitioner;
        } catch (InstantiationException e) {
            e.printStackTrace();
            return null;
        } catch (IllegalAccessException e) {
            e.printStackTrace();
            return null;
        }
    }

    private static void indexLocal(Path inPath, Path outPath, OperationsParams params) throws IOException {
        JobConf job = new JobConf(params);
        String sindex = params.get("sindex");
        Partitioner partitioner = createPartitioner(inPath, outPath, job, sindex);

        // Start reading input file
        Vector<InputSplit> splits = new Vector<InputSplit>();
        final ShapeIterInputFormat inputFormat = new ShapeIterInputFormat();
        FileSystem inFs = inPath.getFileSystem(params);
        FileStatus inFStatus = inFs.getFileStatus(inPath);
        if (inFStatus != null && !inFStatus.isDir()) {
            // One file, retrieve it immediately.
            // This is useful if the input is a hidden file which is automatically
            // skipped by FileInputFormat. We need to plot a hidden file for the case
            // of plotting partition boundaries of a spatial index
            splits.add(new FileSplit(inPath, 0, inFStatus.getLen(), new String[0]));
        } else {
            ShapeIterInputFormat.addInputPath(job, inPath);
            for (InputSplit s : inputFormat.getSplits(job, 1))
                splits.add(s);
        }

        // Copy splits to a final array to be used in parallel
        final FileSplit[] fsplits = splits.toArray(new FileSplit[splits.size()]);
        boolean replicate = job.getBoolean("replicate", false);

        final IndexRecordWriter<Shape> recordWriter = new IndexRecordWriter<Shape>(partitioner, replicate, sindex,
                outPath, params);

        for (FileSplit fsplit : fsplits) {
            RecordReader<Rectangle, Iterable<? extends Shape>> reader = inputFormat.getRecordReader(fsplit, job,
                    null);
            Rectangle partitionMBR = reader.createKey();
            Iterable<? extends Shape> shapes = reader.createValue();

            final IntWritable partitionID = new IntWritable();

            while (reader.next(partitionMBR, shapes)) {
                if (replicate) {
                    // Replicate each shape to all overlapping partitions
                    for (final Shape s : shapes) {
                        partitioner.overlapPartitions(s, new ResultCollector<Integer>() {
                            @Override
                            public void collect(Integer id) {
                                partitionID.set(id);
                                try {
                                    recordWriter.write(partitionID, s);
                                } catch (IOException e) {
                                    throw new RuntimeException(e);
                                }
                            }
                        });
                    }
                } else {
                    for (Shape s : shapes) {
                        partitionID.set(partitioner.overlapPartition(s));
                        recordWriter.write(partitionID, s);
                    }
                }
            }
            reader.close();
        }

        recordWriter.close(null);
    }

    public static RunningJob index(Path inPath, Path outPath, OperationsParams params)
            throws IOException, InterruptedException {
        if (OperationsParams.isLocal(new JobConf(params), inPath)) {
            indexLocal(inPath, outPath, params);
            return null;
        } else {
            return indexMapReduce(inPath, outPath, params);
        }
    }

    protected static void printUsage() {
        System.out.println("Builds a spatial index on an input file");
        System.out.println("Parameters (* marks required parameters):");
        System.out.println("<input file> - (*) Path to input file");
        System.out.println("<output file> - (*) Path to output file");
        System.out.println("shape:<point|rectangle|polygon> - (*) Type of shapes stored in input file");
        System.out.println("sindex:<index> - (*) Type of spatial index (grid|str|str+|quadtree|zcurve|kdtree)");
        System.out.println("-overwrite - Overwrite output file without noitce");
        GenericOptionsParser.printGenericCommandUsage(System.out);
    }

    /**
     * Entry point to the indexing operation.
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        OperationsParams params = new OperationsParams(new GenericOptionsParser(args));

        if (!params.checkInputOutput(true)) {
            printUsage();
            return;
        }
        if (params.get("sindex") == null) {
            System.err.println("Please specify type of index to build (grid, rtree, r+tree, str, str+)");
            printUsage();
            return;
        }
        Path inputPath = params.getInputPath();
        Path outputPath = params.getOutputPath();

        // The spatial index to use
        long t1 = System.currentTimeMillis();
        index(inputPath, outputPath, params);
        long t2 = System.currentTimeMillis();
        System.out.println("Total indexing time in millis " + (t2 - t1));
    }

}