edu.umn.cs.spatialHadoop.indexing.Indexer.java Source code

Java tutorial

Introduction

Here is the source code for edu.umn.cs.spatialHadoop.indexing.Indexer.java

Source

/***********************************************************************
* Copyright (c) 2015 by Regents of the University of Minnesota.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Apache License, Version 2.0 which 
* accompanies this distribution and is available at
* http://www.opensource.org/licenses/apache2.0.php.
*
*************************************************************************/
package edu.umn.cs.spatialHadoop.indexing;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import java.util.ArrayList;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.LocalJobRunner;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.LineReader;

import edu.umn.cs.spatialHadoop.OperationsParams;
import edu.umn.cs.spatialHadoop.core.Point;
import edu.umn.cs.spatialHadoop.core.Rectangle;
import edu.umn.cs.spatialHadoop.core.ResultCollector;
import edu.umn.cs.spatialHadoop.core.Shape;
import edu.umn.cs.spatialHadoop.core.SpatialSite;
import edu.umn.cs.spatialHadoop.indexing.IndexOutputFormat.IndexRecordWriter;
import edu.umn.cs.spatialHadoop.io.Text2;
import edu.umn.cs.spatialHadoop.mapreduce.RTreeRecordReader3;
import edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3;
import edu.umn.cs.spatialHadoop.mapreduce.SpatialRecordReader3;
import edu.umn.cs.spatialHadoop.nasa.HDFRecordReader;
import edu.umn.cs.spatialHadoop.operations.FileMBR;
import edu.umn.cs.spatialHadoop.operations.Sampler;
import edu.umn.cs.spatialHadoop.util.FileUtil;

/**
 * @author Ahmed Eldawy
 *
 */
public class Indexer {
    private static final Log LOG = LogFactory.getLog(Indexer.class);

    private static final Map<String, Class<? extends Partitioner>> PartitionerClasses;
    private static final Map<String, Class<? extends LocalIndexer>> LocalIndexes;
    private static final Map<String, Boolean> PartitionerReplicate;

    static {
        PartitionerClasses = new HashMap<String, Class<? extends Partitioner>>();
        PartitionerClasses.put("grid", GridPartitioner.class);
        PartitionerClasses.put("str", STRPartitioner.class);
        PartitionerClasses.put("str+", STRPartitioner.class);
        PartitionerClasses.put("rtree", STRPartitioner.class);
        PartitionerClasses.put("r+tree", STRPartitioner.class);
        PartitionerClasses.put("quadtree", QuadTreePartitioner.class);
        PartitionerClasses.put("zcurve", ZCurvePartitioner.class);
        PartitionerClasses.put("hilbert", HilbertCurvePartitioner.class);
        PartitionerClasses.put("kdtree", KdTreePartitioner.class);

        PartitionerReplicate = new HashMap<String, Boolean>();
        PartitionerReplicate.put("grid", true);
        PartitionerReplicate.put("str", false);
        PartitionerReplicate.put("str+", true);
        PartitionerReplicate.put("rtree", false);
        PartitionerReplicate.put("r+tree", true);
        PartitionerReplicate.put("quadtree", true);
        PartitionerReplicate.put("zcurve", false);
        PartitionerReplicate.put("hilbert", false);
        PartitionerReplicate.put("kdtree", true);

        LocalIndexes = new HashMap<String, Class<? extends LocalIndexer>>();
        LocalIndexes.put("rtree", RTreeLocalIndexer.class);
        LocalIndexes.put("r+tree", RTreeLocalIndexer.class);
    }

    /**
     * The map function that partitions the data using the configured partitioner
     * @author Eldawy
     *
     */
    public static class PartitionerMap extends Mapper<Rectangle, Iterable<? extends Shape>, IntWritable, Shape> {

        /**The partitioner used to partitioner the data across reducers*/
        private Partitioner partitioner;
        /**
         * Whether to replicate a record to all overlapping partitions or to assign
         * it to only one partition
         */
        private boolean replicate;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
            this.partitioner = Partitioner.getPartitioner(context.getConfiguration());
            this.replicate = context.getConfiguration().getBoolean("replicate", false);
        }

        @Override
        protected void map(Rectangle key, Iterable<? extends Shape> shapes, final Context context)
                throws IOException, InterruptedException {
            final IntWritable partitionID = new IntWritable();
            for (final Shape shape : shapes) {
                if (replicate) {
                    partitioner.overlapPartitions(shape, new ResultCollector<Integer>() {
                        @Override
                        public void collect(Integer r) {
                            partitionID.set(r);
                            try {
                                context.write(partitionID, shape);
                            } catch (IOException e) {
                                LOG.warn("Error checking overlapping partitions", e);
                            } catch (InterruptedException e) {
                                LOG.warn("Error checking overlapping partitions", e);
                            }
                        }
                    });
                } else {
                    partitionID.set(partitioner.overlapPartition(shape));
                    if (partitionID.get() >= 0)
                        context.write(partitionID, shape);
                }
                context.progress();
            }
        }
    }

    public static class PartitionerReduce<S extends Shape> extends Reducer<IntWritable, Shape, IntWritable, Shape> {

        @Override
        protected void reduce(IntWritable partitionID, Iterable<Shape> shapes, Context context)
                throws IOException, InterruptedException {
            LOG.info("Working on partition #" + partitionID);
            for (Shape shape : shapes) {
                context.write(partitionID, shape);
                context.progress();
            }
            // Indicate end of partition to close the file
            context.write(new IntWritable(-partitionID.get() - 1), null);
            LOG.info("Done with partition #" + partitionID);
        }
    }

    private static Job indexMapReduce(Path inPath, Path outPath, OperationsParams paramss)
            throws IOException, InterruptedException, ClassNotFoundException {
        Job job = new Job(paramss, "Indexer");
        Configuration conf = job.getConfiguration();
        job.setJarByClass(Indexer.class);

        // Set input file MBR if not already set
        Rectangle inputMBR = (Rectangle) OperationsParams.getShape(conf, "mbr");
        if (inputMBR == null) {
            inputMBR = FileMBR.fileMBR(inPath, new OperationsParams(conf));
            OperationsParams.setShape(conf, "mbr", inputMBR);
        }

        // Set input and output
        job.setInputFormatClass(SpatialInputFormat3.class);
        SpatialInputFormat3.setInputPaths(job, inPath);
        job.setOutputFormatClass(IndexOutputFormat.class);
        IndexOutputFormat.setOutputPath(job, outPath);

        // Set the correct partitioner according to index type
        String index = conf.get("sindex");
        if (index == null)
            throw new RuntimeException("Index type is not set");
        long t1 = System.currentTimeMillis();
        setLocalIndexer(conf, index);
        Partitioner partitioner = createPartitioner(inPath, outPath, conf, index);
        Partitioner.setPartitioner(conf, partitioner);

        long t2 = System.currentTimeMillis();
        System.out.println("Total time for space subdivision in millis: " + (t2 - t1));

        // Set mapper and reducer
        Shape shape = OperationsParams.getShape(conf, "shape");
        job.setMapperClass(PartitionerMap.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(shape.getClass());
        job.setReducerClass(PartitionerReduce.class);
        // Set number of reduce tasks according to cluster status
        ClusterStatus clusterStatus = new JobClient(new JobConf()).getClusterStatus();
        job.setNumReduceTasks(Math.max(1,
                Math.min(partitioner.getPartitionCount(), (clusterStatus.getMaxReduceTasks() * 9) / 10)));

        // Use multithreading in case the job is running locally
        conf.setInt(LocalJobRunner.LOCAL_MAX_MAPS, Runtime.getRuntime().availableProcessors());

        // Start the job
        if (conf.getBoolean("background", false)) {
            // Run in background
            job.submit();
        } else {
            job.waitForCompletion(conf.getBoolean("verbose", false));
        }
        return job;
    }

    /**
     * Set the local indexer for the given job configuration.
     * @param job
     * @param sindex
     */
    private static void setLocalIndexer(Configuration conf, String sindex) {
        Class<? extends LocalIndexer> localIndexerClass = LocalIndexes.get(sindex);
        if (localIndexerClass != null)
            conf.setClass(LocalIndexer.LocalIndexerClass, localIndexerClass, LocalIndexer.class);
    }

    public static Partitioner createPartitioner(Path in, Path out, Configuration job, String partitionerName)
            throws IOException {
        return createPartitioner(new Path[] { in }, out, job, partitionerName);
    }

    /**
     * Create a partitioner for a particular job
     * @param ins
     * @param out
     * @param job
     * @param partitionerName
     * @return
     * @throws IOException
     */
    public static Partitioner createPartitioner(Path[] ins, Path out, Configuration job, String partitionerName)
            throws IOException {
        try {
            Partitioner partitioner;
            Class<? extends Partitioner> partitionerClass = PartitionerClasses.get(partitionerName.toLowerCase());
            if (partitionerClass == null) {
                // Try to parse the name as a class name
                try {
                    partitionerClass = Class.forName(partitionerName).asSubclass(Partitioner.class);
                } catch (ClassNotFoundException e) {
                    throw new RuntimeException("Unknown index type '" + partitionerName + "'");
                }
            }

            if (PartitionerReplicate.containsKey(partitionerName.toLowerCase())) {
                boolean replicate = PartitionerReplicate.get(partitionerName.toLowerCase());
                job.setBoolean("replicate", replicate);
            }
            partitioner = partitionerClass.newInstance();

            long t1 = System.currentTimeMillis();
            final Rectangle inMBR = (Rectangle) OperationsParams.getShape(job, "mbr");
            // Determine number of partitions
            long inSize = 0;
            for (Path in : ins) {
                inSize += FileUtil.getPathSize(in.getFileSystem(job), in);
            }
            long estimatedOutSize = (long) (inSize * (1.0 + job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.1f)));
            FileSystem outFS = out.getFileSystem(job);
            long outBlockSize = outFS.getDefaultBlockSize(out);

            final List<Point> sample = new ArrayList<Point>();
            float sample_ratio = job.getFloat(SpatialSite.SAMPLE_RATIO, 0.01f);
            long sample_size = job.getLong(SpatialSite.SAMPLE_SIZE, 100 * 1024 * 1024);

            LOG.info("Reading a sample of " + (int) Math.round(sample_ratio * 100) + "%");
            ResultCollector<Point> resultCollector = new ResultCollector<Point>() {
                @Override
                public void collect(Point p) {
                    sample.add(p.clone());
                }
            };
            OperationsParams params2 = new OperationsParams();
            params2.setFloat("ratio", sample_ratio);
            params2.setLong("size", sample_size);
            if (job.get("shape") != null)
                params2.set("shape", job.get("shape"));
            if (job.get("local") != null)
                params2.set("local", job.get("local"));
            params2.setClass("outshape", Point.class, Shape.class);
            Sampler.sample(ins, resultCollector, params2);
            long t2 = System.currentTimeMillis();
            System.out.println("Total time for sampling in millis: " + (t2 - t1));
            LOG.info("Finished reading a sample of " + sample.size() + " records");

            int partitionCapacity = (int) Math.max(1,
                    Math.floor((double) sample.size() * outBlockSize / estimatedOutSize));
            int numPartitions = Math.max(1, (int) Math.ceil((float) estimatedOutSize / outBlockSize));
            LOG.info("Partitioning the space into " + numPartitions + " partitions with capacity of "
                    + partitionCapacity);

            partitioner.createFromPoints(inMBR, sample.toArray(new Point[sample.size()]), partitionCapacity);

            return partitioner;
        } catch (InstantiationException e) {
            e.printStackTrace();
            return null;
        } catch (IllegalAccessException e) {
            e.printStackTrace();
            return null;
        }
    }

    private static void indexLocal(Path inPath, final Path outPath, OperationsParams params)
            throws IOException, InterruptedException {
        Job job = Job.getInstance(params);
        final Configuration conf = job.getConfiguration();

        final String sindex = conf.get("sindex");

        // Start reading input file
        List<InputSplit> splits = new ArrayList<InputSplit>();
        final SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>();
        FileSystem inFs = inPath.getFileSystem(conf);
        FileStatus inFStatus = inFs.getFileStatus(inPath);
        if (inFStatus != null && !inFStatus.isDir()) {
            // One file, retrieve it immediately.
            // This is useful if the input is a hidden file which is automatically
            // skipped by FileInputFormat. We need to plot a hidden file for the case
            // of plotting partition boundaries of a spatial index
            splits.add(new FileSplit(inPath, 0, inFStatus.getLen(), new String[0]));
        } else {
            SpatialInputFormat3.setInputPaths(job, inPath);
            for (InputSplit s : inputFormat.getSplits(job))
                splits.add(s);
        }

        // Copy splits to a final array to be used in parallel
        final FileSplit[] fsplits = splits.toArray(new FileSplit[splits.size()]);
        boolean replicate = PartitionerReplicate.get(sindex);

        // Set input file MBR if not already set
        Rectangle inputMBR = (Rectangle) OperationsParams.getShape(conf, "mbr");
        if (inputMBR == null) {
            inputMBR = FileMBR.fileMBR(inPath, new OperationsParams(conf));
            OperationsParams.setShape(conf, "mbr", inputMBR);
        }

        setLocalIndexer(conf, sindex);
        final Partitioner partitioner = createPartitioner(inPath, outPath, conf, sindex);

        final IndexRecordWriter<Shape> recordWriter = new IndexRecordWriter<Shape>(partitioner, replicate, sindex,
                outPath, conf);
        for (FileSplit fsplit : fsplits) {
            RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat.createRecordReader(fsplit, null);
            if (reader instanceof SpatialRecordReader3) {
                ((SpatialRecordReader3) reader).initialize(fsplit, conf);
            } else if (reader instanceof RTreeRecordReader3) {
                ((RTreeRecordReader3) reader).initialize(fsplit, conf);
            } else if (reader instanceof HDFRecordReader) {
                ((HDFRecordReader) reader).initialize(fsplit, conf);
            } else {
                throw new RuntimeException("Unknown record reader");
            }

            final IntWritable partitionID = new IntWritable();

            while (reader.nextKeyValue()) {
                Iterable<Shape> shapes = reader.getCurrentValue();
                if (replicate) {
                    for (final Shape s : shapes) {
                        partitioner.overlapPartitions(s, new ResultCollector<Integer>() {
                            @Override
                            public void collect(Integer id) {
                                partitionID.set(id);
                                try {
                                    recordWriter.write(partitionID, s);
                                } catch (IOException e) {
                                    throw new RuntimeException(e);
                                }
                            }
                        });
                    }
                } else {
                    for (final Shape s : shapes) {
                        int pid = partitioner.overlapPartition(s);
                        if (pid != -1) {
                            partitionID.set(pid);
                            recordWriter.write(partitionID, s);
                        }
                    }
                }
            }
            reader.close();
        }
        recordWriter.close(null);

        // Write the WKT formatted master file
        Path masterPath = new Path(outPath, "_master." + sindex);
        FileSystem outFs = outPath.getFileSystem(params);
        Path wktPath = new Path(outPath, "_" + sindex + ".wkt");
        PrintStream wktOut = new PrintStream(outFs.create(wktPath));
        wktOut.println("ID\tBoundaries\tRecord Count\tSize\tFile name");
        Text tempLine = new Text2();
        Partition tempPartition = new Partition();
        LineReader in = new LineReader(outFs.open(masterPath));
        while (in.readLine(tempLine) > 0) {
            tempPartition.fromText(tempLine);
            wktOut.println(tempPartition.toWKT());
        }
        in.close();
        wktOut.close();
    }

    public static Job index(Path inPath, Path outPath, OperationsParams params)
            throws IOException, InterruptedException, ClassNotFoundException {
        if (OperationsParams.isLocal(new JobConf(params), inPath)) {
            indexLocal(inPath, outPath, params);
            return null;
        } else {
            return indexMapReduce(inPath, outPath, params);
        }
    }

    protected static void printUsage() {
        System.out.println("Builds a spatial index on an input file");
        System.out.println("Parameters (* marks required parameters):");
        System.out.println("<input file> - (*) Path to input file");
        System.out.println("<output file> - (*) Path to output file");
        System.out.println("shape:<point|rectangle|polygon> - (*) Type of shapes stored in input file");
        System.out.println("sindex:<index> - (*) Type of spatial index (grid|str|str+|quadtree|zcurve|kdtree)");
        System.out.println("-overwrite - Overwrite output file without noitce");
        GenericOptionsParser.printGenericCommandUsage(System.out);
    }

    /**
     * Entry point to the indexing operation.
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        OperationsParams params = new OperationsParams(new GenericOptionsParser(args));

        if (!params.checkInputOutput(true)) {
            printUsage();
            return;
        }
        if (params.get("sindex") == null) {
            System.err.println("Please specify type of index to build (grid, rtree, r+tree, str, str+)");
            printUsage();
            return;
        }
        Path inputPath = params.getInputPath();
        Path outputPath = params.getOutputPath();

        // The spatial index to use
        long t1 = System.currentTimeMillis();
        index(inputPath, outputPath, params);
        long t2 = System.currentTimeMillis();
        System.out.println("Total indexing time in millis " + (t2 - t1));
    }

}