edu.umn.cs.spatialHadoop.temporal.RepartitionTemporal.java Source code

Java tutorial

Introduction

Here is the source code for edu.umn.cs.spatialHadoop.temporal.RepartitionTemporal.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
 * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed under the License is
 * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and limitations under the License.
 */

package edu.umn.cs.spatialHadoop.temporal;

import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.GenericOptionsParser;

import edu.umn.cs.spatialHadoop.OperationsParams;
import edu.umn.cs.spatialHadoop.core.CellInfo;
import edu.umn.cs.spatialHadoop.core.GridInfo;
import edu.umn.cs.spatialHadoop.core.Rectangle;
import edu.umn.cs.spatialHadoop.core.Shape;
import edu.umn.cs.spatialHadoop.core.SpatialSite;
import edu.umn.cs.spatialHadoop.mapred.CombinedSpatialInputFormat;
import edu.umn.cs.spatialHadoop.mapred.GridOutputFormat;
import edu.umn.cs.spatialHadoop.mapred.RTreeGridOutputFormat;
import edu.umn.cs.spatialHadoop.operations.FileMBR;
import edu.umn.cs.spatialHadoop.operations.Repartition;
import edu.umn.cs.spatialHadoop.util.NASADatasetUtil;
import edu.umn.cs.spatialHadoop.util.TemporalIndexManager;

/**
 * Supports indexing for spatial shapes on higher temporal levels (e.g. weeks,
 * months and years) by aggregating indexes on lower levels in one higher level.
 * 
 * @author ibrahimsabek
 */
public class RepartitionTemporal extends Repartition {

    static final Log LOG = LogFactory.getLog(RepartitionTemporal.class);

    public RepartitionTemporal() {
        super();
    }

    public static void repartitionMapReduce(Path[] inputPaths, Path outputPath, OperationsParams params)
            throws IOException, InterruptedException {
        String sindex = params.get("sindex");
        boolean overwrite = params.getBoolean("overwrite", false);
        Shape stockShape = params.getShape("shape");

        FileSystem outFs = outputPath.getFileSystem(params);

        @SuppressWarnings("deprecation")
        final long blockSize = outFs.getDefaultBlockSize();

        // Calculate the dimensions of each partition based on gindex type
        CellInfo[] cellInfos;
        if (sindex.equals("grid")) {
            Rectangle inputMBR = FileMBR.fileMBR(inputPaths[0], params);
            long inputFileSize = FileMBR.sizeOfLastProcessedFile;
            for (int i = 1; i < inputPaths.length; i++) {
                Rectangle currentInputMBR = FileMBR.fileMBR(inputPaths[i], params);
                inputMBR.expand(currentInputMBR);
                inputFileSize = inputFileSize + FileMBR.sizeOfLastProcessedFile;
            }

            int num_partitions = calculateNumberOfPartitions(new Configuration(), inputFileSize, outFs, outputPath,
                    blockSize);

            GridInfo gridInfo = new GridInfo(inputMBR.x1, inputMBR.y1, inputMBR.x2, inputMBR.y2);
            gridInfo.calculateCellDimensions(num_partitions);
            cellInfos = gridInfo.getAllCells();
        } else if (sindex.equals("rtree") || sindex.equals("r+tree") || sindex.equals("str")
                || sindex.equals("str+")) {
            // Pack in rectangles using an RTree
            cellInfos = packInRectangles(inputPaths, outputPath, params, null);
        } else {
            throw new RuntimeException("Unsupported spatial index: " + sindex);
        }

        JobConf job = new JobConf(params, RepartitionTemporal.class);
        job.setJobName("RepartitionTemporal");

        // Overwrite output file
        if (outFs.exists(outputPath)) {
            if (overwrite)
                outFs.delete(outputPath, true);
            else
                throw new RuntimeException(
                        "Output file '" + outputPath + "' already exists and overwrite flag is not set");
        }

        // Decide which map function to use depending on the type of global
        // index
        if (sindex.equals("rtree") || sindex.equals("str")) {
            // Repartition without replication
            job.setMapperClass(RepartitionMapNoReplication.class);
        } else {
            // Repartition with replication (grid, str+, and r+tree)
            job.setMapperClass(RepartitionMap.class);
        }
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(stockShape.getClass());
        CombinedSpatialInputFormat.setInputPaths(job, inputPaths);
        job.setInputFormat(CombinedSpatialInputFormat.class);

        ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
        job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

        FileOutputFormat.setOutputPath(job, outputPath);
        if (sindex.equals("grid") || sindex.equals("str") || sindex.equals("str+")) {
            job.setOutputFormat(GridOutputFormat.class);
        } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
            // For now, the two types of local index are the same
            job.setOutputFormat(RTreeGridOutputFormat.class);
        } else {
            throw new RuntimeException("Unsupported spatial index: " + sindex);
        }

        SpatialSite.setCells(job, cellInfos);
        job.setBoolean(SpatialSite.OVERWRITE, overwrite);

        // Set reduce function
        job.setReducerClass(RepartitionReduce.class);
        job.setNumReduceTasks(
                Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

        // Set output committer that combines output files together
        job.setOutputCommitter(RepartitionOutputCommitter.class);

        JobClient.runJob(job);

    }

    public static void repartitionMapReduce(Path[] inputPaths, Path outputPath, Shape stockShape, long blockSize,
            CellInfo[] cellInfos, String sindex, boolean overwrite) throws IOException {

        JobConf job = new JobConf(Repartition.class);

        job.setJobName("RepartitionTemporal");
        FileSystem outFs = outputPath.getFileSystem(job);

        // Overwrite output file
        if (outFs.exists(outputPath)) {
            if (overwrite)
                outFs.delete(outputPath, true);
            else
                throw new RuntimeException(
                        "Output file '" + outputPath + "' already exists and overwrite flag is not set");
        }

        // Decide which map function to use depending on the type of global
        // index
        if (sindex.equals("rtree") || sindex.equals("str")) {
            // Repartition without replication
            job.setMapperClass(RepartitionMapNoReplication.class);
        } else {
            // Repartition with replication (grid and r+tree)
            job.setMapperClass(RepartitionMap.class);
        }
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(stockShape.getClass());
        CombinedSpatialInputFormat.setInputPaths(job, inputPaths);
        job.setInputFormat(CombinedSpatialInputFormat.class);

        ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
        job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

        FileOutputFormat.setOutputPath(job, outputPath);
        if (sindex.equals("grid") || sindex.equals("str") || sindex.equals("str+")) {
            job.setOutputFormat(GridOutputFormat.class);
        } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
            // For now, the two types of local index are the same
            job.setOutputFormat(RTreeGridOutputFormat.class);
        } else {
            throw new RuntimeException("Unsupported spatial index: " + sindex);
        }

        SpatialSite.setCells(job, cellInfos);
        job.setBoolean(SpatialSite.OVERWRITE, overwrite);

        // Set reduce function
        job.setReducerClass(RepartitionReduce.class);
        job.setNumReduceTasks(
                Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

        // Set output committer that combines output files together
        job.setOutputCommitter(RepartitionOutputCommitter.class);

        if (blockSize != 0) {
            job.setLong("dfs.block.size", blockSize);
            job.setLong("fs.local.block.size", blockSize);
        }

        JobClient.runJob(job);
    }

    private static Path generateIndexPath(Path inputPath, Path outputPath) {
        String inputPathStr = inputPath.toString();
        String inputPathFileName = inputPathStr.substring(inputPathStr.lastIndexOf("/") + 1, inputPathStr.length());
        Path indexPath = new Path(outputPath.toString() + "/" + inputPathFileName);
        return indexPath;
    }

    private static void bulkLoadSpatioTemporalIndexesLevel(Path indexLevelHomePath, Path[] inputPathDirs,
            String indexLevel, OperationsParams params) throws IOException, InterruptedException {
        LOG.info("Needs to index/re-index " + inputPathDirs.length + " " + indexLevel);

        for (Path inputPathDir : inputPathDirs) {
            FileSystem currFileSystem = inputPathDir.getFileSystem(params);

            if (currFileSystem.exists(inputPathDir)) {
                currFileSystem.delete(inputPathDir, true);
            }
            currFileSystem.mkdirs(inputPathDir);

            ArrayList<Path[]> pathsArrList = NASADatasetUtil.getSortedTuplesInPath(indexLevelHomePath,
                    NASADatasetUtil.extractDateStringFromPath(inputPathDir));

            Path indexPath = generateIndexPath(pathsArrList.get(0)[0], inputPathDir);

            for (Path[] currInputFiles : pathsArrList) {
                repartitionMapReduce(currInputFiles, indexPath, params);
            }

        }

    }

    private static void bulkLoadSpatioTemporalIndexes(TemporalIndexManager temporalIndexManager,
            OperationsParams params) throws IOException, InterruptedException {

        bulkLoadSpatioTemporalIndexesLevel(params.getPaths()[0], temporalIndexManager.getNeededDailyIndexes(),
                "daily", params);
        bulkLoadSpatioTemporalIndexesLevel(temporalIndexManager.getDailyIndexesHomePath(),
                temporalIndexManager.getNeededMonthlyIndexes(), "monthly", params);
        bulkLoadSpatioTemporalIndexesLevel(temporalIndexManager.getMonthlyIndexesHomePath(),
                temporalIndexManager.getNeededYearlyIndexes(), "yearly", params);
    }

    protected static void printUsage() {
        System.out.println("Builds a spatio-temporal index for input file");
        System.out.println("Parameters (* marks required parameters):");
        System.out.println("<dataset path> - (*) Path to input dataset");
        System.out.println("<index path> - (*) Path to index output");
        System.out.println("time:yyyy.mm.dd..yyyy.mm.dd - (*) Time range");
        System.out.println("shape:<point|rectangle|polygon> - (*) Type of stored shapes");
        System.out.println("sindex:<index> - (*) Type of spatial index (grid|rtree|r+tree|str|str+)");
        System.out.println("-overwrite - Overwrite output file without notice");
        GenericOptionsParser.printGenericCommandUsage(System.out);
    }

    public static void main(String[] args) throws IOException, ParseException, InterruptedException {
        OperationsParams params = new OperationsParams(new GenericOptionsParser(args), false);

        final Path[] paths = params.getPaths();
        if (paths.length <= 1 && !params.checkInput()) {
            printUsage();
            System.exit(1);
        }
        if (paths.length >= 2 && paths[1] == null) {
            printUsage();
            System.exit(1);
        }
        if (params.get("time") == null) {
            System.err.println("You must provide a time range");
            printUsage();
            System.exit(1);
        }
        if (params.get("sindex") == null) {
            System.err.println("Please specify type of index to build (grid, rtree, r+tree, str, str+)");
            printUsage();
            return;
        }
        Path datasetPath = paths[0]; // dataset path
        Path indexesPath = paths[1]; // index path
        String timeRange = params.get("time"); // time range

        TemporalIndexManager temporalIndexManager = new TemporalIndexManager(datasetPath, indexesPath);
        temporalIndexManager.prepareNeededIndexes(timeRange);

        // The spatio-temporal index to use
        long t1 = System.currentTimeMillis();
        bulkLoadSpatioTemporalIndexes(temporalIndexManager, params);
        long t2 = System.currentTimeMillis();
        System.out.println("Total indexing time in millis " + (t2 - t1));

    }

}