org.mrgeo.mapreduce.ingestvector.IngestVectorDriver.java Source code

Introduction

Here is the source code for org.mrgeo.mapreduce.ingestvector.IngestVectorDriver.java
Source

/*
 * Copyright 2009-2015 DigitalGlobe, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and limitations under the License.
 */

package org.mrgeo.mapreduce.ingestvector;

import org.apache.commons.lang.NotImplementedException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.mrgeo.core.MrGeoConstants;
import org.mrgeo.core.MrGeoProperties;
import org.mrgeo.data.DataProviderFactory;
import org.mrgeo.data.DataProviderFactory.AccessMode;
import org.mrgeo.data.ProtectionLevelUtils;
import org.mrgeo.data.image.MrsImageDataProvider;
import org.mrgeo.data.image.MrsImageOutputFormatProvider;
import org.mrgeo.data.tile.TileIdWritable;
import org.mrgeo.hdfs.partitioners.ImageSplitGenerator;
import org.mrgeo.hdfs.partitioners.TileIdPartitioner;
import org.mrgeo.hdfs.utils.HadoopFileUtils;
import org.mrgeo.mapreduce.GeometryWritable;
import org.mrgeo.mapreduce.MapGeometryToTiles;
import org.mrgeo.mapreduce.MapReduceUtils;
import org.mrgeo.tile.TileIdZoomWritable;
import org.mrgeo.utils.*;
import org.mrgeo.utils.geotools.GeotoolsVectorUtils;
import org.mrgeo.vector.mrsvector.MrsVectorPyramid;
import org.mrgeo.vector.mrsvector.VectorTileWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.Properties;
import java.util.UUID;

public class IngestVectorDriver {
    private static Logger log = LoggerFactory.getLogger(IngestVectorDriver.class);

    static final private String base = IngestVectorDriver.class.getSimpleName();
    static final public String MAX_ZOOM = base + ".max.zoom";
    static final public String TILESIZE = base + ".tilesize";
    static final public String BOUNDS = base + ".bounds";

    static final int zoomMax = 20;
    static final int pointsPerTile = 100000;

    private IngestVectorDriver() {
    }

    public static boolean ingest(final String[] inputs, final String output, final Configuration config,
            final String protectionLevel, final Properties providerProperties) throws Exception {
        return ingest(inputs, output, config, 0, protectionLevel, providerProperties);
    }

    public static boolean ingest(final String[] inputs, final String output, final Configuration config,
            int zoomlevel, String protectionLevel, Properties providerProperties) throws Exception {
        if (zoomlevel <= 0) {
            zoomlevel = calculateZoomlevel(inputs, config);
        }

        MrsImageDataProvider dp = DataProviderFactory.getMrsImageDataProvider(output, AccessMode.OVERWRITE, config);
        String useProtectionLevel = ProtectionLevelUtils.getAndValidateProtectionLevel(dp, protectionLevel);
        return runJob(inputs, output, config, zoomlevel, useProtectionLevel, providerProperties);
    }

    private static boolean runJob(String[] inputs, String output, Configuration config, int zoomlevel,
            String protectionLevel, Properties providerProperties) throws IOException {

        Bounds bounds = GeotoolsVectorUtils.calculateBounds(inputs, config);

        final Job job = new Job(config);

        final String now = new SimpleDateFormat("yyyy-MM-dd'T'HH-mm-ss").format(new Date());

        final String jobName = "IngestVector_" + now + "_" + UUID.randomUUID().toString();
        job.setJobName(jobName);

        final Configuration conf = job.getConfiguration();

        int tilesize = Integer.parseInt(MrGeoProperties.getInstance().getProperty(MrGeoConstants.MRGEO_MRS_TILESIZE,
                MrGeoConstants.MRGEO_MRS_TILESIZE_DEFAULT));

        conf.setInt(MapGeometryToTiles.ZOOMLEVEL, zoomlevel);
        conf.setInt(MapGeometryToTiles.TILESIZE, tilesize);

        job.setInputFormatClass(IngestVectorGeometryInputFormat.class);

        job.setMapperClass(IngestGeometryMapper.class);
        job.setMapOutputKeyClass(TileIdWritable.class);
        job.setMapOutputValueClass(GeometryWritable.class);

        HadoopUtils.setJar(job, IngestVectorDriver.class);
        for (final String input : inputs) {
            // Source file can be on the local file system (instead of hdfs), and
            // we call the following to circumvent a bug in Hadoop 20.2
            // FileInputFormat.addInputPath.
            HadoopVectorUtils.addInputPath(job, new Path(input));
        }
        HadoopFileUtils.delete(output);

        MrsImageOutputFormatProvider ofProvider = MrsImageDataProvider.setupMrsPyramidOutputFormat(job, output,
                bounds, zoomlevel, tilesize, protectionLevel, providerProperties);

        job.setReducerClass(IngestGeometryReducer.class);

        job.setOutputKeyClass(TileIdWritable.class);
        job.setOutputValueClass(VectorTileWritable.class);

        try {
            job.submit();
            final boolean success = job.waitForCompletion(true);

            if (success) {
                ofProvider.teardown(job);
                MrsVectorPyramid.calculateMetadata(output, zoomlevel, tilesize, bounds, protectionLevel);
                return true;
            }

        } catch (final InterruptedException e) {
            e.printStackTrace();
        } catch (final ClassNotFoundException e) {
            e.printStackTrace();
        }

        return false;
    }

    public static boolean localIngest(final String[] inputs, final String output, final Configuration config,
            final int zoomlevel, final String protectionLevel) throws Exception {
        throw new NotImplementedException("localIngest not implemented!");
    }

    public static int readZoomlevel(final String output, final Configuration conf) {
        try {
            final int[] counts = new int[zoomMax + 1];
            Arrays.fill(counts, 0);

            final Path path = new Path(output);
            final FileSystem fs = path.getFileSystem(conf);

            final FileStatus[] outputFiles = fs.listStatus(path);

            for (final FileStatus fileStatus : outputFiles) {
                if (fileStatus.isDir() == false) {
                    final Path p = fileStatus.getPath();
                    final String name = p.getName();
                    if (name.startsWith("part-")) {
                        BufferedReader r = null;
                        InputStream fdis = null;
                        try {
                            fdis = HadoopFileUtils.open(conf, p); // fs.open(p);
                            r = new BufferedReader(new InputStreamReader(fdis));
                            String line = r.readLine();
                            while (line != null) {
                                final String fields[] = line.split("\t");

                                final int zoom = Integer.valueOf(fields[0]);
                                counts[zoom] += Integer.valueOf(fields[1]);

                                line = r.readLine();
                            }
                        } finally {
                            if (fdis != null) {
                                fdis.close();
                            }
                        }
                    }
                }
            }

            for (int i = 1; i < counts.length; i++) {
                if (counts[i] < pointsPerTile) {
                    return i;
                }
            }
            return counts.length - 1;
        } catch (final IOException e) {
            e.printStackTrace();
        }

        return -1;
    }

    private static int calculateZoomlevel(final String[] inputs, final Configuration config) throws IOException {
        log.info("Calculating zoom level");

        final Job job = new Job(config);
        HadoopUtils.setJar(job, IngestVectorDriver.class);

        final String now = new SimpleDateFormat("yyyy-MM-dd'T'HH-mm-ss").format(new Date());

        final String jobName = "CalculateVectorZoom_" + now + "_" + UUID.randomUUID().toString();
        job.setJobName(jobName);

        final Configuration conf = job.getConfiguration();

        conf.setInt(MAX_ZOOM, zoomMax);
        conf.setInt(TILESIZE, Integer.parseInt(MrGeoProperties.getInstance()
                .getProperty(MrGeoConstants.MRGEO_MRS_TILESIZE, MrGeoConstants.MRGEO_MRS_TILESIZE_DEFAULT)));

        job.setInputFormatClass(IngestVectorGeometryInputFormat.class);
        job.setMapperClass(CalculateZoomMapper.class);
        job.setMapOutputKeyClass(TileIdZoomWritable.class);
        job.setMapOutputValueClass(LongWritable.class);

        for (final String input : inputs) {
            // Source file can be on the local file system (instead of hdfs), and
            // we call the following to circumvent a bug in Hadoop 20.2
            // FileInputFormat.addInputPath.
            HadoopVectorUtils.addInputPath(job, new Path(input));
        }

        job.setReducerClass(CalculateZoomReducer.class);
        String output = new Path(HadoopFileUtils.getTempDir(), HadoopUtils.createRandomString(40)).toString();
        try {
            FileOutputFormat.setOutputPath(job, new Path(output));

            job.setOutputKeyClass(IntWritable.class);
            job.setOutputValueClass(LongWritable.class);

            try {
                job.submit();
                final boolean success = job.waitForCompletion(true);

                if (success) {
                    return readZoomlevel(output, conf);
                }

            } catch (final InterruptedException e) {
                e.printStackTrace();
            } catch (final ClassNotFoundException e) {
                e.printStackTrace();
            }
        } finally {
            HadoopFileUtils.delete(output);
        }

        return 0;
    }

    private static Path setupPartitioner(Job job, Bounds bounds, int zoomlevel, int tilesize) throws IOException {
        MapReduceUtils.setupTiledJob(job);

        Configuration conf = job.getConfiguration();

        // Set up partitioner
        TMSUtils.TileBounds tb = TMSUtils.boundsToTileExact(bounds.getTMSBounds(), zoomlevel, tilesize);

        final LongRectangle tileBounds = new LongRectangle(tb.w, tb.s, tb.e, tb.n);

        //    final int tileSizeBytes = tilesize * tilesize *
        //        metadata.getBands() * RasterUtils.getElementSize(metadata.getTileType());

        int increment = conf.getInt(TileIdPartitioner.INCREMENT_KEY, -1);
        Path splitFileTmp;
        if (increment == -1) {
            increment = 1;
        }

        // if increment is provided, use it to setup the partitioner
        splitFileTmp = TileIdPartitioner.setup(job, new ImageSplitGenerator(tileBounds.getMinX(),
                tileBounds.getMinY(), tileBounds.getMaxX(), tileBounds.getMaxY(), zoomlevel, increment));

        //    } 
        //    else 
        //    {
        //      FileSystem fs = HadoopFileUtils.getFileSystem(conf);
        //      
        //      // if increment is not provided, set up the partitioner using max partitions 
        //      String strMaxPartitions = conf.get(TileIdPartitioner.MAX_PARTITIONS_KEY);
        //      if (strMaxPartitions != null)
        //      {
        //        // We know the max partitions conf setting exists, let's go read it. The
        //        // 1000 hard-coded default value is never used.
        //        int maxPartitions = conf.getInt(TileIdPartitioner.MAX_PARTITIONS_KEY, 1000);
        //        splitFileTmp = TileIdPartitioner.setup(job,
        //          new ImageSplitGenerator(tileBounds, zoomlevel,
        //            tileSizeBytes, fs.getDefaultBlockSize(), maxPartitions));
        //      }
        //      else
        //      {
        //        splitFileTmp = TileIdPartitioner.setup(job,
        //          new ImageSplitGenerator(tileBounds, zoomlevel,
        //            tileSizeBytes, fs.getDefaultBlockSize()));
        //      }
        //    }

        return splitFileTmp;
    }
}