edu.umn.cs.spatialHadoop.core.SpatialSite.java Source code

Introduction

Here is the source code for edu.umn.cs.spatialHadoop.core.SpatialSite.java
Source

/***********************************************************************
* Copyright (c) 2015 by Regents of the University of Minnesota.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Apache License, Version 2.0 which 
* accompanies this distribution and is available at
* http://www.opensource.org/licenses/apache2.0.php.
*
*************************************************************************/
package edu.umn.cs.spatialHadoop.core;

import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CodecPool;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.Decompressor;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;

import edu.umn.cs.spatialHadoop.OperationsParams;
import edu.umn.cs.spatialHadoop.indexing.GlobalIndex;
import edu.umn.cs.spatialHadoop.indexing.Partition;
import edu.umn.cs.spatialHadoop.indexing.RTree;
import edu.umn.cs.spatialHadoop.mapred.RandomShapeGenerator.DistributionType;
import edu.umn.cs.spatialHadoop.mapred.ShapeIterRecordReader;
import edu.umn.cs.spatialHadoop.mapred.SpatialRecordReader.ShapeIterator;
import edu.umn.cs.spatialHadoop.util.FileUtil;

/**
 * Combines all the configuration needed for SpatialHadoop.
 * 
 * @author Ahmed Eldawy
 *
 */
public class SpatialSite {

    private static final Log LOG = LogFactory.getLog(SpatialSite.class);

    /**
     * A filter that selects visible files and filters out hidden files.
     * Hidden files are the ones with a names starting in '.' or '_'
     */
    public static final PathFilter NonHiddenFileFilter = new PathFilter() {
        public boolean accept(Path p) {
            String name = p.getName();
            return !name.startsWith("_") && !name.startsWith(".");
        }
    };

    /**Configuration line to store column boundaries on which intermediate data is split*/
    public static final String ColumnBoundaries = "SpatialSite.ReduceSpaceBoundaries";

    /**Enforce static only calls*/
    private SpatialSite() {
    }

    /**The class used to filter blocks before starting map tasks*/
    public static final String FilterClass = "spatialHadoop.mapreduce.filter";

    /**Whether to build the RTree in fast mode or slow (memory saving) mode.*/
    public static final String RTREE_BUILD_MODE = "spatialHadoop.storage.RTreeBuildMode";

    /**Configuration line name for replication overhead*/
    public static final String INDEXING_OVERHEAD = "spatialHadoop.storage.IndexingOverhead";

    /**Ratio of the sample to read from files to build a global R-tree*/
    public static final String SAMPLE_RATIO = "spatialHadoop.storage.SampleRatio";

    /**Ratio of the sample to read from files to build a global R-tree*/
    public static final String SAMPLE_SIZE = "spatialHadoop.storage.SampleSize";

    /**
     * A marker put in the beginning of each block to indicate that this block
     * is stored as an RTree. It might be better to store this in the BlockInfo
     * in a field (e.g. localIndexType).
     */
    public static final long RTreeFileMarker = -0x00012345678910L;

    public static final String OUTPUT_CELLS = "edu.umn.cs.spatial.mapReduce.GridOutputFormat.CellsInfo";
    public static final String OVERWRITE = "edu.umn.cs.spatial.mapReduce.GridOutputFormat.Overwrite";

    private static final CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(
            new Configuration());

    /**
     * Maximum number of shapes to read in one read operation and return when
     * reading a file as array
     */
    public static final String MaxShapesInOneRead = "spatialHadoop.mapred.MaxShapesPerRead";

    /**
     * Maximum size in bytes that can be read in one read operation
     */
    public static final String MaxBytesInOneRead = "spatialHadoop.mapred.MaxBytesPerRead";

    public static byte[] RTreeFileMarkerB;

    static {
        // Load configuration from files
        Configuration.addDefaultResource("spatial-default.xml");
        Configuration.addDefaultResource("spatial-site.xml");

        ByteArrayOutputStream bout = new ByteArrayOutputStream();
        DataOutputStream dout = new DataOutputStream(bout);
        try {
            dout.writeLong(RTreeFileMarker);
            dout.close();
            bout.close();
            RTreeFileMarkerB = bout.toByteArray();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * It sets the given class in the configuration and, in addition, it sets
     * the jar of that class to the class path of this job which allows it to
     * run correctly in a distributed mode.
     * @param conf - Configuration to set the key
     * @param key - the key to set
     * @param klass - the class to use as a value
     * @param xface - the interface that the provided class should implement
     */
    public static void setClass(Configuration conf, String key, Class<?> klass, Class<?> xface) {
        conf.setClass(key, klass, xface);
        addClassToPath(conf, klass);
    }

    private static String findContainingJar(Class my_class) {
        ClassLoader loader = my_class.getClassLoader();
        String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
        try {
            for (Enumeration<URL> itr = loader.getResources(class_file); itr.hasMoreElements();) {
                URL url = (URL) itr.nextElement();
                if ("jar".equals(url.getProtocol())) {
                    String toReturn = url.getPath();
                    if (toReturn.startsWith("file:")) {
                        toReturn = toReturn.substring("file:".length());
                    }
                    toReturn = URLDecoder.decode(toReturn, "UTF-8");
                    return toReturn.replaceAll("!.*$", "");
                }
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return null;
    }

    /**
     * Ensures that the given class is in the class path of running jobs.
     * If the jar is not already in the class path, it is added to the
     * DisributedCache of the given job to ensure the associated job will work
     * fine.
     * @param conf
     * @param klass
     */
    public static void addClassToPath(Configuration conf, Class<?> klass) {
        // Check if we need to add the containing jar to class path
        String klassJar = findContainingJar(klass);
        String shadoopJar = findContainingJar(SpatialSite.class);
        if (klassJar == null || (shadoopJar != null && klassJar.equals(shadoopJar)))
            return;
        Path containingJar = new Path(findContainingJar(klass));
        Path[] existingClassPaths = DistributedCache.getArchiveClassPaths(conf);
        if (existingClassPaths != null) {
            for (Path existingClassPath : existingClassPaths) {
                if (containingJar.getName().equals(existingClassPath.getName()))
                    return;
            }
        }
        // The containing jar is a new one and needs to be copied to class path
        try {
            LOG.info("Adding JAR '" + containingJar.getName() + "' to job class path");
            FileSystem defaultFS = FileSystem.get(conf);
            Path libFolder;
            if (existingClassPaths != null && existingClassPaths.length > 0) {
                libFolder = existingClassPaths[0].getParent();
            } else {
                // First jar to be added like this. Create a new lib folder
                do {
                    libFolder = new Path("lib_" + (int) (Math.random() * 100000));
                } while (defaultFS.exists(libFolder));
                defaultFS.mkdirs(libFolder);
                defaultFS.deleteOnExit(libFolder);
            }
            defaultFS.copyFromLocalFile(containingJar, libFolder);
            Path jarFullPath = new Path(libFolder, containingJar.getName()).makeQualified(defaultFS);
            jarFullPath = jarFullPath.makeQualified(defaultFS);
            DistributedCache.addArchiveToClassPath(jarFullPath, conf);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * Creates a stock shape according to the given configuration.
     * It is a shortcut to {@link #getShape(Configuration, String)}
     * called for this configuration and with the given parameter name.
     * @param job
     * @return
     */
    public static Shape createStockShape(Configuration job) {
        return OperationsParams.getShape(job, "shape");
    }

    /**
     * Sets the specified configuration parameter to the current value of the shape.
     * Both class name and shape values are encoded in one string and set as the
     * value of the configuration parameter. The shape can be retrieved later
     * using {@link #getShape(Configuration, String)}.
     * @param conf
     * @param param
     * @param shape
     * @deprecated Use {@link OperationsParams#setShape(Configuration,String,Shape)} instead
     */
    public static void setShape(Configuration conf, String param, Shape shape) {
        OperationsParams.setShape(conf, param, shape);
    }

    /**
     * Retrieves a value of a shape set earlier using {@link OperationsParams#setShape(Configuration, String, Shape)}.
     * It reads the corresponding parameter and parses it to find the class name
     * and shape value. First, a default object is created using {@link Class#newInstance()}
     * then the value is parsed using {@link Shape#fromText(Text)}.
     * @param conf
     * @param param
     * @return
     * @deprecated - Use {@link OperationsParams#getShape(Configuration, String)}
     */
    @Deprecated
    public static Shape getShape(Configuration conf, String param) {
        return OperationsParams.getShape(conf, param);
    }

    /**
     * Returns the global index (partitions) of a file that is indexed using
     * the index command. If the file is not indexed, it returns null.
     * The return value is of type {@link GlobalIndex} where the generic
     * parameter is specified as {@link Partition}.
     * @param fs
     * @param dir
     * @return
     */
    public static GlobalIndex<Partition> getGlobalIndex(FileSystem fs, Path dir) {
        try {
            FileStatus[] allFiles;
            if (OperationsParams.isWildcard(dir)) {
                allFiles = fs.globStatus(dir);
            } else {
                allFiles = fs.listStatus(dir);
            }

            FileStatus masterFile = null;
            int nasaFiles = 0;
            for (FileStatus fileStatus : allFiles) {
                if (fileStatus.getPath().getName().startsWith("_master")) {
                    if (masterFile != null)
                        throw new RuntimeException("Found more than one master file in " + dir);
                    masterFile = fileStatus;
                } else if (fileStatus.getPath().getName().toLowerCase()
                        .matches(".*h\\d\\dv\\d\\d.*\\.(hdf|jpg|xml)")) {
                    // Handle on-the-fly global indexes imposed from file naming of NASA data
                    nasaFiles++;
                }
            }
            if (masterFile != null) {
                ShapeIterRecordReader reader = new ShapeIterRecordReader(fs.open(masterFile.getPath()), 0,
                        masterFile.getLen());
                Rectangle dummy = reader.createKey();
                reader.setShape(new Partition());
                ShapeIterator values = reader.createValue();
                ArrayList<Partition> partitions = new ArrayList<Partition>();
                while (reader.next(dummy, values)) {
                    for (Shape value : values) {
                        partitions.add((Partition) value.clone());
                    }
                }
                GlobalIndex<Partition> globalIndex = new GlobalIndex<Partition>();
                globalIndex.bulkLoad(partitions.toArray(new Partition[partitions.size()]));
                String extension = masterFile.getPath().getName();
                extension = extension.substring(extension.lastIndexOf('.') + 1);
                globalIndex.setCompact(GridRecordWriter.PackedIndexes.contains(extension));
                globalIndex.setReplicated(GridRecordWriter.ReplicatedIndexes.contains(extension));
                return globalIndex;
            } else if (nasaFiles > allFiles.length / 2) {
                // A folder that contains HDF files
                // Create a global index on the fly for these files based on their names
                Partition[] partitions = new Partition[allFiles.length];
                for (int i = 0; i < allFiles.length; i++) {
                    final Pattern cellRegex = Pattern.compile(".*(h\\d\\dv\\d\\d).*");
                    String filename = allFiles[i].getPath().getName();
                    Matcher matcher = cellRegex.matcher(filename);
                    Partition partition = new Partition();
                    partition.filename = filename;
                    if (matcher.matches()) {
                        String cellname = matcher.group(1);
                        int h = Integer.parseInt(cellname.substring(1, 3));
                        int v = Integer.parseInt(cellname.substring(4, 6));
                        partition.cellId = v * 36 + h;
                        // Calculate coordinates on MODIS Sinusoidal grid
                        partition.x1 = h * 10 - 180;
                        partition.y2 = (18 - v) * 10 - 90;
                        partition.x2 = partition.x1 + 10;
                        partition.y1 = partition.y2 - 10;
                        // Convert to Latitude Longitude
                        double lon1 = partition.x1 / Math.cos(partition.y1 * Math.PI / 180);
                        double lon2 = partition.x1 / Math.cos(partition.y2 * Math.PI / 180);
                        partition.x1 = Math.min(lon1, lon2);
                        lon1 = partition.x2 / Math.cos(partition.y1 * Math.PI / 180);
                        lon2 = partition.x2 / Math.cos(partition.y2 * Math.PI / 180);
                        partition.x2 = Math.max(lon1, lon2);
                    } else {
                        partition.set(-180, -90, 180, 90);
                        partition.cellId = allFiles.length + i;
                    }
                    partitions[i] = partition;
                }
                GlobalIndex<Partition> gindex = new GlobalIndex<Partition>();
                gindex.bulkLoad(partitions);
                return gindex;
            } else {
                return null;
            }
        } catch (IOException e) {
            LOG.info("Error retrieving global index of '" + dir + "'");
            LOG.info(e);
            return null;
        }
    }

    /**
     * Checks whether a file is indexed using an R-tree or not. This allows
     * an operation to use the R-tree to speedup the processing if it exists.
     * This function opens the specified file and reads the first eight bytes
     * which include the R-tree signature. If the signatures matches with the
     * R-tree signature, true is returned. Otherwise, false is returned.
     * If the parameter is a path to a directory, only the first data file in that
     * directory is tested.
     * @param fs
     * @param path
     * @return
     * @throws IOException
     */
    public static boolean isRTree(FileSystem fs, Path path) throws IOException {
        if (FileUtil.getExtensionWithoutCompression(path).equals("rtree"))
            return true;

        FileStatus file = fs.getFileStatus(path);
        Path fileToCheck;
        if (file.isDir()) {
            // Check any cell (e.g., first cell)
            GlobalIndex<Partition> gIndex = getGlobalIndex(fs, path);
            if (gIndex == null)
                return false;
            fileToCheck = new Path(path, gIndex.iterator().next().filename);
        } else {
            fileToCheck = file.getPath();
        }
        InputStream fileIn = fs.open(fileToCheck);

        // Check if file is compressed
        CompressionCodec codec = compressionCodecs.getCodec(fileToCheck);
        Decompressor decompressor = null;
        if (codec != null) {
            synchronized (compressionCodecs) {
                // CodecPool is not thread-safe
                decompressor = CodecPool.getDecompressor(codec);
            }
            fileIn = codec.createInputStream(fileIn, decompressor);
        }
        byte[] signature = new byte[RTreeFileMarkerB.length];
        fileIn.read(signature);
        fileIn.close();
        if (decompressor != null) {
            CodecPool.returnDecompressor(decompressor);
        }
        return Arrays.equals(signature, SpatialSite.RTreeFileMarkerB);
    }

    /**
     * Returns the cells (partitions) of a file. This functionality can be useful
     * to repartition another file using the same partitioning or to draw
     * these partitions as a high level index. This function reads the master
     * file and returns all rectangles in it.
     * @param fs
     * @param path
     * @return
     * @throws IOException
     */
    public static CellInfo[] cellsOf(FileSystem fs, Path path) throws IOException {
        GlobalIndex<Partition> gIndex = getGlobalIndex(fs, path);
        if (gIndex == null)
            return null;
        return cellsOf(gIndex);
    }

    /**
     * Set an array of cells in the job configuration. As the array might be
     * very large to store as one value, an alternative approach is used.
     * The cells are all written to a temporary file, and that file is added
     * to the DistributedCache of the job. Later on, a call to
     * {@link #getCells(Configuration)} will open the corresponding file from
     * DistributedCache and parse cells from that file.
     * @param conf
     * @param cellsInfo
     * @throws IOException
     */
    public static void setCells(Configuration conf, CellInfo[] cellsInfo) throws IOException {
        Path tempFile;
        FileSystem fs = FileSystem.get(conf);
        do {
            tempFile = new Path("cells_" + (int) (Math.random() * 1000000) + ".cells");
        } while (fs.exists(tempFile));
        FSDataOutputStream out = fs.create(tempFile);
        out.writeInt(cellsInfo.length);
        for (CellInfo cell : cellsInfo) {
            cell.write(out);
        }
        out.close();

        fs.deleteOnExit(tempFile);

        DistributedCache.addCacheFile(tempFile.toUri(), conf);
        conf.set(OUTPUT_CELLS, tempFile.getName());
        LOG.info("Partitioning file into " + cellsInfo.length + " cells");
    }

    /**
     * Retrieves cells that were stored earlier using
     * {@link #setCells(Configuration, CellInfo[])}
     * This function opens the corresponding
     * file from DistributedCache and parses jobs from it.
     * @param conf
     * @return
     * @throws IOException
     */
    public static CellInfo[] getCells(Configuration conf) throws IOException {
        CellInfo[] cells = null;
        String cells_file = conf.get(OUTPUT_CELLS);
        if (cells_file != null) {
            Path[] cacheFiles = DistributedCache.getLocalCacheFiles(conf);
            for (Path cacheFile : cacheFiles) {
                if (cacheFile.getName().contains(cells_file)) {
                    FSDataInputStream in = FileSystem.getLocal(conf).open(cacheFile);

                    int cellCount = in.readInt();
                    cells = new CellInfo[cellCount];
                    for (int i = 0; i < cellCount; i++) {
                        cells[i] = new CellInfo();
                        cells[i].readFields(in);
                    }

                    in.close();
                }
            }
        }
        return cells;
    }

    /**
     * Sets a rectangle in a job configuration. The Rectangle is serialized to
     * text using {@link Rectangle#toText(Text)}.
     * @param conf
     * @param name
     * @param rect
     */
    public static void setRectangle(Configuration conf, String name, Rectangle rect) {
        conf.set(name, rect.getMBR().toText(new Text()).toString());
    }

    /**
     * Retrieves a rectangle from configuration parameter. The value is assumed
     * to be in text format that can be parsed using {@link Rectangle#fromText(Text)}
     * @param conf
     * @param name
     * @return
     */
    public static Rectangle getRectangle(Configuration conf, String name) {
        Rectangle rect = null;
        String rectStr = conf.get(name);
        if (rectStr != null) {
            rect = new Rectangle();
            rect.fromText(new Text(rectStr));
        }
        return rect;
    }

    /**
     * Retrieves the distribution type used for generating synthetic data
     * @param job
     * @param key
     * @param defaultValue
     * @return
     */
    public static DistributionType getDistributionType(Configuration job, String key,
            DistributionType defaultValue) {
        DistributionType type = defaultValue;
        String strType = job.get(key);
        if (strType != null) {
            strType = strType.toLowerCase();
            if (strType.startsWith("uni"))
                type = DistributionType.UNIFORM;
            else if (strType.startsWith("gaus"))
                type = DistributionType.GAUSSIAN;
            else if (strType.startsWith("cor"))
                type = DistributionType.CORRELATED;
            else if (strType.startsWith("anti"))
                type = DistributionType.ANTI_CORRELATED;
            else if (strType.startsWith("circle"))
                type = DistributionType.CIRCLE;
            else {
                type = null;
            }
        }
        return type;
    }

    /**
     * Finds the partitioning info used in the given global index. If each cell
     * is represented as one partition, the MBRs of these partitions are returned.
     * If one cell is stored in multiple partitions (i.e., multiple files),
     * their MBRs are combined to produce one MBR for this cell.
     * @param gIndex
     * @return
     */
    public static CellInfo[] cellsOf(GlobalIndex<Partition> gIndex) {
        // Find all partitions of the given file. If two partitions have the same
        // cell ID, it indicates that they are two blocks of the same cell. This
        // means they represent one partition and should be merged together.
        // They might have different MBRs as each block has its own MBR according
        // to the data stored in it.
        Map<Integer, CellInfo> cells = new HashMap<Integer, CellInfo>();
        for (Partition p : gIndex) {
            CellInfo cell = cells.get(p.cellId);
            if (cell == null) {
                cells.put(p.cellId, cell = new CellInfo(p));
            } else {
                cell.expand(p);
            }
        }
        return cells.values().toArray(new CellInfo[cells.size()]);
    }

    public static <S extends Shape> RTree<S> loadRTree(FileSystem fs, Path file, S shape) throws IOException {
        RTree<S> rtree = new RTree<S>();
        rtree.setStockObject(shape);
        FSDataInputStream input = fs.open(file);
        input.skip(8); // Skip the 8 bytes that contains the signature
        rtree.readFields(input);
        return rtree;
    }

    public static CellInfo getCellInfo(GlobalIndex<Partition> gIndex, int cellID) {
        Map<Integer, CellInfo> cells = new HashMap<Integer, CellInfo>();
        for (Partition p : gIndex) {
            CellInfo cell = cells.get(p.cellId);
            if (cell == null) {
                cells.put(p.cellId, cell = new CellInfo(p));
            } else {
                cell.expand(p);
            }
        }
        return cells.get(cellID);
    }

    /**
     * Splits the reduce space vertically among reducers
     * @param job
     * @param inPaths
     * @param params
     * @throws IOException
     */
    public static void splitReduceSpace(Job job, Path[] inPaths, OperationsParams params) throws IOException {
        FileSystem inFs = inPaths[0].getFileSystem(params);
        GlobalIndex<Partition> gIndex = getGlobalIndex(inFs, inPaths[0]);
        if (gIndex == null)
            return; // No global index to split the space against
        List<Rectangle> columns = new ArrayList<Rectangle>();
        for (Partition p : gIndex) {
            double x1 = p.x1, x2 = p.x2;

            boolean matched = false;
            for (int iColumn = 0; iColumn < columns.size() && !matched; iColumn++) {
                Rectangle cmbr = columns.get(iColumn);
                double cx1 = cmbr.x1;
                double cx2 = cmbr.x2;
                if (x2 > cx1 && cx2 > x1) {
                    matched = true;
                    cmbr.expand(p);
                }
            }

            if (!matched) {
                // Create a new column
                columns.add(new Rectangle(p));
            }
        }
        ClusterStatus clusterStatus = new JobClient(new JobConf()).getClusterStatus();
        int numReducers = Math.min(columns.size(), Math.max(1, clusterStatus.getMaxReduceTasks() * 9 / 10));
        String columnBoundaries = "";
        for (int iReducer = 0; iReducer < numReducers; iReducer++) {
            if (iReducer > 0)
                columnBoundaries += ',';
            int col = (iReducer + 1) * columns.size() / numReducers - 1;
            columnBoundaries += columns.get(col).x2;
        }
        job.getConfiguration().set(ColumnBoundaries, columnBoundaries);
        job.setNumReduceTasks(numReducers);
    }

    public static double[] getReduceSpace(Configuration conf) {
        if (conf.get(ColumnBoundaries) == null)
            return null;
        double[] columnBoundaries;
        String[] strBoundaries = conf.get(ColumnBoundaries).split(",");
        columnBoundaries = new double[strBoundaries.length];
        for (int iCol = 0; iCol < strBoundaries.length; iCol++)
            columnBoundaries[iCol] = Double.parseDouble(strBoundaries[iCol]);
        return columnBoundaries;
    }
}