edu.umn.cs.spatialHadoop.mapred.SpatialRecordReader.java Source code

Introduction

Here is the source code for edu.umn.cs.spatialHadoop.mapred.SpatialRecordReader.java
Source

/***********************************************************************
* Copyright (c) 2015 by Regents of the University of Minnesota.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Apache License, Version 2.0 which 
* accompanies this distribution and is available at
* http://www.opensource.org/licenses/apache2.0.php.
*
*************************************************************************/
package edu.umn.cs.spatialHadoop.mapred;

import java.io.DataInput;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CodecPool;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.Decompressor;
import org.apache.hadoop.io.compress.SplitCompressionInputStream;
import org.apache.hadoop.io.compress.SplittableCompressionCodec;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.lib.CombineFileSplit;
import org.apache.hadoop.util.LineReader;

import edu.umn.cs.spatialHadoop.core.Rectangle;
import edu.umn.cs.spatialHadoop.core.Shape;
import edu.umn.cs.spatialHadoop.core.SpatialSite;
import edu.umn.cs.spatialHadoop.indexing.GlobalIndex;
import edu.umn.cs.spatialHadoop.indexing.Partition;
import edu.umn.cs.spatialHadoop.indexing.RTree;

/**
 * A base class to read shapes from files. It reads either single shapes,
 * list of shapes, or R-trees. It automatically detects the format of the
 * underlying block and parses it accordingly.
 * 
 * The class implement the RecordReader interface allowing it to be used in
 * MapReduce programs with an appropriate InputFormat. The key is always
 * a {@link Rectangle} that indicates the MBR of the corresponding partition.
 * In case of a non-indexed file, the key is an invalid rectangle. See
 * {@link Rectangle#isValid()}.
 * 
 * @see ShapeLineRecordReader
 * @see ShapeRecordReader
 * @see ShapeArrayRecordReader
 * @see RTreeRecordReader
 * @author Ahmed Eldawy
 *
 */
public abstract class SpatialRecordReader<K, V> implements RecordReader<K, V> {
    private static final Log LOG = LogFactory.getLog(SpatialRecordReader.class);

    /**Maximum number of shapes to read in one operation to return as array*/
    private int maxShapesInOneRead;
    /**Maximum size in bytes that can be read in one read*/
    private int maxBytesInOneRead;

    enum BlockType {
        HEAP, RTREE
    };

    /** First offset that is read from the input */
    protected long start;
    /** Last offset to stop at */
    protected long end;
    /** Position of the next byte to read/prase from the input */
    protected long pos;
    /** Input stream that reads from file */
    private InputStream in;

    private Seekable filePosition;
    private CompressionCodec codec;
    private Decompressor decompressor;

    /** Reads lines from text files */
    protected LineReader lineReader;
    /** A temporary text to read lines from lineReader */
    protected Text tempLine = new Text();
    /** Some bytes that were read from the stream but not parsed yet */
    protected byte[] buffer;

    /** File system of the file being parsed */
    private FileSystem fs;

    /** The path of the parsed file */
    private Path path;

    /** Block size for the read file. Used with RTrees */
    protected long blockSize;

    /** The boundary of the partition currently being read */
    protected Rectangle cellMbr;

    /**The type of the currently parsed block*/
    protected BlockType blockType;

    /**
     * The input stream that reads directly from the input file.
     * If the file is not compressed, this stream is the same as the in.
     * Otherwise, this is the raw (compressed) input stream. This stream is used
     * to calculate the progress of the input file.
     */
    private FSDataInputStream directIn;

    /**
     * Initialize from an input split
     * @param split
     * @param conf
     * @param reporter
     * @param index
     * @throws IOException
     */
    public SpatialRecordReader(CombineFileSplit split, Configuration conf, Reporter reporter, Integer index)
            throws IOException {
        this(conf, split.getStartOffsets()[index], split.getLength(index), split.getPath(index));
    }

    /**
     * Initialize from a FileSplit
     * @param job
     * @param split
     * @throws IOException
     */
    public SpatialRecordReader(Configuration job, FileSplit split) throws IOException {
        this(job, split.getStart(), split.getLength(), split.getPath());
    }

    /**
     * Initialize from a path and file range
     * @param job
     * @param s
     * @param l
     * @param p
     * @throws IOException
     */
    public SpatialRecordReader(Configuration job, long s, long l, Path p) throws IOException {
        this.start = s;
        this.end = s + l;
        this.path = p;
        LOG.info("Open a SpatialRecordReader to file: " + p + "[" + s + "," + (s + l) + ")");
        this.fs = this.path.getFileSystem(job);
        this.directIn = fs.open(this.path);
        this.blockSize = fs.getFileStatus(this.path).getBlockSize();
        this.cellMbr = new Rectangle();

        codec = new CompressionCodecFactory(job).getCodec(this.path);

        if (isCompressedInput()) {
            decompressor = CodecPool.getDecompressor(codec);
            if (codec instanceof SplittableCompressionCodec) {
                final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                        directIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
                in = cIn;
                start = cIn.getAdjustedStart();
                end = cIn.getAdjustedEnd();
                filePosition = cIn; // take pos from compressed stream
            } else {
                in = codec.createInputStream(directIn, decompressor);
                filePosition = directIn;
            }
        } else {
            directIn.seek(start);
            in = directIn;
            filePosition = directIn;
        }
        this.pos = start;
        this.maxShapesInOneRead = job.getInt(SpatialSite.MaxShapesInOneRead, 1000000);
        this.maxBytesInOneRead = job.getInt(SpatialSite.MaxBytesInOneRead, 32 * 1024 * 1024);

        initializeReader();
    }

    /**
     * Construct from an input stream already set to the first byte to read.
     * @param in
     * @param offset
     * @param endOffset
     * @throws IOException
     */
    public SpatialRecordReader(InputStream in, long offset, long endOffset) throws IOException {
        this.in = in;
        this.start = offset;
        this.end = endOffset;
        this.pos = offset;
        this.cellMbr = new Rectangle();
        initializeReader();
    }

    /**
     * Returns the current position of the file being parsed. This is equal to
     * the number of bytes consumed from disk regardless of whether the file is
     * compressed or not.
     * This function is used to report the progress.
     * @return
     * @throws IOException
     */
    private long getFilePosition() throws IOException {
        long retVal;
        if (isCompressedInput() && null != filePosition) {
            retVal = filePosition.getPos();
        } else {
            retVal = pos;
        }
        return retVal;
    }

    /**
     * Tells whether the file being parsed is compressed or not
     * @return
     */
    private boolean isCompressedInput() {
        return codec != null;
    }

    /**
     * Returns the current position in the data file. If the file is not
     * compressed, this is equal to the value returned by {@link #getFilePosition()}.
     * However, if the file is compressed, this value indicates the position
     * in the decompressed stream.
     */
    @Override
    public long getPos() throws IOException {
        return pos;
    }

    @Override
    public void close() throws IOException {
        try {
            if (lineReader != null) {
                lineReader.close();
            } else if (in != null) {
                in.close();
            }
            lineReader = null;
            in = null;
        } finally {
            if (decompressor != null) {
                CodecPool.returnDecompressor(decompressor);
            }
        }
    }

    @Override
    public float getProgress() throws IOException {
        if (start == end) {
            return 0.0f;
        } else {
            return Math.min(1.0f, (directIn.getPos() - start) / (float) (end - start));
        }
    }

    /**
     * Initializes the reader to read from the input stream or file.
     * First, it initializes the MBR of the partition being read if the file
     * is globally indexed. It also detects whether the file is R-tree indexed
     * or not which allows it to skip the R-tree if not needed to be read.
     * @throws IOException
     */
    protected boolean initializeReader() throws IOException {
        // Get the cell info for the current block
        cellMbr.invalidate(); // Initialize to invalid rectangle
        if (path != null) {
            GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(fs, path.getParent());
            if (globalIndex != null) {
                for (Partition partition : globalIndex) {
                    if (partition.filename.equals(path.getName())) {
                        cellMbr.set(partition);
                    }
                }
            }
        }

        // Read the first part of the block to determine its type
        buffer = new byte[8];
        int bufferLength = in.read(buffer);
        if (bufferLength <= 0) {
            buffer = null;
        } else if (bufferLength < buffer.length) {
            byte[] old_buffer = buffer;
            buffer = new byte[bufferLength];
            System.arraycopy(old_buffer, 0, buffer, 0, bufferLength);
        }
        if (buffer != null && Arrays.equals(buffer, SpatialSite.RTreeFileMarkerB)) {
            blockType = BlockType.RTREE;
            pos += 8;
            // Ignore the signature
            buffer = null;
        } else {
            blockType = BlockType.HEAP;
            // The read buffer might contain some data that must be read
            // File is text file
            lineReader = new LineReader(in);

            // Skip the first line unless we are reading the first block in file
            // For globally indexed blocks, never skip the first line in the block
            boolean skipFirstLine = getPos() != 0;
            if (buffer != null && skipFirstLine) {
                // Search for the first occurrence of a new line
                int eol = RTree.skipToEOL(buffer, 0);
                // If we found an end of line in the buffer, we do not need to skip
                // a line from the open stream. This happens if the EOL returned is
                // beyond the end of buffer and the buffer is not a complete line
                // by itself
                boolean skip_another_line_from_stream = eol >= buffer.length && buffer[buffer.length - 1] != '\n';
                if (eol < buffer.length) {
                    // Found an EOL in the buffer and there are some remaining bytes
                    byte[] tmp = new byte[buffer.length - eol];
                    System.arraycopy(buffer, eol, tmp, 0, tmp.length);
                    buffer = tmp;
                    // Advance current position to skip the first partial line
                    this.pos += eol;
                } else {
                    // Did not find an EOL in the buffer or found it at the very end
                    pos += buffer.length;
                    // Buffer does not contain any useful data
                    buffer = null;
                }

                if (skip_another_line_from_stream) {
                    // Didn't find an EOL in the buffer, need to skip it from the stream
                    pos += lineReader.readLine(tempLine, Integer.MAX_VALUE, (int) (end - pos));
                    if (pos >= end) {
                        // Special case when the whole split is in the middle of a line
                        // Skip the split
                        // Increase position beyond end to ensure the next call to
                        // nextLine would return false
                        pos++;
                    }
                }
            }
        }

        return true;
    }

    /**
     * Reads the next line from input and return true if a line was read.
     * If no more lines are available in this split, a false is returned.
     * @param value
     * @return
     * @throws IOException
     */
    protected boolean nextLine(Text value) throws IOException {
        if (blockType == BlockType.RTREE && pos == 8) {
            // File is positioned at the RTree header
            // Skip the header and go to first data object in file
            pos += RTree.skipHeader(in);
            LOG.info("Skipped R-tree to position: " + pos);
            // Reinitialize record reader at the new position
            lineReader = new LineReader(in);
        }
        while (getFilePosition() <= end) {
            value.clear();
            int b = 0;
            if (buffer != null) {
                // Read the first line encountered in buffer
                int eol = RTree.skipToEOL(buffer, 0);
                b += eol;
                value.append(buffer, 0, eol);
                if (eol < buffer.length) {
                    // There are still some bytes remaining in buffer
                    byte[] tmp = new byte[buffer.length - eol];
                    System.arraycopy(buffer, eol, tmp, 0, tmp.length);
                    buffer = tmp;
                } else {
                    buffer = null;
                }
                // Check if a complete line has been read from the buffer
                byte last_byte = value.getBytes()[value.getLength() - 1];
                if (last_byte == '\n' || last_byte == '\r')
                    return true;
            }

            // Read the first line from stream
            Text temp = new Text();
            b += lineReader.readLine(temp);
            if (b == 0) {
                // Indicates an end of stream
                return false;
            }
            pos += b;

            // Append the part read from stream to the part extracted from buffer
            value.append(temp.getBytes(), 0, temp.getLength());

            if (value.getLength() > 1) {
                // Read a non-empty line. Note that end-of-line character is included
                return true;
            }
        }
        // Reached end of file
        return false;
    }

    /**
     * Reads next shape from input and returns true. If no more shapes are left
     * in the split, a false is returned. This function first reads a line
     * by calling the method {@link #nextLine(Text)} then parses the returned
     * line by calling {@link Shape#fromText(Text)} on that line. If no stock
     * shape is set, a {@link NullPointerException} is thrown.
     * @param s
     * @return
     * @throws IOException 
     */
    protected boolean nextShape(Shape s) throws IOException {
        if (!nextLine(tempLine))
            return false;
        s.fromText(tempLine);
        return true;
    }

    /**
     * Reads all shapes left in the current block in one shot. This function
     * runs a loop where it keeps reading shapes by calling the method
     * {@link #nextShape(Shape)} until one of the following conditions happen.
     * 1. The whole file is read. No more records to read.
     * 2. Number of parsed records reaches the threshold defined by the
     *    configuration parameter spatialHadoop.mapred.MaxShapesPerRead.
     *    To disable this check, set the configuration parameter to -1
     * 3. Total size of parsed data from file reaches the threshold defined by
     *    the configuration parameter spatialHadoop.mapred.MaxBytesPerRead.
     *    To disable this check, set the configuration parameter to -1.
     * 
     * @param shapes
     * @return
     * @throws IOException
     */
    protected boolean nextShapes(ArrayWritable shapes) throws IOException {
        // Prepare a vector that will hold all objects in this 
        Vector<Shape> vshapes = new Vector<Shape>();
        try {
            Shape stockObject = (Shape) shapes.getValueClass().newInstance();
            // Reached the end of this split
            if (getFilePosition() >= end)
                return false;

            long initialReadPos = getPos();
            long readBytes = 0;

            // Read all shapes in this block
            while ((maxShapesInOneRead <= 0 || vshapes.size() < maxShapesInOneRead)
                    && (maxBytesInOneRead <= 0 || readBytes < maxBytesInOneRead) && nextShape(stockObject)) {
                vshapes.add(stockObject.clone());
                readBytes = getPos() - initialReadPos;
            }

            // Store them in the return value
            shapes.set(vshapes.toArray(new Shape[vshapes.size()]));

            return !vshapes.isEmpty();
        } catch (InstantiationException e1) {
            e1.printStackTrace();
        } catch (IllegalAccessException e1) {
            e1.printStackTrace();
        } catch (OutOfMemoryError e) {
            LOG.error("Error reading shapes. Stopped with " + vshapes.size() + " shapes");
            throw e;
        }
        return false;
    }

    /**
     * Returns an iterator that iterates over all remaining shapes in the file.
     * @param iter
     * @return
     * @throws IOException
     */
    protected boolean nextShapeIter(ShapeIterator iter) throws IOException {
        iter.setSpatialRecordReader((SpatialRecordReader<?, ? extends Shape>) this);
        return iter.hasNext();
    }

    /**
     * An iterator that iterates over all shapes in the input file
     * @author Eldawy
     */
    public static class ShapeIterator implements Iterator<Shape>, Iterable<Shape> {
        protected Shape shape;
        protected Shape nextShape;
        private SpatialRecordReader<?, ? extends Shape> srr;

        public ShapeIterator() {
        }

        public void setSpatialRecordReader(SpatialRecordReader<?, ? extends Shape> srr) {
            this.srr = srr;
            try {
                if (shape != null)
                    nextShape = shape.clone();
                if (nextShape != null && !srr.nextShape(nextShape))
                    nextShape = null;
            } catch (IOException e) {
            }
        }

        public void setShape(Shape shape) {
            this.shape = shape;
            this.nextShape = shape.clone();
            try {
                if (srr != null && !srr.nextShape(nextShape))
                    nextShape = null;
            } catch (IOException e) {
            }
        }

        public boolean hasNext() {
            if (nextShape == null)
                return false;
            return nextShape != null;
        }

        @Override
        public Shape next() {
            try {
                if (nextShape == null)
                    return null;
                // Swap Shape and nextShape and read next
                Shape temp = shape;
                shape = nextShape;
                nextShape = temp;

                if (!srr.nextShape(nextShape))
                    nextShape = null;
                return shape;
            } catch (IOException e) {
                return null;
            }
        }

        @Override
        public Iterator<Shape> iterator() {
            return this;
        }

        @Override
        public void remove() {
            throw new RuntimeException("Unsupported method ShapeIterator#remove");
        }

    }

    /**
     * Reads the next RTree from file. The file must be part of an R-tree index.
     * If the file is not locally indexed using an R-tree, a runtime exception
     * is thrown. If the file is locally indexed using an R-tree, the R-tree
     * is consumed from the file and parsed by calling
     * {@link RTree#readFields(DataInput)} on the input stream.
     * @param rtree
     * @return
     * @throws IOException
     */
    protected boolean nextRTree(RTree<? extends Shape> rtree) throws IOException {
        if (blockType == BlockType.RTREE) {
            if (getPos() != 8)
                return false;
            // Signature was already read in initialization.
            buffer = null;
            DataInput dataIn = in instanceof DataInput ? (DataInput) in : new DataInputStream(in);
            rtree.readFields(dataIn);
            pos++;
            return true;
        } else {
            throw new RuntimeException("Not implemented");
        }
    }
}