ch.epfl.data.squall.utilities.SerializableHDFSFileInputStream.java Source code

Java tutorial

Introduction

Here is the source code for ch.epfl.data.squall.utilities.SerializableHDFSFileInputStream.java

Source

/*
 * Copyright (c) 2011-2015 EPFL DATA Laboratory
 * Copyright (c) 2014-2015 The Squall Collaboration (see NOTICE)
 *
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package ch.epfl.data.squall.utilities;

import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;

public class SerializableHDFSFileInputStream extends InputStream implements CustomReader {
    // self-test
    public static void main(String args[]) throws IOException {
        final SerializableHDFSFileInputStream reader = new SerializableHDFSFileInputStream(
                "hdfs://localhost:9000/0.3/0.3/customer.tbl");
        while (true) {
            final String l = reader.readLine();
            if (l == null)
                break;
            LOG.info(l);
        }
    }

    /**
     *
     */
    private static final long serialVersionUID = 1L;

    private static Logger LOG = Logger.getLogger(SerializableHDFSFileInputStream.class);

    // protected final File _file; // The _file to read from
    protected byte[] _buffer; // The _file _buffer
    protected long _filePtr = 0; // How many bytes already read
    // from the _file
    FileSystem _file;
    FSDataInputStream _fis;

    // _filePtr might also represent the logical start of file read
    protected int _bufferPtr = 0; // How many bytes into the
    // _buffer the user has read
    protected int _bufferSize = 0; // How many bytes of the _buffer

    // are being used
    protected boolean _eofReached = false;
    // Modified by Aleksandar
    // All the sections (except the first one) should omit firstLine characters.
    // The responsability of the previous section is to finish the previous
    // line.
    private boolean _omitFirstLine;
    // Logical end of the file based on (section, parts)
    private long _fileEndPtr;

    private URI _uri;

    // Accurate position into the file (updatet after each readLine method)
    private long _filePosition;
    protected static int DEFAULT_BUFFER_SIZE = 1 * 1024 * 1024;

    protected static int LOW_WATER_MARK = 1024;

    public SerializableHDFSFileInputStream(String URIstring, int bufferSize) throws IOException {
        this(URIstring, bufferSize, 0, 1);
    }

    public SerializableHDFSFileInputStream(String URIstring, int bufferSize, int section, int parts)
            throws IOException {

        _uri = URI.create(URIstring);
        Configuration conf = new Configuration();
        _file = FileSystem.get(_uri, conf);
        _fis = _file.open(new Path(_uri));

        _buffer = new byte[bufferSize];
        setParameters(section, parts);
        fillBuffer();
    }

    public SerializableHDFSFileInputStream(String URIstring) throws IOException {
        this(URIstring, DEFAULT_BUFFER_SIZE, 0, 1);
    }

    @Override
    public void close() {
        // ignore
    }

    public boolean eof() {

        return (_eofReached && (_bufferPtr >= _bufferSize))
                // Modified by Aleksandar
                // if _fileEnd points to a first character after \n,
                // we still want the previous section to read it
                || (_filePosition > _fileEndPtr);
    }

    protected void fillBuffer() throws IOException {
        if (_eofReached)
            return;

        if (_bufferPtr < 0)
            throw new IOException("Invalid Buffer Pointer: " + _bufferPtr);

        if (_bufferPtr > 0) {
            // Do some housekeeping on our _buffer
            if (_bufferPtr < _bufferSize) {
                // Move any existing data to the front of the _buffer...
                // A circular _buffer would be faster, but we're really only
                // talking
                // about a few dozen bytes at a time.

                // Note that System.arraycopy is explicitly safe for
                // self-to-self copies.
                System.arraycopy(_buffer, _bufferPtr, _buffer, 0, _bufferSize - _bufferPtr);
                _bufferSize = _bufferSize - _bufferPtr;
            } else
                // If we've precisely exhausted our _buffer (_bufferPtr should
                // never be >
                // _bufferSize), then don't bother copying;
                _bufferSize = 0;
            _bufferPtr = 0;
        }

        _fis = _file.open(new Path(_uri));
        int bytesRead = -100;
        try {
            if (_filePtr > 0)
                for (long i = 0; i < _filePtr; i += _fis.skip(_filePtr - i)) {
                }

            bytesRead = _fis.read(_buffer, _bufferSize, _buffer.length - _bufferSize);

            if (bytesRead < 0)
                _eofReached = true;
            else {
                _bufferSize += bytesRead;
                _filePtr += bytesRead;
            }
        } finally {
            _fis.close();
        }
    }

    protected void fillBufferIfNeeded(int bytesRequested) throws IOException {
        final int currentBufferBytes = _bufferSize - _bufferPtr;
        if ((currentBufferBytes < LOW_WATER_MARK) && (bytesRequested > currentBufferBytes))
            fillBuffer();
    }

    protected int numberOfBytesToEOL(boolean canFail) throws IOException {
        int i;
        for (i = 0; i < _bufferSize - _bufferPtr; i++) {
            if (_buffer[i + _bufferPtr] == '\n')
                break;
            if (_buffer[i + _bufferPtr] == '\r') {
                if (i + 1 < _bufferSize)
                    fillBuffer();
                if ((i + 1 < _bufferSize - _bufferPtr) && (_buffer[i + 1 + _bufferPtr] == '\n'))
                    i += 1;
                break;
            }
        }
        if (i >= _bufferSize - _bufferPtr)
            if (canFail)
                return -1;
            else {
                fillBuffer();
                return numberOfBytesToEOL(true);
            }
        return i + 1;
    }

    @Override
    public int read() throws IOException {
        if (eof())
            return -1;
        fillBufferIfNeeded(1);

        final int ret = _buffer[_bufferPtr];
        _bufferPtr++;

        return ret;
    }

    @Override
    public int read(byte[] b, int off, int len) throws IOException {
        if (eof())
            return -1;
        fillBufferIfNeeded(len);

        if (len > _buffer.length - _bufferSize)
            len = _buffer.length - _bufferSize;
        System.arraycopy(b, off, _buffer, _bufferPtr, len);
        _bufferPtr += len;

        return len;
    }

    @Override
    public String readLine() throws IOException {
        if (eof())
            return null;

        int i = numberOfBytesToEOL(false);
        String ret = null;

        if ((i < 0) && (_bufferPtr < _bufferSize))
            i = _bufferSize - _bufferPtr;
        if (i > 0) {
            int newlineChomp = 1;
            if (_buffer[_bufferPtr + i - 1] == '\n')
                if ((i > 1) && (_buffer[_bufferPtr + i - 2] == '\r'))
                    newlineChomp++;

            // Modified by Aleksandar
            // For the first line in a section:
            // we neglect the line we are currently in (it might be only a part
            // of a line),
            // and send the next line.
            if (!_omitFirstLine) {
                ret = new String(_buffer, _bufferPtr, i - newlineChomp);
                _bufferPtr += i;
                _filePosition += i;
            } else {
                _bufferPtr += i;
                _filePosition += i;
                _omitFirstLine = false;
                return readLine();
            }
        }

        return ret;
    }

    // Modified by Aleksandar
    private void setParameters(int section, int parts) throws IOException {
        if (section >= parts)
            throw new RuntimeException("The section can take value from 0 to " + (parts - 1));

        final long fileSize = _file.getLength(new Path(_uri));

        final long sectionSize = fileSize / parts;
        _filePtr = section * sectionSize;
        _filePosition = _filePtr;

        // for all the sections except the last one, the end is sectionSize far
        // from the beginning
        if (section == parts - 1)
            _fileEndPtr = fileSize;
        else
            _fileEndPtr = _filePtr + sectionSize;

        // for all the sections except the first one, we discard the first read
        // line
        if (section == 0)
            _omitFirstLine = false;
        else
            _omitFirstLine = true;
    }

    protected String stats() {
        return "bP=" + _bufferPtr + "; bS=" + _bufferSize + "; fP=" + _filePtr;
    }
}