org.commoncrawl.service.queryserver.index.PositionBasedSequenceFileIndex.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.service.queryserver.index.PositionBasedSequenceFileIndex.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.service.queryserver.index;

import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.lang.reflect.Constructor;
import java.nio.ByteBuffer;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.SequenceFile.ValueBytes;
import org.commoncrawl.hadoop.mergeutils.MergeSortSpillWriter;
import org.commoncrawl.hadoop.mergeutils.SequenceFileIndexWriter;
import org.commoncrawl.service.queryserver.ClientQueryInfo;
import org.commoncrawl.service.queryserver.query.QueryResult;
import org.commoncrawl.service.queryserver.query.QueryResultRecord;
import org.commoncrawl.util.CCStringUtils;

/**
 * Creates an index into a sequence file
 * 
 * @author rana
 *
 * @param <KeyType>
 * @param <ValueType>
 */
public class PositionBasedSequenceFileIndex<KeyType extends WritableComparable, ValueType extends Writable> {

    private static final Class[] emptyArray = new Class[] {};

    FileSystem _fileSystem;
    Path _indexFileName;
    PositionBasedIndexWriter.IndexHeader _header = new PositionBasedIndexWriter.IndexHeader();
    ByteBuffer _indexData = null;
    DataInputStream _inputStream = null;
    int _headerOffset = -1;
    int _indexItemCount;
    static final int INDEX_RECORD_SIZE = 16;

    Constructor<KeyType> keyConstructor = null;
    Constructor<ValueType> valConstructor = null;

    public static final Log LOG = LogFactory.getLog(PositionBasedSequenceFileIndex.class);

    public static Path getIndexNameFromBaseName(Path baseFileName) {
        return new Path(baseFileName.getParent(), baseFileName.getName() + ".index");
    }

    public static Path getBaseNameFromIndexName(Path indexName) {
        //LOG.info("Index Name is:" + indexName.getName());
        String baseName = indexName.getName().substring(0, indexName.getName().length() - ".index".length());
        //LOG.info("Base Name is:" + baseName);

        return new Path(indexName.getParent(), baseName);
    }

    public PositionBasedSequenceFileIndex(FileSystem fileSystem, Path indexFilePath, Class<KeyType> keyClass,
            Class<ValueType> valueClass) throws IOException {

        _fileSystem = fileSystem;
        _indexFileName = indexFilePath;

        if (!_fileSystem.exists(_indexFileName) || _fileSystem.getFileStatus(_indexFileName).isDir()) {
            throw new IOException("Index Path:" + indexFilePath + " Points to Invalid File");
        } else {

            try {
                this.keyConstructor = keyClass.getDeclaredConstructor(emptyArray);
                this.keyConstructor.setAccessible(true);
                this.valConstructor = valueClass.getDeclaredConstructor(emptyArray);
                this.valConstructor.setAccessible(true);
            } catch (SecurityException e) {
                LOG.error(CCStringUtils.stringifyException(e));
                throw new RuntimeException(e);
            } catch (NoSuchMethodException e) {
                LOG.error(CCStringUtils.stringifyException(e));
                throw new RuntimeException(e);
            }

            _indexData = loadStreamIntoMemory(indexFilePath);
            _inputStream = new DataInputStream(newInputStream(_indexData));
            _header.readHeader(_inputStream);
            _headerOffset = _indexData.position();
            // calculate index item count based on file size 
            _indexItemCount = (int) (_indexData.remaining() / INDEX_RECORD_SIZE);
        }
    }

    public long getRecordCount() {
        return _header._totalRecordCount;
    }

    private static InputStream newInputStream(final ByteBuffer buf) {
        return new InputStream() {
            public synchronized int read() throws IOException {
                if (!buf.hasRemaining()) {
                    LOG.error("EOF REACHED in Wrapper Stream!");
                    return -1;
                }
                return buf.get() & 0xff;
            }

            public synchronized int read(byte[] bytes, int off, int len) throws IOException {
                // Read only what's left
                len = Math.min(len, buf.remaining());
                buf.get(bytes, off, len);
                return len;
            }
        };
    }

    private static class IndexItem {

        public IndexItem(long indexValue, long offsetValue) {
            _indexValue = indexValue;
            _offsetValue = offsetValue;
        }

        long _indexValue;
        long _offsetValue;
    }

    private IndexItem findIndexDataPosForItemIndex(long targetItemIndexValue) throws IOException {

        int low = 0;
        int high = _indexItemCount - 1;
        while (low <= high) {
            int mid = low + ((high - low) / 2);
            _indexData.position(_headerOffset + (mid * (INDEX_RECORD_SIZE)));
            long indexValue = _inputStream.readLong();
            int comparisonResult = (int) (indexValue - targetItemIndexValue);
            if (comparisonResult > 0)
                high = mid - 1;
            else if (comparisonResult < 0)
                low = mid + 1;
            else {
                return new IndexItem(indexValue, _inputStream.readLong()); // found
            }
        }
        if (high == -1)
            return null;
        else {
            _indexData.position(_headerOffset + (high * (INDEX_RECORD_SIZE)));
            return new IndexItem(_inputStream.readLong(), _inputStream.readLong()); // not found
        }
    }

    public void dump() throws IOException {
        //LOG.info("Record Count:"+ this._header._totalRecordCount);

        for (long i = 0; i < _header._totalRecordCount; i += 100) {
            IndexItem itemData = findIndexDataPosForItemIndex(i);
            //LOG.info("Pos for Item:" + i + " is:[" + itemData._indexValue + "," + itemData._offsetValue +"]" );
        }
    }

    public void seekReaderToItemAtIndex(SequenceFile.Reader reader, long desiredIndexPos) throws IOException {
        IndexItem indexItem = findIndexDataPosForItemIndex(desiredIndexPos);
        if (indexItem == null) {
            throw new IOException("Invalid Index Position:" + desiredIndexPos);
        }

        //LOG.info("Seeking to appropriate position in file");
        long timeStart = System.currentTimeMillis();
        reader.seek(indexItem._offsetValue);
        //LOG.info("Seek Took:" + (System.currentTimeMillis() - timeStart));

        DataOutputBuffer skipBuffer = new DataOutputBuffer() {
            @Override
            public void write(DataInput in, int length) throws IOException {
                in.skipBytes(length);
            }
        };

        timeStart = System.currentTimeMillis();

        int skipCount = 0;

        ValueBytes skipValue = reader.createValueBytes();

        long currentIndexPos = indexItem._indexValue;
        while (currentIndexPos < desiredIndexPos) {

            reader.nextRawKey(skipBuffer);
            reader.nextRawValue(skipValue);
            ++skipCount;
            ++currentIndexPos;
        }

        //LOG.info("Skip of:" + skipCount +" Values took:" + (System.currentTimeMillis() - timeStart));

    }

    public void readPaginatedResults(FileSystem fileSystem, Configuration conf, int sortOrder, int pageNumber,
            int pageSize, QueryResult<KeyType, ValueType> resultOut) throws IOException {
        SequenceFile.Reader reader = new SequenceFile.Reader(fileSystem, getBaseNameFromIndexName(_indexFileName),
                conf);

        try {
            readPaginatedResults(reader, sortOrder, pageNumber, pageSize, resultOut);
        } finally {
            reader.close();
        }

    }

    public void readPaginatedResults(SequenceFile.Reader reader, int sortOrder, int pageNumber, int pageSize,
            QueryResult<KeyType, ValueType> resultOut) throws IOException {
        // if descending sort order ... 
        // take pageNumber * pageSize as starting point
        long offset = 0;
        long startPos = 0;
        long endPos = 0;

        resultOut.getResults().clear();
        resultOut.setPageNumber(pageNumber);
        resultOut.setTotalRecordCount(_header._totalRecordCount);

        if (sortOrder == ClientQueryInfo.SortOrder.ASCENDING) {
            startPos = pageNumber * pageSize;
            endPos = Math.min(startPos + pageSize, _header._totalRecordCount);
            offset = pageNumber * pageSize;
        } else {
            startPos = _header._totalRecordCount - ((pageNumber + 1) * pageSize);
            endPos = startPos + pageSize;
            startPos = Math.max(0, startPos);
            offset = _header._totalRecordCount - ((pageNumber + 1) * pageSize);
        }
        LOG.info("readPaginatedResults called on Index with sortOrder:" + sortOrder + " pageNumber: " + pageNumber
                + " pageSize:" + pageSize + " offset is:" + offset);
        if (startPos < _header._totalRecordCount) {

            //LOG.info("Seeking to Offset:" + startPos);
            seekReaderToItemAtIndex(reader, startPos);
            //LOG.info("Reading from:"+ startPos + " to:" + endPos + " (exclusive)");
            for (long i = startPos; i < endPos; ++i) {
                KeyType key = null;
                ValueType value = null;
                try {
                    key = keyConstructor.newInstance();
                    value = valConstructor.newInstance();
                } catch (Exception e) {
                    LOG.error("Failed to create key or value type with Exception:"
                            + CCStringUtils.stringifyException(e));
                    throw new RuntimeException(e);
                }

                if (reader.next(key, value)) {
                    if (sortOrder == ClientQueryInfo.SortOrder.DESCENDING) {
                        resultOut.getResults().add(0, new QueryResultRecord<KeyType, ValueType>(key, value));
                    } else {
                        resultOut.getResults().add(new QueryResultRecord<KeyType, ValueType>(key, value));
                    }
                } else {
                    break;
                }
            }
        }
    }

    private ByteBuffer loadStreamIntoMemory(Path streamPath) throws IOException {
        //LOG.info("Loading Stream:" + streamPath.getAbsolutePath());
        if (!_fileSystem.exists(streamPath) || _fileSystem.getFileStatus(streamPath).isDir()) {
            throw new IOException("Stream Path:" + streamPath + " Points to Invalid File");
        } else {
            DataInputStream inputStream = null;
            ByteBuffer bufferOut = null;
            try {

                //LOG.info("Allocating Buffer of size:" + streamPath.length() + " for Stream:" + streamPath.getAbsolutePath());
                bufferOut = ByteBuffer.allocate((int) _fileSystem.getFileStatus(streamPath).getLen());
                inputStream = _fileSystem.open(streamPath);
                long loadStart = System.currentTimeMillis();
                for (int offset = 0, totalRead = 0; offset < bufferOut.capacity();) {
                    int bytesToRead = Math.min(16384, bufferOut.capacity() - totalRead);
                    inputStream.read(bufferOut.array(), offset, bytesToRead);
                    offset += bytesToRead;
                    totalRead += bytesToRead;
                }
                //LOG.info("Load of Stream:" + streamPath.getAbsolutePath() + " Took:" + (System.currentTimeMillis() - loadStart) + " MS");
            } finally {
                if (inputStream != null) {
                    inputStream.close();
                }
            }

            return bufferOut;
        }
    }

    @SuppressWarnings("unchecked")
    public static class PositionBasedIndexWriter implements SequenceFileIndexWriter {

        public static final Log LOG = LogFactory.getLog(MergeSortSpillWriter.class);

        private FileSystem _fileSystem;
        private Path _indexFileName;
        private RandomAccessFile _indexFile = null;
        private File _tempFileName;
        private IndexHeader _header = null;
        public long _lastKnownStartIndex = -1;
        public long _lastKnownFileLength = -1;
        public int _level1IndexItemCount = 0;

        public static class IndexHeader {

            public short _version = 01;
            public long _totalRecordCount = 0;

            public void readHeader(DataInput stream) throws IOException {
                _version = stream.readShort();
                _totalRecordCount = stream.readLong();
            }

            public void writeHeader(DataOutput stream) throws IOException {
                stream.writeShort(_version);
                stream.writeLong(_totalRecordCount);
            }

            public static int sizeOfHeader() {
                return 2 + 4 + 8;
            }
        }

        public PositionBasedIndexWriter(FileSystem fileSystem, Path indexFilePath) throws IOException {
            _fileSystem = fileSystem;
            _fileSystem.delete(indexFilePath, false);
            _indexFileName = indexFilePath;
            _tempFileName = File.createTempFile("indexTmp", Long.toString(System.currentTimeMillis()));
            _indexFile = new RandomAccessFile(_tempFileName, "rw");

            _header = new IndexHeader();

            // write empty header to disk 
            _header.writeHeader(_indexFile);
        }

        public Path getPath() {
            return _indexFileName;
        }

        public void close() throws IOException {
            if (_indexFile != null) {
                //LOG.info("Level 1 Index Count:" + _level1IndexItemCount);
                try {
                    // reseek to zero 
                    _indexFile.seek(0);
                    // and rewrite header ...
                    _header.writeHeader(_indexFile);
                } finally {
                    _indexFile.close();
                }
                _indexFile = null;

                // copy across to the remote file system.
                _fileSystem.copyFromLocalFile(new Path(_tempFileName.getAbsolutePath()), _indexFileName);
            }
        }

        @Override
        public void indexItem(byte[] keyData, int keyOffset, int keyLength, byte[] valueData, int valueOffset,
                int valueLength, long currentFileLength) throws IOException {

            // check to see if block position changed ... 
            if (currentFileLength != _lastKnownFileLength) {
                // establish new start index
                _lastKnownStartIndex = _header._totalRecordCount;
                // and also update last known file position 
                _lastKnownFileLength = currentFileLength;
                // increment index item count
                ++_level1IndexItemCount;
                //LOG.info("Writing Index Record. StartIndex:" + _lastKnownStartIndex +" FilePos:"+ _lastKnownFileLength);
                // time to write out an index record ... 
                _indexFile.writeLong(_lastKnownStartIndex);
                _indexFile.writeLong(_lastKnownFileLength);
            }
            // now update header count ...
            _header._totalRecordCount++;
        }

    }

}