com.facebook.infrastructure.io.SSTable.java Source code

Java tutorial

Introduction

Here is the source code for com.facebook.infrastructure.io.SSTable.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.facebook.infrastructure.io;

import com.facebook.infrastructure.config.DatabaseDescriptor;
import com.facebook.infrastructure.utils.BasicUtilities;
import com.facebook.infrastructure.utils.BloomFilter;
import com.facebook.infrastructure.utils.LogUtil;
import com.facebook.infrastructure.db.RowMutation;
import org.apache.log4j.Logger;
import org.apache.commons.lang.ArrayUtils;

import java.io.File;
import java.io.IOException;
import java.util.*;

/**
 * This class is built on top of the SequenceFile. It stores
 * data on disk in sorted fashion. However the sorting is upto
 * the application. This class expects keys to be handed to it
 * in sorted order. SSTable is broken up into blocks where each
 * block contains 128 keys. At the end of every block the block 
 * index is written which contains the offsets to the keys in the
 * block. SSTable also maintains an index file to which every 128th 
 * key is written with a pointer to the block index which is the block 
 * that actually contains the key. This index file is then read and 
 * maintained in memory. SSTable is append only and immutable. SSTable
 * on disk looks as follows:
 * 
 *                 -------------------------
 *                 |------------------------|<-------|
 *                 |                        |        |  BLOCK-INDEX PTR
 *                 |                        |        |
 *                 |------------------------|--------
 *                 |------------------------|<-------|
 *                 |                        |        |
 *                 |                        |        |  BLOCK-INDEX PTR 
 *                 |                        |        |
 *                 |------------------------|---------
 *                 |------------------------|<--------|
 *                 |                        |         |
 *                 |                        |         |
 *                 |                        |         | BLOCK-INDEX PTR
 *                 |                        |         |
 *                 |------------------------|         |
 *                 |------------------------|----------
 *                 |------------------------|-----------------> BLOOM-FILTER
 * version-info <--|----------|-------------|-------> relative offset to last block index.
 *                 
 * Author : Avinash Lakshman ( alakshman@facebook.com) & Prashant Malik ( pmalik@facebook.com )
 */
public class SSTable {
    private static Logger logger_ = Logger.getLogger(SSTable.class);
    /* use this as a monitor to lock when loading index. */
    private static Object indexLoadLock_ = new Object();
    /* Every 128th key is an index. */
    private static final int indexInterval_ = 128;
    /* Key associated with block index written to disk */
    public static final String blockIndexKey_ = "BLOCK-INDEX";
    /* Position in SSTable after the first Block Index */
    private static long positionAfterFirstBlockIndex_ = 0L;
    /* Required extension for temporary files created during compactions. */
    public static final String temporaryFile_ = "tmp";
    /* Use this long as a 64 bit entity to turn on some bits for various settings */
    private static final long version_ = 0L;
    /*
     * This map has the SSTable as key and a BloomFilter as value. This
     * BloomFilter will tell us if a key/column pair is in the SSTable.
     * If not we can avoid scanning it.
     */
    private static Map<String, BloomFilter> bfs_ = new Hashtable<String, BloomFilter>();
    /* Maintains a touched set of keys */
    private static LinkedHashMap<String, Long> touchCache_ = new TouchedKeyCache(
            DatabaseDescriptor.getTouchKeyCacheSize());

    /**
     * This class holds the position of a key in a block
     * and the size of the data associated with this key. 
    */
    static class BlockMetadata {
        static final BlockMetadata NULL = new BlockMetadata(-1L, -1L);

        long position_;
        long size_;

        BlockMetadata(long position, long size) {
            position_ = position;
            size_ = size;
        }
    }

    /*
     * This abstraction provides LRU symantics for the keys that are 
     * "touched". Currently it holds the offset of the key in a data
     * file. May change to hold a reference to a IFileReader which
     * memory maps the key and its associated data on a touch.
    */
    private static class TouchedKeyCache extends LinkedHashMap<String, Long> {
        private final int capacity_;

        TouchedKeyCache(int capacity) {
            super(capacity + 1, 1.1f, true);
            capacity_ = capacity;
        }

        protected boolean removeEldestEntry(Map.Entry<String, Long> entry) {
            return (size() > capacity_);
        }
    }

    /**
     * This is a simple container for the index Key and its corresponding position
     * in the data file. Binary search is performed on a list of these objects
     * to lookup keys within the SSTable data file.
    */
    public static class KeyPositionInfo implements Comparable<KeyPositionInfo> {
        public final String key;
        public final long position;

        public KeyPositionInfo(String key) {
            this(key, 0);
        }

        public KeyPositionInfo(String key, long position) {
            this.key = key;
            this.position = position;
        }

        public int compareTo(KeyPositionInfo kPosInfo) {
            return key.compareTo(kPosInfo.key);
        }

        public String toString() {
            return key + ":" + position;
        }
    }

    public static int indexInterval() {
        return indexInterval_;
    }

    /*
     * Maintains a list of KeyPositionInfo objects per SSTable file loaded.
     * We do this so that we don't read the index file into memory multiple
     * times.
    */
    private static Map<String, List<KeyPositionInfo>> indexMetadataMap_ = new Hashtable<String, List<KeyPositionInfo>>();

    /** 
     * This method deletes both the specified data file
     * and the associated index file
     *
     * @param dataFile - data file associated with the SSTable
    */
    public static void delete(String dataFile) {
        /* remove the cached index table from memory */
        indexMetadataMap_.remove(dataFile);

        File file = new File(dataFile);
        if (file.exists())
            /* delete the data file */
            if (file.delete()) {
                logger_.info("** Deleted " + file.getName() + " **");
            } else {
                logger_.error("Failed to delete " + file.getName());
            }
    }

    public static int getApproximateKeyCount(List<String> dataFiles) {
        int count = 0;

        for (String dataFile : dataFiles) {
            List<KeyPositionInfo> index = indexMetadataMap_.get(dataFile);
            if (index != null) {
                count += index.size() + 1;
            }
        }

        return count * indexInterval_;
    }

    /**
     * Get all indexed keys in the SSTable.
    */
    public static List<String> getSortedKeys() {
        Set<String> indexFiles = indexMetadataMap_.keySet();
        List<KeyPositionInfo> keyPositionInfos = new ArrayList<KeyPositionInfo>();

        for (String indexFile : indexFiles) {
            keyPositionInfos.addAll(indexMetadataMap_.get(indexFile));
        }

        List<String> indexedKeys = new ArrayList<String>();
        for (KeyPositionInfo keyPositionInfo : keyPositionInfos) {
            indexedKeys.add(keyPositionInfo.key);
        }

        Collections.sort(indexedKeys);
        return indexedKeys;
    }

    public static void onStart(List<String> filenames) throws IOException {
        for (String filename : filenames) {
            SSTable.maybeLoadIndexFile(filename);
        }
    }

    /*
     * Stores the Bloom Filter associated with the given file.
    */
    public static void storeBloomFilter(String filename, BloomFilter bf) {
        bfs_.put(filename, bf);
    }

    /*
     * Removes the bloom filter associated with the specified file.
    */
    public static void removeAssociatedBloomFilter(String filename) {
        bfs_.remove(filename);
    }

    /*
     * Determines if the given key is in the specified file. If the
     * key is not present then we skip processing this file.
    */
    public static boolean isKeyInFile(String key, String filename) {
        boolean bVal = false;
        BloomFilter bf = bfs_.get(filename);
        if (bf != null) {
            bVal = bf.isPresent(key);
        }
        return bVal;
    }

    public static long fetchOffset(String key, String file) throws IOException {
        long position = -1L;
        DataOutputBuffer bufOut = new DataOutputBuffer();
        DataInputBuffer bufIn = new DataInputBuffer();
        IFileReader dataReader = SequenceFile.bufferedReader(file, 1024 * 1024);

        while (!dataReader.isEOF()) {
            bufOut.reset();
            /* Record the position of the key. */
            position = dataReader.getCurrentPosition();
            dataReader.next(bufOut);
            bufIn.reset(bufOut.getData(), bufOut.getLength());
            /* Key just read */
            String keyOnDisk = bufIn.readUTF();
            if (keyOnDisk.equals(key)) {
                break;
            }
        }
        return position;
    }

    private String dataFile_;
    private IFileWriter dataWriter_;
    private String lastWrittenKey_;
    private long prevBlockPosition_ = 0L;
    private int indexKeysWritten_ = 0;
    /* Holds the keys and their respective positions in a block */
    private SortedMap<String, BlockMetadata> blockIndex_ = new TreeMap<String, BlockMetadata>(
            Collections.reverseOrder());

    /*
     * This ctor basically gets passed in the full path name
     * of the data file associated with this SSTable. Use this
     * ctor to read the data in this file.
    */
    public SSTable(String dataFileName) throws IOException {
        dataFile_ = dataFileName;
        SSTable.maybeLoadIndexFile(dataFile_);
    }

    /*
     * Intialize the index files and also cache the Bloom Filters
     * associated with these files.
    */
    public static void maybeLoadIndexFile(String filename) throws IOException {
        // prevent multiple threads from loading the same index files multiple times
        synchronized (indexLoadLock_) {
            if (indexMetadataMap_.get(filename) == null) {
                long start = System.currentTimeMillis();
                loadIndex(filename);
                logger_.debug("INDEX LOAD TIME: " + (System.currentTimeMillis() - start) + " ms.");
            }
        }
    }

    private static void loadBloomFilter(IFileReader reader, long size) throws IOException {
        /* read the position of the bloom filter */
        reader.seek(size - 8);
        byte[] bytes = new byte[8];
        long currentPosition = reader.getCurrentPosition();
        reader.readDirect(bytes);
        long position = BasicUtilities.byteArrayToLong(bytes);
        /* seek to the position of the bloom filter */
        reader.seek(currentPosition - position);
        DataOutputBuffer bufOut = new DataOutputBuffer();
        DataInputBuffer bufIn = new DataInputBuffer();
        /* read the bloom filter from disk */
        reader.next(bufOut);
        bufIn.reset(bufOut.getData(), bufOut.getLength());
        String key = bufIn.readUTF();
        if (key.equals(SequenceFile.marker_)) {
            /*
             * We are now reading the serialized Bloom Filter. We read
             * the length and then pass the bufIn to the serializer of
             * the BloomFilter. We then store the Bloom filter in the
             * map. However if the Bloom Filter already exists then we
             * need not read the rest of the file.
            */
            bufIn.readInt();
            if (bfs_.get(reader.getFileName()) == null)
                bfs_.put(reader.getFileName(), BloomFilter.serializer().deserialize(bufIn));
        }
    }

    private static void loadIndex(String filename) throws IOException {
        IFileReader indexReader = SequenceFile.reader(filename);
        File file = new File(filename);
        long size = file.length();
        /* load the bloom filter into memory */
        loadBloomFilter(indexReader, size);
        /* read the position of the last block index */
        byte[] bytes = new byte[8];
        /* seek to the position to read the relative position of the last block index */
        indexReader.seek(size - 16L);
        /* the beginning of the last block index */
        long currentPosition = indexReader.getCurrentPosition();
        indexReader.readDirect(bytes);
        long lastBlockIndexPosition = BasicUtilities.byteArrayToLong(bytes);
        List<KeyPositionInfo> keyPositionInfos = new ArrayList<KeyPositionInfo>();
        indexMetadataMap_.put(filename, keyPositionInfos);
        DataOutputBuffer bufOut = new DataOutputBuffer();
        DataInputBuffer bufIn = new DataInputBuffer();
        /* Read all block indexes to maintain an index in memory */
        try {
            long nextPosition = currentPosition - lastBlockIndexPosition;
            /* read the block indexes from the end of the file till we hit the first one. */
            while (nextPosition > 0) {
                indexReader.seek(nextPosition);
                bufOut.reset();
                /* position @ the current block index being processed */
                currentPosition = indexReader.getCurrentPosition();
                long bytesRead = indexReader.next(bufOut);
                if (bytesRead != -1) {
                    bufIn.reset(bufOut.getData(), bufOut.getLength());
                    /* read the block key. */
                    String blockIndexKey = bufIn.readUTF();
                    if (!blockIndexKey.equals(SSTable.blockIndexKey_))
                        throw new IOException("Unexpected position to be reading the block index from.");
                    /* read the size of the block index */
                    int sizeOfBlockIndex = bufIn.readInt();
                    /* Number of keys in the block. */
                    int keys = bufIn.readInt();
                    String largestKeyInBlock = null;
                    for (int i = 0; i < keys; ++i) {
                        String keyInBlock = bufIn.readUTF();
                        if (i == 0) {
                            largestKeyInBlock = keyInBlock;
                            /* relative offset in the block for the key*/
                            long position = bufIn.readLong();
                            /* size of data associated with the key */
                            bufIn.readLong();
                            /* load the actual position of the block index into the index map */
                            keyPositionInfos.add(new KeyPositionInfo(largestKeyInBlock, currentPosition));
                        } else {
                            /*
                             * This is not the key we are looking for. So read its position
                             * and the size of the data associated with it. This was stored
                             * as the BlockMetadata.
                            */
                            long position = bufIn.readLong();
                            bufIn.readLong();
                        }
                    }
                    lastBlockIndexPosition = bufIn.readLong();
                    nextPosition = currentPosition - lastBlockIndexPosition;
                }
            }
            Collections.sort(keyPositionInfos);
        } catch (IOException ex) {
            logger_.error(LogUtil.throwableToString(ex));
        } finally {
            indexReader.close();
        }
    }

    /**
     * Section of a file that needs to be scanned
     */
    public static class Range {
        public long start;
        public long end;

        Range(long start, long end) {
            this.start = start;
            this.end = end;
        }
    }

    //
    //
    // BEGIN ACTUAL SSTABLE CODE
    //
    //

    /*
     * This ctor is used for writing data into the SSTable. Use this
     * version to write to the SSTable.
    */
    public SSTable(String directory, String filename) throws IOException {
        dataFile_ = directory + System.getProperty("file.separator") + filename + "-Data.db";
        dataWriter_ = SequenceFile.bufferedWriter(dataFile_, 32 * 1024 * 1024);
        // dataWriter_ = SequenceFile.checksumWriter(dataFile_);
        /* Write the block index first. This is an empty one */
        dataWriter_.append(SSTable.blockIndexKey_, ArrayUtils.EMPTY_BYTE_ARRAY);
        SSTable.positionAfterFirstBlockIndex_ = dataWriter_.getCurrentPosition();
    }

    public String getDataFileLocation() throws IOException {
        File file = new File(dataFile_);
        if (file.exists())
            return file.getAbsolutePath();
        throw new IOException("File " + dataFile_ + " was not found on disk.");
    }

    public long lastModified() {
        return dataWriter_.lastModified();
    }

    /*
     * Seeks to the specified key on disk.
    */
    public void touch(String key, boolean fData) throws IOException {
        if (touchCache_.containsKey(key))
            return;

        IFileReader dataReader = SequenceFile.reader(dataFile_);
        try {
            Range fileCoordinate = getRange(key, dataReader);
            /* Get offset of key from block Index */
            dataReader.seek(fileCoordinate.end);
            BlockMetadata blockMetadata = dataReader.getBlockMetadata(key);
            if (blockMetadata.position_ != -1L) {
                touchCache_.put(dataFile_ + ":" + key, blockMetadata.position_);
            }

            if (fData) {
                /* Read the data associated with this key and pull it into the Buffer Cache */
                if (blockMetadata.position_ != -1L) {
                    dataReader.seek(blockMetadata.position_);
                    DataOutputBuffer bufOut = new DataOutputBuffer();
                    dataReader.next(bufOut);
                    logger_.debug("Finished the touch of the key to pull it into buffer cache.");
                }
            }
        } finally {
            dataReader.close();
        }
    }

    private void dumpBlockIndex() throws IOException {
        DataOutputBuffer bufOut = new DataOutputBuffer();
        /* 
         * Record the position where we start writing the block index. This is will be
         * used as the position of the lastWrittenKey in the block in the index file
        */
        long position = dataWriter_.getCurrentPosition();
        Set<String> keys = blockIndex_.keySet();
        /* Number of keys in this block */
        bufOut.writeInt(keys.size());
        for (String key : keys) {
            bufOut.writeUTF(key);
            BlockMetadata blockMetadata = blockIndex_.get(key);
            /* position of the key as a relative offset */
            bufOut.writeLong(position - blockMetadata.position_);
            bufOut.writeLong(blockMetadata.size_);
        }

        /* Write the relative offset to the previous block index. */
        bufOut.writeLong(position - prevBlockPosition_);
        prevBlockPosition_ = position;
        /* Write out the block index. */
        dataWriter_.append(SSTable.blockIndexKey_, bufOut);
        blockIndex_.clear();
        /* Load this index into the in memory index map */
        List<KeyPositionInfo> keyPositionInfos = SSTable.indexMetadataMap_.get(dataFile_);
        if (keyPositionInfos == null) {
            keyPositionInfos = new ArrayList<KeyPositionInfo>();
            SSTable.indexMetadataMap_.put(dataFile_, keyPositionInfos);
        }
        keyPositionInfos.add(new KeyPositionInfo(lastWrittenKey_, position));
    }

    public void append(String key, DataOutputBuffer buffer) throws IOException {
        append(key, buffer.getData());
    }

    public void append(String key, byte[] value) throws IOException {
        assert key != null;
        if (lastWrittenKey_ != null && key.compareTo(lastWrittenKey_) <= 0) {
            logger_.info("Last written key : " + lastWrittenKey_);
            logger_.info("Current key : " + key);
            logger_.info("Writing into file " + dataFile_);
            throw new IOException("Keys must be written in ascending order.");
        }
        long currentPosition = (lastWrittenKey_ == null) ? SSTable.positionAfterFirstBlockIndex_
                : dataWriter_.getCurrentPosition();

        dataWriter_.append(key, value);
        ++indexKeysWritten_;
        lastWrittenKey_ = key;
        blockIndex_.put(key, new BlockMetadata(currentPosition, (long) value.length));
        if (indexKeysWritten_ == indexInterval_) {
            dumpBlockIndex();
            indexKeysWritten_ = 0;
        }
    }

    public static Range getRange(String key, IFileReader dataReader) throws IOException {
        List<KeyPositionInfo> indexInfo = indexMetadataMap_.get(dataReader.getFileName());
        int size = (indexInfo == null) ? 0 : indexInfo.size();
        long start = 0L;
        long end = dataReader.getEOF();
        if (size > 0) {
            final int index = Collections.binarySearch(indexInfo, new KeyPositionInfo(key));
            if (index < 0) {
                // key is not present at all; scan is required
                int insertIndex = (index + 1) * -1;
                start = (insertIndex == 0) ? 0 : indexInfo.get(insertIndex - 1).position;
                if (insertIndex < size) {
                    end = indexInfo.get(insertIndex).position;
                } else {
                    /* This is the Block Index in the file. */
                    end = start;
                }
            } else {
                /* If we are here that means the key is in the index file
                 * and we can retrieve it w/o a scan.
                 * TODO we would
                 * like to have a retreive(key, fromPosition) but for now
                 * we use scan(start, start + 1) - a hack. */
                start = indexInfo.get(index).position;
                end = start;
            }
        } else {
            /*
             * We are here which means there are less than
             * 128 keys in the system and hence our only recourse
             * is a linear scan from start to finish. Automatically
             * use memory mapping since we have a huge file and very
             * few keys.
            */
            end = dataReader.getEOF();
        }

        return new Range(start, end);
    }

    public DataInputBuffer next(String key, String columnFamilyName, List<String> cNames) throws IOException {
        assert columnFamilyName.split(":").length == 1;
        IFileReader dataReader = SequenceFile.reader(dataFile_);
        try {
            Range range = getRange(key, dataReader);
            /*
             * we have the position we have to read from in order to get the
             * column family, get the column family and column(s) needed.
            */
            DataOutputBuffer bufOut = new DataOutputBuffer();
            DataInputBuffer bufIn = new DataInputBuffer();

            try {
                dataReader.next(key, bufOut, columnFamilyName, cNames, range);
                if (bufOut.getLength() > 0) {
                    bufIn.reset(bufOut.getData(), bufOut.getLength());
                    /* read the key even though we do not use it */
                    bufIn.readUTF();
                    bufIn.readInt();
                }
            } catch (IOException ex) {
                logger_.info("Bloom filter false positive", ex);
            }
            return bufIn;
        } finally {
            dataReader.close();
        }
    }

    public DataInputBuffer next(String key, String cf) throws IOException {
        String[] values = RowMutation.getColumnAndColumnFamily(cf);
        String columnFamilyName = values[0];
        List<String> cnNames = (values.length == 1) ? null : Arrays.asList(new String[] { values[1] });
        return next(key, columnFamilyName, cnNames);
    }

    public void close(BloomFilter bf) throws IOException {
        /* Any remnants in the blockIndex should be dumped */
        dumpBlockIndex();
        /* reset the buffer and serialize the Bloom Filter. */
        DataOutputBuffer bufOut = new DataOutputBuffer();
        BloomFilter.serializer().serialize(bf, bufOut);
        byte[] bytes = new byte[bufOut.getLength()];
        System.arraycopy(bufOut.getData(), 0, bytes, 0, bytes.length);
        /*
         * Write the bloom filter for this SSTable.
         * Then write two longs one which is a version
         * and one which is a pointer to the last written
         * block index.
         */
        long bloomFilterPosition = dataWriter_.getCurrentPosition();
        dataWriter_.close(bytes, bytes.length);
        /* write the version field into the SSTable */
        dataWriter_.writeDirect(BasicUtilities.longToByteArray(version_));
        /* write the relative position of the last block index from current position */
        long blockPosition = dataWriter_.getCurrentPosition() - prevBlockPosition_;
        dataWriter_.writeDirect(BasicUtilities.longToByteArray(blockPosition));
        /* write the position of the bloom filter */
        long bloomFilterRelativePosition = dataWriter_.getCurrentPosition() - bloomFilterPosition;
        dataWriter_.writeDirect(BasicUtilities.longToByteArray(bloomFilterRelativePosition));
        dataWriter_.close();
        bufOut.close();
    }

    /*
     * Renames a temporray sstable file to a valid data and index file
     */
    public void closeRename(BloomFilter bf) throws IOException {
        close(bf);
        String tmpDataFile = dataFile_;
        String dataFileName = dataFile_.replace("-" + temporaryFile_, "");
        File dataFile = new File(dataFile_);
        dataFile.renameTo(new File(dataFileName));
        dataFile_ = dataFileName;
        /* Now repair the in memory index associated with the old name */
        List<KeyPositionInfo> keyPositionInfos = SSTable.indexMetadataMap_.remove(tmpDataFile);
        SSTable.indexMetadataMap_.put(dataFile_, keyPositionInfos);
    }

    public void closeRename(BloomFilter bf, List<String> files) throws IOException {
        closeRename(bf);
        files.add(dataFile_);
    }

}