com.bigdata.dastor.io.SSTableReader.java Source code

Java tutorial

Introduction

Here is the source code for com.bigdata.dastor.io.SSTableReader.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.bigdata.dastor.io;

import java.io.*;
import java.util.*;
import java.lang.ref.ReferenceQueue;
import java.lang.ref.Reference;
import java.nio.channels.FileChannel;
import java.nio.MappedByteBuffer;

import org.apache.log4j.Logger;

import org.apache.commons.lang.StringUtils;

import org.cliffc.high_scale_lib.NonBlockingHashMap;

import com.bigdata.dastor.cache.InstrumentedCache;
import com.bigdata.dastor.config.DatabaseDescriptor;
import com.bigdata.dastor.db.*;
import com.bigdata.dastor.db.marshal.AbstractType;
import com.bigdata.dastor.dht.IPartitioner;
import com.bigdata.dastor.io.util.BufferedRandomAccessFile;
import com.bigdata.dastor.io.util.FileDataInput;
import com.bigdata.dastor.io.util.MappedFileDataInput;
import com.bigdata.dastor.service.StorageService;
import com.bigdata.dastor.utils.BloomFilter;
import com.bigdata.dastor.utils.FBUtilities;
import com.bigdata.dastor.utils.Pair;

/**
 * SSTableReaders are open()ed by Table.onStart; after that they are created by SSTableWriter.renameAndOpen.
 * Do not re-call open() on existing SSTable files; use the references kept by ColumnFamilyStore post-start instead.
 */
public class SSTableReader extends SSTable implements Comparable<SSTableReader> {
    private static final Logger logger = Logger.getLogger(SSTableReader.class);

    // `finalizers` is required to keep the PhantomReferences alive after the enclosing SSTR is itself
    // unreferenced.  otherwise they will never get enqueued.
    private static final Set<Reference<SSTableReader>> finalizers = new HashSet<Reference<SSTableReader>>();
    private static final ReferenceQueue<SSTableReader> finalizerQueue = new ReferenceQueue<SSTableReader>() {
        {
            Runnable runnable = new Runnable() {
                public void run() {
                    while (true) {
                        SSTableDeletingReference r = null;
                        try {
                            r = (SSTableDeletingReference) finalizerQueue.remove();
                            finalizers.remove(r);
                        } catch (InterruptedException e) {
                            throw new RuntimeException(e);
                        }
                        try {
                            r.cleanup();
                        } catch (IOException e) {
                            logger.error("Error deleting " + r.path, e);
                        }
                    }
                }
            };
            new Thread(runnable, "SSTABLE-DELETER").start();
        }
    };
    // in a perfect world, BUFFER_SIZE would be final, but we need to test with a smaller size to stay sane.
    static long BUFFER_SIZE = Integer.MAX_VALUE;

    public static long getApproximateKeyCount(Iterable<SSTableReader> sstables) {
        long count = 0;

        for (SSTableReader sstable : sstables) {
            int indexKeyCount = sstable.getIndexPositions().size();
            count = count + (indexKeyCount + 1) * DatabaseDescriptor.getIndexInterval();
            if (logger.isDebugEnabled())
                logger.debug(
                        "index size for bloom filter calc for file  : " + sstable.getFilename() + "   : " + count);
        }

        return count;
    }

    public static SSTableReader open(String dataFileName) throws IOException {
        return open(dataFileName, StorageService.getPartitioner());
    }

    /** public, but only for tests */
    public static SSTableReader open(String dataFileName, IPartitioner partitioner) throws IOException {
        return open(dataFileName, partitioner, Collections.<String>emptySet(), null);
    }

    public static SSTableReader open(String dataFileName, Collection<String> savedKeyCacheKeys,
            SSTableTracker tracker) throws IOException {
        return open(dataFileName, StorageService.getPartitioner(), savedKeyCacheKeys, tracker);
    }

    public static SSTableReader open(String dataFileName, IPartitioner partitioner,
            Collection<String> savedKeyCacheKeys, SSTableTracker tracker) throws IOException {
        assert partitioner != null;

        long start = System.currentTimeMillis();
        SSTableReader sstable = new SSTableReader(dataFileName, partitioner);
        sstable.setTrackedBy(tracker);
        logger.info("Sampling index and loading saved keyCache for " + dataFileName + " ("
                + savedKeyCacheKeys.size() + " saved keys)");
        sstable.loadIndexAndCache(savedKeyCacheKeys);
        sstable.loadBloomFilter();

        if (logger.isDebugEnabled())
            logger.debug(
                    "INDEX LOAD TIME for " + dataFileName + ": " + (System.currentTimeMillis() - start) + " ms.");

        return sstable;
    }

    private volatile SSTableDeletingReference phantomReference;
    // jvm can only map up to 2GB at a time, so we split index/data into segments of that size when using mmap i/o
    private final MappedByteBuffer[] indexBuffers;
    private final MappedByteBuffer[] buffers;

    private InstrumentedCache<Pair<String, DecoratedKey>, PositionSize> keyCache;

    private BloomFilterTracker bloomFilterTracker = new BloomFilterTracker();

    SSTableReader(String filename, IPartitioner partitioner, IndexSummary indexSummary, BloomFilter bloomFilter)
            throws IOException {
        super(filename, partitioner);

        if (DatabaseDescriptor.getIndexAccessMode() == DatabaseDescriptor.DiskAccessMode.mmap) {
            long indexLength = new File(indexFilename()).length();
            int bufferCount = 1 + (int) (indexLength / BUFFER_SIZE);
            indexBuffers = new MappedByteBuffer[bufferCount];
            long remaining = indexLength;
            for (int i = 0; i < bufferCount; i++) {
                indexBuffers[i] = mmap(indexFilename(), i * BUFFER_SIZE, (int) Math.min(remaining, BUFFER_SIZE));
                remaining -= BUFFER_SIZE;
            }
        } else {
            assert DatabaseDescriptor.getIndexAccessMode() == DatabaseDescriptor.DiskAccessMode.standard;
            indexBuffers = null;
        }

        if (DatabaseDescriptor.getDiskAccessMode() == DatabaseDescriptor.DiskAccessMode.mmap) {
            int bufferCount = 1 + (int) (new File(path).length() / BUFFER_SIZE);
            buffers = new MappedByteBuffer[bufferCount];
            long remaining = length();
            for (int i = 0; i < bufferCount; i++) {
                buffers[i] = mmap(path, i * BUFFER_SIZE, (int) Math.min(remaining, BUFFER_SIZE));
                remaining -= BUFFER_SIZE;
            }
        } else {
            assert DatabaseDescriptor.getDiskAccessMode() == DatabaseDescriptor.DiskAccessMode.standard;
            buffers = null;
        }

        this.indexSummary = indexSummary;
        this.bf = bloomFilter;
    }

    protected void setTrackedBy(SSTableTracker tracker) {
        if (tracker != null) {
            phantomReference = new SSTableDeletingReference(tracker, this, finalizerQueue);
            finalizers.add(phantomReference);
            // TODO keyCache should never be null in live Dastor, but only setting it here
            // means it can be during tests, so we have to do otherwise-unnecessary != null checks
            keyCache = tracker.getKeyCache();
        }
    }

    private static MappedByteBuffer mmap(String filename, long start, int size) throws IOException {
        RandomAccessFile raf;
        try {
            raf = new RandomAccessFile(filename, "r");
        } catch (FileNotFoundException e) {
            throw new IOError(e);
        }

        try {
            return raf.getChannel().map(FileChannel.MapMode.READ_ONLY, start, size);
        } finally {
            raf.close();
        }
    }

    private SSTableReader(String filename, IPartitioner partitioner) throws IOException {
        this(filename, partitioner, null, null);
    }

    public List<IndexSummary.KeyPosition> getIndexPositions() {
        return indexSummary.getIndexPositions();
    }

    public long estimatedKeys() {
        return indexSummary.getIndexPositions().size() * DatabaseDescriptor.getIndexInterval();
    }

    void loadBloomFilter() throws IOException {
        DataInputStream stream = new DataInputStream(new FileInputStream(filterFilename()));
        try {
            bf = BloomFilter.serializer().deserialize(stream);
        } finally {
            stream.close();
        }
    }

    void loadIndexAndCache(Collection<String> keysToLoadInCache) throws IOException {
        // we read the positions in a BRAF so we don't have to worry about an entry spanning a mmap boundary.
        // any entries that do, we force into the in-memory sample so key lookup can always bsearch within
        // a single mmapped segment.
        indexSummary = new IndexSummary();
        BufferedRandomAccessFile input = new BufferedRandomAccessFile(indexFilename(), "r");
        try {
            if (keyCache != null && keyCache.getCapacity() - keyCache.getSize() < keysToLoadInCache.size())
                keyCache.updateCapacity(keyCache.getSize() + keysToLoadInCache.size());

            long indexSize = input.length();
            // we need to know both the current index entry and its data position, as well as the
            // next such pair, in order to compute tne mmap-spanning entries.  since seeking
            // backwards in a 0.6 BRAF is expensive, we make one pass through by reading the "next"
            // entry in each loop through, then summarizing the previous one.
            IndexSummary.KeyPosition thisEntry = null, nextEntry = null;
            long thisDataPos = -1, nextDataPos = -1;
            while (true) {
                long indexPosition = input.getFilePointer();
                if (indexPosition == indexSize)
                    break;

                DecoratedKey key = partitioner.convertFromDiskFormat(input.readUTF());
                long dataPosition = input.readLong();
                if (thisEntry == null) {
                    thisEntry = new IndexSummary.KeyPosition(key, indexPosition);
                    thisDataPos = dataPosition;
                    continue;
                }

                nextEntry = new IndexSummary.KeyPosition(key, indexPosition);
                nextDataPos = dataPosition;
                SSTable.PositionSize posSize = new PositionSize(thisDataPos, nextDataPos - thisDataPos);
                if (keyCache != null && keysToLoadInCache.contains(thisEntry.key.key))
                    keyCache.put(new Pair<String, DecoratedKey>(path, thisEntry.key), posSize);

                indexSummary.maybeAddEntry(thisEntry.key, posSize.position, posSize.size, thisEntry.indexPosition,
                        nextEntry.indexPosition);
                //indexSummary.maybeAddEntry(thisEntry.key, thisDataPos, nextDataPos - thisDataPos, thisEntry.indexPosition, nextEntry.indexPosition);

                thisEntry = nextEntry;
                thisDataPos = nextDataPos;
            }
            assert thisEntry != null; // should not have any zero-row sstables
            indexSummary.maybeAddEntry(thisEntry.key, thisDataPos, length() - thisDataPos, thisEntry.indexPosition,
                    input.length());
            indexSummary.complete();
        } finally {
            input.close();
        }
    }

    /** get the position in the index file to start scanning to find the given key (at most indexInterval keys away) */
    private IndexSummary.KeyPosition getIndexScanPosition(DecoratedKey decoratedKey) {
        assert indexSummary.getIndexPositions() != null && indexSummary.getIndexPositions().size() > 0;
        int index = Collections.binarySearch(indexSummary.getIndexPositions(),
                new IndexSummary.KeyPosition(decoratedKey, -1));
        if (index < 0) {
            // binary search gives us the first index _greater_ than the key searched for,
            // i.e., its insertion position
            int greaterThan = (index + 1) * -1;
            if (greaterThan == 0)
                return null;
            return indexSummary.getIndexPositions().get(greaterThan - 1);
        } else {
            return indexSummary.getIndexPositions().get(index);
        }
    }

    /**
     * returns the position in the data file to find the given key, or -1 if the key is not present
     */
    public PositionSize getPosition(DecoratedKey decoratedKey) throws IOException {
        // first, check bloom filter
        if (!bf.isPresent(partitioner.convertToDiskFormat(decoratedKey)))
            return null;

        // next, the key cache
        Pair<String, DecoratedKey> unifiedKey = new Pair<String, DecoratedKey>(path, decoratedKey);
        if (keyCache != null && keyCache.getCapacity() > 0) {
            PositionSize cachedPosition = keyCache.get(unifiedKey);
            if (cachedPosition != null) {
                return cachedPosition;
            }
        }

        // next, see if the sampled index says it's impossible for the key to be present
        IndexSummary.KeyPosition sampledPosition = getIndexScanPosition(decoratedKey);
        if (sampledPosition == null) {
            bloomFilterTracker.addFalsePositive();
            return null;
        }

        // get either a buffered or a mmap'd input for the on-disk index
        long p = sampledPosition.indexPosition;
        FileDataInput input;
        if (indexBuffers == null) {
            input = new BufferedRandomAccessFile(indexFilename(), "r");
            ((BufferedRandomAccessFile) input).seek(p);
        } else {
            input = indexInputAt(p);
        }

        // scan the on-disk index, starting at the nearest sampled position
        try {
            int interval = DatabaseDescriptor.getIndexInterval();
            int i = 0;
            do {
                // handle exact sampled index hit
                IndexSummary.KeyPosition kp = indexSummary.getSpannedIndexPosition(input.getAbsolutePosition());
                if (kp != null && kp.key.equals(decoratedKey)) {
                    bloomFilterTracker.addTruePositive();
                    return indexSummary.getSpannedDataPosition(kp);
                }
                // if using mmapped i/o, skip to the next mmap buffer if necessary
                if (input.isEOF() || kp != null) {
                    if (indexBuffers == null) // not mmap-ing, just one index input
                        break;

                    FileDataInput oldInput = input;
                    if (kp == null) {
                        input = indexInputAt(input.getAbsolutePosition());
                    } else {
                        long nextUnspannedPostion = input.getAbsolutePosition() + 2 + FBUtilities
                                .encodedUTF8Length(StorageService.getPartitioner().convertToDiskFormat(kp.key)) + 8;
                        input = indexInputAt(nextUnspannedPostion);
                    }
                    oldInput.close();
                    if (input == null)
                        break;

                    continue;
                }

                // read key & data position from index entry
                DecoratedKey indexDecoratedKey = partitioner.convertFromDiskFormat(input.readUTF());
                long dataPosition = input.readLong();

                int v = indexDecoratedKey.compareTo(decoratedKey);
                if (v == 0) {
                    PositionSize info = getDataPositionSize(input, dataPosition);
                    if (keyCache != null && keyCache.getCapacity() > 0)
                        keyCache.put(unifiedKey, info);
                    bloomFilterTracker.addTruePositive();
                    return info;
                }
                if (v > 0) {
                    bloomFilterTracker.addFalsePositive();
                    return null;
                }
            } while (++i < interval);
        } finally {
            if (input != null)
                input.close();
        }
        bloomFilterTracker.addFalsePositive();
        return null;
    }

    private FileDataInput indexInputAt(long indexPosition) {
        if (indexPosition > indexSummary.getLastIndexPosition())
            return null;
        int bufferIndex = bufferIndex(indexPosition);
        return new MappedFileDataInput(indexBuffers[bufferIndex], indexFilename(), BUFFER_SIZE * bufferIndex,
                (int) (indexPosition % BUFFER_SIZE));
    }

    private PositionSize getDataPositionSize(FileDataInput input, long dataPosition) throws IOException {
        // if we've reached the end of the index, then the row size is "the rest of the data file"
        if (input.isEOF())
            return new PositionSize(dataPosition, length() - dataPosition);

        // otherwise, row size is the start of the next row (in next index entry), minus the start of this one.
        long nextIndexPosition = input.getAbsolutePosition();
        // if next index entry would span mmap boundary, get the next row position from the summary instead
        PositionSize nextPositionSize = indexSummary.getSpannedDataPosition(nextIndexPosition);
        if (nextPositionSize != null)
            return new PositionSize(dataPosition, nextPositionSize.position - dataPosition);

        // read next entry directly
        int utflen = input.readUnsignedShort();
        if (utflen != input.skipBytes(utflen))
            throw new EOFException();
        return new PositionSize(dataPosition, input.readLong() - dataPosition);
    }

    /** like getPosition, but if key is not found will return the location of the first key _greater_ than the desired one, or -1 if no such key exists. */
    public long getNearestPosition(DecoratedKey decoratedKey) throws IOException {
        IndexSummary.KeyPosition sampledPosition = getIndexScanPosition(decoratedKey);
        if (sampledPosition == null) {
            return 0;
        }

        // can't use a MappedFileDataInput here, since we might cross a segment boundary while scanning
        BufferedRandomAccessFile input = new BufferedRandomAccessFile(indexFilename(path), "r");
        input.seek(sampledPosition.indexPosition);
        try {
            while (true) {
                DecoratedKey indexDecoratedKey;
                try {
                    indexDecoratedKey = partitioner.convertFromDiskFormat(input.readUTF());
                } catch (EOFException e) {
                    return -1;
                }
                long position = input.readLong();
                int v = indexDecoratedKey.compareTo(decoratedKey);
                if (v >= 0)
                    return position;
            }
        } finally {
            input.close();
        }
    }

    public long length() {
        return new File(path).length();
    }

    public int compareTo(SSTableReader o) {
        return ColumnFamilyStore.getGenerationFromFileName(path)
                - ColumnFamilyStore.getGenerationFromFileName(o.path);
    }

    public void markCompacted() throws IOException {
        if (logger.isDebugEnabled())
            logger.debug("Marking " + path + " compacted");
        if (!new File(compactedFilename()).createNewFile()) {
            throw new IOException("Unable to create compaction marker");
        }
        phantomReference.deleteOnCleanup();
    }

    /** obviously only for testing */
    public void forceBloomFilterFailures() {
        bf = BloomFilter.alwaysMatchingBloomFilter();
    }

    public BloomFilter getBloomFilter() {
        return bf;
    }

    public IPartitioner getPartitioner() {
        return partitioner;
    }

    public SSTableScanner getScanner(int bufferSize) throws IOException {
        return new SSTableScanner(this, bufferSize);
    }

    public FileDataInput getFileDataInput(DecoratedKey decoratedKey, int bufferSize) throws IOException {
        PositionSize info = getPosition(decoratedKey);
        if (info == null)
            return null;

        if (buffers == null || (bufferIndex(info.position) != bufferIndex(info.position + info.size))) {
            BufferedRandomAccessFile file = new BufferedRandomAccessFile(path, "r", bufferSize);
            file.seek(info.position);
            return file;
        }
        return new MappedFileDataInput(buffers[bufferIndex(info.position)], path,
                BUFFER_SIZE * (info.position / BUFFER_SIZE), (int) (info.position % BUFFER_SIZE));
    }

    static int bufferIndex(long position) {
        return (int) (position / BUFFER_SIZE);
    }

    public AbstractType getColumnComparator() {
        return DatabaseDescriptor.getComparator(getTableName(), getColumnFamilyName());
    }

    public ColumnFamily makeColumnFamily() {
        return ColumnFamily.create(getTableName(), getColumnFamilyName());
    }

    public ICompactSerializer2<IColumn> getColumnSerializer() {
        return DatabaseDescriptor.getColumnFamilyType(getTableName(), getColumnFamilyName()).equals("Standard")
                ? Column.serializer()
                : SuperColumn.serializer(getColumnComparator());
    }

    public long getBloomFilterFalsePositiveCount() {
        return bloomFilterTracker.getFalsePositiveCount();
    }

    public long getRecentBloomFilterFalsePositiveCount() {
        return bloomFilterTracker.getRecentFalsePositiveCount();
    }

    public long getBloomFilterTruePositiveCount() {
        return bloomFilterTracker.getTruePositiveCount();
    }

    public long getRecentBloomFilterTruePositiveCount() {
        return bloomFilterTracker.getRecentTruePositiveCount();
    }
}