org.apache.bookkeeper.bookie.InterleavedLedgerStorage.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.bookkeeper.bookie.InterleavedLedgerStorage.java

Source

/*
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 */

package org.apache.bookkeeper.bookie;

import static com.google.common.base.Preconditions.checkNotNull;
import static org.apache.bookkeeper.bookie.BookKeeperServerStats.BOOKIE_READ_ENTRY;
import static org.apache.bookkeeper.bookie.BookKeeperServerStats.BOOKIE_SCOPE;
import static org.apache.bookkeeper.bookie.BookKeeperServerStats.CATEGORY_SERVER;
import static org.apache.bookkeeper.bookie.BookKeeperServerStats.ENTRYLOGGER_SCOPE;
import static org.apache.bookkeeper.bookie.BookKeeperServerStats.STORAGE_GET_ENTRY;
import static org.apache.bookkeeper.bookie.BookKeeperServerStats.STORAGE_GET_OFFSET;
import static org.apache.bookkeeper.bookie.BookKeeperServerStats.STORAGE_SCRUB_PAGES_SCANNED;
import static org.apache.bookkeeper.bookie.BookKeeperServerStats.STORAGE_SCRUB_PAGE_RETRIES;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.RateLimiter;
import io.netty.buffer.ByteBuf;
import io.netty.buffer.ByteBufAllocator;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Optional;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;

import lombok.Cleanup;
import org.apache.bookkeeper.bookie.Bookie.NoLedgerException;
import org.apache.bookkeeper.bookie.CheckpointSource.Checkpoint;
import org.apache.bookkeeper.bookie.EntryLogger.EntryLogListener;
import org.apache.bookkeeper.bookie.LedgerDirsManager.LedgerDirsListener;
import org.apache.bookkeeper.common.util.Watcher;
import org.apache.bookkeeper.conf.ServerConfiguration;
import org.apache.bookkeeper.meta.LedgerManager;
import org.apache.bookkeeper.proto.BookieProtocol;
import org.apache.bookkeeper.stats.Counter;
import org.apache.bookkeeper.stats.OpStatsLogger;
import org.apache.bookkeeper.stats.StatsLogger;
import org.apache.bookkeeper.stats.annotations.StatsDoc;
import org.apache.bookkeeper.util.MathUtils;
import org.apache.bookkeeper.util.SnapshotMap;
import org.apache.commons.lang.mutable.MutableBoolean;
import org.apache.commons.lang.mutable.MutableLong;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Interleave ledger storage.
 *
 * <p>This ledger storage implementation stores all entries in a single
 * file and maintains an index file for each ledger.
 */
@StatsDoc(name = BOOKIE_SCOPE, category = CATEGORY_SERVER, help = "Bookie related stats")
public class InterleavedLedgerStorage implements CompactableLedgerStorage, EntryLogListener {
    private static final Logger LOG = LoggerFactory.getLogger(InterleavedLedgerStorage.class);

    EntryLogger entryLogger;
    LedgerCache ledgerCache;
    protected CheckpointSource checkpointSource;
    protected Checkpointer checkpointer;
    private final CopyOnWriteArrayList<LedgerDeletionListener> ledgerDeletionListeners = Lists
            .newCopyOnWriteArrayList();

    // A sorted map to stored all active ledger ids
    protected final SnapshotMap<Long, Boolean> activeLedgers;

    // This is the thread that garbage collects the entry logs that do not
    // contain any active ledgers in them; and compacts the entry logs that
    // has lower remaining percentage to reclaim disk space.
    GarbageCollectorThread gcThread;

    // this indicates that a write has happened since the last flush
    private final AtomicBoolean somethingWritten = new AtomicBoolean(false);

    // Expose Stats
    @StatsDoc(name = STORAGE_GET_OFFSET, help = "Operation stats of getting offset from ledger cache", parent = BOOKIE_READ_ENTRY)
    private OpStatsLogger getOffsetStats;
    @StatsDoc(name = STORAGE_GET_ENTRY, help = "Operation stats of getting entry from entry logger", parent = BOOKIE_READ_ENTRY, happensAfter = STORAGE_GET_OFFSET)
    private OpStatsLogger getEntryStats;
    private OpStatsLogger pageScanStats;
    private Counter retryCounter;

    @VisibleForTesting
    public InterleavedLedgerStorage() {
        activeLedgers = new SnapshotMap<Long, Boolean>();
    }

    @Override
    public void initialize(ServerConfiguration conf, LedgerManager ledgerManager,
            LedgerDirsManager ledgerDirsManager, LedgerDirsManager indexDirsManager, StateManager stateManager,
            CheckpointSource checkpointSource, Checkpointer checkpointer, StatsLogger statsLogger,
            ByteBufAllocator allocator) throws IOException {
        initializeWithEntryLogListener(conf, ledgerManager, ledgerDirsManager, indexDirsManager, stateManager,
                checkpointSource, checkpointer, this, statsLogger, allocator);
    }

    void initializeWithEntryLogListener(ServerConfiguration conf, LedgerManager ledgerManager,
            LedgerDirsManager ledgerDirsManager, LedgerDirsManager indexDirsManager, StateManager stateManager,
            CheckpointSource checkpointSource, Checkpointer checkpointer, EntryLogListener entryLogListener,
            StatsLogger statsLogger, ByteBufAllocator allocator) throws IOException {
        initializeWithEntryLogger(conf, ledgerManager, ledgerDirsManager, indexDirsManager, stateManager,
                checkpointSource, checkpointer, new EntryLogger(conf, ledgerDirsManager, entryLogListener,
                        statsLogger.scope(ENTRYLOGGER_SCOPE), allocator),
                statsLogger);
    }

    @VisibleForTesting
    public void initializeWithEntryLogger(ServerConfiguration conf, LedgerManager ledgerManager,
            LedgerDirsManager ledgerDirsManager, LedgerDirsManager indexDirsManager, StateManager stateManager,
            CheckpointSource checkpointSource, Checkpointer checkpointer, EntryLogger entryLogger,
            StatsLogger statsLogger) throws IOException {
        checkNotNull(checkpointSource, "invalid null checkpoint source");
        checkNotNull(checkpointer, "invalid null checkpointer");
        this.entryLogger = entryLogger;
        this.entryLogger.addListener(this);
        this.checkpointSource = checkpointSource;
        this.checkpointer = checkpointer;
        ledgerCache = new LedgerCacheImpl(conf, activeLedgers,
                null == indexDirsManager ? ledgerDirsManager : indexDirsManager, statsLogger);
        gcThread = new GarbageCollectorThread(conf, ledgerManager, this, statsLogger.scope("gc"));
        ledgerDirsManager.addLedgerDirsListener(getLedgerDirsListener());
        // Expose Stats
        getOffsetStats = statsLogger.getOpStatsLogger(STORAGE_GET_OFFSET);
        getEntryStats = statsLogger.getOpStatsLogger(STORAGE_GET_ENTRY);
        pageScanStats = statsLogger.getOpStatsLogger(STORAGE_SCRUB_PAGES_SCANNED);
        retryCounter = statsLogger.getCounter(STORAGE_SCRUB_PAGE_RETRIES);
    }

    private LedgerDirsListener getLedgerDirsListener() {
        return new LedgerDirsListener() {

            @Override
            public void diskAlmostFull(File disk) {
                if (gcThread.isForceGCAllowWhenNoSpace) {
                    gcThread.enableForceGC();
                } else {
                    gcThread.suspendMajorGC();
                }
            }

            @Override
            public void diskFull(File disk) {
                if (gcThread.isForceGCAllowWhenNoSpace) {
                    gcThread.enableForceGC();
                } else {
                    gcThread.suspendMajorGC();
                    gcThread.suspendMinorGC();
                }
            }

            @Override
            public void allDisksFull(boolean highPriorityWritesAllowed) {
                if (gcThread.isForceGCAllowWhenNoSpace) {
                    gcThread.enableForceGC();
                } else {
                    gcThread.suspendMajorGC();
                    gcThread.suspendMinorGC();
                }
            }

            @Override
            public void diskWritable(File disk) {
                // we have enough space now
                if (gcThread.isForceGCAllowWhenNoSpace) {
                    // disable force gc.
                    gcThread.disableForceGC();
                } else {
                    // resume compaction to normal.
                    gcThread.resumeMajorGC();
                    gcThread.resumeMinorGC();
                }
            }

            @Override
            public void diskJustWritable(File disk) {
                if (gcThread.isForceGCAllowWhenNoSpace) {
                    // if a disk is just writable, we still need force gc.
                    gcThread.enableForceGC();
                } else {
                    // still under warn threshold, only resume minor compaction.
                    gcThread.resumeMinorGC();
                }
            }
        };
    }

    @Override
    public void forceGC() {
        gcThread.enableForceGC();
    }

    @Override
    public boolean isInForceGC() {
        return gcThread.isInForceGC();
    }

    @Override
    public void start() {
        gcThread.start();
    }

    @Override
    public void shutdown() throws InterruptedException {
        // shut down gc thread, which depends on zookeeper client
        // also compaction will write entries again to entry log file
        LOG.info("Shutting down InterleavedLedgerStorage");
        LOG.info("Shutting down GC thread");
        gcThread.shutdown();
        LOG.info("Shutting down entry logger");
        entryLogger.shutdown();
        try {
            ledgerCache.close();
        } catch (IOException e) {
            LOG.error("Error while closing the ledger cache", e);
        }
        LOG.info("Complete shutting down Ledger Storage");
    }

    @Override
    public boolean setFenced(long ledgerId) throws IOException {
        return ledgerCache.setFenced(ledgerId);
    }

    @Override
    public boolean isFenced(long ledgerId) throws IOException {
        return ledgerCache.isFenced(ledgerId);
    }

    @Override
    public void setExplicitlac(long ledgerId, ByteBuf lac) throws IOException {
        ledgerCache.setExplicitLac(ledgerId, lac);
    }

    @Override
    public ByteBuf getExplicitLac(long ledgerId) {
        return ledgerCache.getExplicitLac(ledgerId);
    }

    @Override
    public void setMasterKey(long ledgerId, byte[] masterKey) throws IOException {
        ledgerCache.setMasterKey(ledgerId, masterKey);
    }

    @Override
    public byte[] readMasterKey(long ledgerId) throws IOException, BookieException {
        return ledgerCache.readMasterKey(ledgerId);
    }

    @Override
    public boolean ledgerExists(long ledgerId) throws IOException {
        return ledgerCache.ledgerExists(ledgerId);
    }

    @Override
    public long getLastAddConfirmed(long ledgerId) throws IOException {
        Long lac = ledgerCache.getLastAddConfirmed(ledgerId);
        if (lac == null) {
            ByteBuf bb = getEntry(ledgerId, BookieProtocol.LAST_ADD_CONFIRMED);
            if (null == bb) {
                return BookieProtocol.INVALID_ENTRY_ID;
            } else {
                try {
                    bb.skipBytes(2 * Long.BYTES); // skip ledger & entry id
                    lac = bb.readLong();
                    lac = ledgerCache.updateLastAddConfirmed(ledgerId, lac);
                } finally {
                    bb.release();
                }
            }
        }
        return lac;
    }

    @Override
    public boolean waitForLastAddConfirmedUpdate(long ledgerId, long previousLAC,
            Watcher<LastAddConfirmedUpdateNotification> watcher) throws IOException {
        return ledgerCache.waitForLastAddConfirmedUpdate(ledgerId, previousLAC, watcher);
    }

    @Override
    public long addEntry(ByteBuf entry) throws IOException {
        long ledgerId = entry.getLong(entry.readerIndex() + 0);
        long entryId = entry.getLong(entry.readerIndex() + 8);
        long lac = entry.getLong(entry.readerIndex() + 16);

        processEntry(ledgerId, entryId, entry);

        ledgerCache.updateLastAddConfirmed(ledgerId, lac);
        return entryId;
    }

    @Override
    public ByteBuf getEntry(long ledgerId, long entryId) throws IOException {
        long offset;
        /*
         * If entryId is BookieProtocol.LAST_ADD_CONFIRMED, then return the last written.
         */
        if (entryId == BookieProtocol.LAST_ADD_CONFIRMED) {
            entryId = ledgerCache.getLastEntry(ledgerId);
        }

        // Get Offset
        long startTimeNanos = MathUtils.nowInNano();
        boolean success = false;
        try {
            offset = ledgerCache.getEntryOffset(ledgerId, entryId);
            if (offset == 0) {
                throw new Bookie.NoEntryException(ledgerId, entryId);
            }
            success = true;
        } finally {
            if (success) {
                getOffsetStats.registerSuccessfulEvent(MathUtils.elapsedNanos(startTimeNanos),
                        TimeUnit.NANOSECONDS);
            } else {
                getOffsetStats.registerFailedEvent(MathUtils.elapsedNanos(startTimeNanos), TimeUnit.NANOSECONDS);
            }
        }
        // Get Entry
        startTimeNanos = MathUtils.nowInNano();
        success = false;
        try {
            ByteBuf retBytes = entryLogger.readEntry(ledgerId, entryId, offset);
            success = true;
            return retBytes;
        } finally {
            if (success) {
                getEntryStats.registerSuccessfulEvent(MathUtils.elapsedNanos(startTimeNanos), TimeUnit.NANOSECONDS);
            } else {
                getEntryStats.registerFailedEvent(MathUtils.elapsedNanos(startTimeNanos), TimeUnit.NANOSECONDS);
            }
        }
    }

    private void flushOrCheckpoint(boolean isCheckpointFlush) throws IOException {

        boolean flushFailed = false;
        try {
            ledgerCache.flushLedger(true);
        } catch (LedgerDirsManager.NoWritableLedgerDirException e) {
            throw e;
        } catch (IOException ioe) {
            LOG.error("Exception flushing Ledger cache", ioe);
            flushFailed = true;
        }

        try {
            // if it is just a checkpoint flush, we just flush rotated entry log files
            // in entry logger.
            if (isCheckpointFlush) {
                entryLogger.checkpoint();
            } else {
                entryLogger.flush();
            }
        } catch (LedgerDirsManager.NoWritableLedgerDirException e) {
            throw e;
        } catch (IOException ioe) {
            LOG.error("Exception flushing Ledger", ioe);
            flushFailed = true;
        }
        if (flushFailed) {
            throw new IOException("Flushing to storage failed, check logs");
        }
    }

    @Override
    public void checkpoint(Checkpoint checkpoint) throws IOException {
        // we don't need to check somethingwritten since checkpoint
        // is scheduled when rotate an entry logger file. and we could
        // not set somethingWritten to false after checkpoint, since
        // current entry logger file isn't flushed yet.
        flushOrCheckpoint(true);
    }

    @Override
    public synchronized void flush() throws IOException {
        if (!somethingWritten.compareAndSet(true, false)) {
            return;
        }
        flushOrCheckpoint(false);
    }

    @Override
    public void deleteLedger(long ledgerId) throws IOException {
        activeLedgers.remove(ledgerId);
        ledgerCache.deleteLedger(ledgerId);

        for (LedgerDeletionListener listener : ledgerDeletionListeners) {
            listener.ledgerDeleted(ledgerId);
        }
    }

    @Override
    public Iterable<Long> getActiveLedgersInRange(long firstLedgerId, long lastLedgerId) {
        NavigableMap<Long, Boolean> bkActiveLedgersSnapshot = activeLedgers.snapshot();
        Map<Long, Boolean> subBkActiveLedgers = bkActiveLedgersSnapshot.subMap(firstLedgerId, true, lastLedgerId,
                false);

        return subBkActiveLedgers.keySet();
    }

    @Override
    public void updateEntriesLocations(Iterable<EntryLocation> locations) throws IOException {
        for (EntryLocation l : locations) {
            try {
                ledgerCache.putEntryOffset(l.ledger, l.entry, l.location);
            } catch (NoLedgerException e) {
                // Ledger was already deleted, we can skip it in the compaction
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Compaction failed for deleted ledger ledger: {} entry: {}", l.ledger, l.entry);
                }
            }
        }
    }

    @Override
    public void flushEntriesLocationsIndex() throws IOException {
        ledgerCache.flushLedger(true);
    }

    @Override
    public EntryLogger getEntryLogger() {
        return entryLogger;
    }

    @Override
    public void registerLedgerDeletionListener(LedgerDeletionListener listener) {
        ledgerDeletionListeners.add(listener);
    }

    protected void processEntry(long ledgerId, long entryId, ByteBuf entry) throws IOException {
        processEntry(ledgerId, entryId, entry, true);
    }

    protected void processEntry(long ledgerId, long entryId, ByteBuf entry, boolean rollLog) throws IOException {
        /*
         * Touch dirty flag
         */
        somethingWritten.set(true);

        /*
         * Log the entry
         */
        long pos = entryLogger.addEntry(ledgerId, entry, rollLog);

        /*
         * Set offset of entry id to be the current ledger position
         */
        ledgerCache.putEntryOffset(ledgerId, entryId, pos);
    }

    @Override
    public void onRotateEntryLog() {
        // for interleaved ledger storage, we request a checkpoint when rotating a entry log file.
        // the checkpoint represent the point that all the entries added before this point are already
        // in ledger storage and ready to be synced to disk.
        // TODO: we could consider remove checkpointSource and checkpointSouce#newCheckpoint
        // later if we provide kind of LSN (Log/Journal Squeuence Number)
        // mechanism when adding entry. {@link https://github.com/apache/bookkeeper/issues/279}
        Checkpoint checkpoint = checkpointSource.newCheckpoint();
        checkpointer.startCheckpoint(checkpoint);
    }

    /**
     * Return iterable for index entries for ledgerId.
     * @param ledgerId ledger to scan
     * @return Iterator
     */
    public LedgerCache.PageEntriesIterable getIndexEntries(long ledgerId) throws IOException {
        return ledgerCache.listEntries(ledgerId);
    }

    /**
     * Read implementation metadata for index file.
     * @param ledgerId
     * @return Implementation metadata
     * @throws IOException
     */
    public LedgerCache.LedgerIndexMetadata readLedgerIndexMetadata(long ledgerId) throws IOException {
        return ledgerCache.readLedgerIndexMetadata(ledgerId);
    }

    @Override
    public List<DetectedInconsistency> localConsistencyCheck(Optional<RateLimiter> rateLimiter) throws IOException {
        long checkStart = MathUtils.nowInNano();
        LOG.info("Starting localConsistencyCheck");
        long checkedLedgers = 0;
        long checkedPages = 0;
        final MutableLong checkedEntries = new MutableLong(0);
        final MutableLong pageRetries = new MutableLong(0);
        NavigableMap<Long, Boolean> bkActiveLedgersSnapshot = activeLedgers.snapshot();
        final List<DetectedInconsistency> errors = new ArrayList<>();
        for (Long ledger : bkActiveLedgersSnapshot.keySet()) {
            try (LedgerCache.PageEntriesIterable pages = ledgerCache.listEntries(ledger)) {
                for (LedgerCache.PageEntries page : pages) {
                    @Cleanup
                    LedgerEntryPage lep = page.getLEP();
                    MutableBoolean retry = new MutableBoolean(false);
                    do {
                        retry.setValue(false);
                        int version = lep.getVersion();

                        MutableBoolean success = new MutableBoolean(true);
                        long start = MathUtils.nowInNano();
                        lep.getEntries((entry, offset) -> {
                            rateLimiter.ifPresent(RateLimiter::acquire);

                            try {
                                entryLogger.checkEntry(ledger, entry, offset);
                                checkedEntries.increment();
                            } catch (EntryLogger.EntryLookupException e) {
                                if (version != lep.getVersion()) {
                                    pageRetries.increment();
                                    if (lep.isDeleted()) {
                                        LOG.debug("localConsistencyCheck: ledger {} deleted", ledger);
                                    } else {
                                        LOG.debug("localConsistencyCheck: concurrent modification, retrying");
                                        retry.setValue(true);
                                        retryCounter.inc();
                                    }
                                    return false;
                                } else {
                                    errors.add(new DetectedInconsistency(ledger, entry, e));
                                    LOG.error("Got error: ", e);
                                }
                                success.setValue(false);
                            }
                            return true;
                        });

                        if (success.booleanValue()) {
                            pageScanStats.registerSuccessfulEvent(MathUtils.elapsedNanos(start),
                                    TimeUnit.NANOSECONDS);
                        } else {
                            pageScanStats.registerFailedEvent(MathUtils.elapsedNanos(start), TimeUnit.NANOSECONDS);
                        }
                    } while (retry.booleanValue());
                    checkedPages++;
                }
            } catch (NoLedgerException | FileInfo.FileInfoDeletedException e) {
                if (activeLedgers.containsKey(ledger)) {
                    LOG.error("Cannot find ledger {}, should exist, exception is ", ledger, e);
                    errors.add(new DetectedInconsistency(ledger, -1, e));
                } else {
                    LOG.debug("ledger {} deleted since snapshot taken", ledger);
                }
            } catch (Exception e) {
                throw new IOException("Got other exception in localConsistencyCheck", e);
            }
            checkedLedgers++;
        }
        LOG.info(
                "Finished localConsistencyCheck, took {}s to scan {} ledgers, {} pages, "
                        + "{} entries with {} retries, {} errors",
                TimeUnit.NANOSECONDS.toSeconds(MathUtils.elapsedNanos(checkStart)), checkedLedgers, checkedPages,
                checkedEntries.longValue(), pageRetries.longValue(), errors.size());

        return errors;
    }

    @Override
    public List<GarbageCollectionStatus> getGarbageCollectionStatus() {
        return Collections.singletonList(gcThread.getGarbageCollectionStatus());
    }
}