io.atomix.protocols.raft.impl.RaftServiceManager.java Source code

Introduction

Here is the source code for io.atomix.protocols.raft.impl.RaftServiceManager.java
Source

/*
 * Copyright 2015-present Open Networking Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.atomix.protocols.raft.impl;

import com.google.common.collect.Maps;
import com.google.common.primitives.Longs;
import io.atomix.cluster.MemberId;
import io.atomix.primitive.PrimitiveId;
import io.atomix.primitive.PrimitiveType;
import io.atomix.primitive.service.PrimitiveService;
import io.atomix.primitive.service.ServiceConfig;
import io.atomix.primitive.session.SessionId;
import io.atomix.primitive.session.SessionMetadata;
import io.atomix.protocols.raft.RaftException;
import io.atomix.protocols.raft.RaftServer;
import io.atomix.protocols.raft.service.RaftServiceContext;
import io.atomix.protocols.raft.session.RaftSession;
import io.atomix.protocols.raft.storage.log.RaftLog;
import io.atomix.protocols.raft.storage.log.RaftLogReader;
import io.atomix.protocols.raft.storage.log.entry.CloseSessionEntry;
import io.atomix.protocols.raft.storage.log.entry.CommandEntry;
import io.atomix.protocols.raft.storage.log.entry.ConfigurationEntry;
import io.atomix.protocols.raft.storage.log.entry.InitializeEntry;
import io.atomix.protocols.raft.storage.log.entry.KeepAliveEntry;
import io.atomix.protocols.raft.storage.log.entry.MetadataEntry;
import io.atomix.protocols.raft.storage.log.entry.OpenSessionEntry;
import io.atomix.protocols.raft.storage.log.entry.QueryEntry;
import io.atomix.protocols.raft.storage.log.entry.RaftLogEntry;
import io.atomix.protocols.raft.storage.snapshot.Snapshot;
import io.atomix.protocols.raft.storage.snapshot.SnapshotReader;
import io.atomix.protocols.raft.storage.snapshot.SnapshotWriter;
import io.atomix.storage.StorageLevel;
import io.atomix.storage.journal.Indexed;
import io.atomix.utils.concurrent.ComposableFuture;
import io.atomix.utils.concurrent.Futures;
import io.atomix.utils.concurrent.OrderedFuture;
import io.atomix.utils.concurrent.ThreadContext;
import io.atomix.utils.concurrent.ThreadContextFactory;
import io.atomix.utils.config.ConfigurationException;
import io.atomix.utils.logging.ContextualLoggerFactory;
import io.atomix.utils.logging.LoggerContext;
import io.atomix.utils.serializer.Serializer;
import io.atomix.utils.time.WallClockTimestamp;
import org.slf4j.Logger;

import java.time.Duration;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CompletableFuture;

import static com.google.common.base.Preconditions.checkNotNull;

/**
 * Internal server state machine.
 * <p>
 * The internal state machine handles application of commands to the user provided {@link PrimitiveService} and keeps
 * track of internal state like sessions and the various indexes relevant to log compaction.
 */
public class RaftServiceManager implements AutoCloseable {
    private static final Duration SNAPSHOT_INTERVAL = Duration.ofSeconds(10);
    private static final Duration SNAPSHOT_COMPLETION_DELAY = Duration.ofSeconds(10);
    private static final Duration COMPACT_DELAY = Duration.ofSeconds(10);

    private static final int SEGMENT_BUFFER_FACTOR = 5;

    private final Logger logger;
    private final RaftContext raft;
    private final ThreadContext stateContext;
    private final ThreadContextFactory threadContextFactory;
    private final RaftLog log;
    private final RaftLogReader reader;
    private final Map<Long, CompletableFuture> futures = Maps.newHashMap();
    private volatile CompletableFuture<Void> compactFuture;
    private long lastEnqueued;
    private long lastCompacted;

    public RaftServiceManager(RaftContext raft, ThreadContext stateContext,
            ThreadContextFactory threadContextFactory) {
        this.raft = checkNotNull(raft, "state cannot be null");
        this.log = raft.getLog();
        this.reader = log.openReader(1, RaftLogReader.Mode.COMMITS);
        this.stateContext = stateContext;
        this.threadContextFactory = threadContextFactory;
        this.logger = ContextualLoggerFactory.getLogger(getClass(),
                LoggerContext.builder(RaftServer.class).addValue(raft.getName()).build());
        this.lastEnqueued = reader.getFirstIndex() - 1;
        scheduleSnapshots();
    }

    /**
     * Returns the service thread context.
     *
     * @return the service thread context
     */
    public ThreadContext executor() {
        return stateContext;
    }

    /**
     * Returns a boolean indicating whether the node is running out of disk space.
     */
    private boolean isRunningOutOfDiskSpace() {
        // If there's not enough space left to allocate two log segments
        return raft.getStorage().statistics().getUsableSpace() < raft.getStorage().maxLogSegmentSize()
                * SEGMENT_BUFFER_FACTOR
                // Or the used disk percentage has surpassed the free disk buffer percentage
                || raft.getStorage().statistics().getUsableSpace()
                        / (double) raft.getStorage().statistics().getTotalSpace() < raft.getStorage()
                                .freeDiskBuffer();
    }

    /**
     * Returns a boolean indicating whether the node is running out of memory.
     */
    private boolean isRunningOutOfMemory() {
        StorageLevel level = raft.getStorage().storageLevel();
        if (level == StorageLevel.MEMORY || level == StorageLevel.MAPPED) {
            long freeMemory = raft.getStorage().statistics().getFreeMemory();
            long totalMemory = raft.getStorage().statistics().getTotalMemory();
            if (freeMemory > 0 && totalMemory > 0) {
                return freeMemory / (double) totalMemory < raft.getStorage().freeMemoryBuffer();
            }
        }
        return false;
    }

    /**
     * Schedules a snapshot iteration.
     */
    private void scheduleSnapshots() {
        raft.getThreadContext().schedule(SNAPSHOT_INTERVAL, () -> takeSnapshots(true, false));
    }

    /**
     * Compacts Raft logs.
     *
     * @return a future to be completed once logs have been compacted
     */
    public CompletableFuture<Void> compact() {
        return takeSnapshots(false, true);
    }

    /**
     * Takes a snapshot of all services and compacts logs if the server is not under high load or disk needs to be freed.
     */
    private CompletableFuture<Void> takeSnapshots(boolean rescheduleAfterCompletion, boolean force) {
        // If compaction is already in progress, return the existing future and reschedule if this is a scheduled compaction.
        if (compactFuture != null) {
            if (rescheduleAfterCompletion) {
                compactFuture.whenComplete((r, e) -> scheduleSnapshots());
            }
            return compactFuture;
        }

        long lastApplied = raft.getLastApplied();

        // Only take snapshots if segments can be removed from the log below the lastApplied index.
        if (raft.getLog().isCompactable(lastApplied)
                && raft.getLog().getCompactableIndex(lastApplied) > lastCompacted) {

            // Determine whether the node is running out of disk space.
            boolean runningOutOfDiskSpace = isRunningOutOfDiskSpace();

            // Determine whether the node is running out of memory.
            boolean runningOutOfMemory = isRunningOutOfMemory();

            // If compaction is not already being forced...
            if (!force
                    // And the node isn't running out of memory (we need to free up memory if it is)...
                    && !runningOutOfMemory
                    // And dynamic compaction is enabled (we need to compact immediately if it's disabled)...
                    && raft.getStorage().dynamicCompaction()
                    // And the node isn't running out of disk space (we need to compact immediately if it is)...
                    && !runningOutOfDiskSpace
                    // And the server is under high load (we can skip compaction at this point)...
                    && raft.getLoadMonitor().isUnderHighLoad()) {
                // We can skip taking a snapshot for now.
                logger.debug("Skipping compaction due to high load");
                if (rescheduleAfterCompletion) {
                    scheduleSnapshots();
                }
                return CompletableFuture.completedFuture(null);
            }

            logger.debug("Snapshotting services");

            // Update the index at which the log was last compacted.
            this.lastCompacted = lastApplied;

            // We need to ensure that callbacks added to the compaction future are completed in the order in which they
            // were added in order to preserve the order of retries when appending to the log.
            compactFuture = new OrderedFuture<>();

            // Wait for snapshots in all state machines to be completed before compacting the log at the last applied index.
            takeSnapshots().whenComplete((snapshot, error) -> {
                if (error == null) {
                    scheduleCompletion(snapshot.persist());
                }
            });

            // Reschedule snapshots after completion if necessary.
            if (rescheduleAfterCompletion) {
                compactFuture.whenComplete((r, e) -> scheduleSnapshots());
            }
            return compactFuture;
        }
        // Otherwise, if the log can't be compacted anyways, just reschedule snapshots.
        else {
            if (rescheduleAfterCompletion) {
                scheduleSnapshots();
            }
            return CompletableFuture.completedFuture(null);
        }
    }

    /**
     * Takes and persists snapshots of provided services.
     *
     * @return future to be completed once all snapshots have been completed
     */
    private CompletableFuture<Snapshot> takeSnapshots() {
        ComposableFuture<Snapshot> future = new ComposableFuture<>();
        stateContext.execute(() -> {
            try {
                future.complete(snapshot());
            } catch (Exception e) {
                future.completeExceptionally(e);
            }
        });
        return future;
    }

    /**
     * Schedules a completion check for the snapshot at the given index.
     *
     * @param snapshot the snapshot to complete
     */
    private void scheduleCompletion(Snapshot snapshot) {
        stateContext.schedule(SNAPSHOT_COMPLETION_DELAY, () -> {
            if (completeSnapshot(snapshot.index())) {
                logger.debug("Completing snapshot {}", snapshot.index());
                snapshot.complete();
                // If log compaction is being forced, immediately compact the logs.
                if (!raft.getLoadMonitor().isUnderHighLoad() || isRunningOutOfDiskSpace()
                        || isRunningOutOfMemory()) {
                    compactLogs(snapshot.index());
                } else {
                    scheduleCompaction(snapshot.index());
                }
            } else {
                scheduleCompletion(snapshot);
            }
        });
    }

    /**
     * Schedules a log compaction.
     *
     * @param lastApplied the last applied index at the start of snapshotting. This represents the highest index
     *     before which segments can be safely removed from disk
     */
    private void scheduleCompaction(long lastApplied) {
        // Schedule compaction after a randomized delay to discourage snapshots on multiple nodes at the same time.
        logger.trace("Scheduling compaction in {}", COMPACT_DELAY);
        stateContext.schedule(COMPACT_DELAY, () -> compactLogs(lastApplied));
    }

    /**
     * Compacts logs up to the given index.
     *
     * @param compactIndex the index to which to compact logs
     */
    private void compactLogs(long compactIndex) {
        raft.getThreadContext().execute(() -> {
            logger.debug("Compacting logs up to index {}", compactIndex);
            try {
                raft.getLog().compact(compactIndex);
            } catch (Exception e) {
                logger.error("An exception occurred during log compaction: {}", e);
            } finally {
                this.compactFuture.complete(null);
                this.compactFuture = null;
                // Immediately attempt to take new snapshots since compaction is already run after a time interval.
                takeSnapshots(false, false);
            }
        });
    }

    /**
     * Applies all commits up to the given index.
     * <p>
     * Calls to this method are assumed not to expect a result. This allows some optimizations to be made internally since
     * linearizable events don't have to be waited to complete the command.
     *
     * @param index The index up to which to apply commits.
     */
    public void applyAll(long index) {
        enqueueBatch(index);
    }

    /**
     * Applies the entry at the given index to the state machine.
     * <p>
     * Calls to this method are assumed to expect a result. This means linearizable session events triggered by the
     * application of the command at the given index will be awaited before completing the returned future.
     *
     * @param index The index to apply.
     * @return A completable future to be completed once the commit has been applied.
     */
    @SuppressWarnings("unchecked")
    public <T> CompletableFuture<T> apply(long index) {
        CompletableFuture<T> future = futures.computeIfAbsent(index, i -> new CompletableFuture<T>());
        enqueueBatch(index);
        return future;
    }

    /**
     * Applies all entries up to the given index.
     *
     * @param index the index up to which to apply entries
     */
    private void enqueueBatch(long index) {
        while (lastEnqueued < index) {
            enqueueIndex(++lastEnqueued);
        }
    }

    /**
     * Enqueues an index to be applied to the state machine.
     *
     * @param index the index to be applied to the state machine
     */
    private void enqueueIndex(long index) {
        raft.getThreadContext().execute(() -> applyIndex(index));
    }

    /**
     * Applies the next entry in the log up to the given index.
     *
     * @param index the index up to which to apply the entry
     */
    @SuppressWarnings("unchecked")
    private void applyIndex(long index) {
        // Apply entries prior to this entry.
        if (reader.hasNext() && reader.getNextIndex() == index) {
            // Read the entry from the log. If the entry is non-null then apply it, otherwise
            // simply update the last applied index and return a null result.
            Indexed<RaftLogEntry> entry = reader.next();
            try {
                if (entry.index() != index) {
                    throw new IllegalStateException("inconsistent index applying entry " + index + ": " + entry);
                }
                CompletableFuture future = futures.remove(index);
                apply(entry).whenComplete((r, e) -> {
                    raft.setLastApplied(index);
                    if (future != null) {
                        if (e == null) {
                            future.complete(r);
                        } else {
                            future.completeExceptionally(e);
                        }
                    }
                });
            } catch (Exception e) {
                logger.error("Failed to apply {}: {}", entry, e);
            }
        } else {
            CompletableFuture future = futures.remove(index);
            if (future != null) {
                logger.error("Cannot apply index " + index);
                future.completeExceptionally(new IndexOutOfBoundsException("Cannot apply index " + index));
            }
        }
    }

    /**
     * Applies an entry to the state machine.
     * <p>
     * Calls to this method are assumed to expect a result. This means linearizable session events triggered by the
     * application of the given entry will be awaited before completing the returned future.
     *
     * @param entry The entry to apply.
     * @return A completable future to be completed with the result.
     */
    @SuppressWarnings("unchecked")
    public <T> CompletableFuture<T> apply(Indexed<? extends RaftLogEntry> entry) {
        CompletableFuture<T> future = new CompletableFuture<>();
        stateContext.execute(() -> {
            logger.trace("Applying {}", entry);
            try {
                if (entry.type() == QueryEntry.class) {
                    applyQuery(entry.cast()).whenComplete((r, e) -> {
                        if (e != null) {
                            future.completeExceptionally(e);
                        } else {
                            future.complete((T) r);
                        }
                    });
                } else {
                    // Get the current snapshot. If the snapshot is for a higher index then skip this operation.
                    // If the snapshot is for the prior index, install it.
                    Snapshot snapshot = raft.getSnapshotStore().getCurrentSnapshot();
                    if (snapshot != null) {
                        if (snapshot.index() >= entry.index()) {
                            future.complete(null);
                            return;
                        } else if (snapshot.index() == entry.index() - 1) {
                            install(snapshot);
                        }
                    }

                    if (entry.type() == CommandEntry.class) {
                        future.complete((T) applyCommand(entry.cast()));
                    } else if (entry.type() == OpenSessionEntry.class) {
                        future.complete((T) (Long) applyOpenSession(entry.cast()));
                    } else if (entry.type() == KeepAliveEntry.class) {
                        future.complete((T) applyKeepAlive(entry.cast()));
                    } else if (entry.type() == CloseSessionEntry.class) {
                        applyCloseSession(entry.cast());
                        future.complete(null);
                    } else if (entry.type() == MetadataEntry.class) {
                        future.complete((T) applyMetadata(entry.cast()));
                    } else if (entry.type() == InitializeEntry.class) {
                        future.complete((T) applyInitialize(entry.cast()));
                    } else if (entry.type() == ConfigurationEntry.class) {
                        future.complete((T) applyConfiguration(entry.cast()));
                    } else {
                        future.completeExceptionally(new RaftException.ProtocolException("Unknown entry type"));
                    }
                }
            } catch (Exception e) {
                future.completeExceptionally(e);
            }
        });
        return future;
    }

    /**
     * Takes snapshots for the given index.
     */
    Snapshot snapshot() {
        Snapshot snapshot = raft.getSnapshotStore().newTemporarySnapshot(raft.getLastApplied(),
                new WallClockTimestamp());
        try (SnapshotWriter writer = snapshot.openWriter()) {
            for (RaftServiceContext service : raft.getServices()) {
                writer.buffer().mark();
                SnapshotWriter serviceWriter = new SnapshotWriter(writer.buffer().writeInt(0).slice(),
                        writer.snapshot());
                snapshotService(serviceWriter, service);
                int length = serviceWriter.buffer().position();
                writer.buffer().reset().writeInt(length).skip(length);
            }
        } catch (Exception e) {
            snapshot.close();
            logger.error("Failed to snapshot services", e);
            throw e;
        }
        return snapshot;
    }

    /**
     * Takes a snapshot of the given service.
     *
     * @param writer the snapshot writer
     * @param service the service to snapshot
     */
    private void snapshotService(SnapshotWriter writer, RaftServiceContext service) {
        writer.writeLong(service.serviceId().id());
        writer.writeString(service.serviceType().name());
        writer.writeString(service.serviceName());
        byte[] config = Serializer.using(service.serviceType().namespace()).encode(service.serviceConfig());
        writer.writeInt(config.length).writeBytes(config);
        try {
            service.takeSnapshot(writer);
        } catch (Exception e) {
            logger.error("Failed to take snapshot of service {}", service.serviceId(), e);
        }
    }

    /**
     * Prepares sessions for the given index.
     *
     * @param snapshot the snapshot to install
     */
    void install(Snapshot snapshot) {
        logger.debug("Installing snapshot {}", snapshot);
        try (SnapshotReader reader = snapshot.openReader()) {
            while (reader.hasRemaining()) {
                try {
                    int length = reader.readInt();
                    if (length > 0) {
                        SnapshotReader serviceReader = new SnapshotReader(reader.buffer().slice(length),
                                reader.snapshot());
                        installService(serviceReader);
                        reader.skip(length);
                    }
                } catch (Exception e) {
                    logger.error("Failed to read snapshot", e);
                }
            }
        }
    }

    /**
     * Restores the service associated with the given snapshot.
     *
     * @param reader the snapshot reader
     */
    private void installService(SnapshotReader reader) {
        PrimitiveId primitiveId = PrimitiveId.from(reader.readLong());
        try {
            PrimitiveType primitiveType = raft.getPrimitiveTypes().getPrimitiveType(reader.readString());
            String serviceName = reader.readString();
            byte[] serviceConfig = reader.readBytes(reader.readInt());

            // Get or create the service associated with the snapshot.
            logger.debug("Installing service {} {}", primitiveId, serviceName);
            RaftServiceContext service = initializeService(primitiveId, primitiveType, serviceName, serviceConfig);
            if (service != null) {
                try {
                    service.installSnapshot(reader);
                } catch (Exception e) {
                    logger.error("Failed to install snapshot for service {}", serviceName, e);
                }
            }
        } catch (ConfigurationException e) {
            logger.error(e.getMessage(), e);
        }
    }

    /**
     * Determines whether to complete the snapshot at the given index.
     *
     * @param index the index of the snapshot to complete
     * @return whether to complete the snapshot at the given index
     */
    private boolean completeSnapshot(long index) {
        // Compute the lowest completed index for all sessions that belong to this state machine.
        long lastCompleted = index;
        for (RaftSession session : raft.getSessions().getSessions()) {
            lastCompleted = Math.min(lastCompleted, session.getLastCompleted());
        }
        return lastCompleted >= index;
    }

    /**
     * Applies an initialize entry.
     * <p>
     * Initialize entries are used only at the beginning of a new leader's term to force the commitment of entries from
     * prior terms, therefore no logic needs to take place.
     */
    private CompletableFuture<Void> applyInitialize(Indexed<InitializeEntry> entry) {
        for (RaftServiceContext service : raft.getServices()) {
            service.keepAliveSessions(entry.index(), entry.entry().timestamp());
        }
        return CompletableFuture.completedFuture(null);
    }

    /**
     * Applies a configuration entry to the internal state machine.
     * <p>
     * Configuration entries are applied to internal server state when written to the log. Thus, no significant logic
     * needs to take place in the handling of configuration entries. We simply release the previous configuration entry
     * since it was overwritten by a more recent committed configuration entry.
     */
    private CompletableFuture<Void> applyConfiguration(Indexed<ConfigurationEntry> entry) {
        for (RaftServiceContext service : raft.getServices()) {
            service.keepAliveSessions(entry.index(), entry.entry().timestamp());
        }
        return CompletableFuture.completedFuture(null);
    }

    /**
     * Applies a session keep alive entry to the state machine.
     * <p>
     * Keep alive entries are applied to the internal state machine to reset the timeout for a specific session. If the
     * session indicated by the KeepAliveEntry is still held in memory, we mark the session as trusted, indicating that
     * the client has committed a keep alive within the required timeout. Additionally, we check all other sessions for
     * expiration based on the timestamp provided by this KeepAliveEntry. Note that sessions are never completely expired
     * via this method. Leaders must explicitly commit an UnregisterEntry to expire a session.
     * <p>
     * When a KeepAliveEntry is committed to the internal state machine, two specific fields provided in the entry are
     * used to update server-side session state. The {@code commandSequence} indicates the highest command for which the
     * session has received a successful response in the proper sequence. By applying the {@code commandSequence} to the
     * server session, we clear command output held in memory up to that point. The {@code eventVersion} indicates the
     * index up to which the client has received event messages in sequence for the session. Applying the {@code
     * eventVersion} to the server-side session results in events up to that index being removed from memory as they were
     * acknowledged by the client. It's essential that both of these fields be applied via entries committed to the Raft
     * log to ensure they're applied on all servers in sequential order.
     * <p>
     * Keep alive entries are retained in the log until the next time the client sends a keep alive entry or until the
     * client's session is expired. This ensures for sessions that have long timeouts, keep alive entries cannot be
     * cleaned from the log before they're replicated to some servers.
     */
    private long[] applyKeepAlive(Indexed<KeepAliveEntry> entry) {

        // Store the session/command/event sequence and event index instead of acquiring a reference to the entry.
        long[] sessionIds = entry.entry().sessionIds();
        long[] commandSequences = entry.entry().commandSequenceNumbers();
        long[] eventIndexes = entry.entry().eventIndexes();

        // Iterate through session identifiers and keep sessions alive.
        List<Long> successfulSessionIds = new ArrayList<>(sessionIds.length);
        Set<RaftServiceContext> services = new HashSet<>();
        for (int i = 0; i < sessionIds.length; i++) {
            long sessionId = sessionIds[i];
            long commandSequence = commandSequences[i];
            long eventIndex = eventIndexes[i];

            RaftSession session = raft.getSessions().getSession(sessionId);
            if (session != null) {
                if (session.getService().keepAlive(entry.index(), entry.entry().timestamp(), session,
                        commandSequence, eventIndex)) {
                    successfulSessionIds.add(sessionId);
                    services.add(session.getService());
                }
            }
        }

        // Iterate through services and complete keep-alives, causing sessions to be expired if necessary.
        for (RaftServiceContext service : services) {
            service.completeKeepAlive(entry.index(), entry.entry().timestamp());
        }

        expireOrphanSessions(entry.entry().timestamp());

        return Longs.toArray(successfulSessionIds);
    }

    /**
     * Expires sessions that have timed out.
     */
    private void expireOrphanSessions(long timestamp) {
        // Iterate through registered sessions.
        for (RaftSession session : raft.getSessions().getSessions()) {
            if (session.getService().deleted() && session.isTimedOut(timestamp)) {
                logger.debug("Orphaned session expired in {} milliseconds: {}",
                        timestamp - session.getLastUpdated(), session);
                session = raft.getSessions().removeSession(session.sessionId());
                if (session != null) {
                    session.expire();
                }
            }
        }
    }

    /**
     * Gets or initializes a service context.
     */
    private RaftServiceContext getOrInitializeService(PrimitiveId primitiveId, PrimitiveType primitiveType,
            String serviceName, byte[] config) {
        // Get the state machine executor or create one if it doesn't already exist.
        RaftServiceContext service = raft.getServices().getService(serviceName);
        if (service == null) {
            service = initializeService(primitiveId, primitiveType, serviceName, config);
        }
        return service;
    }

    /**
     * Initializes a new service.
     */
    @SuppressWarnings("unchecked")
    private RaftServiceContext initializeService(PrimitiveId primitiveId, PrimitiveType primitiveType,
            String serviceName, byte[] config) {
        RaftServiceContext oldService = raft.getServices().getService(serviceName);
        ServiceConfig serviceConfig = config == null ? new ServiceConfig()
                : Serializer.using(primitiveType.namespace()).decode(config);
        RaftServiceContext service = new RaftServiceContext(primitiveId, serviceName, primitiveType, serviceConfig,
                primitiveType.newService(serviceConfig), raft, threadContextFactory);
        raft.getServices().registerService(service);

        // If a service with this name was already registered, remove all of its sessions.
        if (oldService != null) {
            raft.getSessions().removeSessions(oldService.serviceId());
        }
        return service;
    }

    /**
     * Applies an open session entry to the state machine.
     */
    private long applyOpenSession(Indexed<OpenSessionEntry> entry) {
        PrimitiveType primitiveType = raft.getPrimitiveTypes().getPrimitiveType(entry.entry().serviceType());

        // Get the state machine executor or create one if it doesn't already exist.
        RaftServiceContext service = getOrInitializeService(PrimitiveId.from(entry.index()), primitiveType,
                entry.entry().serviceName(), entry.entry().serviceConfig());

        if (service == null) {
            throw new RaftException.UnknownService("Unknown service type " + entry.entry().serviceType());
        }

        SessionId sessionId = SessionId.from(entry.index());
        RaftSession session = raft.getSessions()
                .addSession(new RaftSession(sessionId, MemberId.from(entry.entry().memberId()),
                        entry.entry().serviceName(), primitiveType, entry.entry().readConsistency(),
                        entry.entry().minTimeout(), entry.entry().maxTimeout(), entry.entry().timestamp(),
                        service.serializer(), service, raft, threadContextFactory));
        return service.openSession(entry.index(), entry.entry().timestamp(), session);
    }

    /**
     * Applies a close session entry to the state machine.
     */
    private void applyCloseSession(Indexed<CloseSessionEntry> entry) {
        RaftSession session = raft.getSessions().getSession(entry.entry().session());

        // If the server session is null, the session either never existed or already expired.
        if (session == null) {
            throw new RaftException.UnknownSession("Unknown session: " + entry.entry().session());
        }

        RaftServiceContext service = session.getService();
        service.closeSession(entry.index(), entry.entry().timestamp(), session, entry.entry().expired());

        // If this is a delete, unregister the service.
        if (entry.entry().delete()) {
            raft.getServices().unregisterService(service);
            service.close();
        }
    }

    /**
     * Applies a metadata entry to the state machine.
     */
    private MetadataResult applyMetadata(Indexed<MetadataEntry> entry) {
        // If the session ID is non-zero, read the metadata for the associated state machine.
        if (entry.entry().session() > 0) {
            RaftSession session = raft.getSessions().getSession(entry.entry().session());

            // If the session is null, return an UnknownSessionException.
            if (session == null) {
                logger.warn("Unknown session: " + entry.entry().session());
                throw new RaftException.UnknownSession("Unknown session: " + entry.entry().session());
            }

            Set<SessionMetadata> sessions = new HashSet<>();
            for (RaftSession s : raft.getSessions().getSessions()) {
                if (s.primitiveName().equals(session.primitiveName())) {
                    sessions.add(
                            new SessionMetadata(s.sessionId().id(), s.primitiveName(), s.primitiveType().name()));
                }
            }
            return new MetadataResult(sessions);
        } else {
            Set<SessionMetadata> sessions = new HashSet<>();
            for (RaftSession session : raft.getSessions().getSessions()) {
                sessions.add(new SessionMetadata(session.sessionId().id(), session.primitiveName(),
                        session.primitiveType().name()));
            }
            return new MetadataResult(sessions);
        }
    }

    /**
     * Applies a command entry to the state machine.
     * <p>
     * Command entries result in commands being executed on the user provided {@link PrimitiveService} and a response
     * being sent back to the client by completing the returned future. All command responses are cached in the command's
     * {@link RaftSession} for fault tolerance. In the event that the same command is applied to the state machine more
     * than once, the original response will be returned.
     * <p>
     * Command entries are written with a sequence number. The sequence number is used to ensure that commands are applied
     * to the state machine in sequential order. If a command entry has a sequence number that is less than the next
     * sequence number for the session, that indicates that it is a duplicate of a command that was already applied.
     * Otherwise, commands are assumed to have been received in sequential order. The reason for this assumption is
     * because leaders always sequence commands as they're written to the log, so no sequence number will be skipped.
     */
    private OperationResult applyCommand(Indexed<CommandEntry> entry) {
        // First check to ensure that the session exists.
        RaftSession session = raft.getSessions().getSession(entry.entry().session());

        // If the session is null, return an UnknownSessionException. Commands applied to the state machine must
        // have a session. We ensure that session register/unregister entries are not compacted from the log
        // until all associated commands have been cleaned.
        // Note that it's possible for a session to be unknown if a later snapshot has been taken, so we don't want
        // to log warnings here.
        if (session == null) {
            logger.debug("Unknown session: " + entry.entry().session());
            throw new RaftException.UnknownSession("unknown session: " + entry.entry().session());
        }

        // Increment the load counter to avoid snapshotting under high load.
        raft.getLoadMonitor().recordEvent();

        // Execute the command using the state machine associated with the session.
        return session.getService().executeCommand(entry.index(), entry.entry().sequenceNumber(),
                entry.entry().timestamp(), session, entry.entry().operation());
    }

    /**
     * Applies a query entry to the state machine.
     * <p>
     * Query entries are applied to the user {@link PrimitiveService} for read-only operations. Because queries are
     * read-only, they may only be applied on a single server in the cluster, and query entries do not go through the Raft
     * log. Thus, it is critical that measures be taken to ensure clients see a consistent view of the cluster event when
     * switching servers. To do so, clients provide a sequence and version number for each query. The sequence number is
     * the order in which the query was sent by the client. Sequence numbers are shared across both commands and queries.
     * The version number indicates the last index for which the client saw a command or query response. In the event that
     * the lastApplied index of this state machine does not meet the provided version number, we wait for the state
     * machine to catch up before applying the query. This ensures clients see state progress monotonically even when
     * switching servers.
     * <p>
     * Because queries may only be applied on a single server in the cluster they cannot result in the publishing of
     * session events. Events require commands to be written to the Raft log to ensure fault-tolerance and consistency
     * across the cluster.
     */
    private CompletableFuture<OperationResult> applyQuery(Indexed<QueryEntry> entry) {
        RaftSession session = raft.getSessions().getSession(entry.entry().session());

        // If the session is null then that indicates that the session already timed out or it never existed.
        // Return with an UnknownSessionException.
        if (session == null) {
            logger.warn("Unknown session: " + entry.entry().session());
            return Futures.exceptionalFuture(
                    new RaftException.UnknownSession("unknown session " + entry.entry().session()));
        }

        // Execute the query using the state machine associated with the session.
        return session.getService().executeQuery(entry.index(), entry.entry().sequenceNumber(),
                entry.entry().timestamp(), session, entry.entry().operation());
    }

    @Override
    public void close() {
        // Don't close the thread context here since state machines can be reused.
    }
}