io.debezium.connector.mongodb.Replicator.java Source code

Introduction

Here is the source code for io.debezium.connector.mongodb.Replicator.java
Source

/*
 * Copyright Debezium Authors.
 *
 * Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
 */
package io.debezium.connector.mongodb;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Predicate;

import org.apache.kafka.connect.errors.ConnectException;
import org.apache.kafka.connect.source.SourceRecord;
import org.bson.BsonTimestamp;
import org.bson.Document;
import org.bson.conversions.Bson;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.mongodb.CursorType;
import com.mongodb.MongoClient;
import com.mongodb.ServerAddress;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.model.Filters;

import io.debezium.annotation.ThreadSafe;
import io.debezium.connector.mongodb.RecordMakers.RecordsForCollection;
import io.debezium.function.BlockingConsumer;
import io.debezium.function.BufferedBlockingConsumer;
import io.debezium.util.Clock;
import io.debezium.util.Strings;

/**
 * A component that replicates the content of a replica set, starting with an initial sync or continuing to read the oplog where
 * it last left off.
 * 
 * <h2>Initial Sync</h2>
 * If no offsets have been recorded for this replica set, replication begins with an
 * <a href="https://docs.mongodb.com/manual/core/replica-set-sync/#initial-sync">MongoDB initial sync</a> of the replica set.
 * <p>
 * The logic used in this component to perform an initial sync is similar to that of the
 * <a href="https://github.com/mongodb/mongo/blob/master/src/mongo/db/repl/rs_initialsync.cpp">official initial sync</a>
 * functionality used by secondary MongoDB nodes, although our logic can be simpler since we are not storing state nor building
 * indexes:
 * <ol>
 * <li>Read the primary node's current oplog time. This is our <em>start time</em>.</li>
 * <li>Clone all of the databases and collections from the primary node, using multiple threads. This steps is completed only
 * after <em>all</em> collections are successfully copied.</li>
 * <li>Start reading the primary node's oplog from <em>start time</em> and applying the changes.</li>
 * </ol>
 * <p>
 * It is important to understand that step 2 is not performing a consistent snapshot. That means that once we start copying a
 * collection, clients can make changes and we may or may not see those changes in our copy. However, this is not a problem
 * because the MongoDB replication process -- and our logic -- relies upon the fact that every change recorded in the MongoDB
 * oplog is <a href="https://docs.mongodb.com/manual/core/replica-set-oplog/">idempotent</a>. So, as long as we read the oplog
 * from the same point in time (or earlier) than we <em>started</em> our copy operation, and apply <em>all</em> of the changes
 * <em>in the same order</em>, then the state of all documents described by this connector will be the same.
 * 
 * <h2>Restart</h2>
 * If prior runs of the replicator have recorded offsets in the {@link ReplicationContext#source() source information}, then
 * when the replicator starts it will simply start reading the primary's oplog starting at the same point it last left off.
 * 
 * <h2>Handling problems</h2>
 * <p>
 * This replicator does each of its tasks using a connection to the primary. If the replicator is not able to establish a
 * connection to the primary (e.g., there is no primary, or the replicator cannot communicate with the primary), the replicator
 * will continue to try to establish a connection, using an exponential back-off strategy to prevent saturating the system.
 * After a {@link ReplicationContext#maxConnectionAttemptsForPrimary() configurable} number of failed attempts, the replicator
 * will fail by throwing a {@link ConnectException}.
 * 
 * @author Randall Hauch
 */
@ThreadSafe
public class Replicator {

    private final Logger logger = LoggerFactory.getLogger(getClass());
    private final ReplicationContext context;
    private final ExecutorService copyThreads;
    private final ReplicaSet replicaSet;
    private final String rsName;
    private final AtomicBoolean running = new AtomicBoolean();
    private final SourceInfo source;
    private final RecordMakers recordMakers;
    private final BufferableRecorder bufferedRecorder;
    private final Predicate<CollectionId> collectionFilter;
    private final Clock clock;
    private ReplicationContext.MongoPrimary primaryClient;

    /**
     * @param context the replication context; may not be null
     * @param replicaSet the replica set to be replicated; may not be null
     * @param recorder the recorder for source record produced by this replicator; may not be null
     */
    public Replicator(ReplicationContext context, ReplicaSet replicaSet, BlockingConsumer<SourceRecord> recorder) {
        assert context != null;
        assert replicaSet != null;
        assert recorder != null;
        this.context = context;
        this.source = context.source();
        this.replicaSet = replicaSet;
        this.rsName = replicaSet.replicaSetName();
        this.copyThreads = Executors.newFixedThreadPool(context.maxNumberOfCopyThreads());
        this.bufferedRecorder = new BufferableRecorder(recorder);
        this.recordMakers = new RecordMakers(this.source, context.topicSelector(), this.bufferedRecorder);
        this.collectionFilter = this.context.collectionFilter();
        this.clock = this.context.clock();
    }

    /**
     * Stop the replication from running.
     * <p>
     * This method does nothing if the snapshot is not running
     */
    public void stop() {
        this.copyThreads.shutdownNow();
    }

    /**
     * Perform the replication logic. This can be run once.
     */
    public void run() {
        if (this.running.compareAndSet(false, true)) {
            try {
                if (establishConnectionToPrimary()) {
                    if (isInitialSyncExpected()) {
                        recordCurrentOplogPosition();
                        if (!performInitialSync()) {
                            return;
                        }
                    }
                    readOplog();
                }
            } finally {
                this.running.set(false);
            }
        }
    }

    /**
     * Establish a connection to the primary.
     * 
     * @return {@code true} if a connection was established, or {@code false} otherwise
     */
    protected boolean establishConnectionToPrimary() {
        logger.info("Connecting to '{}'", replicaSet);
        primaryClient = context.primaryFor(replicaSet, (desc, error) -> {
            logger.error("Error while attempting to {}: {}", desc, error.getMessage(), error);
        });
        return primaryClient != null;
    }

    /**
     * Obtain the current position of the oplog, and record it in the source.
     */
    protected void recordCurrentOplogPosition() {
        primaryClient.execute("get oplog position", primary -> {
            MongoCollection<Document> oplog = primary.getDatabase("local").getCollection("oplog.rs");
            Document last = oplog.find().sort(new Document("$natural", -1)).limit(1).first(); // may be null
            source.offsetStructForEvent(replicaSet.replicaSetName(), last);
        });
    }

    /**
     * Determine if an initial sync should be performed. An initial sync is expected if the {@link #source} has no previously
     * recorded offsets for this replica set, or if {@link ReplicationContext#performSnapshotEvenIfNotNeeded() a snapshot should
     * always be performed}.
     * 
     * @return {@code true} if the initial sync should be performed, or {@code false} otherwise
     */
    protected boolean isInitialSyncExpected() {
        boolean performSnapshot = true;
        if (source.hasOffset(rsName)) {
            logger.info("Found existing offset for replica set '{}' at {}", rsName, source.lastOffset(rsName));
            performSnapshot = false;
            if (context.performSnapshotEvenIfNotNeeded()) {
                logger.info("Configured to performing initial sync of replica set '{}'", rsName);
                performSnapshot = true;
            } else {
                if (source.isInitialSyncOngoing(rsName)) {
                    // The last snapshot was not completed, so do it again ...
                    logger.info(
                            "The previous initial sync was incomplete for '{}', so initiating another initial sync",
                            rsName);
                    performSnapshot = true;
                } else {
                    // There is no ongoing initial sync, so look to see if our last recorded offset still exists in the oplog.
                    BsonTimestamp lastRecordedTs = source.lastOffsetTimestamp(rsName);
                    AtomicReference<BsonTimestamp> firstExistingTs = new AtomicReference<>();
                    primaryClient.execute("get oplog position", primary -> {
                        MongoCollection<Document> oplog = primary.getDatabase("local").getCollection("oplog.rs");
                        Document firstEvent = oplog.find().sort(new Document("$natural", 1)).limit(1).first(); // may be null
                        firstExistingTs.set(SourceInfo.extractEventTimestamp(firstEvent));
                    });
                    BsonTimestamp firstAvailableTs = firstExistingTs.get();
                    if (firstAvailableTs == null) {
                        logger.info("The oplog contains no entries, so performing initial sync of replica set '{}'",
                                rsName);
                        performSnapshot = true;
                    } else if (lastRecordedTs.compareTo(firstAvailableTs) < 0) {
                        // The last recorded timestamp is *before* the first existing oplog event, which means there is
                        // almost certainly some history lost since we last processed the oplog ...
                        logger.info(
                                "Initial sync is required since the oplog for replica set '{}' starts at {}, which is later than the timestamp of the last offset {}",
                                rsName, firstAvailableTs, lastRecordedTs);
                        performSnapshot = true;
                    } else {
                        // Otherwise we'll not perform an initial sync
                        logger.info(
                                "The oplog contains the last entry previously read for '{}', so no initial sync will be performed",
                                rsName);
                    }
                }
            }
        } else {
            logger.info("No existing offset found for replica set '{}', starting initial sync", rsName);
            performSnapshot = true;
        }
        return performSnapshot;
    }

    /**
     * Perform the initial sync of the collections in the replica set.
     * 
     * @return {@code true} if the initial sync was completed, or {@code false} if it was stopped for any reason
     */
    protected boolean performInitialSync() {
        logger.info("Beginning initial sync of '{}' at {}", rsName, source.lastOffset(rsName));
        source.startInitialSync(replicaSet.replicaSetName());

        // Set up our recorder to buffer the last record ...
        try {
            bufferedRecorder.startBuffering();
        } catch (InterruptedException e) {
            // Do nothing so that this thread is terminated ...
            logger.info("Interrupted while waiting to flush the buffer before starting an initial sync of '{}'",
                    rsName);
            return false;
        }

        // Get the current timestamp of this processor ...
        final long syncStart = clock.currentTimeInMillis();

        // We need to copy each collection, so put the collection IDs into a queue ...
        final List<CollectionId> collections = new ArrayList<>();
        primaryClient.collections().forEach(id -> {
            if (collectionFilter.test(id))
                collections.add(id);
        });
        final Queue<CollectionId> collectionsToCopy = new ConcurrentLinkedQueue<>(collections);
        final int numThreads = Math.min(collections.size(), context.maxNumberOfCopyThreads());
        final CountDownLatch latch = new CountDownLatch(numThreads);
        final AtomicBoolean aborted = new AtomicBoolean(false);
        final AtomicInteger replicatorThreadCounter = new AtomicInteger(0);
        final AtomicInteger numCollectionsCopied = new AtomicInteger();
        final AtomicLong numDocumentsCopied = new AtomicLong();

        // And start threads to pull collection IDs from the queue and perform the copies ...
        logger.info("Preparing to use {} thread(s) to sync {} collection(s): {}", numThreads, collections.size(),
                Strings.join(", ", collections));
        for (int i = 0; i != numThreads; ++i) {
            copyThreads.submit(() -> {
                context.configureLoggingContext(
                        replicaSet.replicaSetName() + "-sync" + replicatorThreadCounter.incrementAndGet());
                // Continue to pull a collection ID and copy the collection ...
                try {
                    CollectionId id = null;
                    while (!aborted.get() && (id = collectionsToCopy.poll()) != null) {
                        long start = clock.currentTimeInMillis();
                        logger.info("Starting initial sync of '{}'", id);
                        long numDocs = copyCollection(id, syncStart);
                        numCollectionsCopied.incrementAndGet();
                        numDocumentsCopied.addAndGet(numDocs);
                        long duration = clock.currentTimeInMillis() - start;
                        logger.info("Completing initial sync of {} documents from '{}' in {}", numDocs, id,
                                Strings.duration(duration));
                    }
                } catch (InterruptedException e) {
                    // Do nothing so that this thread is terminated ...
                    aborted.set(true);
                } finally {
                    latch.countDown();
                }
            });
        }

        // Wait for all of the threads to complete ...
        try {
            latch.await();
        } catch (InterruptedException e) {
            Thread.interrupted();
            aborted.set(true);
        }

        // Stopping the replicator does not interrupt *our* thread but does interrupt the copy threads.
        // Therefore, check the aborted state here ...
        long syncDuration = clock.currentTimeInMillis() - syncStart;
        if (aborted.get()) {
            int remaining = collections.size() - numCollectionsCopied.get();
            logger.info("Initial sync aborted after {} with {} of {} collections incomplete",
                    Strings.duration(syncDuration), remaining, collections.size());
            return false;
        }

        // We completed the initial sync, so record this in the source ...
        source.stopInitialSync(replicaSet.replicaSetName());
        try {
            // And immediately flush the last buffered source record with the updated offset ...
            bufferedRecorder.stopBuffering(source.lastOffset(rsName));
        } catch (InterruptedException e) {
            logger.info(
                    "Interrupted while waiting for last initial sync record from replica set '{}' to be recorded",
                    rsName);
            return false;
        }

        logger.info("Initial sync of {} collections with a total of {} documents completed in {}",
                collections.size(), numDocumentsCopied.get(), Strings.duration(syncDuration));
        return true;
    }

    /**
     * Copy the collection, sending to the recorder a record for each document.
     * 
     * @param collectionId the identifier of the collection to be copied; may not be null
     * @param timestamp the timestamp in milliseconds at which the copy operation was started
     * @return number of documents that were copied
     * @throws InterruptedException if the thread was interrupted while the copy operation was running
     */
    protected long copyCollection(CollectionId collectionId, long timestamp) throws InterruptedException {
        AtomicLong docCount = new AtomicLong();
        primaryClient.executeBlocking("sync '" + collectionId + "'", primary -> {
            docCount.set(copyCollection(primary, collectionId, timestamp));
        });
        return docCount.get();
    }

    /**
     * Copy the collection, sending to the recorder a record for each document.
     * 
     * @param primary the connection to the replica set's primary node; may not be null
     * @param collectionId the identifier of the collection to be copied; may not be null
     * @param timestamp the timestamp in milliseconds at which the copy operation was started
     * @return number of documents that were copied
     * @throws InterruptedException if the thread was interrupted while the copy operation was running
     */
    protected long copyCollection(MongoClient primary, CollectionId collectionId, long timestamp)
            throws InterruptedException {
        RecordsForCollection factory = recordMakers.forCollection(collectionId);
        MongoDatabase db = primary.getDatabase(collectionId.dbName());
        MongoCollection<Document> docCollection = db.getCollection(collectionId.name());
        long counter = 0;
        try (MongoCursor<Document> cursor = docCollection.find().iterator()) {
            while (cursor.hasNext()) {
                Document doc = cursor.next();
                logger.trace("Found existing doc in {}: {}", collectionId, doc);
                counter += factory.recordObject(collectionId, doc, timestamp);
            }
        }
        return counter;
    }

    /**
     * Repeatedly obtain a connection to the replica set's current primary and use that primary to read the oplog.
     * This method will continue to run even if there are errors or problems. The method will return when a sufficient
     * number of errors occur or if the replicator should stop reading the oplog. The latter occurs when a new primary
     * is elected (as identified by an oplog event), of if the current thread doing the reading is interrupted.
     */
    protected void readOplog() {
        primaryClient.execute("read from oplog on '" + replicaSet + "'", this::readOplog);
    }

    /**
     * Use the given primary to read the oplog.
     * 
     * @param primary the connection to the replica set's primary node; may not be null
     */
    protected void readOplog(MongoClient primary) {
        BsonTimestamp oplogStart = source.lastOffsetTimestamp(replicaSet.replicaSetName());
        logger.info("Reading oplog for '{}' primary {} starting at {}", replicaSet, primary.getAddress(),
                oplogStart);

        // Include none of the cluster-internal operations and only those events since the previous timestamp ...
        MongoCollection<Document> oplog = primary.getDatabase("local").getCollection("oplog.rs");
        Bson filter = Filters.and(Filters.gt("ts", oplogStart), // start just after our last position
                Filters.exists("fromMigrate", false)); // skip internal movements across shards
        FindIterable<Document> results = oplog.find(filter).sort(new Document("$natural", 1)) // force forwards collection scan
                .oplogReplay(true) // tells Mongo to not rely on indexes
                .noCursorTimeout(true) // don't timeout waiting for events
                .cursorType(CursorType.TailableAwait); // tail and await new data
        // Read as much of the oplog as we can ...
        ServerAddress primaryAddress = primary.getAddress();
        try (MongoCursor<Document> cursor = results.iterator()) {
            while (running.get() && cursor.hasNext()) {
                if (!handleOplogEvent(primaryAddress, cursor.next())) {
                    // Something happened, and we're supposed to stop reading
                    return;
                }
            }
        }
    }

    /**
     * Handle a single oplog event.
     * 
     * @param primaryAddress the address of the primary server from which the event was obtained; may not be null
     * @param event the oplog event; may not be null
     * @return {@code true} if additional events should be processed, or {@code false} if the caller should stop
     *         processing events
     */
    protected boolean handleOplogEvent(ServerAddress primaryAddress, Document event) {
        logger.debug("Found event: {}", event);
        String ns = event.getString("ns");
        Document object = event.get("o", Document.class);
        if (object == null) {
            logger.warn("Missing 'o' field in event, so skipping {}", event.toJson());
            return true;
        }
        if (ns == null || ns.isEmpty()) {
            // These are replica set events ...
            String msg = object.getString("msg");
            if ("new primary".equals(msg)) {
                AtomicReference<ServerAddress> address = new AtomicReference<>();
                try {
                    primaryClient.executeBlocking("conn", mongoClient -> {
                        ServerAddress currentPrimary = mongoClient.getAddress();
                        address.set(currentPrimary);
                    });
                } catch (InterruptedException e) {
                    logger.error("Get current primary executeBlocking", e);
                }
                ServerAddress serverAddress = address.get();
                if (serverAddress != null && !serverAddress.equals(primaryAddress)) {
                    logger.info(
                            "Found new primary event in oplog, so stopping use of {} to continue with new primary",
                            primaryAddress);
                    // There is a new primary, so stop using this server and instead use the new primary ...
                    return false;
                } else {
                    logger.info("Found new primary event in oplog, current {} is new primary. "
                            + "Continue to process oplog event.", primaryAddress);
                }
            }
            // Otherwise, ignore this event ...
            logger.debug("Skipping event with no namespace: {}", event.toJson());
            return true;
        }
        int delimIndex = ns.indexOf('.');
        if (delimIndex > 0) {
            assert (delimIndex + 1) < ns.length();
            String dbName = ns.substring(0, delimIndex);
            String collectionName = ns.substring(delimIndex + 1);
            if ("$cmd".equals(collectionName)) {
                // This is a command on a database ...
                // TODO: Probably want to handle some of these when we track creation/removal of collections
                logger.debug("Skipping database command event: {}", event.toJson());
                return true;
            }
            // Otherwise, it is an event on a document in a collection ...
            CollectionId collectionId = new CollectionId(rsName, dbName, collectionName);
            if (collectionFilter.test(collectionId)) {
                RecordsForCollection factory = recordMakers.forCollection(collectionId);
                try {
                    factory.recordEvent(event, clock.currentTimeInMillis());
                } catch (InterruptedException e) {
                    Thread.interrupted();
                    return false;
                }
            }
        }
        return true;
    }

    /**
     * A {@link BlockingConsumer BlockingConsumer<SourceRecord>} implementation that will buffer the last source record
     * only during an initial sync, so that when the initial sync is complete the last record's offset can be updated
     * to reflect the completion of the initial sync before it is flushed.
     */
    protected final class BufferableRecorder implements BlockingConsumer<SourceRecord> {
        private final BlockingConsumer<SourceRecord> actual;
        private final BufferedBlockingConsumer<SourceRecord> buffered;
        private volatile BlockingConsumer<SourceRecord> current;

        public BufferableRecorder(BlockingConsumer<SourceRecord> actual) {
            this.actual = actual;
            this.buffered = BufferedBlockingConsumer.bufferLast(actual);
            this.current = this.actual;
        }

        /**
         * Start buffering the most recently source record so it can be updated before the {@link #stopBuffering(Map) initial sync
         * is completed}.
         * 
         * @throws InterruptedException if the thread is interrupted while waiting for any existing record to be flushed
         */
        protected synchronized void startBuffering() throws InterruptedException {
            this.buffered.flush();
            this.current = this.buffered;
        }

        /**
         * Stop buffering source records, and flush any buffered records by replacing their offset with the provided offset.
         * Note that this only replaces the record's {@link SourceRecord#sourceOffset() offset} and does not change the
         * value of the record, which may contain information about the snapshot.
         * 
         * @param newOffset the offset that reflects that the snapshot has been completed; may not be null
         * @throws InterruptedException if the thread is interrupted while waiting for the new record to be flushed
         */
        protected synchronized void stopBuffering(Map<String, ?> newOffset) throws InterruptedException {
            assert newOffset != null;
            this.buffered.flush(record -> {
                if (record == null)
                    return null;
                return new SourceRecord(record.sourcePartition(), newOffset, record.topic(),
                        record.kafkaPartition(), record.keySchema(), record.key(), record.valueSchema(),
                        record.value());
            });
            this.current = this.actual;
        }

        @Override
        public void accept(SourceRecord t) throws InterruptedException {
            this.current.accept(t);
        }
    }
}