com.cloudant.sync.replication.PullStrategy.java Source code

Introduction

Here is the source code for com.cloudant.sync.replication.PullStrategy.java
Source

/**
 * Copyright (c) 2013 Cloudant, Inc. All rights reserved.
 * <p/>
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software distributed under the
 * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific language governing permissions
 * and limitations under the License.
 */

package com.cloudant.sync.replication;

import com.cloudant.http.HttpConnectionRequestInterceptor;
import com.cloudant.http.HttpConnectionResponseInterceptor;
import com.cloudant.mazha.ChangesResult;
import com.cloudant.mazha.CouchClient;
import com.cloudant.mazha.DocumentRevs;
import com.cloudant.sync.datastore.Attachment;
import com.cloudant.sync.datastore.Datastore;
import com.cloudant.sync.datastore.DatastoreException;
import com.cloudant.sync.datastore.DatastoreImpl;
import com.cloudant.sync.datastore.DocumentException;
import com.cloudant.sync.datastore.DocumentRevsList;
import com.cloudant.sync.datastore.PreparedAttachment;
import com.cloudant.sync.datastore.UnsavedStreamAttachment;
import com.cloudant.sync.event.EventBus;
import com.cloudant.sync.util.JSONUtils;
import com.cloudant.sync.util.Misc;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimap;

import org.apache.commons.codec.binary.Hex;

import java.io.ByteArrayInputStream;
import java.net.URI;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * @api_private
 */
class PullStrategy implements ReplicationStrategy {

    // internal state which gets reset each time run() is called
    private static class State {
        // Flag to stop the replication thread.
        // Volatile as might be set from another thread.
        private volatile boolean cancel = false;

        /**
         * Flag is set when the replication process is complete. The thread
         * may live on because the listener's callback is executed on the thread.
         */
        private volatile boolean replicationTerminated = false;

        int documentCounter = 0;

        int batchCounter = 0;
    }

    private State state;

    private static final Logger logger = Logger.getLogger(PullStrategy.class.getCanonicalName());

    private static final String LOG_TAG = "PullStrategy";

    CouchDB sourceDb;

    PullFilter filter;

    DatastoreWrapper targetDb;

    private final String name;

    private final EventBus eventBus = new EventBus();

    // Is _bulk_get endpoint supported?
    private boolean useBulkGet = false;

    public int changeLimitPerBatch = 1000;

    public int batchLimitPerRun = 100;

    public int insertBatchSize = 100;

    public boolean pullAttachmentsInline = false;

    public PullStrategy(URI source, Datastore target, PullFilter filter,
            List<HttpConnectionRequestInterceptor> requestInterceptors,
            List<HttpConnectionResponseInterceptor> responseInterceptors) {
        this.filter = filter;
        this.sourceDb = new CouchClientWrapper(new CouchClient(source, requestInterceptors, responseInterceptors));
        this.targetDb = new DatastoreWrapper((DatastoreImpl) target);
        String replicatorName;
        if (filter == null) {
            replicatorName = String.format("%s <-- %s ", target.getDatastoreName(), source);
        } else {
            replicatorName = String.format("%s <-- %s (%s)", target.getDatastoreName(), source, filter.getName());
        }
        this.name = String.format("%s [%s]", LOG_TAG, replicatorName);
    }

    @Override
    public boolean isReplicationTerminated() {
        if (this.state != null) {
            return state.replicationTerminated;
        } else {
            return false;
        }
    }

    @Override
    public void setCancel() {
        // if we have been cancelled before run(), we have to create the internal state
        if (this.state == null) {
            this.state = new State();
        }
        this.state.cancel = true;
    }

    @Override
    public int getDocumentCounter() {
        if (this.state != null) {
            return this.state.documentCounter;
        } else {
            return 0;
        }
    }

    @Override
    public int getBatchCounter() {
        if (this.state != null) {
            return this.state.batchCounter;
        } else {
            return 0;
        }
    }

    /**
     * Handle exceptions in separate run() method to allow replicate() to
     * just return when cancel is set to true rather than having to keep
     * track of whether we terminated via an error in replicate().
     */
    @Override
    public void run() {

        if (this.state != null && this.state.cancel) {
            // we were already cancelled, don't run, but still post completion
            this.state.documentCounter = 0;
            this.state.batchCounter = 0;
            runComplete(null);
            return;
        }
        // reset internal state
        this.state = new State();

        ErrorInfo errorInfo = null;

        try {
            this.useBulkGet = sourceDb.isBulkSupported();
            replicate();

        } catch (ExecutionException ex) {
            logger.log(Level.SEVERE, String.format("Batch %s ended with error:", this.state.batchCounter), ex);
            errorInfo = new ErrorInfo(ex.getCause());
        } catch (Throwable e) {
            logger.log(Level.SEVERE, String.format("Batch %s ended with error:", this.state.batchCounter), e);
            errorInfo = new ErrorInfo(e);
        }

        runComplete(errorInfo);
    }

    private void runComplete(ErrorInfo errorInfo) {
        state.replicationTerminated = true;

        String msg = "Pull replication terminated via ";
        msg += this.state.cancel ? "cancel." : "completion.";

        // notify complete/errored on eventbus
        logger.info(msg + " Posting on EventBus.");
        if (errorInfo == null) { // successful replication
            eventBus.post(new ReplicationStrategyCompleted(this));
        } else {
            eventBus.post(new ReplicationStrategyErrored(this, errorInfo));
        }
    }

    private void replicate() throws DatabaseNotFoundException, ExecutionException, InterruptedException,
            DocumentException, DatastoreException {
        logger.info("Pull replication started");
        long startTime = System.currentTimeMillis();

        // We were cancelled before we started
        if (this.state.cancel) {
            return;
        }

        if (!this.sourceDb.exists()) {
            throw new DatabaseNotFoundException("Database not found " + this.sourceDb.getIdentifier());
        }

        this.state.documentCounter = 0;
        for (this.state.batchCounter = 1; this.state.batchCounter < this.batchLimitPerRun; this.state.batchCounter++) {

            if (this.state.cancel) {
                return;
            }

            String msg = String.format("Batch %s started (completed %s changes so far)", this.state.batchCounter,
                    this.state.documentCounter);
            logger.info(msg);
            long batchStartTime = System.currentTimeMillis();

            ChangesResultWrapper changeFeeds = this.nextBatch();
            int batchChangesProcessed = 0;

            // So we can check whether all changes were processed during
            // a log analysis.
            msg = String.format("Batch %s contains %s changes", this.state.batchCounter, changeFeeds.size());
            logger.info(msg);

            if (changeFeeds.size() > 0) {
                batchChangesProcessed = processOneChangesBatch(changeFeeds);
                state.documentCounter += batchChangesProcessed;
            }

            long batchEndTime = System.currentTimeMillis();
            msg = String.format("Batch %s completed in %sms (batch was %s changes)", this.state.batchCounter,
                    batchEndTime - batchStartTime, batchChangesProcessed);
            logger.info(msg);

            // This logic depends on the changes in the feed rather than the
            // changes we actually processed.
            if (changeFeeds.size() < this.changeLimitPerBatch) {
                break;
            }
        }

        long endTime = System.currentTimeMillis();
        long deltaTime = endTime - startTime;
        String msg = String.format("Pull completed in %sms (%s total changes processed)", deltaTime,
                this.state.documentCounter);
        logger.info(msg);
    }

    public static class BatchItem {

        public BatchItem(DocumentRevsList revsList, HashMap<String[], List<PreparedAttachment>> attachments) {
            this.revsList = revsList;
            this.attachments = attachments;
        }

        public HashMap<String[], List<PreparedAttachment>> attachments;
        public DocumentRevsList revsList;
    }

    private int processOneChangesBatch(ChangesResultWrapper changeFeeds)
            throws ExecutionException, InterruptedException, DocumentException {
        String feed = String.format("Change feed: { last_seq: %s, change size: %s}", changeFeeds.getLastSeq(),
                changeFeeds.getResults().size());
        logger.info(feed);

        Multimap<String, String> openRevs = changeFeeds.openRevisions(0, changeFeeds.size());
        Map<String, Collection<String>> missingRevisions = this.targetDb.getDbCore().revsDiff(openRevs);

        int changesProcessed = 0;

        // Process the changes in batches
        List<String> ids = Lists.newArrayList(missingRevisions.keySet());
        List<List<String>> batches = Lists.partition(ids, this.insertBatchSize);

        for (List<String> batch : batches) {

            List<BatchItem> batchesToInsert = new ArrayList<BatchItem>();

            if (this.state.cancel) {
                break;
            }

            try {
                Iterable<DocumentRevsList> result = createTask(batch, missingRevisions);

                for (DocumentRevsList revsList : result) {
                    // We promise not to insert documents after cancel is set
                    if (this.state.cancel) {
                        break;
                    }

                    // attachments, keyed by docId and revId, so that
                    // we can add the attachments to the correct leaf
                    // nodes
                    HashMap<String[], List<PreparedAttachment>> atts = new HashMap<String[], List<PreparedAttachment>>();

                    // now put together a list of attachments we need to download
                    if (!this.pullAttachmentsInline) {
                        try {
                            for (DocumentRevs documentRevs : revsList) {
                                Map<String, Object> attachments = documentRevs.getAttachments();
                                // keep track of attachments we are going to prepare
                                ArrayList<PreparedAttachment> preparedAtts = new ArrayList<PreparedAttachment>();
                                atts.put(new String[] { documentRevs.getId(), documentRevs.getRev() },
                                        preparedAtts);

                                for (Map.Entry<String, Object> entry : attachments.entrySet()) {
                                    Map attachmentMetadata = (Map) entry.getValue();
                                    int revpos = (Integer) attachmentMetadata.get("revpos");
                                    String contentType = (String) attachmentMetadata.get("content_type");
                                    String encoding = (String) attachmentMetadata.get("encoding");
                                    long length = (Integer) attachmentMetadata.get("length");
                                    long encodedLength = 0; // encodedLength can default to 0 if
                                    // it's not encoded
                                    if (Attachment.getEncodingFromString(encoding) != Attachment.Encoding.Plain) {
                                        encodedLength = (Integer) attachmentMetadata.get("encoded_length");
                                    }

                                    // do we already have the attachment @ this revpos?
                                    // look back up the tree for this document and see:
                                    // if we already have it, then we don't need to fetch it
                                    DocumentRevs.Revisions revs = documentRevs.getRevisions();
                                    int offset = revs.getStart() - revpos;
                                    if (offset >= 0 && offset < revs.getIds().size()) {
                                        String revId = String.valueOf(revpos) + "-" + revs.getIds().get(offset);

                                        Attachment a = this.targetDb.getDbCore().getAttachment(documentRevs.getId(),
                                                revId, entry.getKey());
                                        if (a != null) {
                                            // skip attachment, already got it
                                            continue;
                                        }

                                    }
                                    UnsavedStreamAttachment usa = this.sourceDb.getAttachmentStream(
                                            documentRevs.getId(), documentRevs.getRev(), entry.getKey(),
                                            contentType, encoding);

                                    // by preparing the attachment here, it is downloaded outside
                                    // of the database transaction
                                    preparedAtts.add(this.targetDb.prepareAttachment(usa, length, encodedLength));
                                }
                            }
                        } catch (Exception e) {
                            logger.log(Level.SEVERE, "There was a problem downloading an attachment to the"
                                    + " datastore, terminating replication", e);
                            this.state.cancel = true;
                        }
                    }

                    if (this.state.cancel) {
                        break;
                    }

                    batchesToInsert.add(new BatchItem(revsList, atts));
                    changesProcessed++;
                }
                this.targetDb.bulkInsert(batchesToInsert, this.pullAttachmentsInline);
            } catch (Exception e) {
                throw new ExecutionException(e);
            }
        }

        if (!this.state.cancel) {
            try {
                this.targetDb.putCheckpoint(this.getReplicationId(), changeFeeds.getLastSeq());
            } catch (DatastoreException e) {
                logger.log(Level.WARNING,
                        "Failed to put checkpoint doc, next replication will " + "start from previous checkpoint",
                        e);
            }
        }

        return changesProcessed;
    }

    public String getReplicationId() throws DatastoreException {
        HashMap<String, String> dict = new HashMap<String, String>();
        dict.put("source", this.sourceDb.getIdentifier());
        dict.put("target", this.targetDb.getIdentifier());
        if (filter != null) {
            dict.put("filter", this.filter.toQueryString());
        }
        // get raw SHA-1 of dictionary
        byte[] sha1Bytes = Misc.getSha1(new ByteArrayInputStream(JSONUtils.serializeAsBytes(dict)));
        // return SHA-1 as a hex string
        byte[] sha1Hex = new Hex().encode(sha1Bytes);
        return new String(sha1Hex, Charset.forName("UTF-8"));
    }

    private ChangesResultWrapper nextBatch() throws DatastoreException {
        final Object lastCheckpoint = this.targetDb.getCheckpoint(this.getReplicationId());
        logger.fine("last checkpoint " + lastCheckpoint);
        ChangesResult changeFeeds = this.sourceDb.changes(this.filter, lastCheckpoint, this.changeLimitPerBatch);
        logger.finer("changes feed: " + JSONUtils.toPrettyJson(changeFeeds));
        return new ChangesResultWrapper(changeFeeds);
    }

    public Iterable<DocumentRevsList> createTask(List<String> ids, Map<String, Collection<String>> revisions) {

        List<BulkGetRequest> requests = new ArrayList<BulkGetRequest>();

        for (String id : ids) {
            //skip any document with an empty id
            if (id.isEmpty()) {
                logger.info("Found document with empty ID in change feed, skipping");
                continue;
            }
            // get list for atts_since (these are possible ancestors we have, it's ok to be eager

            // and get all revision IDs higher up in the tree even if they're not our
            // ancestors and
            // belong to a different subtree)
            HashSet<String> possibleAncestors = new HashSet<String>();
            for (String revId : revisions.get(id)) {
                List<String> thesePossibleAncestors = targetDb.getDbCore().getPossibleAncestorRevisionIDs(id, revId,
                        50);
                if (thesePossibleAncestors != null) {
                    possibleAncestors.addAll(thesePossibleAncestors);
                }
            }
            requests.add(new BulkGetRequest(id, new ArrayList<String>(revisions.get(id)),
                    new ArrayList<String>(possibleAncestors)));
        }

        if (useBulkGet) {
            return new GetRevisionTaskBulk(this.sourceDb, requests, this.pullAttachmentsInline);
        } else {
            return new GetRevisionTaskThreaded(this.sourceDb, requests, this.pullAttachmentsInline);
        }
    }

    @Override
    public EventBus getEventBus() {
        return eventBus;
    }
}