org.apache.bookkeeper.client.PendingAddOp.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.bookkeeper.client.PendingAddOp.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.bookkeeper.client;

import static com.google.common.base.Preconditions.checkNotNull;
import static org.apache.bookkeeper.proto.BookieProtocol.FLAG_HIGH_PRIORITY;
import static org.apache.bookkeeper.proto.BookieProtocol.FLAG_NONE;
import static org.apache.bookkeeper.proto.BookieProtocol.FLAG_RECOVERY_ADD;

import com.google.common.collect.ImmutableMap;

import io.netty.buffer.ByteBuf;
import io.netty.util.Recycler;
import io.netty.util.Recycler.Handle;
import io.netty.util.ReferenceCountUtil;
import java.util.EnumSet;

import java.util.List;
import java.util.Map;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.TimeUnit;
import org.apache.bookkeeper.client.AsyncCallback.AddCallbackWithLatency;
import org.apache.bookkeeper.client.api.WriteFlag;
import org.apache.bookkeeper.net.BookieSocketAddress;
import org.apache.bookkeeper.proto.BookkeeperInternalCallbacks.WriteCallback;
import org.apache.bookkeeper.util.ByteBufList;
import org.apache.bookkeeper.util.MathUtils;
import org.apache.bookkeeper.util.SafeRunnable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This represents a pending add operation. When it has got success from all
 * bookies, it sees if its at the head of the pending adds queue, and if yes,
 * sends ack back to the application. If a bookie fails, a replacement is made
 * and placed at the same position in the ensemble. The pending adds are then
 * rereplicated.
 *
 *
 */
class PendingAddOp extends SafeRunnable implements WriteCallback {
    private static final Logger LOG = LoggerFactory.getLogger(PendingAddOp.class);

    ByteBuf payload;
    ByteBufList toSend;
    AddCallbackWithLatency cb;
    Object ctx;
    long entryId;
    int entryLength;

    DistributionSchedule.AckSet ackSet;
    boolean completed = false;

    LedgerHandle lh;
    ClientContext clientCtx;
    boolean isRecoveryAdd = false;
    long requestTimeNanos;
    long qwcLatency; // Quorum Write Completion Latency after response from quorum bookies.

    long currentLedgerLength;
    int pendingWriteRequests;
    boolean callbackTriggered;
    boolean hasRun;
    EnumSet<WriteFlag> writeFlags;
    boolean allowFailFast = false;
    List<BookieSocketAddress> ensemble;

    static PendingAddOp create(LedgerHandle lh, ClientContext clientCtx, List<BookieSocketAddress> ensemble,
            ByteBuf payload, EnumSet<WriteFlag> writeFlags, AddCallbackWithLatency cb, Object ctx) {
        PendingAddOp op = RECYCLER.get();
        op.lh = lh;
        op.clientCtx = clientCtx;
        op.isRecoveryAdd = false;
        op.cb = cb;
        op.ctx = ctx;
        op.entryId = LedgerHandle.INVALID_ENTRY_ID;
        op.currentLedgerLength = -1;
        op.payload = payload;
        op.entryLength = payload.readableBytes();

        op.completed = false;
        op.ensemble = ensemble;
        op.ackSet = lh.getDistributionSchedule().getAckSet();
        op.pendingWriteRequests = 0;
        op.callbackTriggered = false;
        op.hasRun = false;
        op.requestTimeNanos = Long.MAX_VALUE;
        op.allowFailFast = false;
        op.qwcLatency = 0;
        op.writeFlags = writeFlags;

        return op;
    }

    /**
     * Enable the recovery add flag for this operation.
     * @see LedgerHandle#asyncRecoveryAddEntry
     */
    PendingAddOp enableRecoveryAdd() {
        isRecoveryAdd = true;
        return this;
    }

    PendingAddOp allowFailFastOnUnwritableChannel() {
        allowFailFast = true;
        return this;
    }

    void setEntryId(long entryId) {
        this.entryId = entryId;
    }

    void setLedgerLength(long ledgerLength) {
        this.currentLedgerLength = ledgerLength;
    }

    long getEntryId() {
        return this.entryId;
    }

    void sendWriteRequest(List<BookieSocketAddress> ensemble, int bookieIndex) {
        int flags = isRecoveryAdd ? FLAG_RECOVERY_ADD | FLAG_HIGH_PRIORITY : FLAG_NONE;

        clientCtx.getBookieClient().addEntry(ensemble.get(bookieIndex), lh.ledgerId, lh.ledgerKey, entryId, toSend,
                this, bookieIndex, flags, allowFailFast, lh.writeFlags);
        ++pendingWriteRequests;
    }

    boolean maybeTimeout() {
        if (MathUtils.elapsedNanos(requestTimeNanos) >= clientCtx.getConf().addEntryQuorumTimeoutNanos) {
            timeoutQuorumWait();
            return true;
        }
        return false;
    }

    void timeoutQuorumWait() {
        try {
            clientCtx.getMainWorkerPool().executeOrdered(lh.ledgerId, new SafeRunnable() {
                @Override
                public void safeRun() {
                    if (completed) {
                        return;
                    }
                    lh.handleUnrecoverableErrorDuringAdd(BKException.Code.AddEntryQuorumTimeoutException);
                }

                @Override
                public String toString() {
                    return String.format("AddEntryQuorumTimeout(lid=%d, eid=%d)", lh.ledgerId, entryId);
                }
            });
        } catch (RejectedExecutionException e) {
            LOG.warn("Timeout add entry quorum wait failed {} entry: {}", lh.ledgerId, entryId);
        }
    }

    void unsetSuccessAndSendWriteRequest(List<BookieSocketAddress> ensemble, int bookieIndex) {
        // update the ensemble
        this.ensemble = ensemble;

        if (toSend == null) {
            // this addOp hasn't yet had its mac computed. When the mac is
            // computed, its write requests will be sent, so no need to send it
            // now
            return;
        }
        // Suppose that unset doesn't happen on the write set of an entry. In this
        // case we don't need to resend the write request upon an ensemble change.
        // We do need to invoke #sendAddSuccessCallbacks() for such entries because
        // they may have already completed, but they are just waiting for the ensemble
        // to change.
        // E.g.
        // ensemble (A, B, C, D), entry k is written to (A, B, D). An ensemble change
        // happens to replace C with E. Entry k does not complete until C is
        // replaced with E successfully. When the ensemble change completes, it tries
        // to unset entry k. C however is not in k's write set, so no entry is written
        // again, and no one triggers #sendAddSuccessCallbacks. Consequently, k never
        // completes.
        //
        // We call sendAddSuccessCallback when unsetting t cover this case.
        DistributionSchedule.WriteSet writeSet = lh.distributionSchedule.getWriteSet(entryId);
        try {
            if (!writeSet.contains(bookieIndex)) {
                lh.sendAddSuccessCallbacks();
                return;
            }
        } finally {
            writeSet.recycle();
        }

        if (callbackTriggered) {
            return;
        }

        if (LOG.isDebugEnabled()) {
            LOG.debug("Unsetting success for ledger: " + lh.ledgerId + " entry: " + entryId + " bookie index: "
                    + bookieIndex);
        }

        // if we had already heard a success from this array index, need to
        // increment our number of responses that are pending, since we are
        // going to unset this success
        if (!ackSet.removeBookieAndCheck(bookieIndex)) {
            // unset completed if this results in loss of ack quorum
            completed = false;
        }

        sendWriteRequest(ensemble, bookieIndex);
    }

    /**
     * Initiate the add operation.
     */
    public void safeRun() {
        hasRun = true;
        if (callbackTriggered) {
            // this should only be true if the request was failed due
            // to another request ahead in the pending queue,
            // so we can just ignore this request
            maybeRecycle();
            return;
        }

        this.requestTimeNanos = MathUtils.nowInNano();
        checkNotNull(lh);
        checkNotNull(lh.macManager);

        this.toSend = lh.macManager.computeDigestAndPackageForSending(entryId, lh.lastAddConfirmed,
                currentLedgerLength, payload);
        // ownership of RefCounted ByteBuf was passed to computeDigestAndPackageForSending
        payload = null;

        // We are about to send. Check if we need to make an ensemble change
        // becasue of delayed write errors
        lh.maybeHandleDelayedWriteBookieFailure();

        // Iterate over set and trigger the sendWriteRequests
        DistributionSchedule.WriteSet writeSet = lh.distributionSchedule.getWriteSet(entryId);

        try {
            for (int i = 0; i < writeSet.size(); i++) {
                sendWriteRequest(ensemble, writeSet.get(i));
            }
        } finally {
            writeSet.recycle();
        }
    }

    @Override
    public void writeComplete(int rc, long ledgerId, long entryId, BookieSocketAddress addr, Object ctx) {
        int bookieIndex = (Integer) ctx;
        --pendingWriteRequests;

        if (!ensemble.get(bookieIndex).equals(addr)) {
            // ensemble has already changed, failure of this addr is immaterial
            if (LOG.isDebugEnabled()) {
                LOG.debug(
                        "Write did not succeed: " + ledgerId + ", " + entryId + ". But we have already fixed it.");
            }
            return;
        }

        // must record all acks, even if complete (completion can be undone by an ensemble change)
        boolean ackQuorum = false;
        if (BKException.Code.OK == rc) {
            ackQuorum = ackSet.completeBookieAndCheck(bookieIndex);
        }

        if (completed) {
            if (rc != BKException.Code.OK) {
                // Got an error after satisfying AQ. This means we are under replicated at the create itself.
                // Update the stat to reflect it.
                clientCtx.getClientStats().getAddOpUrCounter().inc();
                if (!clientCtx.getConf().disableEnsembleChangeFeature.isAvailable()
                        && !clientCtx.getConf().delayEnsembleChange) {
                    lh.notifyWriteFailed(bookieIndex, addr);
                }
            }
            // even the add operation is completed, but because we don't reset completed flag back to false when
            // #unsetSuccessAndSendWriteRequest doesn't break ack quorum constraint. we still have current pending
            // add op is completed but never callback. so do a check here to complete again.
            //
            // E.g. entry x is going to complete.
            //
            // 1) entry x + k hits a failure. lh.handleBookieFailure increases blockAddCompletions to 1, for ensemble
            //    change
            // 2) entry x receives all responses, sets completed to true but fails to send success callback because
            //    blockAddCompletions is 1
            // 3) ensemble change completed. lh unset success starting from x to x+k, but since the unset doesn't break
            //    ackSet constraint. #removeBookieAndCheck doesn't set completed back to false.
            // 4) so when the retry request on new bookie completes, it finds the pending op is already completed.
            //    we have to trigger #sendAddSuccessCallbacks
            //
            sendAddSuccessCallbacks();
            // I am already finished, ignore incoming responses.
            // otherwise, we might hit the following error handling logic, which might cause bad things.
            maybeRecycle();
            return;
        }

        switch (rc) {
        case BKException.Code.OK:
            // continue
            break;
        case BKException.Code.ClientClosedException:
            // bookie client is closed.
            lh.errorOutPendingAdds(rc);
            return;
        case BKException.Code.IllegalOpException:
            // illegal operation requested, like using unsupported feature in v2 protocol
            lh.handleUnrecoverableErrorDuringAdd(rc);
            return;
        case BKException.Code.LedgerFencedException:
            LOG.warn("Fencing exception on write: L{} E{} on {}", ledgerId, entryId, addr);
            lh.handleUnrecoverableErrorDuringAdd(rc);
            return;
        case BKException.Code.UnauthorizedAccessException:
            LOG.warn("Unauthorized access exception on write: L{} E{} on {}", ledgerId, entryId, addr);
            lh.handleUnrecoverableErrorDuringAdd(rc);
            return;
        default:
            if (clientCtx.getConf().delayEnsembleChange) {
                if (ackSet.failBookieAndCheck(bookieIndex, addr)
                        || rc == BKException.Code.WriteOnReadOnlyBookieException) {
                    Map<Integer, BookieSocketAddress> failedBookies = ackSet.getFailedBookies();
                    LOG.warn("Failed to write entry ({}, {}) to bookies {}, handling failures.", ledgerId, entryId,
                            failedBookies);
                    // we can't meet ack quorum requirement, trigger ensemble change.
                    lh.handleBookieFailure(failedBookies);
                } else {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug(
                                "Failed to write entry ({}, {}) to bookie ({}, {}),"
                                        + " but it didn't break ack quorum, delaying ensemble change : {}",
                                ledgerId, entryId, bookieIndex, addr, BKException.getMessage(rc));
                    }
                }
            } else {
                LOG.warn("Failed to write entry ({}, {}): {}", ledgerId, entryId, BKException.getMessage(rc));
                lh.handleBookieFailure(ImmutableMap.of(bookieIndex, addr));
            }
            return;
        }

        if (ackQuorum && !completed) {
            completed = true;
            this.qwcLatency = MathUtils.elapsedNanos(requestTimeNanos);

            sendAddSuccessCallbacks();
        }
    }

    void sendAddSuccessCallbacks() {
        lh.sendAddSuccessCallbacks();
    }

    void submitCallback(final int rc) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Submit callback (lid:{}, eid: {}). rc:{}", lh.getId(), entryId, rc);
        }

        long latencyNanos = MathUtils.elapsedNanos(requestTimeNanos);
        if (rc != BKException.Code.OK) {
            clientCtx.getClientStats().getAddOpLogger().registerFailedEvent(latencyNanos, TimeUnit.NANOSECONDS);
            LOG.error("Write of ledger entry to quorum failed: L{} E{}", lh.getId(), entryId);
        } else {
            clientCtx.getClientStats().getAddOpLogger().registerSuccessfulEvent(latencyNanos, TimeUnit.NANOSECONDS);
        }
        cb.addCompleteWithLatency(rc, lh, entryId, qwcLatency, ctx);
        callbackTriggered = true;

        maybeRecycle();
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append("PendingAddOp(lid:").append(lh.ledgerId).append(", eid:").append(entryId).append(", completed:")
                .append(completed).append(")");
        return sb.toString();
    }

    @Override
    public int hashCode() {
        return (int) entryId;
    }

    @Override
    public boolean equals(Object o) {
        if (o instanceof PendingAddOp) {
            return (this.entryId == ((PendingAddOp) o).entryId);
        }
        return (this == o);
    }

    private final Handle<PendingAddOp> recyclerHandle;
    private static final Recycler<PendingAddOp> RECYCLER = new Recycler<PendingAddOp>() {
        protected PendingAddOp newObject(Recycler.Handle<PendingAddOp> handle) {
            return new PendingAddOp(handle);
        }
    };

    private PendingAddOp(Handle<PendingAddOp> recyclerHandle) {
        this.recyclerHandle = recyclerHandle;
    }

    private void maybeRecycle() {
        /**
         * We have opportunity to recycle two objects here.
         * PendingAddOp#toSend and LedgerHandle#pendingAddOp
         *
         * A. LedgerHandle#pendingAddOp: This can be released after all 3 conditions are met.
         *    - After the callback is run
         *    - After safeRun finished by the executor
         *    - Write responses are returned from all bookies
         *      as BookieClient Holds a reference from the point the addEntry requests are sent.
         *
         * B. ByteBuf can be released after 2 of the conditions are met.
         *    - After the callback is run as we will not retry the write after callback
         *    - After safeRun finished by the executor
         * BookieClient takes and releases on this buffer immediately after sending the data.
         *
         * The object can only be recycled after the above conditions are met
         * otherwise we could end up recycling twice and all
         * joy that goes along with that.
         */
        if (hasRun && callbackTriggered) {
            ReferenceCountUtil.release(toSend);
            toSend = null;
        }
        // only recycle a pending add op after it has been run.
        if (hasRun && toSend == null && pendingWriteRequests == 0) {
            recyclePendAddOpObject();
        }
    }

    private void recyclePendAddOpObject() {
        entryId = LedgerHandle.INVALID_ENTRY_ID;
        currentLedgerLength = -1;
        if (payload != null) {
            ReferenceCountUtil.release(payload);
            payload = null;
        }
        cb = null;
        ctx = null;
        ensemble = null;
        ackSet.recycle();
        ackSet = null;
        lh = null;
        clientCtx = null;
        isRecoveryAdd = false;
        completed = false;
        pendingWriteRequests = 0;
        callbackTriggered = false;
        hasRun = false;
        allowFailFast = false;
        writeFlags = null;

        recyclerHandle.recycle(this);
    }
}