org.apache.distributedlog.BKLogSegmentWriter.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.distributedlog.BKLogSegmentWriter.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.distributedlog;

import static com.google.common.base.Charsets.UTF_8;
import static org.apache.distributedlog.DistributedLogConstants.INVALID_TXID;
import static org.apache.distributedlog.LogRecord.MAX_LOGRECORDSET_SIZE;
import static org.apache.distributedlog.LogRecord.MAX_LOGRECORD_SIZE;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Stopwatch;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import io.netty.buffer.ByteBuf;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
import java.util.concurrent.locks.ReentrantLock;
import java.util.function.Function;
import org.apache.bookkeeper.client.AsyncCallback.AddCallback;
import org.apache.bookkeeper.client.AsyncCallback.CloseCallback;
import org.apache.bookkeeper.client.BKException;
import org.apache.bookkeeper.client.LedgerHandle;
import org.apache.bookkeeper.common.concurrent.FutureEventListener;
import org.apache.bookkeeper.common.concurrent.FutureUtils;
import org.apache.bookkeeper.common.util.OrderedScheduler;
import org.apache.bookkeeper.feature.Feature;
import org.apache.bookkeeper.feature.FeatureProvider;
import org.apache.bookkeeper.stats.AlertStatsLogger;
import org.apache.bookkeeper.stats.Counter;
import org.apache.bookkeeper.stats.Gauge;
import org.apache.bookkeeper.stats.OpStatsLogger;
import org.apache.bookkeeper.stats.StatsLogger;
import org.apache.bookkeeper.util.MathUtils;
import org.apache.commons.lang3.mutable.MutableObject;
import org.apache.distributedlog.Entry.Writer;
import org.apache.distributedlog.common.stats.OpStatsListener;
import org.apache.distributedlog.common.util.PermitLimiter;
import org.apache.distributedlog.common.util.Sizable;
import org.apache.distributedlog.config.DynamicDistributedLogConfiguration;
import org.apache.distributedlog.exceptions.BKTransmitException;
import org.apache.distributedlog.exceptions.EndOfStreamException;
import org.apache.distributedlog.exceptions.FlushException;
import org.apache.distributedlog.exceptions.InvalidEnvelopedEntryException;
import org.apache.distributedlog.exceptions.LockingException;
import org.apache.distributedlog.exceptions.LogRecordTooLongException;
import org.apache.distributedlog.exceptions.TransactionIdOutOfOrderException;
import org.apache.distributedlog.exceptions.WriteCancelledException;
import org.apache.distributedlog.exceptions.WriteException;
import org.apache.distributedlog.feature.CoreFeatureKeys;
import org.apache.distributedlog.injector.FailureInjector;
import org.apache.distributedlog.injector.RandomDelayFailureInjector;
import org.apache.distributedlog.io.CompressionCodec;
import org.apache.distributedlog.io.CompressionUtils;
import org.apache.distributedlog.lock.DistributedLock;
import org.apache.distributedlog.logsegment.LogSegmentEntryWriter;
import org.apache.distributedlog.logsegment.LogSegmentWriter;
import org.apache.distributedlog.util.FailpointUtils;
import org.apache.distributedlog.util.SimplePermitLimiter;
import org.apache.distributedlog.util.Utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * BookKeeper Based Log Segment Writer.
 * Multiple log records are packed into a single bookkeeper
 * entry before sending it over the network. The fact that the log record entries
 * are complete in the bookkeeper entries means that each bookkeeper log entry
 * can be read as a complete edit log. This is useful for reading, as we don't
 * need to read through the entire log segment to get the last written entry.
 *
 * <h3>Metrics</h3>
 *
 * <ul>
 * <li> flush/periodic/{success,miss}: counters for periodic flushes.
 * <li> data/{success,miss}: counters for data transmits.
 * <li> transmit/packetsize: opstats. characteristics of packet size for transmits.
 * <li> control/success: counter of success transmit of control records
 * <li> seg_writer/write: opstats. latency characteristics of write operations in segment writer.
 * <li> seg_writer/add_complete/{callback,queued,deferred}: opstats. latency components of add completions.
 * <li> seg_writer/pendings: counter. the number of records pending by the segment writers.
 * <li> transmit/outstanding/requests: per stream gauge. the number of outstanding transmits each stream.
 * </ul>
 */
class BKLogSegmentWriter implements LogSegmentWriter, AddCallback, Runnable, Sizable {
    static final Logger LOG = LoggerFactory.getLogger(BKLogSegmentWriter.class);

    final Writer REJECT_WRITES_WRITER = new Writer() {
        @Override
        public void writeRecord(LogRecord record, CompletableFuture<DLSN> transmitPromise)
                throws LogRecordTooLongException, WriteException {
            throw new WriteException(getFullyQualifiedLogSegment(), "Write record is cancelled.");
        }

        @Override
        public boolean hasUserRecords() {
            return false;
        }

        @Override
        public int getNumRecords() {
            return 0;
        }

        @Override
        public int getNumBytes() {
            return 0;
        }

        @Override
        public long getMaxTxId() {
            return INVALID_TXID;
        }

        @Override
        public ByteBuf getBuffer() throws InvalidEnvelopedEntryException, IOException {
            throw new IOException("GetBuffer is not supported.");
        }

        @Override
        public DLSN finalizeTransmit(long lssn, long entryId) {
            return new DLSN(lssn, entryId, -1L);
        }

        @Override
        public void completeTransmit(long lssn, long entryId) {
            // no-op
        }

        @Override
        public void abortTransmit(Throwable reason) {
            // no-op
        }
    };

    private final String fullyQualifiedLogSegment;
    private final String streamName;
    private final int logSegmentMetadataVersion;
    private BKTransmitPacket packetPrevious;
    private Entry.Writer recordSetWriter;
    private static final AtomicIntegerFieldUpdater<BKLogSegmentWriter> outstandingTransmitsUpdater = AtomicIntegerFieldUpdater
            .newUpdater(BKLogSegmentWriter.class, "outstandingTransmits");
    private volatile int outstandingTransmits = 0;
    private final int transmissionThreshold;
    protected final LogSegmentEntryWriter entryWriter;
    private final CompressionCodec.Type compressionType;
    private final ReentrantLock transmitLock = new ReentrantLock();
    private static final AtomicIntegerFieldUpdater<BKLogSegmentWriter> transmitResultUpdater = AtomicIntegerFieldUpdater
            .newUpdater(BKLogSegmentWriter.class, "transmitResult");
    private volatile int transmitResult = BKException.Code.OK;
    private final DistributedLock lock;
    private final boolean isDurableWriteEnabled;
    private DLSN lastDLSN = DLSN.InvalidDLSN;
    private final long startTxId;
    private long lastTxId = INVALID_TXID;
    private long lastTxIdAcknowledged = INVALID_TXID;
    private long outstandingBytes = 0;
    private long numFlushesSinceRestart = 0;
    private long numBytes = 0;
    private long lastEntryId = Long.MIN_VALUE;
    private long lastTransmitNanos = Long.MIN_VALUE;
    private final int periodicKeepAliveMs;

    // Indicates whether there are writes that have been successfully transmitted that would need
    // a control record to be transmitted to make them visible to the readers by updating the last
    // add confirmed
    private volatile boolean controlFlushNeeded = false;
    private boolean immediateFlushEnabled = false;
    private int minDelayBetweenImmediateFlushMs = 0;
    private Stopwatch lastTransmit;
    private boolean streamEnded = false;
    private final ScheduledFuture<?> periodicFlushSchedule;
    private final ScheduledFuture<?> periodicKeepAliveSchedule;
    private static final AtomicReferenceFieldUpdater<BKLogSegmentWriter, ScheduledFuture> transmitSchedFutureRefUpdater = AtomicReferenceFieldUpdater
            .newUpdater(BKLogSegmentWriter.class, ScheduledFuture.class, "transmitSchedFutureRef");
    private volatile ScheduledFuture transmitSchedFutureRef = null;
    private static final AtomicReferenceFieldUpdater<BKLogSegmentWriter, ScheduledFuture> immFlushSchedFutureRefUpdater = AtomicReferenceFieldUpdater
            .newUpdater(BKLogSegmentWriter.class, ScheduledFuture.class, "immFlushSchedFutureRef");
    private volatile ScheduledFuture immFlushSchedFutureRef = null;
    private static final AtomicReferenceFieldUpdater<BKLogSegmentWriter, Exception> scheduledFlushExceptionUpdater = AtomicReferenceFieldUpdater
            .newUpdater(BKLogSegmentWriter.class, Exception.class, "scheduledFlushException");
    private volatile Exception scheduledFlushException = null;
    private boolean enforceLock = true;
    private CompletableFuture<Void> closeFuture = null;
    private final boolean enableRecordCounts;
    private int positionWithinLogSegment = 0;
    private final long logSegmentSequenceNumber;
    // Used only for values that *could* change (e.g. buffer size etc.)
    private final DistributedLogConfiguration conf;
    private final OrderedScheduler scheduler;

    // stats
    private final StatsLogger transmitOutstandingLogger;
    private final Counter transmitDataSuccesses;
    private final Counter transmitDataMisses;
    private final Gauge<Number> transmitOutstandingGauge;
    private final OpStatsLogger transmitDataPacketSize;
    private final Counter transmitControlSuccesses;
    private final Counter pFlushSuccesses;
    private final Counter pFlushMisses;
    private final OpStatsLogger writeTime;
    private final OpStatsLogger addCompleteTime;
    private final OpStatsLogger addCompleteQueuedTime;
    private final OpStatsLogger addCompleteDeferredTime;
    private final Counter pendingWrites;

    // Functions
    private final Function<Integer, CompletableFuture<Long>> GET_LAST_TXID_ACKNOWLEDGED_AFTER_TRANSMIT_FUNC = transmitRc -> {
        if (BKException.Code.OK == transmitRc) {
            return FutureUtils.value(getLastTxIdAcknowledged());
        } else {
            return FutureUtils.exception(new BKTransmitException("Failed to transmit entry", transmitRc));
        }
    };
    final Function<Long, CompletableFuture<Long>> COMMIT_AFTER_FLUSH_FUNC = lastAckedTxId -> commit();

    private final AlertStatsLogger alertStatsLogger;
    private final WriteLimiter writeLimiter;
    private final FailureInjector writeDelayInjector;

    /**
     * Construct an edit log output stream which writes to a ledger.
     */
    protected BKLogSegmentWriter(String streamName, String logSegmentName, DistributedLogConfiguration conf,
            int logSegmentMetadataVersion, LogSegmentEntryWriter entryWriter, DistributedLock lock,
            /** the lock needs to be acquired **/
            long startTxId, long logSegmentSequenceNumber, OrderedScheduler scheduler, StatsLogger statsLogger,
            StatsLogger perLogStatsLogger, AlertStatsLogger alertStatsLogger, PermitLimiter globalWriteLimiter,
            FeatureProvider featureProvider, DynamicDistributedLogConfiguration dynConf) throws IOException {
        super();

        // set up a write limiter
        PermitLimiter streamWriteLimiter = null;
        if (conf.getPerWriterOutstandingWriteLimit() < 0) {
            streamWriteLimiter = PermitLimiter.NULL_PERMIT_LIMITER;
        } else {
            Feature disableWriteLimitFeature = featureProvider
                    .getFeature(CoreFeatureKeys.DISABLE_WRITE_LIMIT.name().toLowerCase());
            streamWriteLimiter = new SimplePermitLimiter(conf.getOutstandingWriteLimitDarkmode(),
                    conf.getPerWriterOutstandingWriteLimit(), statsLogger.scope("streamWriteLimiter"), false,
                    disableWriteLimitFeature);
        }
        this.writeLimiter = new WriteLimiter(streamName, streamWriteLimiter, globalWriteLimiter);
        this.alertStatsLogger = alertStatsLogger;

        StatsLogger flushStatsLogger = statsLogger.scope("flush");
        StatsLogger pFlushStatsLogger = flushStatsLogger.scope("periodic");
        pFlushSuccesses = pFlushStatsLogger.getCounter("success");
        pFlushMisses = pFlushStatsLogger.getCounter("miss");

        // transmit
        StatsLogger transmitDataStatsLogger = statsLogger.scope("data");
        transmitDataSuccesses = transmitDataStatsLogger.getCounter("success");
        transmitDataMisses = transmitDataStatsLogger.getCounter("miss");
        StatsLogger transmitStatsLogger = statsLogger.scope("transmit");
        transmitDataPacketSize = transmitStatsLogger.getOpStatsLogger("packetsize");
        StatsLogger transmitControlStatsLogger = statsLogger.scope("control");
        transmitControlSuccesses = transmitControlStatsLogger.getCounter("success");
        StatsLogger segWriterStatsLogger = statsLogger.scope("seg_writer");
        writeTime = segWriterStatsLogger.getOpStatsLogger("write");
        addCompleteTime = segWriterStatsLogger.scope("add_complete").getOpStatsLogger("callback");
        addCompleteQueuedTime = segWriterStatsLogger.scope("add_complete").getOpStatsLogger("queued");
        addCompleteDeferredTime = segWriterStatsLogger.scope("add_complete").getOpStatsLogger("deferred");
        pendingWrites = segWriterStatsLogger.getCounter("pending");

        // outstanding transmit requests
        transmitOutstandingLogger = perLogStatsLogger.scope("transmit").scope("outstanding");
        transmitOutstandingGauge = new Gauge<Number>() {
            @Override
            public Number getDefaultValue() {
                return 0;
            }

            @Override
            public Number getSample() {
                return outstandingTransmitsUpdater.get(BKLogSegmentWriter.this);
            }
        };
        transmitOutstandingLogger.registerGauge("requests", transmitOutstandingGauge);

        this.fullyQualifiedLogSegment = streamName + ":" + logSegmentName;
        this.streamName = streamName;
        this.logSegmentMetadataVersion = logSegmentMetadataVersion;
        this.entryWriter = entryWriter;
        this.lock = lock;
        this.lock.checkOwnershipAndReacquire();

        final int configuredTransmissionThreshold = dynConf.getOutputBufferSize();
        if (configuredTransmissionThreshold > MAX_LOGRECORDSET_SIZE) {
            LOG.warn("Setting output buffer size {} greater than max transmission size {} for log segment {}",
                    new Object[] { configuredTransmissionThreshold, MAX_LOGRECORDSET_SIZE,
                            fullyQualifiedLogSegment });
            this.transmissionThreshold = MAX_LOGRECORDSET_SIZE;
        } else {
            this.transmissionThreshold = configuredTransmissionThreshold;
        }
        this.compressionType = CompressionUtils.stringToType(conf.getCompressionType());

        this.logSegmentSequenceNumber = logSegmentSequenceNumber;
        this.recordSetWriter = Entry.newEntry(streamName, Math.max(transmissionThreshold, 1024),
                envelopeBeforeTransmit(), compressionType);
        this.packetPrevious = null;
        this.startTxId = startTxId;
        this.lastTxId = startTxId;
        this.lastTxIdAcknowledged = startTxId;
        this.enableRecordCounts = conf.getEnableRecordCounts();
        this.immediateFlushEnabled = conf.getImmediateFlushEnabled();
        this.isDurableWriteEnabled = dynConf.isDurableWriteEnabled();
        this.scheduler = scheduler;

        // Failure injection
        if (conf.getEIInjectWriteDelay()) {
            this.writeDelayInjector = new RandomDelayFailureInjector(dynConf);
        } else {
            this.writeDelayInjector = FailureInjector.NULL;
        }

        // If we are transmitting immediately (threshold == 0) and if immediate
        // flush is enabled, we don't need the periodic flush task
        final int configuredPeriodicFlushFrequency = dynConf.getPeriodicFlushFrequencyMilliSeconds();
        if (!immediateFlushEnabled || (0 != this.transmissionThreshold)) {
            int periodicFlushFrequency = configuredPeriodicFlushFrequency;
            if (periodicFlushFrequency > 0 && scheduler != null) {
                periodicFlushSchedule = scheduler.scheduleAtFixedRate(this, periodicFlushFrequency / 2,
                        periodicFlushFrequency / 2, TimeUnit.MILLISECONDS);
            } else {
                periodicFlushSchedule = null;
            }
        } else {
            // Min delay heuristic applies only when immediate flush is enabled
            // and transmission threshold is zero
            minDelayBetweenImmediateFlushMs = conf.getMinDelayBetweenImmediateFlushMs();
            periodicFlushSchedule = null;
        }
        this.periodicKeepAliveMs = conf.getPeriodicKeepAliveMilliSeconds();
        if (periodicKeepAliveMs > 0 && scheduler != null) {
            periodicKeepAliveSchedule = scheduler.scheduleAtFixedRate(new Runnable() {
                @Override
                public void run() {
                    keepAlive();
                }
            }, periodicKeepAliveMs, periodicKeepAliveMs, TimeUnit.MILLISECONDS);
        } else {
            periodicKeepAliveSchedule = null;
        }

        this.conf = conf;
        assert (!this.immediateFlushEnabled || (null != this.scheduler));
        this.lastTransmit = Stopwatch.createStarted();
    }

    String getFullyQualifiedLogSegment() {
        return fullyQualifiedLogSegment;
    }

    @VisibleForTesting
    DistributedLock getLock() {
        return this.lock;
    }

    @VisibleForTesting
    ScheduledExecutorService getFuturePool() {
        return scheduler.chooseThread(streamName);
    }

    @VisibleForTesting
    void setTransmitResult(int rc) {
        transmitResultUpdater.set(this, rc);
    }

    @VisibleForTesting
    protected final LogSegmentEntryWriter getEntryWriter() {
        return this.entryWriter;
    }

    @Override
    public long getLogSegmentId() {
        return this.entryWriter.getLogSegmentId();
    }

    protected final long getLogSegmentSequenceNumber() {
        return logSegmentSequenceNumber;
    }

    /**
     * Get the start tx id of the log segment.
     *
     * @return start tx id of the log segment.
     */
    protected final long getStartTxId() {
        return startTxId;
    }

    /**
     * Get the last tx id that has been written to the log segment buffer but not committed yet.
     *
     * @return last tx id that has been written to the log segment buffer but not committed yet.
     * @see #getLastTxIdAcknowledged()
     */
    synchronized long getLastTxId() {
        return lastTxId;
    }

    /**
     * Get the last tx id that has been acknowledged.
     *
     * @return last tx id that has been acknowledged.
     * @see #getLastTxId()
     */
    synchronized long getLastTxIdAcknowledged() {
        return lastTxIdAcknowledged;
    }

    /**
     * Get the position-within-logsemgnet of the last written log record.
     *
     * @return position-within-logsegment of the last written log record.
     */
    synchronized int getPositionWithinLogSegment() {
        return positionWithinLogSegment;
    }

    @VisibleForTesting
    long getLastEntryId() {
        return lastEntryId;
    }

    /**
     * Get the last dlsn of the last acknowledged record.
     *
     * @return last dlsn of the last acknowledged record.
     */
    synchronized DLSN getLastDLSN() {
        return lastDLSN;
    }

    @Override
    public long size() {
        return entryWriter.size();
    }

    private synchronized int getAverageTransmitSize() {
        if (numFlushesSinceRestart > 0) {
            long ret = numBytes / numFlushesSinceRestart;

            if (ret < Integer.MIN_VALUE || ret > Integer.MAX_VALUE) {
                throw new IllegalArgumentException(ret + " transmit size should never exceed max transmit size");
            }
            return (int) ret;
        }

        return 0;
    }

    private Entry.Writer newRecordSetWriter() {
        return Entry.newEntry(streamName, Math.max(transmissionThreshold, getAverageTransmitSize()),
                envelopeBeforeTransmit(), compressionType);
    }

    private boolean envelopeBeforeTransmit() {
        return LogSegmentMetadata.supportsEnvelopedEntries(logSegmentMetadataVersion);
    }

    @Override
    public CompletableFuture<Void> asyncClose() {
        return closeInternal(false);
    }

    @Override
    public CompletableFuture<Void> asyncAbort() {
        return closeInternal(true);
    }

    private synchronized void abortPacket(BKTransmitPacket packet) {
        long numRecords = 0;
        if (null != packet) {
            EntryBuffer recordSet = packet.getRecordSet();
            numRecords = recordSet.getNumRecords();
            int rc = transmitResultUpdater.get(this);
            if (BKException.Code.OK == rc) {
                rc = BKException.Code.InterruptedException;
            }
            Throwable reason = new WriteCancelledException(streamName, Utils.transmitException(rc));
            recordSet.abortTransmit(reason);
        }
        LOG.info("Stream {} aborted {} writes", fullyQualifiedLogSegment, numRecords);
    }

    private synchronized long getWritesPendingTransmit() {
        if (null != recordSetWriter) {
            return recordSetWriter.getNumRecords();
        } else {
            return 0;
        }
    }

    private CompletableFuture<Void> closeInternal(boolean abort) {
        CompletableFuture<Void> closePromise;
        synchronized (this) {
            if (null != closeFuture) {
                return closeFuture;
            }
            closePromise = closeFuture = new CompletableFuture<Void>();
        }

        MutableObject<Throwable> throwExc = new MutableObject<>(null);
        closeInternal(abort, throwExc, closePromise);
        return closePromise;
    }

    private void closeInternal(final boolean abort, final MutableObject<Throwable> throwExc,
            final CompletableFuture<Void> closePromise) {
        // clean stats resources
        this.transmitOutstandingLogger.unregisterGauge("requests", transmitOutstandingGauge);
        this.writeLimiter.close();

        // Cancel the periodic keep alive schedule first
        if (null != periodicKeepAliveSchedule) {
            if (!periodicKeepAliveSchedule.cancel(false)) {
                LOG.info("Periodic keepalive for log segment {} isn't cancelled.", getFullyQualifiedLogSegment());
            }
        }

        // Cancel the periodic flush schedule first
        // The task is allowed to exit gracefully
        if (null != periodicFlushSchedule) {
            // we don't need to care about the cancel result here. if the periodicl flush task couldn't
            // be cancelled, it means that it is doing flushing. So following flushes would be synchronized
            // to wait until background flush completed.
            if (!periodicFlushSchedule.cancel(false)) {
                LOG.info("Periodic flush for log segment {} isn't cancelled.", getFullyQualifiedLogSegment());
            }
        }

        // If it is a normal close and the stream isn't in an error state, we attempt to flush any buffered data
        if (!abort && !isLogSegmentInError()) {
            this.enforceLock = false;
            LOG.info("Flushing before closing log segment {}", getFullyQualifiedLogSegment());
            flushAndCommit().whenComplete(new FutureEventListener<Long>() {
                @Override
                public void onSuccess(Long value) {
                    abortTransmitPacketOnClose(abort, throwExc, closePromise);
                }

                @Override
                public void onFailure(Throwable cause) {
                    throwExc.setValue(cause);
                    abortTransmitPacketOnClose(abort, throwExc, closePromise);
                }
            });
        } else {
            abortTransmitPacketOnClose(abort, throwExc, closePromise);
        }

    }

    private void abortTransmitPacketOnClose(final boolean abort, final MutableObject<Throwable> throwExc,
            final CompletableFuture<Void> closePromise) {
        LOG.info(
                "Closing BKPerStreamLogWriter (abort={}) for {} :"
                        + " lastDLSN = {} outstandingTransmits = {} writesPendingTransmit = {}",
                new Object[] { abort, fullyQualifiedLogSegment, getLastDLSN(),
                        outstandingTransmitsUpdater.get(this), getWritesPendingTransmit() });

        // Save the current packet to reset, leave a new empty packet to avoid a race with
        // addCompleteDeferredProcessing.
        final BKTransmitPacket packetPreviousSaved;
        final BKTransmitPacket packetCurrentSaved;
        synchronized (this) {
            packetPreviousSaved = packetPrevious;
            packetCurrentSaved = new BKTransmitPacket(recordSetWriter);
            recordSetWriter = REJECT_WRITES_WRITER;
        }

        // Once the last packet been transmitted, apply any remaining promises asynchronously
        // to avoid blocking close if bk client is slow for some reason.
        if (null != packetPreviousSaved) {
            packetPreviousSaved.addTransmitCompleteListener(new FutureEventListener<Integer>() {
                @Override
                public void onSuccess(Integer transmitResult) {
                    abortPacket(packetCurrentSaved);
                }

                @Override
                public void onFailure(Throwable cause) {
                    LOG.error("Unexpected error on transmit completion ", cause);
                }
            });
        } else {
            // In this case there are no pending add completes, but we still need to abort the
            // current packet.
            abortPacket(packetCurrentSaved);
        }
        closeLedgerOnClose(abort, throwExc, closePromise);
    }

    private void closeLedgerOnClose(final boolean abort, final MutableObject<Throwable> throwExc,
            final CompletableFuture<Void> closePromise) {
        // close the log segment if it isn't in error state, so all the outstanding addEntry(s) will callback.
        if (null == throwExc.getValue() && !isLogSegmentInError()) {
            // Synchronous closing the ledger handle, if we couldn't close a ledger handle successfully.
            // we should throw the exception to #closeToFinalize, so it would fail completing a log segment.
            entryWriter.asyncClose(new CloseCallback() {
                @Override
                public void closeComplete(int rc, LedgerHandle lh, Object ctx) {
                    if (BKException.Code.OK != rc && BKException.Code.LedgerClosedException != rc) {
                        if (!abort) {
                            throwExc.setValue(new IOException("Failed to close ledger for "
                                    + fullyQualifiedLogSegment + " : " + BKException.getMessage(rc)));
                        }
                    }
                    completeClosePromise(abort, throwExc, closePromise);
                }
            }, null);
        } else {
            completeClosePromise(abort, throwExc, closePromise);
        }
    }

    private void completeClosePromise(final boolean abort, final MutableObject<Throwable> throwExc,
            final CompletableFuture<Void> closePromise) {
        // If add entry failed because of closing ledger above, we don't need to fail the close operation
        if (!abort && null == throwExc.getValue() && shouldFailCompleteLogSegment()) {
            throwExc.setValue(
                    new BKTransmitException("Closing an errored stream : ", transmitResultUpdater.get(this)));
        }

        if (null == throwExc.getValue()) {
            FutureUtils.complete(closePromise, null);
        } else {
            FutureUtils.completeExceptionally(closePromise, throwExc.getValue());
        }
    }

    @Override
    public synchronized void write(LogRecord record) throws IOException {
        writeUserRecord(record);
        flushIfNeeded();
    }

    @Override
    public synchronized CompletableFuture<DLSN> asyncWrite(LogRecord record) {
        return asyncWrite(record, true);
    }

    public synchronized CompletableFuture<DLSN> asyncWrite(LogRecord record, boolean flush) {
        CompletableFuture<DLSN> result = null;
        try {
            if (record.isControl()) {
                // we don't pack control records with user records together
                // so transmit current output buffer if possible
                try {
                    transmit();
                } catch (IOException ioe) {
                    return FutureUtils.exception(new WriteCancelledException(fullyQualifiedLogSegment, ioe));
                }
                result = writeControlLogRecord(record);
                transmit();
            } else {
                result = writeUserRecord(record);
                if (!isDurableWriteEnabled) {
                    // we have no idea about the DLSN if durability is turned off.
                    result = FutureUtils.value(DLSN.InvalidDLSN);
                }
                if (flush) {
                    flushIfNeeded();
                }
            }
        } catch (IOException ioe) {
            // We may incorrectly report transmit failure here, but only if we happened to hit
            // packet/xmit size limit conditions AND fail flush above, which should happen rarely
            if (null != result) {
                LOG.error("Overriding first result with flush failure {}", result);
            }
            result = FutureUtils.exception(ioe);

            // Flush to ensure any prev. writes with flush=false are flushed despite failure.
            flushIfNeededNoThrow();
        }
        return result;
    }

    private synchronized CompletableFuture<DLSN> writeUserRecord(LogRecord record) throws IOException {
        if (null != closeFuture) {
            throw new WriteException(fullyQualifiedLogSegment,
                    BKException.getMessage(BKException.Code.LedgerClosedException));
        }

        if (BKException.Code.OK != transmitResultUpdater.get(this)) {
            // Failfast if the stream already encountered error with safe retry on the client
            throw new WriteException(fullyQualifiedLogSegment,
                    BKException.getMessage(transmitResultUpdater.get(this)));
        }

        if (streamEnded) {
            throw new EndOfStreamException("Writing to a stream after it has been marked as completed");
        }

        if ((record.getTransactionId() < 0) || (record.getTransactionId() == DistributedLogConstants.MAX_TXID)) {
            throw new TransactionIdOutOfOrderException(record.getTransactionId());
        }

        // Inject write delay if configured to do so
        writeDelayInjector.inject();

        // Will check write rate limits and throw if exceeded.
        writeLimiter.acquire();
        pendingWrites.inc();

        // The count represents the number of user records up to the
        // current record
        // Increment the record count only when writing a user log record
        // Internally generated log records don't increment the count
        // writeInternal will always set a count regardless of whether it was
        // incremented or not.
        CompletableFuture<DLSN> future = null;
        try {
            // increment the position for the record to write
            // if the record is failed to write, it would be decremented.
            positionWithinLogSegment++;
            int numRecords = 1;
            if (record.isRecordSet()) {
                numRecords = LogRecordSet.numRecords(record);
            }
            future = writeInternal(record);
            // after the record (record set) is written, the position should be
            // moved for {numRecords}, but since we already moved the record by 1
            // so advance the position for other {numRecords - 1}.
            positionWithinLogSegment += (numRecords - 1);
        } catch (IOException ex) {
            writeLimiter.release();
            pendingWrites.dec();
            positionWithinLogSegment--;
            throw ex;
        }

        // Track outstanding requests and return the future.
        return FutureUtils.ensure(future, () -> {
            pendingWrites.dec();
            writeLimiter.release();
        });
    }

    boolean isLogSegmentInError() {
        return (transmitResultUpdater.get(this) != BKException.Code.OK);
    }

    boolean shouldFailCompleteLogSegment() {
        return (transmitResultUpdater.get(this) != BKException.Code.OK)
                && (transmitResultUpdater.get(this) != BKException.Code.LedgerClosedException);
    }

    public synchronized CompletableFuture<DLSN> writeInternal(LogRecord record) throws LogRecordTooLongException,
            LockingException, BKTransmitException, WriteException, InvalidEnvelopedEntryException {
        int logRecordSize = record.getPersistentSize();

        if (logRecordSize > MAX_LOGRECORD_SIZE) {
            throw new LogRecordTooLongException(String.format(
                    "Log Record of size %d written when only %d is allowed", logRecordSize, MAX_LOGRECORD_SIZE));
        }

        // If we will exceed the max number of bytes allowed per entry
        // initiate a transmit before accepting the new log record
        if ((recordSetWriter.getNumBytes() + logRecordSize) > MAX_LOGRECORDSET_SIZE) {
            checkStateAndTransmit();
        }

        checkWriteLock();

        if (enableRecordCounts) {
            // Set the count here. The caller would appropriately increment it
            // if this log record is to be counted
            record.setPositionWithinLogSegment(positionWithinLogSegment);
        }

        CompletableFuture<DLSN> writePromise = new CompletableFuture<DLSN>();
        writePromise.whenComplete(new OpStatsListener<DLSN>(writeTime));
        recordSetWriter.writeRecord(record, writePromise);

        if (record.getTransactionId() < lastTxId) {
            LOG.info("Log Segment {} TxId decreased Last: {} Record: {}",
                    new Object[] { fullyQualifiedLogSegment, lastTxId, record.getTransactionId() });
        }
        if (!record.isControl()) {
            // only update last tx id for user records
            lastTxId = record.getTransactionId();
            outstandingBytes += (20 + record.getPayload().length);
        }
        return writePromise;
    }

    private synchronized CompletableFuture<DLSN> writeControlLogRecord() throws BKTransmitException, WriteException,
            InvalidEnvelopedEntryException, LockingException, LogRecordTooLongException {
        LogRecord controlRec = new LogRecord(lastTxId, DistributedLogConstants.CONTROL_RECORD_CONTENT);
        controlRec.setControl();
        return writeControlLogRecord(controlRec);
    }

    private synchronized CompletableFuture<DLSN> writeControlLogRecord(LogRecord record) throws BKTransmitException,
            WriteException, InvalidEnvelopedEntryException, LockingException, LogRecordTooLongException {
        return writeInternal(record);
    }

    /**
     * We write a special log record that marks the end of the stream. Since this is the last
     * log record in the stream, it is marked with MAX_TXID. MAX_TXID also has the useful
     * side-effect of disallowing future startLogSegment calls through the MaxTxID check
     *
     * @throws IOException
     */
    private synchronized void writeEndOfStreamMarker() throws IOException {
        LogRecord endOfStreamRec = new LogRecord(DistributedLogConstants.MAX_TXID, "endOfStream".getBytes(UTF_8));
        endOfStreamRec.setEndOfStream();
        writeInternal(endOfStreamRec);
    }

    /**
     * Flushes all the data up to this point,
     * adds the end of stream marker and marks the stream
     * as read-only in the metadata. No appends to the
     * stream will be allowed after this point
     */
    public CompletableFuture<Long> markEndOfStream() {
        synchronized (this) {
            try {
                writeEndOfStreamMarker();
            } catch (IOException e) {
                return FutureUtils.exception(e);
            }
            streamEnded = true;
        }
        return flushAndCommit();
    }

    /**
     * Write bulk of records.
     * (TODO: moved this method to log writer level)
     *
     * @param records list of records to write
     * @return number of records that has been written
     * @throws IOException when there is I/O errors during writing records.
     */
    public synchronized int writeBulk(List<LogRecord> records) throws IOException {
        int numRecords = 0;
        for (LogRecord r : records) {
            write(r);
            numRecords++;
        }
        return numRecords;
    }

    private void checkStateBeforeTransmit() throws WriteException {
        try {
            FailpointUtils.checkFailPoint(FailpointUtils.FailPointName.FP_TransmitBeforeAddEntry);
        } catch (IOException e) {
            throw new WriteException(streamName, "Fail transmit before adding entries");
        }
    }

    /**
     * Transmit the output buffer data to the backend.
     *
     * @return last txn id that already acknowledged
     * @throws BKTransmitException if the segment writer is already in error state
     * @throws LockingException if the segment writer lost lock before transmit
     * @throws WriteException if failed to create the envelope for the data to transmit
     * @throws InvalidEnvelopedEntryException when built an invalid enveloped entry
     */
    synchronized void checkStateAndTransmit()
            throws BKTransmitException, WriteException, InvalidEnvelopedEntryException, LockingException {
        checkStateBeforeTransmit();
        transmit();
    }

    @Override
    public synchronized CompletableFuture<Long> flush() {
        try {
            checkStateBeforeTransmit();
        } catch (WriteException e) {
            return FutureUtils.exception(e);
        }

        CompletableFuture<Integer> transmitFuture;
        try {
            transmitFuture = transmit();
        } catch (BKTransmitException e) {
            return FutureUtils.exception(e);
        } catch (LockingException e) {
            return FutureUtils.exception(e);
        } catch (WriteException e) {
            return FutureUtils.exception(e);
        } catch (InvalidEnvelopedEntryException e) {
            return FutureUtils.exception(e);
        }

        if (null == transmitFuture) {
            if (null != packetPrevious) {
                transmitFuture = packetPrevious.getTransmitFuture();
            } else {
                return FutureUtils.value(getLastTxIdAcknowledged());
            }
        }

        return transmitFuture.thenCompose(GET_LAST_TXID_ACKNOWLEDGED_AFTER_TRANSMIT_FUNC);
    }

    @Override
    public synchronized CompletableFuture<Long> commit() {
        // we don't pack control records with user records together
        // so transmit current output buffer if possible
        CompletableFuture<Integer> transmitFuture;
        try {
            try {
                transmitFuture = transmit();
            } catch (IOException ioe) {
                return FutureUtils.exception(ioe);
            }
            if (null == transmitFuture) {
                writeControlLogRecord();
                return flush();
            }
        } catch (IOException ioe) {
            return FutureUtils.exception(ioe);
        }
        return transmitFuture.thenCompose(GET_LAST_TXID_ACKNOWLEDGED_AFTER_TRANSMIT_FUNC);
    }

    CompletableFuture<Long> flushAndCommit() {
        return flush().thenCompose(COMMIT_AFTER_FLUSH_FUNC);
    }

    void flushIfNeededNoThrow() {
        try {
            flushIfNeeded();
        } catch (IOException ioe) {
            LOG.error("Encountered exception while flushing log records to stream {}", fullyQualifiedLogSegment,
                    ioe);
        }
    }

    void scheduleFlushWithDelayIfNeeded(final Callable<?> callable,
            final AtomicReferenceFieldUpdater<BKLogSegmentWriter, ScheduledFuture> scheduledFutureRefUpdater) {
        final long delayMs = Math.max(0,
                minDelayBetweenImmediateFlushMs - lastTransmit.elapsed(TimeUnit.MILLISECONDS));
        final ScheduledFuture scheduledFuture = scheduledFutureRefUpdater.get(this);
        if ((null == scheduledFuture) || scheduledFuture.isDone()) {
            scheduledFutureRefUpdater.set(this, scheduler.schedule(new Runnable() {
                @Override
                public void run() {
                    synchronized (this) {
                        scheduledFutureRefUpdater.set(BKLogSegmentWriter.this, null);
                        try {
                            callable.call();

                            // Flush was successful or wasn't needed, the exception should be unset.
                            scheduledFlushExceptionUpdater.set(BKLogSegmentWriter.this, null);
                        } catch (Exception exc) {
                            scheduledFlushExceptionUpdater.set(BKLogSegmentWriter.this, exc);
                            LOG.error("Delayed flush failed", exc);
                        }
                    }
                }
            }, delayMs, TimeUnit.MILLISECONDS));
        }
    }

    // Based on transmit buffer size, immediate flush, etc., should we flush the current
    // packet now.
    void flushIfNeeded() throws BKTransmitException, WriteException, InvalidEnvelopedEntryException,
            LockingException, FlushException {
        if (outstandingBytes > transmissionThreshold) {
            // If flush delay is disabled, flush immediately, else schedule appropriately.
            if (0 == minDelayBetweenImmediateFlushMs) {
                checkStateAndTransmit();
            } else {
                scheduleFlushWithDelayIfNeeded(new Callable<Void>() {
                    @Override
                    public Void call() throws Exception {
                        checkStateAndTransmit();
                        return null;
                    }
                }, transmitSchedFutureRefUpdater);

                // Timing here is not very important--the last flush failed and we should
                // indicate this to the caller. The next flush may succeed and unset the
                // scheduledFlushException in which case the next write will succeed (if the caller
                // hasn't already closed the writer).
                Exception exec = scheduledFlushExceptionUpdater.get(this);
                if (exec != null) {
                    throw new FlushException("Last flush encountered an error while writing data to the backend",
                            getLastTxId(), getLastTxIdAcknowledged(), exec);
                }
            }
        }
    }

    private void checkWriteLock() throws LockingException {
        try {
            if (FailpointUtils.checkFailPoint(FailpointUtils.FailPointName.FP_WriteInternalLostLock)) {
                throw new LockingException("/failpoint/lockpath",
                        "failpoint is simulating a lost lock" + getFullyQualifiedLogSegment());
            }
        } catch (IOException e) {
            throw new LockingException("/failpoint/lockpath",
                    "failpoint is simulating a lost lock for " + getFullyQualifiedLogSegment());
        }
        if (enforceLock) {
            lock.checkOwnershipAndReacquire();
        }
    }

    /**
     * Transmit the current buffer to bookkeeper.
     * Synchronised at the class. #write() and #flush()
     * are never called at the same time.
     * NOTE: This method should only throw known exceptions so that we don't accidentally
     *       add new code that throws in an inappropriate place.
     *
     * @return a transmit future for caller to wait for transmit result if we transmit successfully,
     *         null if no data to transmit
     * @throws BKTransmitException if the segment writer is already in error state
     * @throws LockingException if the segment writer lost lock before transmit
     * @throws WriteException if failed to create the envelope for the data to transmit
     * @throws InvalidEnvelopedEntryException when built an invalid enveloped entry
     */
    private CompletableFuture<Integer> transmit()
            throws BKTransmitException, LockingException, WriteException, InvalidEnvelopedEntryException {
        EntryBuffer recordSetToTransmit;
        transmitLock.lock();
        try {
            synchronized (this) {
                checkWriteLock();
                // If transmitResult is anything other than BKException.Code.OK, it means that the
                // stream has encountered an error and cannot be written to.
                if (!transmitResultUpdater.compareAndSet(this, BKException.Code.OK, BKException.Code.OK)) {
                    LOG.error("Log Segment {} Trying to write to an errored stream; Error is {}",
                            fullyQualifiedLogSegment, BKException.getMessage(transmitResultUpdater.get(this)));
                    throw new BKTransmitException(
                            "Trying to write to an errored stream;" + " Error code : ("
                                    + transmitResultUpdater.get(this) + ") "
                                    + BKException.getMessage(transmitResultUpdater.get(this)),
                            transmitResultUpdater.get(this));
                }

                if (recordSetWriter.getNumRecords() == 0) {
                    // Control flushes always have at least the control record to flush
                    transmitDataMisses.inc();
                    return null;
                }

                recordSetToTransmit = recordSetWriter;
                recordSetWriter = newRecordSetWriter();
                outstandingBytes = 0;

                if (recordSetToTransmit.hasUserRecords()) {
                    numBytes += recordSetToTransmit.getNumBytes();
                    numFlushesSinceRestart++;
                }
            }

            ByteBuf toSend;
            try {
                toSend = recordSetToTransmit.getBuffer();
                FailpointUtils.checkFailPoint(FailpointUtils.FailPointName.FP_TransmitFailGetBuffer);
            } catch (IOException e) {
                if (e instanceof InvalidEnvelopedEntryException) {
                    alertStatsLogger.raise("Invalid enveloped entry for segment {} : ", fullyQualifiedLogSegment,
                            e);
                }
                LOG.error("Exception while enveloping entries for segment: {}",
                        new Object[] { fullyQualifiedLogSegment }, e);
                // If a write fails here, we need to set the transmit result to an error so that
                // no future writes go through and violate ordering guarantees.
                transmitResultUpdater.set(this, BKException.Code.WriteException);
                if (e instanceof InvalidEnvelopedEntryException) {
                    alertStatsLogger.raise("Invalid enveloped entry for segment {} : ", fullyQualifiedLogSegment,
                            e);
                    throw (InvalidEnvelopedEntryException) e;
                } else {
                    throw new WriteException(streamName, "Envelope Error");
                }
            }

            synchronized (this) {
                // update the transmit timestamp
                lastTransmitNanos = MathUtils.nowInNano();

                BKTransmitPacket packet = new BKTransmitPacket(recordSetToTransmit);
                packetPrevious = packet;
                entryWriter.asyncAddEntry(toSend, this, packet);

                if (recordSetToTransmit.hasUserRecords()) {
                    transmitDataSuccesses.inc();
                } else {
                    transmitControlSuccesses.inc();
                }

                lastTransmit.reset().start();
                outstandingTransmitsUpdater.incrementAndGet(this);
                controlFlushNeeded = false;
                return packet.getTransmitFuture();
            }
        } finally {
            transmitLock.unlock();
        }
    }

    /**
     *  Checks if there is any data to transmit so that the periodic
     *  flush task can determine if there is anything it needs to do.
     */
    private synchronized boolean haveDataToTransmit() {
        if (!transmitResultUpdater.compareAndSet(this, BKException.Code.OK, BKException.Code.OK)) {
            // Even if there is data it cannot be transmitted, so effectively nothing to send
            return false;
        }

        return (recordSetWriter.getNumRecords() > 0);
    }

    @Override
    public void addComplete(final int rc, LedgerHandle handle, final long entryId, final Object ctx) {
        int rcAfterFailPoint = rc;
        try {
            if (FailpointUtils.checkFailPoint(FailpointUtils.FailPointName.FP_TransmitComplete)) {
                rcAfterFailPoint = BKException.Code.UnexpectedConditionException;
            }
        } catch (Exception exc) {
            rcAfterFailPoint = BKException.Code.UnexpectedConditionException;
        }
        final int effectiveRC = rcAfterFailPoint;

        // Sanity check to make sure we're receiving these callbacks in order.
        if (entryId > -1 && lastEntryId >= entryId) {
            LOG.error("Log segment {} saw out of order entry {} lastEntryId {}",
                    new Object[] { fullyQualifiedLogSegment, entryId, lastEntryId });
        }
        lastEntryId = entryId;

        assert (ctx instanceof BKTransmitPacket);
        final BKTransmitPacket transmitPacket = (BKTransmitPacket) ctx;

        // Time from transmit until receipt of addComplete callback
        addCompleteTime.registerSuccessfulEvent(TimeUnit.MICROSECONDS.convert(
                System.nanoTime() - transmitPacket.getTransmitTime(), TimeUnit.NANOSECONDS), TimeUnit.MICROSECONDS);

        if (BKException.Code.OK == rc) {
            EntryBuffer recordSet = transmitPacket.getRecordSet();
            if (recordSet.hasUserRecords()) {
                synchronized (this) {
                    lastTxIdAcknowledged = Math.max(lastTxIdAcknowledged, recordSet.getMaxTxId());
                }
            }
        }

        if (null != scheduler) {
            final Stopwatch queuedTime = Stopwatch.createStarted();
            Futures.addCallback(scheduler.submitOrdered(streamName, new Callable<Void>() {
                @Override
                public Void call() {
                    final Stopwatch deferredTime = Stopwatch.createStarted();
                    addCompleteQueuedTime.registerSuccessfulEvent(queuedTime.elapsed(TimeUnit.MICROSECONDS),
                            TimeUnit.MICROSECONDS);
                    addCompleteDeferredProcessing(transmitPacket, entryId, effectiveRC);
                    addCompleteDeferredTime.registerSuccessfulEvent(deferredTime.elapsed(TimeUnit.MICROSECONDS),
                            TimeUnit.MILLISECONDS);
                    return null;
                }

                @Override
                public String toString() {
                    return String.format("AddComplete(Stream=%s, entryId=%d, rc=%d)", fullyQualifiedLogSegment,
                            entryId, rc);
                }
            }), new FutureCallback<Void>() {
                @Override
                public void onSuccess(Void done) {
                }

                @Override
                public void onFailure(Throwable cause) {
                    LOG.error("addComplete processing failed for {} entry {} lastTxId {} rc {} with error",
                            new Object[] { fullyQualifiedLogSegment, entryId,
                                    transmitPacket.getRecordSet().getMaxTxId(), rc, cause });
                }
            });
            // Race condition if we notify before the addComplete is enqueued.
            transmitPacket.notifyTransmitComplete(effectiveRC);
            outstandingTransmitsUpdater.getAndDecrement(this);
        } else {
            // Notify transmit complete must be called before deferred processing in the
            // sync case since otherwise callbacks in deferred processing may deadlock.
            transmitPacket.notifyTransmitComplete(effectiveRC);
            outstandingTransmitsUpdater.getAndDecrement(this);
            addCompleteDeferredProcessing(transmitPacket, entryId, effectiveRC);
        }
    }

    private void addCompleteDeferredProcessing(final BKTransmitPacket transmitPacket, final long entryId,
            final int rc) {
        boolean cancelPendingPromises = false;
        EntryBuffer recordSet = transmitPacket.getRecordSet();
        synchronized (this) {
            if (transmitResultUpdater.compareAndSet(this, BKException.Code.OK, rc)) {
                // If this is the first time we are setting an error code in the transmitResult then
                // we must cancel pending promises; once this error has been set, more records will not
                // be enqueued; they will be failed with WriteException
                cancelPendingPromises = (BKException.Code.OK != rc);
            } else {
                LOG.warn("Log segment {} entryId {}: Tried to set transmit result to ({}) but is already ({})",
                        new Object[] { fullyQualifiedLogSegment, entryId, rc, transmitResultUpdater.get(this) });
            }

            if (transmitResultUpdater.get(this) != BKException.Code.OK) {
                if (recordSet.hasUserRecords()) {
                    transmitDataPacketSize.registerFailedEvent(recordSet.getNumBytes(), TimeUnit.MICROSECONDS);
                }
            } else {
                // If we had data that we flushed then we need it to make sure that
                // background flush in the next pass will make the previous writes
                // visible by advancing the lastAck
                if (recordSet.hasUserRecords()) {
                    transmitDataPacketSize.registerSuccessfulEvent(recordSet.getNumBytes(), TimeUnit.MICROSECONDS);
                    controlFlushNeeded = true;
                    if (immediateFlushEnabled) {
                        if (0 == minDelayBetweenImmediateFlushMs) {
                            backgroundFlush(true);
                        } else {
                            scheduleFlushWithDelayIfNeeded(new Callable<Void>() {
                                @Override
                                public Void call() throws Exception {
                                    backgroundFlush(true);
                                    return null;
                                }
                            }, immFlushSchedFutureRefUpdater);
                        }
                    }
                }
            }

            // update last dlsn before satisifying future
            if (BKException.Code.OK == transmitResultUpdater.get(this)) {
                DLSN lastDLSNInPacket = recordSet.finalizeTransmit(logSegmentSequenceNumber, entryId);
                if (recordSet.hasUserRecords()) {
                    if (null != lastDLSNInPacket && lastDLSN.compareTo(lastDLSNInPacket) < 0) {
                        lastDLSN = lastDLSNInPacket;
                    }
                }
            }
        }

        if (BKException.Code.OK == transmitResultUpdater.get(this)) {
            recordSet.completeTransmit(logSegmentSequenceNumber, entryId);
        } else {
            recordSet.abortTransmit(Utils.transmitException(transmitResultUpdater.get(this)));
        }

        if (cancelPendingPromises) {
            // Since the writer is in a bad state no more packets will be tramsitted, and its safe to
            // assign a new empty packet. This is to avoid a race with closeInternal which may also
            // try to cancel the current packet;
            final BKTransmitPacket packetCurrentSaved;
            synchronized (this) {
                packetCurrentSaved = new BKTransmitPacket(recordSetWriter);
                recordSetWriter = newRecordSetWriter();
            }
            packetCurrentSaved.getRecordSet().abortTransmit(new WriteCancelledException(streamName,
                    Utils.transmitException(transmitResultUpdater.get(this))));
        }
    }

    @Override
    public synchronized void run() {
        backgroundFlush(false);
    }

    private synchronized void backgroundFlush(boolean controlFlushOnly) {
        if (null != closeFuture) {
            // if the log segment is closing, skip any background flushing
            LOG.debug("Skip background flushing since log segment {} is closing.", getFullyQualifiedLogSegment());
            return;
        }
        try {
            boolean newData = haveDataToTransmit();

            if (controlFlushNeeded || (!controlFlushOnly && newData)) {
                // If we need this periodic transmit to persist previously written data but
                // there is no new data (which would cause the transmit to be skipped) generate
                // a control record
                if (!newData) {
                    writeControlLogRecord();
                }

                transmit();
                pFlushSuccesses.inc();
            } else {
                pFlushMisses.inc();
            }
        } catch (IOException exc) {
            LOG.error("Log Segment {}: Error encountered by the periodic flush", fullyQualifiedLogSegment, exc);
        }
    }

    private synchronized void keepAlive() {
        if (null != closeFuture) {
            // if the log segment is closing, skip sending any keep alive records.
            LOG.debug("Skip sending keepAlive control record since log segment {} is closing.",
                    getFullyQualifiedLogSegment());
            return;
        }

        if (MathUtils.elapsedMSec(lastTransmitNanos) < periodicKeepAliveMs) {
            return;
        }

        LogRecord controlRec = new LogRecord(lastTxId, DistributedLogConstants.KEEPALIVE_RECORD_CONTENT);
        controlRec.setControl();
        asyncWrite(controlRec);
    }

}