com.mellanox.r4h.DFSOutputStream.java Source code

Java tutorial

Introduction

Here is the source code for com.mellanox.r4h.DFSOutputStream.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.mellanox.r4h;

import static org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status.SUCCESS;

import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InterruptedIOException;
import java.io.OutputStream;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.nio.channels.ClosedChannelException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.List;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.crypto.CryptoProtocolVersion;
import org.apache.hadoop.fs.CanSetDropBehind;
import org.apache.hadoop.fs.CreateFlag;
import org.apache.hadoop.fs.FSOutputSummer;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileEncryptionInfo;
import org.apache.hadoop.fs.ParentNotDirectoryException;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.fs.StorageType;
import org.apache.hadoop.fs.Syncable;
import org.apache.hadoop.hdfs.DFSClientFaultInjector;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.UnknownCryptoProtocolVersionException;
import org.apache.hadoop.hdfs.client.HdfsDataOutputStream;
import org.apache.hadoop.hdfs.protocol.BlockStoragePolicy;
import org.apache.hadoop.hdfs.protocol.DSQuotaExceededException;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.NSQuotaExceededException;
import org.apache.hadoop.hdfs.protocol.SnapshotAccessControlException;
import org.apache.hadoop.hdfs.protocol.UnresolvedPathException;
import org.apache.hadoop.hdfs.protocol.datatransfer.BlockConstructionStage;
import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtocol;
import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair;
import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
import org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader;
import org.apache.hadoop.hdfs.protocol.datatransfer.PipelineAck;
import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;
import org.apache.hadoop.hdfs.protocolPB.PBHelper;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockStoragePolicySuite;
import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
import org.apache.hadoop.hdfs.server.namenode.NotReplicatedYetException;
import org.apache.hadoop.hdfs.server.namenode.RetryStartFileException;
import org.apache.hadoop.hdfs.server.namenode.SafeModeException;
import org.apache.hadoop.hdfs.server.namenode.UnsupportedActionException;
import org.apache.hadoop.io.EnumSetWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.DataChecksum.Type;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.Time;
import org.apache.htrace.NullScope;
import org.apache.htrace.Span;
import org.apache.htrace.TraceScope;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.google.common.cache.RemovalListener;
import com.google.common.cache.RemovalNotification;

import org.accelio.jxio.ClientSession;
import org.accelio.jxio.EventName;
import org.accelio.jxio.EventReason;
import org.accelio.jxio.Msg;
import org.accelio.jxio.MsgPool;

import com.mellanox.r4h.client.HdfsDataOutputStream.SyncFlag;

/****************************************************************
 * DFSOutputStream creates files from a stream of bytes.
 * 
 * The client application writes data that is cached internally by
 * this stream. Data is broken up into packets, each packet is
 * typically 64K in size. A packet comprises of chunks. Each chunk
 * is typically 512 bytes and has an associated checksum with it.
 * 
 * When a client application fills up the currentPacket, it is
 * enqueued into dataQueue. The DataStreamer thread picks up
 * packets from the dataQueue, sends it to the first datanode in
 * the pipeline and moves it from the dataQueue to the ackQueue.
 * The ResponseProcessor receives acks from the datanodes. When an
 * successful ack for a packet is received from all datanodes, the
 * ResponseProcessor removes the corresponding packet from the
 * ackQueue.
 * 
 * In case of error, all outstanding packets and moved from
 * ackQueue. A new pipeline is setup by eliminating the bad
 * datanode from the original pipeline. The DataStreamer now
 * starts sending packets from the dataQueue.
 ****************************************************************/
@InterfaceAudience.Private
public class DFSOutputStream extends FSOutputSummer implements Syncable, CanSetDropBehind {
    // R4H stuff begins here.
    private static final Log LOG = LogFactory.getLog(DFSOutputStream.class);
    private static boolean toPrintBreakdown = false;
    private static long lastOperationTS = 0;
    private Object lastOperationTSLock = new Object();
    // The client session. A new client session is created per block.
    private ClientSession currentClientSession;
    // The event queue handler. We have only one such, and reuse it for all blocks.
    private R4HEventHandler eventQHandler;
    // The name of next DN node we're talking to (we hold it here for logging purposes only).
    private String nextDnName;
    // The message pool that is used for connection to data nodes.
    private final MsgPool msgPool;
    // The sequence number of the current message being SENT (excluding the header).
    private long sentSeqenceNum = -1;
    private ConcurrentLinkedQueue<DFSPacket> dataQueue = new ConcurrentLinkedQueue<DFSPacket>();
    private ConcurrentLinkedQueue<DFSPacket> ackQueue = new ConcurrentLinkedQueue<DFSPacket>();
    private String name; // used as the toString value
    // The time to wait for header ack before pronouncing failure:
    private long headerAckTimeoutUsec;
    private static final long CLOSE_WAIT_TIMEOUT_IN_USEC = 100000; // very sensitive and might affect significantly job duration
    // R4H stuff ends here.

    private final DFSClient dfsClient;
    /**
     * Number of times to retry creating a file when there are transient 
     * errors (typically related to encryption zones and KeyProvider operations).
     */
    @VisibleForTesting
    static final int CREATE_RETRY_COUNT = 10;
    @VisibleForTesting
    static CryptoProtocolVersion[] SUPPORTED_CRYPTO_VERSIONS = CryptoProtocolVersion.supported();
    private Socket s;
    // closed is accessed by different threads under different locks.
    private volatile boolean closed = false;

    private String src;
    private final long fileId;
    private final long blockSize;
    /** Only for DataTransferProtocol.writeBlock(..) */
    private final int bytesPerChecksum;
    private final DataChecksum checksum;
    private DFSPacket currentPacket = null;
    private DataStreamer streamer;
    private long currentSeqno = 0;
    private long lastQueuedSeqno = -1;
    private long lastAckedSeqno = -1;
    private long bytesCurBlock = 0; // bytes written in current block
    private int packetSize = 0; // write packet size, not including the header.
    private int chunksPerPacket = 0;
    private final AtomicReference<IOException> lastException = new AtomicReference<IOException>();
    private long artificialSlowdown = 0;
    private long lastFlushOffset = 0; // offset when flush was invoked
    // persist blocks on namenode
    private final AtomicBoolean persistBlocks = new AtomicBoolean(false);
    private volatile boolean appendChunk = false; // appending to existing partial block
    private long initialFileSize = 0; // at time of file open
    private final Progressable progress;
    private final short blockReplication; // replication factor of file
    private boolean shouldSyncBlock = false; // force blocks to disk upon close
    private final AtomicReference<CachingStrategy> cachingStrategy;
    private boolean failPacket = false;
    private final JXIOClientResource jxioResource;
    private boolean usingJxioClientResource = true;
    private FileEncryptionInfo fileEncryptionInfo;
    private static final BlockStoragePolicySuite blockStoragePolicySuite = BlockStoragePolicySuite
            .createDefaultSuite();

    /** Use {@link ByteArrayManager} to create buffer for non-heartbeat packets.*/
    private DFSPacket createPacket(int packetSize, int chunksPerPkt, long offsetInBlock, long seqno,
            boolean lastPacketInBlock) throws InterruptedIOException {
        Msg newMsg = DFSOutputStream.this.getMsg();
        final int bufferSize = PacketHeader.PKT_MAX_HEADER_LEN + packetSize;
        return new DFSPacket(newMsg, chunksPerPkt, offsetInBlock, seqno, getChecksumSize(), lastPacketInBlock);
    }

    private static class DFSPacket {
        public static final long HEART_BEAT_SEQNO = -1L;
        private static long[] EMPTY = new long[0];
        private final long seqno; // sequence number of buffer in block
        private final long offsetInBlock; // offset in block
        private boolean syncBlock; // this packet forces the current block to disk
        private int numChunks; // number of chunks currently in packet
        private final int maxChunks; // max chunks in packet
        private boolean lastPacketInBlock; // is this the last packet in block?

        /**
         * buf is pointed into like follows:
         * (C is checksum data, D is payload data)
         * 
         * [_________CCCCCCCCC________________DDDDDDDDDDDDDDDD___]
         * ^ ^ ^ ^
         * | checksumPos dataStart dataPos
         * checksumStart
         * 
         * Right before sending, we move the checksum data to immediately precede
         * the actual data, and then insert the header into the buffer immediately
         * preceding the checksum data, so we make sure to keep enough space in
         * front of the checksum data to support the largest conceivable header.
         */
        private int checksumStart;
        private int checksumPos;
        private final int dataStart;
        private int dataPos;
        private long[] traceParents = EMPTY;
        private int traceParentsUsed;
        private Span span;

        // R4H stuff begins here.
        private Msg msg;
        private ByteBuffer byteBuff;

        // R4H stuff ends here.

        /**
         * Create a new packet.
         * 
         * @param pktSize
         *            maximum size of the packet,
         *            including checksum data and actual data.
         * @param chunksPerPkt
         *            maximum number of chunks per packet.
         * @param offsetInBlock
         *            offset in bytes into the HDFS block.
         * @param lastPacketInBlock 
         *            if this is the last packet
         */
        private DFSPacket(Msg newMsg, int chunksPerPkt, long offsetInBlock, long seqno, int checksumSize,
                boolean lastPacketInBlock) {
            this.lastPacketInBlock = lastPacketInBlock;
            this.numChunks = 0;
            this.offsetInBlock = offsetInBlock;
            this.seqno = seqno;

            // R4H stuff begins here.
            msg = newMsg;
            byteBuff = msg.getOut();
            byteBuff.clear();
            // R4H stuff ends here.

            checksumStart = PacketHeader.PKT_MAX_HEADER_LEN;
            checksumPos = checksumStart;
            dataStart = checksumStart + (chunksPerPkt * checksumSize);
            dataPos = dataStart;
            maxChunks = chunksPerPkt;
        }

        /**
         * Write data to this packet.
         *
         * @param inarray input array of data
         * @param off the offset of data to write
         * @param len the length of data to write
         * @throws ClosedChannelException
         */
        // R4H made here few changes.
        // Synchronization is not necessary because we do not use ByteArrayManager. see HDFS-7358 for more info.
        void writeData(byte[] inarray, int off, int len) {
            if (dataPos + len > byteBuff.capacity()) {
                throw new BufferOverflowException();
            }
            byteBuff.position(dataPos);
            byteBuff.put(inarray, off, len);
            dataPos += len;
        }

        /**
         * Write checksums to this packet
         *
         * @param inarray input array of checksums
         * @param off the offset of checksums to write
         * @param len the length of checksums to write
         * @throws ClosedChannelException
         */
        // R4H made here few changes.
        // Synchronization is not necessary because we do not use ByteArrayManager. see HDFS-7358 for more info.
        void writeChecksum(byte[] inarray, int off, int len) {
            if (checksumPos + len > dataStart) {
                throw new BufferOverflowException();
            }

            byteBuff.position(checksumPos);
            byteBuff.put(inarray, off, len);
            checksumPos += len;
        }

        /**
         * Write the full packet, including the header, to the given output stream.
         *
         * @param stm
         * @throws IOException
         */
        // R4H made here few changes, including renaming the function.
        // Synchronization is not necessary because we do not use ByteArrayManager. see HDFS-7358 for more info.
        PacketHeader prepareBeforeSend() throws IOException {
            final int dataLen = dataPos - dataStart;
            final int checksumLen = checksumPos - checksumStart;
            final int pktLen = HdfsConstants.BYTES_IN_INTEGER + dataLen + checksumLen;

            PacketHeader header = new PacketHeader(pktLen, offsetInBlock, seqno, lastPacketInBlock, dataLen,
                    syncBlock);

            if (checksumPos != dataStart) {
                // Move the checksum to cover the gap. This can happen for the last
                // packet or during an hflush/hsync call.
                byte[] csBuff = new byte[checksumLen];
                byteBuff.position(checksumStart);
                byteBuff.get(csBuff);
                byteBuff.position(dataStart - checksumLen);
                byteBuff.put(csBuff);
                checksumPos = dataStart;
                checksumStart = checksumPos - checksumLen;
            }

            final int headerStart = checksumStart - header.getSerializedSize();
            assert checksumStart + 1 >= header.getSerializedSize();
            assert checksumPos == dataStart;
            assert headerStart >= 0;
            assert headerStart + header.getSerializedSize() == checksumStart;

            // corrupt the data for testing.
            if (DFSClientFaultInjector.get().corruptPacket()) {
                byteBuff.position(headerStart + header.getSerializedSize() + checksumLen + dataLen - 1);
                byte currContents = byteBuff.get();
                currContents ^= 0xff;
                byteBuff.put(currContents);
            }

            byteBuff.position(headerStart);
            byteBuff.put(header.getBytes(), 0, header.getSerializedSize());
            if (headerStart > 0) {
                byteBuff.position(0);
                DataOutput out = new DataOutputStream(new ByteBufferOutputStream(byteBuff));
                WritableUtils.writeVInt(out, headerStart);
            }

            int sendLen = header.getSerializedSize() + checksumLen + dataLen + headerStart;
            byteBuff.position(sendLen);

            // undo corruption.
            if (DFSClientFaultInjector.get().uncorruptPacket()) {
                byteBuff.position(headerStart + header.getSerializedSize() + checksumLen + dataLen - 1);
                byte currContents = byteBuff.get();
                currContents ^= 0xff;
                byteBuff.put(currContents);
                byteBuff.position(sendLen);
            }

            return header;
        }

        // get the packet's last byte's offset in the block
        // Synchronization is not necessary because we do not have ResponseProcessor. see HDFS-7399 for more info.
        long getLastByteOffsetBlock() {
            return offsetInBlock + dataPos - dataStart;
        }

        /**
         * Check if this packet is a heart beat packet
         * 
         * @return true if the sequence number is HEART_BEAT_SEQNO
         */
        boolean isHeartbeatPacket() {
            return seqno == HEART_BEAT_SEQNO;
        }

        /**
         * check if this packet is the last packet in block
         *
         * @return true if the packet is the last packet
         */
        boolean isLastPacketInBlock() {
            return lastPacketInBlock;
        }

        /**
         * get sequence number of this packet
         *
         * @return the sequence number of this packet
         */
        long getSeqno() {
            return seqno;
        }

        /**
         * get the number of chunks this packet contains
         *
         * @return the number of chunks in this packet
         */
        synchronized int getNumChunks() {
            return numChunks;
        }

        /**
         * increase the number of chunks by one
         */
        synchronized void incNumChunks() {
            numChunks++;
        }

        /**
         * get the maximum number of packets
         *
         * @return the maximum number of packets
         */
        int getMaxChunks() {
            return maxChunks;
        }

        /**
         * set if to sync block
         *
         * @param syncBlock if to sync block
         */
        synchronized void setSyncBlock(boolean syncBlock) {
            this.syncBlock = syncBlock;
        }

        @Override
        public String toString() {
            return "packet seqno:" + this.seqno + " offsetInBlock:" + this.offsetInBlock + " lastPacketInBlock:"
                    + this.lastPacketInBlock + " lastByteOffsetInBlock: " + this.getLastByteOffsetBlock();
        }

        /**
         * Add a trace parent span for this packet.<p/>
         *
         * Trace parent spans for a packet are the trace spans responsible for
         * adding data to that packet.  We store them as an array of longs for
         * efficiency.<p/>
         *
         * Protected by the DFSOutputStream dataQueue lock.
         */
        public void addTraceParent(Span span) {
            if (span == null) {
                return;
            }
            addTraceParent(span.getSpanId());
        }

        public void addTraceParent(long id) {
            if (traceParentsUsed == traceParents.length) {
                int newLength = (traceParents.length == 0) ? 8 : traceParents.length * 2;
                traceParents = Arrays.copyOf(traceParents, newLength);
            }
            traceParents[traceParentsUsed] = id;
            traceParentsUsed++;
        }

        /**
         * Get the trace parent spans for this packet.<p/>
         *
         * Will always be non-null.<p/>
         *
         * Protected by the DFSOutputStream dataQueue lock.
         */
        public long[] getTraceParents() {
            // Remove duplicates from the array.
            int len = traceParentsUsed;
            Arrays.sort(traceParents, 0, len);
            int i = 0, j = 0;
            long prevVal = 0; // 0 is not a valid span id
            while (true) {
                if (i == len) {
                    break;
                }
                long val = traceParents[i];
                if (val != prevVal) {
                    traceParents[j] = val;
                    j++;
                    prevVal = val;
                }
                i++;
            }
            if (j < traceParents.length) {
                traceParents = Arrays.copyOf(traceParents, j);
                traceParentsUsed = traceParents.length;
            }
            return traceParents;
        }

        public void setTraceSpan(Span span) {
            this.span = span;
        }

        public Span getTraceSpan() {
            return span;
        }
    }

    class CSCallbacks implements org.accelio.jxio.ClientSession.Callbacks {

        private long countPacketArrived;
        private boolean wasSessionEstablished;
        private boolean isSessionClosed;
        private boolean closeEventExpected;
        private boolean errorFlowInTheMiddle;
        private String firstBadLink;
        private ClientSession clientSession;
        private boolean didHeaderFail;
        private boolean wasHeaderAckReceived;
        private boolean wasLastPacketAcked;
        private Status pipelineStatus;
        private boolean currResult; // Current status of reply from next DN.

        public CSCallbacks() {
            this.countPacketArrived = 0;
            this.wasSessionEstablished = false;
            this.closeEventExpected = false;
            this.errorFlowInTheMiddle = false;
            this.firstBadLink = "";
            this.clientSession = null;
            this.didHeaderFail = false;
            this.wasHeaderAckReceived = false;
            this.wasLastPacketAcked = false;
            this.isSessionClosed = true;
            this.pipelineStatus = SUCCESS;
            this.currResult = false;
        }

        @Override
        /**
         * The callback upon receiving a reply for message sent previously.
         * @param message
         *       The ack message. Contains also the originally sent message.
         * @todo
         *       To make sure we catch all exceptions here inside. Alternatively, do this in all places where we do 
         *       eqh.runeventloop().
         */
        public void onResponse(Msg message) {
            boolean needToReturnMsgToPool = true;
            if (DFSOutputStream.LOG.isTraceEnabled()) {
                DFSOutputStream.LOG.trace("DFSClient got a reply message.");
            }

            message.getIn().clear();
            message.getOut().clear();

            do {
                if (!this.wasHeaderAckReceived) {
                    // Handle header ack:
                    ByteBuffer msgBuf = message.getIn();

                    // receive ack for connect
                    BlockOpResponseProto resp = null;
                    try {
                        resp = BlockOpResponseProto
                                .parseFrom(PBHelper.vintPrefixed(new ByteBufferInputStream(msgBuf)));
                    } catch (IOException e) {
                        this.didHeaderFail = true;
                        this.currResult = false;
                        streamer.setLastException((IOException) e);
                        DFSOutputStream.LOG.error(
                                DFSOutputStream.this.toString() + "Got exception while parsing header: " + e);
                        break;
                    }
                    this.pipelineStatus = resp.getStatus();
                    this.firstBadLink = resp.getFirstBadLink();
                    needToReturnMsgToPool = false; // never return header msg to pool

                    if (this.pipelineStatus != SUCCESS) {
                        this.didHeaderFail = true;
                        this.currResult = false;
                        if (this.pipelineStatus == Status.ERROR_ACCESS_TOKEN) {
                            streamer.setLastException(new InvalidBlockTokenException(
                                    "Got access token error for connect ack with firstBadLink as "
                                            + this.firstBadLink));
                            DFSOutputStream.LOG.error(DFSOutputStream.this.toString()
                                    + "Got access token error for connect ack with firstBadLink as "
                                    + this.firstBadLink);
                        } else {
                            streamer.setLastException(
                                    new IOException("Bad connect ack with firstBadLink as " + this.firstBadLink));
                            DFSOutputStream.LOG.error(DFSOutputStream.this.toString()
                                    + "Bad connect ack with firstBadLink as " + this.firstBadLink);
                        }

                    } else {
                        if (DFSOutputStream.LOG.isDebugEnabled()) {
                            DFSOutputStream.LOG.debug(DFSOutputStream.this.toString()
                                    + String.format("Got header ack and parsed it successfully"));
                        }
                        this.currResult = true;
                        this.wasHeaderAckReceived = true;

                        if (toPrintBreakdown) {
                            long now = System.nanoTime();
                            DFSOutputStream.LOG.info(DFSOutputStream.this.toString()
                                    + String.format("%.3f", (float) now / 1000000000.) + ", "
                                    + (now - lastOperationTS) / 1000000000.
                                    + " : header ack was parsed successfully.");
                            synchronized (lastOperationTSLock) {
                                lastOperationTS = now;
                            }
                        }
                    }
                } else {

                    if (DFSOutputStream.LOG.isTraceEnabled()) {
                        DFSOutputStream.LOG.trace("Handling message ack...");
                    }

                    // Handle regular message ack:
                    PipelineAck ack = new PipelineAck();

                    try {
                        // read an ack from the pipeline
                        InputStream ackInputStream = new DataInputStream(
                                new ByteBufferInputStream(message.getIn()));
                        ack.readFields(ackInputStream);
                        // // DEBUG
                        // DFSOutputStream.LOG.info("DFSClient for block " + DFSOutputStream.this.streamer.block + " " + ack);
                        // // DEBUG

                        if (DFSOutputStream.LOG.isDebugEnabled()) {
                            DFSOutputStream.LOG.debug(
                                    "DFSClient for block " + DFSOutputStream.this.streamer.block + " " + ack);
                        }
                        // processes response status from all datanodes.
                        for (int i = ack.getNumOfReplies() - 1; i >= 0 && dfsClient.clientRunning; i--) {
                            final Status reply = PipelineAck.getStatusFromHeader(ack.getHeaderFlag(i));

                            // Restart will not be treated differently unless it is
                            // the local node or the only one in the pipeline.
                            if (PipelineAck.isRestartOOBStatus(reply)
                                    && DFSOutputStream.this.streamer.shouldWaitForRestart(i)) {
                                DFSOutputStream.this.streamer.restartDeadline = dfsClient.getConf()
                                        .getDatanodeRestartTimeout() + Time.now();
                                DFSOutputStream.this.streamer.setRestartingNodeIndex(i);
                                String errMessage = "A datanode is restarting: "
                                        + DFSOutputStream.this.streamer.nodes[i];
                                DFSClient.LOG.info(errMessage);
                                throw new IOException(errMessage);
                            }
                            // node error

                            if (reply != SUCCESS) {
                                // Set first bad datanode
                                String errorMsg = "Bad response " + reply + " for block "
                                        + DFSOutputStream.this.streamer.block + " from datanode ";
                                if (i < DFSOutputStream.this.streamer.nodes.length) { // the usual and expected case
                                    streamer.setErrorIndex(i);
                                    errorMsg += DFSOutputStream.this.streamer.nodes[streamer.errorIndex].getName();
                                } // the unexpected case
                                else {
                                    // It's a lie, but this is the best lie we can think of ;)
                                    streamer.errorIndex = DFSOutputStream.this.streamer.nodes.length - 1;
                                    errorMsg += DFSOutputStream.this.streamer.nodes[streamer.errorIndex].getName()
                                            + ". Actually it was " + i + "'s datanode.";
                                }

                                streamer.setLastException(new IOException(errorMsg));
                                this.currResult = false;
                                break;
                            }
                        }

                        if (!this.currResult) {
                            break;
                        }

                        long seqno = ack.getSeqno();

                        if (seqno == DFSPacket.HEART_BEAT_SEQNO) { // a heartbeat ack
                            break;
                        }

                        PacketHeader sentPcktHeader = (PacketHeader) message.getUserContext();

                        if (DFSOutputStream.LOG.isTraceEnabled()) {
                            DFSOutputStream.LOG.trace(
                                    String.format("Received ack with seq #%d, last pckt in block:%b. Sent seq #%d",
                                            seqno, sentPcktHeader.isLastPacketInBlock(), sentSeqenceNum));
                        }

                        DFSPacket one = ackQueue.peek();

                        if (one.seqno != seqno) {
                            throw new IOException("PacketReply: Expecting seqno " + " for block "
                                    + DFSOutputStream.this.streamer.block + " " + one.seqno + " but received "
                                    + seqno);
                        }

                        // update bytesAcked
                        streamer.block.setNumBytes(one.getLastByteOffsetBlock());

                        lastAckedSeqno = ack.getSeqno();

                        returnMsgToPool(message);
                        needToReturnMsgToPool = false;

                        ackQueue.poll();

                        // Check if this is ack for last message in block:
                        if (sentPcktHeader.isLastPacketInBlock()) {
                            // Got ack for last packet in block.
                            this.wasLastPacketAcked = true;
                            // Reset for next block
                            this.wasHeaderAckReceived = false;
                            this.currResult = false;
                            if (toPrintBreakdown) {
                                long now5 = System.nanoTime();
                                DFSOutputStream.LOG.info(String.format("%.3f", (float) now5 / 1000000000.) + ", "
                                        + (now5 - lastOperationTS) / 1000000000.
                                        + " : last packet in block ack was parsed successfully.");
                                synchronized (lastOperationTSLock) {
                                    lastOperationTS = now5;
                                }
                            }
                            closeEventExpected = true;
                            if ((this.clientSession != null) && !this.clientSession.getIsClosing()) {
                                this.clientSession.close();
                                if (this.clientSession == DFSOutputStream.this.currentClientSession) {
                                    DFSOutputStream.this.currentClientSession = null;
                                }
                            }
                        }

                        if (LOG.isDebugEnabled()) {
                            if (this.wasLastPacketAcked) {
                                LOG.debug(
                                        "Got message ack and parsed it successfully. This was the last message in the block.");
                            } else {
                                LOG.debug("Got message ack and parsed it successfully.");
                            }
                        }

                    } catch (Exception e) {
                        DFSOutputStream.LOG.error(DFSOutputStream.this.toString()
                                + "Got exception during processing of reply message: "
                                + StringUtils.stringifyException(e));

                        if (!closed) {
                            DFSOutputStream.this.streamer.hasError = true;
                            if (e instanceof IOException) {
                                streamer.setLastException((IOException) e);
                            }
                            DFSOutputStream.LOG
                                    .error(DFSOutputStream.this.toString() + "DFSOutputStream onMsg exception "
                                            + " for block " + DFSOutputStream.this.streamer.block + e);
                            closed = true;
                        }
                    }
                }
            } while (false);

            if (needToReturnMsgToPool) {
                returnMsgToPool(message);
            }

            synchronized (ackQueue) {
                ackQueue.notifyAll();
            }

        }

        @Override
        /**
         * @todo
         * To make sure we catch all exceptions here inside. Alternatively, do this in all places where we do 
         * eqh.runeventloop().
         */
        public void onSessionEstablished() {
            this.wasSessionEstablished = true;
            this.isSessionClosed = false;
            if (toPrintBreakdown) {
                long now = System.nanoTime();
                DFSOutputStream.LOG.info(DFSOutputStream.this.toString()
                        + String.format("%.3f", (float) now / 1000000000.) + ", "
                        + (now - lastOperationTS) / 1000000000. + " : Successfully established client session.");
                synchronized (lastOperationTSLock) {
                    lastOperationTS = now;
                }
            } else {
                LOG.debug(DFSOutputStream.this.toString() + String
                        .format("Successfully established client session on %s", DFSOutputStream.this.nextDnName));
            }
        }

        @Override
        /**
         * @todo
         * To make sure we catch all exceptions here inside. Alternatively, do this in all places where we do
         * eqh.runeventloop().
         */
        public void onSessionEvent(EventName session_event, EventReason reason) {
            boolean errorFlow = false;

            switch (session_event) {
            case SESSION_CLOSED:
                this.isSessionClosed = true;
                String logmsg = String.format("Client Session event=%s, reason=%s", session_event, reason);
                if (closeEventExpected) {
                    if (DFSOutputStream.LOG.isDebugEnabled()) {
                        DFSOutputStream.LOG.debug(DFSOutputStream.this.toString() + logmsg);
                    }
                } else {
                    DFSOutputStream.LOG.error(DFSOutputStream.this.toString() + logmsg);
                    errorFlow = true;
                }
                break;
            case SESSION_ERROR:
            case SESSION_REJECT:
                errorFlow = true;
                if (!this.wasHeaderAckReceived) {
                    this.didHeaderFail = true;
                    DFSOutputStream.LOG.error(DFSOutputStream.this.toString() + String.format(
                            "Session error occurred before header ack was received: session=%s, event=%s, reason=%s",
                            this.clientSession, session_event, reason));
                } else {
                    DFSOutputStream.LOG.error(DFSOutputStream.this.toString() + String.format(
                            "Session error occurred in the middle of the block: session=%s, event=%s, reason=%s",
                            this.clientSession, session_event, reason));
                }
                break;
            default:
                break;
            }
            if (errorFlow) {
                errorFlowInTheMiddle = true;
                // Break the event loop, returning the context to where the event loop was run originally
                DFSOutputStream.this.eventQHandler.breakEventLoop();
            }
        }

        /**
         * @param msg
         * @param reason
         */
        @Override
        public void onMsgError(Msg msg, EventReason reason) {
            if (msg != streamer.headerMsg) {
                returnMsgToPool(msg);
            }
            DFSOutputStream.LOG.error(DFSOutputStream.this.toString() + String
                    .format("Msg error occurred: reason=%s, countPacketArrived=%d", reason, countPacketArrived));
            if ((this.clientSession != null) && !this.clientSession.getIsClosing()) {
                this.clientSession.close();
                closeEventExpected = true;
                if (this.clientSession == DFSOutputStream.this.currentClientSession) {
                    DFSOutputStream.this.currentClientSession = null;
                }
            }
            if (!this.wasHeaderAckReceived) {
                this.didHeaderFail = true;
            }
            errorFlowInTheMiddle = true;
            DFSOutputStream.this.eventQHandler.breakEventLoop();
        }

        public void setClientSession(ClientSession clientSession) {
            this.clientSession = clientSession;
        }
    }

    //
    // The DataStreamer class is responsible for sending data packets to the
    // datanodes in the pipeline. It retrieves a new blockid and block locations
    // from the namenode, and starts streaming packets to the pipeline of
    // Datanodes. Every packet has a sequence number associated with
    // it. When all the packets for a block are sent out and acks for each
    // if them are received, the DataStreamer closes the current block.
    //
    class DataStreamer extends Daemon {
        private volatile boolean streamerClosed = false;
        private ExtendedBlock block; // its length is number of bytes acked
        private Token<BlockTokenIdentifier> accessToken;
        private DataOutputStream blockStream;
        private DataInputStream blockReplyStream;
        private volatile DatanodeInfo[] nodes = null; // list of targets for current block
        private volatile StorageType[] storageTypes = null;
        private volatile String[] storageIDs = null;
        private CSCallbacks currentCSCallbacks;
        private final LoadingCache<DatanodeInfo, DatanodeInfo> excludedNodes = CacheBuilder.newBuilder()
                .expireAfterWrite(dfsClient.getConf().getExcludedNodesCacheExpiry(), TimeUnit.MILLISECONDS)
                .removalListener(new RemovalListener<DatanodeInfo, DatanodeInfo>() {
                    @Override
                    public void onRemoval(RemovalNotification<DatanodeInfo, DatanodeInfo> notification) {
                        DFSClient.LOG
                                .info("Removing node " + notification.getKey() + " from the excluded nodes list");
                    }
                }).build(new CacheLoader<DatanodeInfo, DatanodeInfo>() {
                    @Override
                    public DatanodeInfo load(DatanodeInfo key) throws Exception {
                        return key;
                    }
                });
        private String[] favoredNodes;
        volatile boolean hasError = false;
        volatile int errorIndex = -1;
        // Restarting node index
        AtomicInteger restartingNodeIndex = new AtomicInteger(-1);
        private long restartDeadline = 0; // Deadline of DN restart
        private BlockConstructionStage stage; // block construction stage
        private long bytesSent = 0; // number of bytes that've been sent
        private final boolean isLazyPersistFile;

        /** Nodes have been used in the pipeline before and have failed. */
        private final List<DatanodeInfo> failed = new ArrayList<DatanodeInfo>();
        /** The last ack sequence number before pipeline failure. */
        private long lastAckedSeqnoBeforeFailure = -1;
        private int pipelineRecoveryCount = 0;
        /** Has the current block been hflushed? */
        private boolean isHflushed = false;
        /** Append on an existing block? */
        private final boolean isAppend;

        final int ACK_Q_THRESHOLD_HIGH = R4HProtocol.MAX_SEND_PACKETS / 2;
        final int ACK_Q_THRESHOLD_LOW = ACK_Q_THRESHOLD_HIGH / 2;

        private final Msg headerMsg = getMsg();

        private DataStreamer(HdfsFileStatus stat, ExtendedBlock block) {
            isAppend = false;
            isLazyPersistFile = isLazyPersist(stat);
            this.block = block;
            stage = BlockConstructionStage.PIPELINE_SETUP_CREATE;
        }

        /**
         * Construct a data streamer for append
         * 
         * @param lastBlock
         *            last block of the file to be appended
         * @param stat
         *            status of the file to be appended
         * @param bytesPerChecksum
         *            number of bytes per checksum
         * @throws IOException
         *             if error occurs
         */
        private DataStreamer(LocatedBlock lastBlock, HdfsFileStatus stat, int bytesPerChecksum) throws IOException {
            isAppend = true;
            stage = BlockConstructionStage.PIPELINE_SETUP_APPEND;
            block = lastBlock.getBlock();
            bytesSent = block.getNumBytes();
            accessToken = lastBlock.getBlockToken();
            isLazyPersistFile = isLazyPersist(stat);
            long usedInLastBlock = stat.getLen() % blockSize;
            int freeInLastBlock = (int) (blockSize - usedInLastBlock);

            // calculate the amount of free space in the pre-existing
            // last crc chunk
            int usedInCksum = (int) (stat.getLen() % bytesPerChecksum);
            int freeInCksum = bytesPerChecksum - usedInCksum;

            // if there is space in the last block, then we have to
            // append to that block
            if (freeInLastBlock == blockSize) {
                throw new IOException("The last block for file " + src + " is full.");
            }

            if (usedInCksum > 0 && freeInCksum > 0) {
                // if there is space in the last partial chunk, then
                // setup in such a way that the next packet will have only
                // one chunk that fills up the partial chunk.
                //
                computePacketChunkSize(0, freeInCksum);
                setChecksumBufSize(freeInCksum);
                appendChunk = true;
            } else {
                // if the remaining space in the block is smaller than
                // that expected size of of a packet, then create
                // smaller size packet.
                //
                computePacketChunkSize(Math.min(dfsClient.getConf().getWritePacketSize(), freeInLastBlock),
                        bytesPerChecksum);
            }

            // setup pipeline to append to the last block XXX retries??
            setPipeline(lastBlock);
            errorIndex = -1; // no errors yet.
            if (nodes.length < 1) {
                throw new IOException(
                        "Unable to retrieve blocks locations " + " for last block " + block + "of file " + src);

            }
        }

        private void setPipeline(LocatedBlock lb) {
            setPipeline(lb.getLocations(), lb.getStorageTypes(), lb.getStorageIDs());
        }

        private void setPipeline(DatanodeInfo[] nodes, StorageType[] storageTypes, String[] storageIDs) {
            this.nodes = nodes;
            this.storageTypes = storageTypes;
            this.storageIDs = storageIDs;
        }

        private void setFavoredNodes(String[] favoredNodes) {
            this.favoredNodes = favoredNodes;
        }

        /**
         * Initialize for data streaming
         */
        private void initDataStreaming() {
            this.setName("DataStreamer for file " + src + " block " + block);
            stage = BlockConstructionStage.DATA_STREAMING;
        }

        private void endBlock() {
            if (DFSClient.LOG.isDebugEnabled()) {
                DFSClient.LOG.debug("Closing old block " + block);
            }
            this.setName("DataStreamer for file " + src);
            closeStream();
            setPipeline(null, null, null);
            stage = BlockConstructionStage.PIPELINE_SETUP_CREATE;
        }

        int getNumOfAcksToProcess() {
            int onFlight = ackQueue.size();
            if (dataQueue.isEmpty()) {
                return onFlight;
            } else if (onFlight >= ACK_Q_THRESHOLD_HIGH) {
                return onFlight - ACK_Q_THRESHOLD_LOW;
            } else {
                return 0;
            }
        }

        /*
         * streamer thread is the only thread that opens streams to datanode,
         * and closes them. Any error recovery is also done by this thread.
         */
        @Override
        public void run() {
            boolean waslpib = false;
            if (LOG.isTraceEnabled()) {
                LOG.trace("Started running streamer with msg pool " + DFSOutputStream.this.msgPool);
            }

            if (toPrintBreakdown) {
                long now2 = System.nanoTime();
                LOG.info(String.format("%.3f", (float) now2 / 1000000000.) + ", "
                        + (now2 - lastOperationTS) / 1000000000. + " : Started streamer.");
                synchronized (lastOperationTSLock) {
                    lastOperationTS = now2;
                }
            }

            long lastPacket = Time.now();
            TraceScope traceScope = NullScope.INSTANCE;

            while (!streamerClosed && dfsClient.clientRunning) {

                DFSPacket one = null;

                try {
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Entering streamer main loop...");
                    }
                    // process datanode IO errors if any
                    boolean doSleep = false;
                    if (hasError && errorIndex >= 0) {
                        doSleep = processDatanodeError();
                    }

                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Before streamer's inner while loop: |dataQ|=" + dataQueue.size() + ", |ackQ|="
                                + ackQueue.size());
                        LOG.trace("streamerClosed:" + streamerClosed + ", hasError:" + hasError
                                + ", dfsClient.clientRunning:" + dfsClient.clientRunning + ", doSleep:" + doSleep);
                    }

                    while ((!streamerClosed && !hasError && dfsClient.clientRunning
                            && ((dataQueue.isEmpty()) || (ackQueue.size() >= ACK_Q_THRESHOLD_HIGH))) || doSleep) {

                        int events = getNumOfAcksToProcess();

                        if (events > 0) {
                            DFSOutputStream.this.eventQHandler.runEventLoop(events, 10 * 1000 * 1000);
                            if (this.currentCSCallbacks.errorFlowInTheMiddle) {
                                throw new IOException(
                                        "Error in message/session while running streamer, client cannot continue.");
                            }
                        } else {
                            synchronized (dataQueue) {
                                if (dataQueue.isEmpty() && !streamerClosed && !hasError
                                        && dfsClient.clientRunning) {
                                    // dataQ&ackQ are empty - going to wait on dataQ for user to insert new packet
                                    dataQueue.wait();
                                }
                            }
                        }
                    }

                    if (streamerClosed || hasError || !dfsClient.clientRunning) {
                        continue;
                    }
                    // get packet to be sent.
                    if (!(dataQueue.isEmpty())) {
                        one = dataQueue.peek(); // regular data packet

                        if (LOG.isTraceEnabled()) {
                            LOG.trace("Fetched a packet from dataQ.");
                        }
                    } else {
                        continue;
                    }

                    assert one != null;

                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Inside our streamer.run(), get new block");
                    }
                    // get new block from namenode.
                    if (stage == BlockConstructionStage.PIPELINE_SETUP_CREATE) {
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Allocating new block");
                        }

                        if (toPrintBreakdown) {
                            long now2 = System.nanoTime();
                            LOG.info(String.format("%.3f", (float) now2 / 1000000000.) + ", "
                                    + (now2 - lastOperationTS) / 1000000000.
                                    + " : Before call to nextBlockOutputStream.");
                            synchronized (lastOperationTSLock) {
                                lastOperationTS = now2;
                            }
                        }

                        setPipeline(nextBlockOutputStream());
                        initDataStreaming();
                    } else if (stage == BlockConstructionStage.PIPELINE_SETUP_APPEND) {
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Append to block " + block);
                        }
                        setupPipelineForAppendOrRecovery();
                        initDataStreaming();
                    }

                    if (one.lastPacketInBlock) {
                        // wait for all sent packet acks
                        while (!ackQueue.isEmpty()) {
                            DFSOutputStream.this.eventQHandler.runEventLoop(ackQueue.size(), 10 * 1000 * 1000);
                            if (this.currentCSCallbacks.errorFlowInTheMiddle) {
                                throw new IOException(
                                        "Error in message/session while running streamer, client cannot continue.");
                            }
                        }
                    }

                    long lastByteOffsetInBlock = one.getLastByteOffsetBlock();
                    if (lastByteOffsetInBlock > blockSize) {
                        throw new IOException("BlockSize " + blockSize + " is smaller than data size. "
                                + " Offset of packet in block " + lastByteOffsetInBlock + " Aborting file " + src);
                    }

                    if (LOG.isDebugEnabled()) {
                        LOG.debug("DataStreamer block " + block + " sending packet " + one);
                    }

                    // write out data to remote datanode
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Inside our streamer.run(), send packt to DN");
                    }

                    long now0 = System.nanoTime();
                    long now1 = System.nanoTime();
                    PacketHeader header = one.prepareBeforeSend();
                    one.msg.setUserContext(header);
                    R4HProtocol.wrappedSendRequest(DFSOutputStream.this.currentClientSession, one.msg, LOG);
                    long now2 = System.nanoTime();
                    sentSeqenceNum = one.seqno;

                    // sent the packet, handle the queues:
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Inside our streamer.run(), move packt from dataQ to ackQ");
                        LOG.trace("Before move: |dataQ|=" + dataQueue.size() + ", |ackQ|=" + ackQueue.size());
                    }
                    // move packet from dataQueue to ackQueue
                    if (!one.isHeartbeatPacket()) {
                        dataQueue.poll();
                        ackQueue.add(one);
                        synchronized (dataQueue) {
                            dataQueue.notifyAll();
                        }
                    }
                    lastPacket = Time.now();

                    if (one.isHeartbeatPacket()) { // heartbeat packet
                        LOG.warn(DFSOutputStream.this.toString() + "Unexpectedly bumped into heartbeat packet.");
                    }

                    // update bytesSent
                    long tmpBytesSent = one.getLastByteOffsetBlock();
                    if (bytesSent < tmpBytesSent) {
                        bytesSent = tmpBytesSent;
                    }

                    if (streamerClosed || hasError || !dfsClient.clientRunning) {
                        continue;
                    }

                    // Is this block full?
                    if (one.lastPacketInBlock) {
                        // wait for the close packet has been acked

                        while (!hasError && !this.currentCSCallbacks.wasLastPacketAcked
                                && dfsClient.clientRunning) {
                            DFSOutputStream.this.eventQHandler.runEventLoop(1, 1000 * 1000);
                            if (this.currentCSCallbacks.errorFlowInTheMiddle) {
                                throw new IOException(
                                        "Error in message/session while waiting for last packet ack in streamer, client cannot continue.");
                            }
                        }
                        waslpib = this.currentCSCallbacks.wasLastPacketAcked;
                        this.currentCSCallbacks.wasLastPacketAcked = false; // reset for next block

                        if (streamerClosed || hasError || !dfsClient.clientRunning) {
                            continue;
                        }

                        endBlock();
                    }

                } catch (Throwable e) {
                    // Log warning if there was a real error.
                    if (restartingNodeIndex.get() == -1) {
                        DFSClient.LOG.warn("DataStreamer Exception", e);
                    }
                    if (e instanceof IOException) {
                        setLastException((IOException) e);
                    } else {
                        setLastException(new IOException("DataStreamer Exception: ", e));
                    }
                    hasError = true;
                    if (errorIndex == -1 && restartingNodeIndex.get() == -1) {
                        // Not a datanode issue
                        streamerClosed = true;
                    }
                }
            }
            closeInternal();
        }

        private void closeInternal() {
            if (this.currentCSCallbacks != null && !this.currentCSCallbacks.isSessionClosed) {
                long start = System.nanoTime();
                DFSOutputStream.this.eventQHandler.runEventLoop(1, CLOSE_WAIT_TIMEOUT_IN_USEC);
                long durationUsec = (System.nanoTime() - start) / 1000;
                if (!this.currentCSCallbacks.isSessionClosed) {
                    LOG.error("Did not receive client session closed event");
                }
                if (durationUsec >= CLOSE_WAIT_TIMEOUT_IN_USEC) {
                    LOG.warn(String.format("Spent %d milliSec waiting for session close", durationUsec / 1000));
                }
            }
            closeStream();
            streamerClosed = true;
            closed = true;
            if (LOG.isTraceEnabled()) {
                LOG.trace("closing streamer");
            }
            DFSOutputStream.this.returnMsgToPool(headerMsg);
            synchronized (dataQueue) {
                dataQueue.notifyAll();
            }
            synchronized (ackQueue) {
                ackQueue.notifyAll();
            }
        }

        /*
         * close both streamer and DFSOutputStream, should be called only
         * by an external thread and only after all data to be sent has
         * been flushed to datanode.
         * 
         * Interrupt this data streamer if force is true
         * 
         * @param force if this data stream is forced to be closed
         */
        void close(boolean force) {
            streamerClosed = true;
            synchronized (dataQueue) {
                dataQueue.notifyAll();
            }
            if (force) {
                this.interrupt();
            }
        }

        private void closeStream() {
            if (blockStream != null) {
                try {
                    blockStream.close();
                } catch (IOException e) {
                    setLastException(e);
                } finally {
                    blockStream = null;
                }
            }
            if (blockReplyStream != null) {
                try {
                    blockReplyStream.close();
                } catch (IOException e) {
                    setLastException(e);
                } finally {
                    blockReplyStream = null;
                }
            }
            if (null != s) {
                try {
                    s.close();
                } catch (IOException e) {
                    setLastException(e);
                } finally {
                    s = null;
                }
            }
        }

        // The following synchronized methods are used whenever
        // errorIndex or restartingNodeIndex is set. This is because
        // check & set needs to be atomic. Simply reading variables
        // does not require a synchronization. When responder is
        // not running (e.g. during pipeline recovery), there is no
        // need to use these methods.

        /** Set the error node index. Called by responder */
        synchronized void setErrorIndex(int idx) {
            errorIndex = idx;
        }

        /** Set the restarting node index. Called by responder */
        synchronized void setRestartingNodeIndex(int idx) {
            restartingNodeIndex.set(idx);
            // If the data streamer has already set the primary node
            // bad, clear it. It is likely that the write failed due to
            // the DN shutdown. Even if it was a real failure, the pipeline
            // recovery will take care of it.
            errorIndex = -1;
        }

        /**
         * This method is used when no explicit error report was received,
         * but something failed. When the primary node is a suspect or
         * unsure about the cause, the primary node is marked as failed.
         */
        synchronized void tryMarkPrimaryDatanodeFailed() {
            // There should be no existing error and no ongoing restart.
            if ((errorIndex == -1) && (restartingNodeIndex.get() == -1)) {
                errorIndex = 0;
            }
        }

        /**
         * Examine whether it is worth waiting for a node to restart.
         * 
         * @param index
         *            the node index
         */
        boolean shouldWaitForRestart(int index) {
            // Only one node in the pipeline.
            if (nodes.length == 1) {
                return true;
            }

            // Is it a local node?
            InetAddress addr = null;
            try {
                addr = InetAddress.getByName(nodes[index].getIpAddr());
            } catch (java.net.UnknownHostException e) {
                // we are passing an ip address. this should not happen.
                assert false;
            }

            if (addr != null && NetUtils.isLocalAddress(addr)) {
                return true;
            }
            return false;
        }

        // If this stream has encountered any errors so far, shutdown
        // threads and mark stream as closed. Returns true if we should
        // sleep for a while after returning from this call.
        //
        private boolean processDatanodeError() throws IOException {
            closeStream();

            // move packets from ack queue to front of the data queue
            dataQueue.addAll(ackQueue);
            ackQueue.clear();

            // Record the new pipeline failure recovery.
            if (lastAckedSeqnoBeforeFailure != lastAckedSeqno) {
                lastAckedSeqnoBeforeFailure = lastAckedSeqno;
                pipelineRecoveryCount = 1;
            } else {
                // If we had to recover the pipeline five times in a row for the
                // same packet, this client likely has corrupt data or corrupting
                // during transmission.
                if (++pipelineRecoveryCount > 5) {
                    DFSClient.LOG.warn("Error recovering pipeline for writing " + block
                            + ". Already retried 5 times for the same packet.");
                    lastException.set(new IOException(
                            "Failing write. Tried pipeline " + "recovery 5 times without success."));
                    streamerClosed = true;
                    return false;
                }
            }
            boolean doSleep = setupPipelineForAppendOrRecovery();

            if (!streamerClosed && dfsClient.clientRunning) {
                if (stage == BlockConstructionStage.PIPELINE_CLOSE) {

                    // If we had an error while closing the pipeline, we go through a fast-path
                    // where the BlockReceiver does not run. Instead, the DataNode just finalizes
                    // the block immediately during the 'connect ack' process. So, we want to pull
                    // the end-of-block packet from the dataQueue, since we don't actually have
                    // a true pipeline to send it over.
                    //
                    // We also need to set lastAckedSeqno to the end-of-block Packet's seqno, so that
                    // a client waiting on close() will be aware that the flush finished.
                    DFSPacket endOfBlockPacket = dataQueue.remove(); // remove the end of block packet
                    assert endOfBlockPacket.lastPacketInBlock;
                    assert lastAckedSeqno == endOfBlockPacket.seqno - 1;
                    synchronized (ackQueue) {
                        lastAckedSeqno = endOfBlockPacket.seqno;
                        ackQueue.notifyAll();
                    }
                    synchronized (dataQueue) {
                        dataQueue.notifyAll();
                    }
                    endBlock();
                } else {
                    initDataStreaming();
                }
            }

            return doSleep;
        }

        private void setHflush() {
            isHflushed = true;
        }

        private int findNewDatanode(final DatanodeInfo[] original) throws IOException {
            if (nodes.length != original.length + 1) {
                throw new IOException(new StringBuilder()
                        .append("Failed to replace a bad datanode on the existing pipeline ")
                        .append("due to no more good datanodes being available to try. ").append("(Nodes: current=")
                        .append(Arrays.asList(nodes)).append(", original=").append(Arrays.asList(original))
                        .append("). ").append("The current failed datanode replacement policy is ")
                        .append(dfsClient.dtpReplaceDatanodeOnFailure).append(", and ")
                        .append("a client may configure this via '")
                        .append(DFSConfigKeys.DFS_CLIENT_WRITE_REPLACE_DATANODE_ON_FAILURE_POLICY_KEY)
                        .append("' in its configuration.").toString());
            }
            for (int i = 0; i < nodes.length; i++) {
                int j = 0;
                for (; j < original.length && !nodes[i].equals(original[j]); j++)
                    ;
                if (j == original.length) {
                    return i;
                }
            }
            throw new IOException("Failed: new datanode not found: nodes=" + Arrays.asList(nodes) + ", original="
                    + Arrays.asList(original));
        }

        private void addDatanode2ExistingPipeline() throws IOException {
            if (DataTransferProtocol.LOG.isDebugEnabled()) {
                DataTransferProtocol.LOG.debug("lastAckedSeqno = " + lastAckedSeqno);
            }
            /*
             * Is data transfer necessary? We have the following cases.
             * 
             * Case 1: Failure in Pipeline Setup
             * - Append
             * + Transfer the stored replica, which may be a RBW or a finalized.
             * - Create
             * + If no data, then no transfer is required.
             * + If there are data written, transfer RBW. This case may happens
             * when there are streaming failure earlier in this pipeline.
             * 
             * Case 2: Failure in Streaming
             * - Append/Create:
             * + transfer RBW
             * 
             * Case 3: Failure in Close
             * - Append/Create:
             * + no transfer, let NameNode replicates the block.
             */
            if (!isAppend && lastAckedSeqno < 0 && stage == BlockConstructionStage.PIPELINE_SETUP_CREATE) {
                // no data have been written
                return;
            } else if (stage == BlockConstructionStage.PIPELINE_CLOSE
                    || stage == BlockConstructionStage.PIPELINE_CLOSE_RECOVERY) {
                // pipeline is closing
                return;
            }

            // get a new datanode
            final DatanodeInfo[] original = nodes;
            final LocatedBlock lb = dfsClient.namenode.getAdditionalDatanode(src, fileId, block, nodes, storageIDs,
                    failed.toArray(new DatanodeInfo[failed.size()]), 1, dfsClient.clientName);
            setPipeline(lb);

            // find the new datanode
            final int d = findNewDatanode(original);

            // transfer replica
            final DatanodeInfo src = d == 0 ? nodes[1] : nodes[d - 1];
            final DatanodeInfo[] targets = { nodes[d] };
            final StorageType[] targetStorageTypes = { storageTypes[d] };
            transfer(src, targets, targetStorageTypes, lb.getBlockToken());
        }

        private void transfer(final DatanodeInfo src, final DatanodeInfo[] targets,
                final StorageType[] targetStorageTypes, final Token<BlockTokenIdentifier> blockToken)
                throws IOException {
            // transfer replica to the new datanode
            Socket sock = null;
            DataOutputStream out = null;
            DataInputStream in = null;
            try {
                sock = createSocketForPipeline(src, 2, dfsClient);
                final long writeTimeout = dfsClient.getDatanodeWriteTimeout(2);

                OutputStream unbufOut = NetUtils.getOutputStream(sock, writeTimeout);
                InputStream unbufIn = NetUtils.getInputStream(sock);
                IOStreamPair saslStreams = dfsClient.saslClient.socketSend(sock, unbufOut, unbufIn, dfsClient,
                        blockToken, src);
                unbufOut = saslStreams.out;
                unbufIn = saslStreams.in;
                out = new DataOutputStream(new BufferedOutputStream(unbufOut, HdfsConstants.SMALL_BUFFER_SIZE));
                in = new DataInputStream(unbufIn);

                // send the TRANSFER_BLOCK request
                new Sender(out).transferBlock(block, blockToken, dfsClient.clientName, targets, targetStorageTypes);
                out.flush();

                // ack
                BlockOpResponseProto response = BlockOpResponseProto.parseFrom(PBHelper.vintPrefixed(in));
                if (SUCCESS != response.getStatus()) {
                    throw new IOException("Failed to add a datanode");
                }
            } finally {
                IOUtils.closeStream(in);
                IOUtils.closeStream(out);
                IOUtils.closeSocket(sock);
            }
        }

        /**
         * Open a DataOutputStream to a DataNode pipeline so that
         * it can be written to.
         * This happens when a file is appended or data streaming fails
         * It keeps on trying until a pipeline is setup
         */
        private boolean setupPipelineForAppendOrRecovery() throws IOException {
            // check number of datanodes
            if (nodes == null || nodes.length == 0) {
                String msg = "Could not get block locations. " + "Source file \"" + src + "\" - Aborting...";
                DFSClient.LOG.warn(msg);
                setLastException(new IOException(msg));
                streamerClosed = true;
                return false;
            }

            boolean success = false;
            long newGS = 0L;
            while (!success && !streamerClosed && dfsClient.clientRunning) {
                // Sleep before reconnect if a dn is restarting.
                // This process will be repeated until the deadline or the datanode
                // starts back up.
                if (restartingNodeIndex.get() >= 0) {
                    // 4 seconds or the configured deadline period, whichever is shorter.
                    // This is the retry interval and recovery will be retried in this
                    // interval until timeout or success.
                    long delay = Math.min(dfsClient.getConf().getDatanodeRestartTimeout(), 4000L);
                    try {
                        Thread.sleep(delay);
                    } catch (InterruptedException ie) {
                        lastException.set(new IOException("Interrupted while waiting for " + "datanode to restart. "
                                + nodes[restartingNodeIndex.get()]));
                        streamerClosed = true;
                        return false;
                    }
                }
                boolean isRecovery = hasError;
                // remove bad datanode from list of datanodes.
                // If errorIndex was not set (i.e. appends), then do not remove
                // any datanodes
                //
                if (errorIndex >= 0) {
                    StringBuilder pipelineMsg = new StringBuilder();
                    for (int j = 0; j < nodes.length; j++) {
                        pipelineMsg.append(nodes[j]);
                        if (j < nodes.length - 1) {
                            pipelineMsg.append(", ");
                        }
                    }
                    if (nodes.length <= 1) {
                        lastException
                                .set(new IOException("All datanodes " + pipelineMsg + " are bad. Aborting..."));
                        streamerClosed = true;
                        return false;
                    }
                    DFSClient.LOG.warn("Error Recovery for block " + block + " in pipeline " + pipelineMsg
                            + ": bad datanode " + nodes[errorIndex]);
                    failed.add(nodes[errorIndex]);

                    DatanodeInfo[] newnodes = new DatanodeInfo[nodes.length - 1];
                    arraycopy(nodes, newnodes, errorIndex);

                    final StorageType[] newStorageTypes = new StorageType[newnodes.length];
                    arraycopy(storageTypes, newStorageTypes, errorIndex);

                    final String[] newStorageIDs = new String[newnodes.length];
                    arraycopy(storageIDs, newStorageIDs, errorIndex);

                    setPipeline(newnodes, newStorageTypes, newStorageIDs);

                    // Just took care of a node error while waiting for a node restart
                    if (restartingNodeIndex.get() >= 0) {
                        // If the error came from a node further away than the restarting
                        // node, the restart must have been complete.
                        if (errorIndex > restartingNodeIndex.get()) {
                            restartingNodeIndex.set(-1);
                        } else if (errorIndex < restartingNodeIndex.get()) {
                            // the node index has shifted.
                            restartingNodeIndex.decrementAndGet();
                        } else {
                            // this shouldn't happen...
                            assert false;
                        }
                    }

                    if (restartingNodeIndex.get() == -1) {
                        hasError = false;
                    }
                    lastException.set(null);
                    errorIndex = -1;
                }

                // Check if replace-datanode policy is satisfied.
                if (dfsClient.dtpReplaceDatanodeOnFailure.satisfy(blockReplication, nodes, isAppend, isHflushed)) {
                    try {
                        addDatanode2ExistingPipeline();
                    } catch (IOException ioe) {
                        if (!dfsClient.dtpReplaceDatanodeOnFailure.isBestEffort()) {
                            throw ioe;
                        }
                        DFSClient.LOG.warn(
                                "Failed to replace datanode." + " Continue with the remaining datanodes since "
                                        + DFSConfigKeys.DFS_CLIENT_WRITE_REPLACE_DATANODE_ON_FAILURE_BEST_EFFORT_KEY
                                        + " is set to true.",
                                ioe);
                    }
                }

                // get a new generation stamp and an access token
                LocatedBlock lb = dfsClient.namenode.updateBlockForPipeline(block, dfsClient.clientName);
                newGS = lb.getBlock().getGenerationStamp();
                accessToken = lb.getBlockToken();

                // set up the pipeline again with the remaining nodes
                if (failPacket) { // for testing
                    success = createBlockOutputStream(nodes, storageTypes, newGS, isRecovery);
                    failPacket = false;
                    try {
                        // Give DNs time to send in bad reports. In real situations,
                        // good reports should follow bad ones, if client committed
                        // with those nodes.
                        Thread.sleep(2000);
                    } catch (InterruptedException ie) {
                    }
                } else {
                    success = createBlockOutputStream(nodes, storageTypes, newGS, isRecovery);
                }

                if (restartingNodeIndex.get() >= 0) {
                    assert hasError == true;
                    // check errorIndex set above
                    if (errorIndex == restartingNodeIndex.get()) {
                        // ignore, if came from the restarting node
                        errorIndex = -1;
                    }
                    // still within the deadline
                    if (Time.monotonicNow() < restartDeadline) {
                        continue; // with in the deadline
                    }
                    // expired. declare the restarting node dead
                    restartDeadline = 0;
                    int expiredNodeIndex = restartingNodeIndex.get();
                    restartingNodeIndex.set(-1);
                    DFSClient.LOG.warn("Datanode did not restart in time: " + nodes[expiredNodeIndex]);
                    // Mark the restarting node as failed. If there is any other failed
                    // node during the last pipeline construction attempt, it will not be
                    // overwritten/dropped. In this case, the restarting node will get
                    // excluded in the following attempt, if it still does not come up.
                    if (errorIndex == -1) {
                        errorIndex = expiredNodeIndex;
                    }
                    // From this point on, normal pipeline recovery applies.
                }
            } // while

            if (success) {
                // update pipeline at the namenode
                ExtendedBlock newBlock = new ExtendedBlock(block.getBlockPoolId(), block.getBlockId(),
                        block.getNumBytes(), newGS);
                dfsClient.namenode.updatePipeline(dfsClient.clientName, block, newBlock, nodes, storageIDs);
                // update client side generation stamp
                block = newBlock;
            }
            return false; // do not sleep, continue processing
        }

        /**
         * Open a DataOutputStream to a DataNode so that it can be written to.
         * This happens when a file is created and each time a new block is allocated.
         * Must get block ID and the IDs of the destinations from the namenode.
         * Returns the list of target datanodes.
         */
        private LocatedBlock nextBlockOutputStream() throws IOException {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Entered nextBlockOutputStream");
            }

            LocatedBlock lb = null;
            DatanodeInfo[] nodes = null;
            StorageType[] storageTypes = null;
            int count = dfsClient.getConf().getnBlockWriteRetry();
            boolean success = false;
            ExtendedBlock oldBlock = block;
            do {
                hasError = false;
                lastException.set(null);
                errorIndex = -1;
                success = false;

                long startTime = Time.now();
                DatanodeInfo[] excluded = excludedNodes.getAllPresent(excludedNodes.asMap().keySet()).keySet()
                        .toArray(new DatanodeInfo[0]);
                block = oldBlock;
                lb = locateFollowingBlock(excluded.length > 0 ? excluded : null);
                block = lb.getBlock();
                block.setNumBytes(0);
                bytesSent = 0;
                accessToken = lb.getBlockToken();
                nodes = lb.getLocations();
                storageTypes = lb.getStorageTypes();

                //
                // Connect to first DataNode in the list.
                //
                if (toPrintBreakdown) {
                    long now2 = System.nanoTime();
                    LOG.info(String.format("%.3f", (float) now2 / 1000000000.) + ", "
                            + (now2 - lastOperationTS) / 1000000000.
                            + " : Before call to createBlockOutputStream.");
                    synchronized (lastOperationTSLock) {
                        lastOperationTS = now2;
                    }
                }

                success = createBlockOutputStream(nodes, storageTypes, 0L, false);

                if (!success) {
                    DFSOutputStream.LOG.info(DFSOutputStream.this.toString() + " Abandoning " + block);
                    dfsClient.namenode.abandonBlock(block, fileId, src, dfsClient.clientName);
                    block = null;
                    DFSOutputStream.LOG
                            .info(DFSOutputStream.this.toString() + "Excluding datanode " + nodes[errorIndex]);
                    excludedNodes.put(nodes[errorIndex], nodes[errorIndex]);
                }
            } while (!success && --count >= 0);

            if (!success) {
                throw new IOException("Unable to create new block.");
            }
            return lb;
        }

        /*
         * If client session hasn't been established yet, run the event loop and check whether we encountered an error flow.
         * 
         * @param clientSessionCallbacks
         * The callbacks that hold the client session.
         * 
         * @param eventLoopRunDurationUsec
         * For how many uSeconds to run the event loop.
         * 
         * @return
         * false if encountered error flow, true otherwise.
         */
        private boolean runEventLoopAndCheckErrorFlow(int eventLoopRunDurationUsec) {
            if (!this.currentCSCallbacks.wasSessionEstablished) {
                DFSOutputStream.this.eventQHandler.runEventLoop(1, eventLoopRunDurationUsec);
                if (this.currentCSCallbacks.errorFlowInTheMiddle) {
                    LOG.error(
                            "Error in message/session while waiting for session establishment, client cannot continue.");
                    return false;
                }
            }
            return true;
        }

        // connects to the first datanode in the pipeline
        // Returns true if success, otherwise return failure.
        //
        private boolean createBlockOutputStream(DatanodeInfo[] nodes, StorageType[] nodeStorageTypes, long newGS,
                boolean recoveryFlag) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Inside our createBlockOutputStream()");
            }

            if (nodes.length == 0) {
                LOG.info("nodes are empty for write pipeline of block " + block);
                return false;
            }

            boolean checkRestart = false;
            if (LOG.isDebugEnabled()) {
                for (int i = 0; i < nodes.length; i++) {
                    LOG.debug("pipeline = " + nodes[i]);
                }
            }

            // persist blocks on namenode on next flush
            persistBlocks.set(true);

            int refetchEncryptionKey = 1;
            while (true) {
                boolean result = false;
                DataOutputStream out = null;
                try {
                    assert null == blockReplyStream : "Previous blockReplyStream unclosed";
                    long writeTimeout = dfsClient.getDatanodeWriteTimeout(nodes.length);
                    DFSOutputStream.this.nextDnName = nodes[0].getName();
                    String hash = R4HProtocol.createSessionHash();
                    DFSOutputStream.this.name = String.format("[hash=%s] ", hash);

                    URI uri;
                    try {
                        uri = R4HProtocol.createInitialURI(nodes, hash);
                    } catch (URISyntaxException e) {
                        LOG.error(DFSOutputStream.this.toString() + "Failed on uri creation:" + e);
                        return false;
                    }

                    if (toPrintBreakdown) {
                        long now = System.nanoTime();
                        LOG.info(DFSOutputStream.this.toString() + String.format("%.3f", (float) now / 1000000000.)
                                + ", " + (now - lastOperationTS) / 1000000000.
                                + " : Going to establish client session.");
                        synchronized (lastOperationTSLock) {
                            lastOperationTS = now;
                        }
                    }

                    // Close/Clean old session
                    if ((DFSOutputStream.this.currentClientSession != null)
                            && !DFSOutputStream.this.currentClientSession.getIsClosing()) {
                        this.currentCSCallbacks.closeEventExpected = true;
                        currentClientSession.close();
                        currentClientSession = null;
                    }

                    this.currentCSCallbacks = new CSCallbacks();

                    if (LOG.isDebugEnabled()) {
                        LOG.debug(DFSOutputStream.this.toString() + String.format("Connecting to %s", uri));
                    }

                    DFSOutputStream.this.currentClientSession = new ClientSession(eventQHandler, uri,
                            this.currentCSCallbacks);
                    this.currentCSCallbacks.setClientSession(DFSOutputStream.this.currentClientSession);

                    final int eventLoopRunDurationUsec = 1000;
                    if (!runEventLoopAndCheckErrorFlow(eventLoopRunDurationUsec)) {
                        return false;
                    }

                    //
                    // Xmit header info to datanode
                    //
                    headerMsg.resetPositions();
                    out = new DataOutputStream(
                            new BufferedOutputStream(new ByteBufferOutputStream(headerMsg.getOut()), 512));
                    if (!runEventLoopAndCheckErrorFlow(eventLoopRunDurationUsec)) {
                        return false;
                    }

                    if (toPrintBreakdown) {
                        long now = System.nanoTime();
                        LOG.info(String.format("%.3f", (float) now / 1000000000.) + ", "
                                + (now - lastOperationTS) / 1000000000. + " : After get msg from pool + new DOS.");
                        synchronized (lastOperationTSLock) {
                            lastOperationTS = now;
                        }
                    }

                    if (dfsClient.shouldEncryptData()) {
                        throw new UnsupportedActionException(
                                "Encryption is not supported yet in RDFS. Please disable it in conf.");
                    }
                    if (toPrintBreakdown) {
                        long now = System.nanoTime();
                        LOG.info(String.format("%.3f", (float) now / 1000000000.) + ", "
                                + (now - lastOperationTS) / 1000000000.
                                + " : Going to write block with protoBuff.");
                        synchronized (lastOperationTSLock) {
                            lastOperationTS = now;
                        }
                    }

                    boolean[] targetPinnings = getPinnings(nodes, true);
                    // send the request
                    new Sender(out).writeBlock(block, nodeStorageTypes[0], accessToken, dfsClient.clientName, nodes,
                            nodeStorageTypes, null, recoveryFlag ? stage.getRecoveryStage() : stage, nodes.length,
                            block.getNumBytes(), bytesSent, newGS, checksum, cachingStrategy.get(),
                            isLazyPersistFile, (targetPinnings == null ? false : targetPinnings[0]),
                            targetPinnings);

                    if (toPrintBreakdown) {
                        long now = System.nanoTime();
                        LOG.info(String.format("%.3f", (float) now / 1000000000.) + ", "
                                + (now - lastOperationTS) / 1000000000. + " : After write block protobuf.");
                        synchronized (lastOperationTSLock) {
                            lastOperationTS = now;
                        }
                    }

                    out.flush();
                    if (!runEventLoopAndCheckErrorFlow(eventLoopRunDurationUsec)) {
                        return false;
                    }

                    if (toPrintBreakdown) {
                        long now = System.nanoTime();
                        LOG.info(String.format("%.3f", (float) now / 1000000000.) + ", "
                                + (now - lastOperationTS) / 1000000000.
                                + " : After protoBuff.flush , before sendMessage.");
                        synchronized (lastOperationTSLock) {
                            lastOperationTS = now;
                        }
                    }

                    long now1 = System.nanoTime();
                    R4HProtocol.wrappedSendRequest(DFSOutputStream.this.currentClientSession, headerMsg, LOG);
                    long now2 = System.nanoTime();

                    if (!runEventLoopAndCheckErrorFlow(eventLoopRunDurationUsec)) {
                        return false;
                    }

                    if (toPrintBreakdown) {
                        long now = System.nanoTime();
                        LOG.info(String.format("%.3f", (float) now / 1000000000.) + ", "
                                + (now - lastOperationTS) / 1000000000. + " : Sent header to datanode.");
                        synchronized (lastOperationTSLock) {
                            lastOperationTS = now;
                        }
                    }

                    // Now we run the event queue handler. Ack/nack will be handled in the respective callbacks of the client
                    // session.
                    long startWaitingForAckTSMicro;
                    long endWaitingForAckTSMicro;
                    long timeToWaitForHeaderAckUsec = headerAckTimeoutUsec;

                    while (!this.currentCSCallbacks.wasHeaderAckReceived && !this.currentCSCallbacks.didHeaderFail
                            && (timeToWaitForHeaderAckUsec > 0)) {
                        startWaitingForAckTSMicro = System.nanoTime() / 1000;
                        DFSOutputStream.this.eventQHandler.runEventLoop(1, timeToWaitForHeaderAckUsec);
                        endWaitingForAckTSMicro = System.nanoTime() / 1000;
                        timeToWaitForHeaderAckUsec -= (endWaitingForAckTSMicro - startWaitingForAckTSMicro);

                        if (this.currentCSCallbacks.errorFlowInTheMiddle) {
                            LOG.error(String.format(
                                    "Error in message/session while waiting for header ack, client cannot continue. %s",
                                    name));
                            return false;
                        }
                    }

                    if (!this.currentCSCallbacks.wasHeaderAckReceived) {
                        this.currentCSCallbacks.errorFlowInTheMiddle = true;
                        LOG.error(String.format(
                                "Waited for header ack longer than %d seconds, client cannot continue. %s",
                                headerAckTimeoutUsec / 1000000, name));
                        return false;
                    }

                    if (this.currentCSCallbacks.didHeaderFail) {
                        DFSOutputStream.LOG.warn(DFSOutputStream.this.toString() + String.format(
                                "Header failed (packets arrived=%d, nodes.len=%d, errorIndex=%d, firstBadLink.length=%d)",
                                this.currentCSCallbacks.countPacketArrived, nodes.length, errorIndex,
                                this.currentCSCallbacks.firstBadLink.length()));
                        this.currentCSCallbacks.currResult = false;
                    }
                    if (LOG.isDebugEnabled()) {
                        LOG.debug(String.format(
                                "After waiting for header ACK: didHeaderFail=%S, currResult=%s, wasHeaderAckReceived=%s",
                                this.currentCSCallbacks.didHeaderFail ? "TRUE" : "FALSE",
                                this.currentCSCallbacks.currResult ? "TRUE" : "FALSE",
                                this.currentCSCallbacks.wasHeaderAckReceived ? "TRUE" : "FALSE"));
                    }

                    DFSOutputStream.LOG.debug(DFSOutputStream.this.toString() + String.format(
                            "createBlockOutputStream retunrs with: currResult=%b, wasHeaderAckReceived=%b",
                            this.currentCSCallbacks.currResult, this.currentCSCallbacks.wasHeaderAckReceived));
                    return this.currentCSCallbacks.currResult && this.currentCSCallbacks.wasHeaderAckReceived;

                } catch (IOException ie) {
                    if (restartingNodeIndex.get() == -1) {
                        LOG.info("Exception in createBlockOutputStream", ie);
                    }
                    if (ie instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
                        LOG.info("Will fetch a new encryption key and retry, "
                                + "encryption key was invalid when connecting to " + nodes[0] + " : " + ie);
                        // The encryption key used is invalid.
                        refetchEncryptionKey--;
                        dfsClient.clearDataEncryptionKey();
                        // Don't close the socket/exclude this node just yet. Try again with
                        // a new encryption key.
                        continue;
                    }

                    updateBadLinkAndErrorIndex(nodes, this.currentCSCallbacks, checkRestart);

                    // Check whether there is a restart worth waiting for.
                    if (checkRestart && shouldWaitForRestart(errorIndex)) {
                        restartDeadline = dfsClient.getConf().getDatanodeRestartTimeout() + Time.now();
                        restartingNodeIndex.set(errorIndex);
                        errorIndex = -1;
                        LOG.info("Waiting for the datanode to be restarted: " + nodes[restartingNodeIndex.get()]);
                    }
                    hasError = true;
                    setLastException(ie);
                    result = false; // error
                } finally {
                    if (!result) {
                        updateBadLinkAndErrorIndex(nodes, this.currentCSCallbacks, checkRestart);
                        IOUtils.closeStream(out);
                        out = null;
                        IOUtils.closeStream(blockReplyStream);
                        blockReplyStream = null;
                    }
                }
                if (DFSOutputStream.LOG.isDebugEnabled())
                    DFSOutputStream.LOG.debug(DFSOutputStream.this.toString()
                            + String.format("createBlockOutputStream returns with: result=%b)", result));

                return result;
            }
        }

        /**
         * Updates bad DN, if exists and error index, if needed.
         */
        private void updateBadLinkAndErrorIndex(DatanodeInfo[] nodes, CSCallbacks clientSessionCallbacks,
                boolean checkRestart) {
            // find the datanode that matches
            if (clientSessionCallbacks.firstBadLink.length() != 0) {
                for (int i = 0; i < nodes.length; i++) {
                    // NB: Unconditionally using the xfer addr w/o hostname
                    if (clientSessionCallbacks.firstBadLink.equals(nodes[i].getXferAddr())) {
                        errorIndex = i;
                        break;
                    }
                }
            } else {
                assert checkRestart == false;
                errorIndex = 0;
            }
        }

        private boolean[] getPinnings(DatanodeInfo[] nodes, boolean shouldLog) {
            if (favoredNodes == null) {
                return null;
            } else {
                boolean[] pinnings = new boolean[nodes.length];
                HashSet<String> favoredSet = new HashSet<String>(Arrays.asList(favoredNodes));
                for (int i = 0; i < nodes.length; i++) {
                    pinnings[i] = favoredSet.remove(nodes[i].getXferAddrWithHostname());
                    if (DFSClient.LOG.isDebugEnabled()) {
                        DFSClient.LOG.debug(nodes[i].getXferAddrWithHostname()
                                + " was chosen by name node (favored=" + pinnings[i] + ").");
                    }
                }
                if (shouldLog && !favoredSet.isEmpty()) {
                    // There is one or more favored nodes that were not allocated.
                    DFSClient.LOG.warn("These favored nodes were specified but not chosen: " + favoredSet
                            + " Specified favored nodes: " + Arrays.toString(favoredNodes));

                }
                return pinnings;
            }
        }

        private LocatedBlock locateFollowingBlock(DatanodeInfo[] excludedNodes) throws IOException {
            int retries = dfsClient.getConf().getnBlockWriteLocateFollowingRetry();
            long sleeptime = 400;
            while (true) {
                long localstart = Time.monotonicNow();
                while (true) {
                    try {
                        return dfsClient.namenode.addBlock(src, dfsClient.clientName, block, excludedNodes, fileId,
                                favoredNodes);
                    } catch (RemoteException e) {
                        IOException ue = e.unwrapRemoteException(FileNotFoundException.class,
                                AccessControlException.class, NSQuotaExceededException.class,
                                DSQuotaExceededException.class, UnresolvedPathException.class);
                        if (ue != e) {
                            throw ue; // no need to retry these exceptions
                        }

                        if (NotReplicatedYetException.class.getName().equals(e.getClassName())) {
                            if (retries == 0) {
                                throw e;
                            } else {
                                --retries;
                                DFSClient.LOG.info("Exception while adding a block", e);
                                long elapsed = Time.monotonicNow() - localstart;
                                if (elapsed > 5000) {
                                    DFSClient.LOG
                                            .info("Waiting for replication for " + (elapsed / 1000) + " seconds");
                                }
                                try {
                                    DFSClient.LOG.warn("NotReplicatedYetException sleeping " + src
                                            + " retries left " + retries);
                                    Thread.sleep(sleeptime);
                                    sleeptime *= 2;
                                } catch (InterruptedException ie) {
                                    DFSClient.LOG.warn("Caught exception ", ie);
                                }
                            }
                        } else {
                            throw e;
                        }

                    }
                }
            }
        }

        ExtendedBlock getBlock() {
            return block;
        }

        DatanodeInfo[] getNodes() {
            return nodes;
        }

        Token<BlockTokenIdentifier> getBlockToken() {
            return accessToken;
        }

        private void setLastException(IOException e) {
            lastException.compareAndSet(null, e);
        }
    }

    /**
     * Create a socket for a write pipeline
     * 
     * @param first
     *            the first datanode
     * @param length
     *            the pipeline length
     * @param client
     *            client
     * @return the socket connected to the first datanode
     */
    static Socket createSocketForPipeline(final DatanodeInfo first, final int length, final DFSClient client)
            throws IOException {
        final String dnAddr = first.getXferAddr(client.getConf().getConnectToDnViaHostname());
        if (DFSClient.LOG.isDebugEnabled()) {
            DFSClient.LOG.debug("Connecting to datanode " + dnAddr);
        }
        final InetSocketAddress isa = NetUtils.createSocketAddr(dnAddr);
        final Socket sock = client.socketFactory.createSocket();
        final int timeout = client.getDatanodeReadTimeout(length);
        NetUtils.connect(sock, isa, client.getRandomLocalInterfaceAddr(), client.getConf().getSocketTimeout());
        sock.setSoTimeout(timeout);
        sock.setSendBufferSize(HdfsConstants.DEFAULT_DATA_SOCKET_SIZE);
        if (DFSClient.LOG.isDebugEnabled()) {
            DFSClient.LOG.debug("Send buf size " + sock.getSendBufferSize());
        }
        return sock;
    }

    @Override
    protected void checkClosed() throws IOException {
        if (closed) {
            IOException e = lastException.get();
            throw e != null ? e : new ClosedChannelException();
        }
    }

    //
    // returns the list of targets, if any, that is being currently used.
    //
    @VisibleForTesting
    public synchronized DatanodeInfo[] getPipeline() {
        if (streamer == null) {
            return null;
        }
        DatanodeInfo[] currentNodes = streamer.getNodes();
        if (currentNodes == null) {
            return null;
        }
        DatanodeInfo[] value = new DatanodeInfo[currentNodes.length];
        for (int i = 0; i < currentNodes.length; i++) {
            value[i] = currentNodes[i];
        }
        return value;
    }

    /**
     * @return the object for computing checksum.
     *         The type is NULL if checksum is not computed.
     */
    private static DataChecksum getChecksum4Compute(DataChecksum checksum, HdfsFileStatus stat) {
        if (isLazyPersist(stat) && stat.getReplication() == 1) {
            // do not compute checksum for writing to single replica to memory
            return DataChecksum.newDataChecksum(Type.NULL, checksum.getBytesPerChecksum());
        }
        return checksum;
    }

    protected DFSOutputStream(DFSClient dfsClient, String src, Progressable progress, HdfsFileStatus stat,
            DataChecksum checksum) throws IOException {
        super(getChecksum4Compute(checksum, stat));
        this.dfsClient = dfsClient;
        this.src = src;
        this.fileId = stat.getFileId();
        this.blockSize = stat.getBlockSize();
        this.blockReplication = stat.getReplication();
        this.fileEncryptionInfo = stat.getFileEncryptionInfo();
        this.progress = progress;
        this.cachingStrategy = new AtomicReference<CachingStrategy>(dfsClient.getDefaultWriteCachingStrategy());
        this.jxioResource = dfsClient.getJXIOResource();
        this.usingJxioClientResource = true;
        this.eventQHandler = jxioResource.getEqh();
        this.msgPool = jxioResource.getMsgPool();
        this.headerAckTimeoutUsec = dfsClient.getConf().getHeaderAckTimeoutUsec();

        if ((progress != null) && DFSOutputStream.LOG.isDebugEnabled()) {
            DFSOutputStream.LOG.debug("Set non-null progress callback on DFSOutputStream " + src);
        }

        this.bytesPerChecksum = checksum.getBytesPerChecksum();
        if (bytesPerChecksum < 1 || blockSize % bytesPerChecksum != 0) {
            throw new IOException("io.bytes.per.checksum(" + bytesPerChecksum + ") and blockSize(" + blockSize
                    + ") do not match. " + "blockSize should be a " + "multiple of io.bytes.per.checksum");

        }
        this.name = String.format("[hash=%X] ", DFSOutputStream.this.hashCode());
        this.checksum = checksum;

        if (toPrintBreakdown) {
            long now = System.nanoTime();
            DFSOutputStream.LOG.info(String.format("%.3f", (float) now / 1000000000.) + ", "
                    + (now - lastOperationTS) / 1000000000. + " : DFSOutputStream constructed successfully.");
            synchronized (lastOperationTSLock) {
                lastOperationTS = now;
            }
            // } else {
            // LOG.info("Mellanox RDMA-accelerated DFSOutputStream constructed successfully.");
        }
    }

    /** Construct a new output stream for creating a file. */
    private DFSOutputStream(DFSClient dfsClient, String src, HdfsFileStatus stat, EnumSet<CreateFlag> flag,
            Progressable progress, DataChecksum checksum, String[] favoredNodes) throws IOException {
        this(dfsClient, src, progress, stat, checksum);
        this.shouldSyncBlock = flag.contains(CreateFlag.SYNC_BLOCK);

        computePacketChunkSize(dfsClient.getConf().getWritePacketSize(), bytesPerChecksum);

        streamer = new DataStreamer(stat, null);
        if (favoredNodes != null && favoredNodes.length != 0) {
            streamer.setFavoredNodes(favoredNodes);
        }
    }

    static DFSOutputStream newStreamForCreate(DFSClient dfsClient, String src, FsPermission masked,
            EnumSet<CreateFlag> flag, boolean createParent, short replication, long blockSize,
            Progressable progress, int buffersize, DataChecksum checksum, String[] favoredNodes)
            throws IOException {
        TraceScope scope = dfsClient.getPathTraceScope("newStreamForCreate", src);
        try {
            HdfsFileStatus stat = null;

            // Retry the create if we get a RetryStartFileException up to a maximum
            // number of times
            boolean shouldRetry = true;
            int retryCount = CREATE_RETRY_COUNT;
            while (shouldRetry) {
                shouldRetry = false;
                try {
                    stat = dfsClient.namenode.create(src, masked, dfsClient.clientName,
                            new EnumSetWritable<CreateFlag>(flag), createParent, replication, blockSize,
                            SUPPORTED_CRYPTO_VERSIONS);
                    break;
                } catch (RemoteException re) {
                    IOException e = re.unwrapRemoteException(AccessControlException.class,
                            DSQuotaExceededException.class, FileAlreadyExistsException.class,
                            FileNotFoundException.class, ParentNotDirectoryException.class,
                            NSQuotaExceededException.class, RetryStartFileException.class, SafeModeException.class,
                            UnresolvedPathException.class, SnapshotAccessControlException.class,
                            UnknownCryptoProtocolVersionException.class);
                    if (e instanceof RetryStartFileException) {
                        if (retryCount > 0) {
                            shouldRetry = true;
                            retryCount--;
                        } else {
                            throw new IOException("Too many retries because of encryption" + " zone operations", e);
                        }
                    } else {
                        throw e;
                    }
                }
            }
            Preconditions.checkNotNull(stat, "HdfsFileStatus should not be null!");
            final DFSOutputStream out = new DFSOutputStream(dfsClient, src, stat, flag, progress, checksum,
                    favoredNodes);
            out.start();
            return out;
        } finally {
            scope.close();
        }
    }

    static DFSOutputStream newStreamForCreate(DFSClient dfsClient, String src, FsPermission masked,
            EnumSet<CreateFlag> flag, boolean createParent, short replication, long blockSize,
            Progressable progress, int buffersize, DataChecksum checksum) throws IOException {
        return newStreamForCreate(dfsClient, src, masked, flag, createParent, replication, blockSize, progress,
                buffersize, checksum, null);
    }

    /** Construct a new output stream for append. */
    private DFSOutputStream(DFSClient dfsClient, String src, EnumSet<CreateFlag> flags, Progressable progress,
            LocatedBlock lastBlock, HdfsFileStatus stat, DataChecksum checksum) throws IOException {
        this(dfsClient, src, progress, stat, checksum);
        initialFileSize = stat.getLen(); // length of file when opened

        this.shouldSyncBlock = flags.contains(CreateFlag.SYNC_BLOCK);
        boolean toNewBlock = flags.contains(CreateFlag.NEW_BLOCK);

        // The last partial block of the file has to be filled.
        if (!toNewBlock && lastBlock != null) {
            // indicate that we are appending to an existing block
            bytesCurBlock = lastBlock.getBlockSize();
            streamer = new DataStreamer(lastBlock, stat, bytesPerChecksum);
        } else {
            computePacketChunkSize(dfsClient.getConf().getWritePacketSize(), bytesPerChecksum);
            streamer = new DataStreamer(stat, lastBlock != null ? lastBlock.getBlock() : null);
        }
        this.fileEncryptionInfo = stat.getFileEncryptionInfo();
    }

    static DFSOutputStream newStreamForAppend(DFSClient dfsClient, String src, EnumSet<CreateFlag> flags,
            int bufferSize, Progressable progress, LocatedBlock lastBlock, HdfsFileStatus stat,
            DataChecksum checksum, String[] favoredNodes) throws IOException {
        TraceScope scope = dfsClient.getPathTraceScope("newStreamForAppend", src);
        try {
            final DFSOutputStream out = new DFSOutputStream(dfsClient, src, flags, progress, lastBlock, stat,
                    checksum);
            if (favoredNodes != null && favoredNodes.length != 0) {
                out.streamer.setFavoredNodes(favoredNodes);
            }
            out.start();
            return out;
        } finally {
            scope.close();
        }
    }

    private static boolean isLazyPersist(HdfsFileStatus stat) {
        final BlockStoragePolicy p = blockStoragePolicySuite.getPolicy(HdfsConstants.MEMORY_STORAGE_POLICY_NAME);
        return p != null && stat.getStoragePolicy() == p.getId();
    }

    private void computePacketChunkSize(int psize, int csize) {
        final int bodySize = psize - PacketHeader.PKT_MAX_HEADER_LEN;
        final int chunkSize = csize + getChecksumSize();
        chunksPerPacket = Math.max(bodySize / chunkSize, 1);
        packetSize = chunkSize * chunksPerPacket;
        if (DFSClient.LOG.isDebugEnabled()) {
            DFSClient.LOG.debug("computePacketChunkSize: src=" + src + ", chunkSize=" + chunkSize
                    + ", chunksPerPacket=" + chunksPerPacket + ", packetSize=" + packetSize);
        }
    }

    private void queueCurrentPacket() {
        if (currentPacket == null)
            return;

        dataQueue.add(currentPacket);

        lastQueuedSeqno = currentPacket.seqno;
        if (DFSOutputStream.LOG.isDebugEnabled()) {
            DFSOutputStream.LOG.debug("Queued packet " + currentPacket.seqno);
        }
        currentPacket = null;
        synchronized (dataQueue) {
            dataQueue.notifyAll();
        }
    }

    private void waitAndQueueCurrentPacket() throws IOException {

        try {
            // If queue is full, then wait till we have enough space
            while (!closed && ((dataQueue.size() + ackQueue.size()) >= R4HProtocol.MAX_DATA_QUEUE_PACKETS)) {
                try {
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Going to wait on dataQ indefinitely...");
                    }
                    synchronized (dataQueue) {
                        dataQueue.wait(1000);
                    }
                } catch (InterruptedException e) {
                    // If we get interrupted while waiting to queue data, we still need to get rid
                    // of the current packet. This is because we have an invariant that if
                    // currentPacket gets full, it will get queued before the next writeChunk.
                    //
                    // Rather than wait around for space in the queue, we should instead try to
                    // return to the caller as soon as possible, even though we slightly overrun
                    // the MAX_PACKETS length.
                    Thread.currentThread().interrupt();
                    break;
                }
            }
            checkClosed();
            queueCurrentPacket();
        } catch (ClosedChannelException e) {
        }

    }

    // @see FSOutputSummer#writeChunk()
    @Override
    protected synchronized void writeChunk(byte[] b, int offset, int len, byte[] checksum, int ckoff, int cklen)
            throws IOException {
        TraceScope scope = dfsClient.getPathTraceScope("DFSOutputStream#writeChunk", src);
        try {
            writeChunkImpl(b, offset, len, checksum, ckoff, cklen);
        } finally {
            scope.close();
        }
    }

    private synchronized void writeChunkImpl(byte[] b, int offset, int len, byte[] checksum, int ckoff, int cklen)
            throws IOException {
        dfsClient.checkOpen();
        checkClosed();

        if (len > bytesPerChecksum) {
            throw new IOException("writeChunk() buffer size is " + len
                    + " is larger than supported  bytesPerChecksum " + bytesPerChecksum);
        }
        if (cklen != 0 && cklen != getChecksumSize()) {
            throw new IOException("writeChunk() checksum size is supposed to be " + getChecksumSize()
                    + " but found to be " + cklen);
        }

        if (currentPacket == null) {
            currentPacket = createPacket(packetSize, chunksPerPacket, bytesCurBlock, currentSeqno++, false);
            if (DFSClient.LOG.isDebugEnabled()) {
                DFSClient.LOG.debug("DFSClient writeChunk allocating new packet seqno=" + currentPacket.getSeqno()
                        + ", src=" + src + ", packetSize=" + packetSize + ", chunksPerPacket=" + chunksPerPacket
                        + ", bytesCurBlock=" + bytesCurBlock);
            }
        }

        currentPacket.writeChecksum(checksum, ckoff, cklen);
        currentPacket.writeData(b, offset, len);
        currentPacket.incNumChunks();
        bytesCurBlock += len;

        // If packet is full, enqueue it for transmission
        //
        if (currentPacket.getNumChunks() == currentPacket.getMaxChunks() || bytesCurBlock == blockSize) {
            if (DFSClient.LOG.isDebugEnabled()) {
                DFSClient.LOG.debug("DFSClient writeChunk packet full seqno=" + currentPacket.getSeqno() + ", src="
                        + src + ", bytesCurBlock=" + bytesCurBlock + ", blockSize=" + blockSize + ", appendChunk="
                        + appendChunk);
            }
            waitAndQueueCurrentPacket();

            // If the reopened file did not end at chunk boundary and the above
            // write filled up its partial chunk. Tell the summer to generate full
            // crc chunks from now on.
            if (appendChunk && bytesCurBlock % bytesPerChecksum == 0) {
                appendChunk = false;
                resetChecksumBufSize();
            }

            if (!appendChunk) {
                int psize = Math.min((int) (blockSize - bytesCurBlock), dfsClient.getConf().getWritePacketSize());
                computePacketChunkSize(psize, bytesPerChecksum);
            }
            //
            // if encountering a block boundary, send an empty packet to
            // indicate the end of block and reset bytesCurBlock.
            //
            if (bytesCurBlock == blockSize) {
                currentPacket = createPacket(0, 0, bytesCurBlock, currentSeqno++, true);
                currentPacket.setSyncBlock(shouldSyncBlock);
                waitAndQueueCurrentPacket();
                bytesCurBlock = 0;
                lastFlushOffset = 0;
            }
        }
    }

    @Deprecated
    public void sync() throws IOException {
        hflush();
    }

    /**
     * Flushes out to all replicas of the block. The data is in the buffers
     * of the DNs but not necessarily in the DN's OS buffers.
     * 
     * It is a synchronous operation. When it returns,
     * it guarantees that flushed data become visible to new readers.
     * It is not guaranteed that data has been flushed to
     * persistent store on the datanode.
     * Block allocations are persisted on namenode.
     */
    @Override
    public void hflush() throws IOException {
        TraceScope scope = dfsClient.getPathTraceScope("hflush", src);
        try {
            flushOrSync(false, EnumSet.noneOf(SyncFlag.class));
        } finally {
            scope.close();
        }
    }

    @Override
    public void hsync() throws IOException {
        TraceScope scope = dfsClient.getPathTraceScope("hsync", src);
        try {
            flushOrSync(true, EnumSet.noneOf(SyncFlag.class));
        } finally {
            scope.close();
        }
    }

    /**
     * The expected semantics is all data have flushed out to all replicas
     * and all replicas have done posix fsync equivalent - ie the OS has
     * flushed it to the disk device (but the disk may have it in its cache).
     * 
     * Note that only the current block is flushed to the disk device.
     * To guarantee durable sync across block boundaries the stream should
     * be created with {@link CreateFlag#SYNC_BLOCK}.
     * 
     * @param syncFlags
     *            Indicate the semantic of the sync. Currently used to specify
     *            whether or not to update the block length in NameNode.
     */
    public void hsync(EnumSet<SyncFlag> syncFlags) throws IOException {
        TraceScope scope = dfsClient.getPathTraceScope("hsync", src);
        try {
            flushOrSync(true, syncFlags);
        } finally {
            scope.close();
        }
    }

    /**
     * Flush/Sync buffered data to DataNodes.
     * 
     * @param isSync
     *            Whether or not to require all replicas to flush data to the disk
     *            device
     * @param syncFlags
     *            Indicate extra detailed semantic of the flush/sync. Currently
     *            mainly used to specify whether or not to update the file length in
     *            the NameNode
     * @throws IOException
     */
    private void flushOrSync(boolean isSync, EnumSet<SyncFlag> syncFlags) throws IOException {
        dfsClient.checkOpen();
        checkClosed();
        try {
            long toWaitFor;
            long lastBlockLength = -1L;
            boolean updateLength = syncFlags.contains(SyncFlag.UPDATE_LENGTH);
            synchronized (this) {
                /*
                 * Record current blockOffset. This might be changed inside
                 * flushBuffer() where a partial checksum chunk might be flushed.
                 * After the flush, reset the bytesCurBlock back to its previous value,
                 * any partial checksum chunk will be sent now and in next packet.
                 */
                long saveOffset = bytesCurBlock;
                DFSPacket oldCurrentPacket = currentPacket;
                // flush checksum buffer, but keep checksum buffer intact
                int numKept = flushBuffer(true, true);
                // bytesCurBlock potentially incremented if there was buffered data

                if (DFSClient.LOG.isDebugEnabled()) {
                    DFSClient.LOG.debug("DFSClient flush() : saveOffset " + saveOffset + " bytesCurBlock "
                            + bytesCurBlock + " lastFlushOffset " + lastFlushOffset);
                }
                // Flush only if we haven't already flushed till this offset.
                if (lastFlushOffset != bytesCurBlock) {
                    assert bytesCurBlock > lastFlushOffset;
                    // record the valid offset of this flush
                    lastFlushOffset = bytesCurBlock;
                    if (isSync && currentPacket == null) {
                        // Nothing to send right now,
                        // but sync was requested.
                        // Send an empty packet
                        currentPacket = createPacket(packetSize, chunksPerPacket, bytesCurBlock, currentSeqno++,
                                false);
                    }
                } else {
                    // We already flushed up to this offset.
                    // This means that we haven't written anything since the last flush
                    // (or the beginning of the file). Hence, we should not have any
                    // packet queued prior to this call, since the last flush set
                    // currentPacket = null.
                    assert oldCurrentPacket == null : "Empty flush should not occur with a currentPacket";

                    if (isSync && bytesCurBlock > 0) {
                        // Nothing to send right now,
                        // and the block was partially written,
                        // and sync was requested.
                        // So send an empty sync packet.
                        currentPacket = createPacket(packetSize, chunksPerPacket, bytesCurBlock, currentSeqno++,
                                false);
                    } else {
                        // just discard the current packet since it is already been sent.
                        currentPacket = null;
                    }
                }
                if (currentPacket != null) {
                    currentPacket.syncBlock = isSync;
                    waitAndQueueCurrentPacket();
                }
                // Restore state of stream. Record the last flush offset
                // of the last full chunk that was flushed.
                //
                bytesCurBlock -= numKept;
                toWaitFor = lastQueuedSeqno;
            } // end synchronized

            waitForAckedSeqno(toWaitFor);

            // update the block length first time irrespective of flag
            if (updateLength || persistBlocks.get()) {
                synchronized (this) {
                    if (streamer != null && streamer.block != null) {
                        lastBlockLength = streamer.block.getNumBytes();
                    }
                }
            }
            // If 1) any new blocks were allocated since the last flush, or 2) to
            // update length in NN is required, then persist block locations on
            // namenode.
            if (persistBlocks.getAndSet(false) || updateLength) {
                try {
                    dfsClient.namenode.fsync(src, fileId, dfsClient.clientName, lastBlockLength);
                } catch (IOException ioe) {
                    DFSClient.LOG.warn("Unable to persist blocks in hflush for " + src, ioe);
                    // If we got an error here, it might be because some other thread called
                    // close before our hflush completed. In that case, we should throw an
                    // exception that the stream is closed.
                    checkClosed();
                    // If we aren't closed but failed to sync, we should expose that to the
                    // caller.
                    throw ioe;
                }
            }

            synchronized (this) {
                if (streamer != null) {
                    streamer.setHflush();
                }
            }
        } catch (InterruptedIOException interrupt) {
            // This kind of error doesn't mean that the stream itself is broken - just the
            // flushing thread got interrupted. So, we shouldn't close down the writer,
            // but instead just propagate the error
            throw interrupt;
        } catch (IOException e) {
            DFSClient.LOG.warn("Error while syncing", e);
            synchronized (this) {
                if (!closed) {
                    lastException.set(new IOException("IOException flush:" + e));
                    closeThreads(true);
                }
            }
            throw e;
        }
    }

    /**
     * @deprecated use {@link HdfsDataOutputStream#getCurrentBlockReplication()}.
     */
    @Deprecated
    public synchronized int getNumCurrentReplicas() throws IOException {
        return getCurrentBlockReplication();
    }

    /**
     * Note that this is not a public API;
     * use {@link HdfsDataOutputStream#getCurrentBlockReplication()} instead.
     * 
     * @return the number of valid replicas of the current block
     */
    public synchronized int getCurrentBlockReplication() throws IOException {
        dfsClient.checkOpen();
        checkClosed();
        if (streamer == null) {
            return blockReplication; // no pipeline, return repl factor of file
        }
        DatanodeInfo[] currentNodes = streamer.getNodes();
        if (currentNodes == null) {
            return blockReplication; // no pipeline, return repl factor of file
        }
        return currentNodes.length;
    }

    /**
     * Waits till all existing data is flushed and confirmations
     * received from datanodes.
     */
    private void flushInternal() throws IOException {
        long toWaitFor;
        synchronized (this) {
            dfsClient.checkOpen();
            checkClosed();
            //
            // If there is data in the current buffer, send it across
            //
            queueCurrentPacket();
            toWaitFor = lastQueuedSeqno;
        }

        waitForAckedSeqno(toWaitFor);
    }

    private void waitForAckedSeqno(long seqno) throws IOException {
        if (DFSOutputStream.LOG.isDebugEnabled()) {
            DFSOutputStream.LOG.debug("Waiting for ack for: " + seqno);
        }
        try {
            while (!closed) {
                checkClosed();
                if (lastAckedSeqno >= seqno) {
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("going to break loop for waiting to lastAckedSeqno(=" + lastAckedSeqno
                                + ", seqnumToWaitFor=" + seqno + ")");
                    }
                    break;
                }
                try {
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("going to wait on ackQueue.wait() for lastAckedSeqno(=" + lastAckedSeqno
                                + ", seqnumToWaitFor=" + seqno + ")");
                    }
                    synchronized (ackQueue) { // in orig cdh wait on dataQ
                        if (lastAckedSeqno < seqno) {
                            ackQueue.wait();
                        }
                    }
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("After waiting on ackQueue.wait() for lastAckedSeqno(=" + lastAckedSeqno
                                + ", seqnumToWaitFor=" + seqno + ")");
                    }
                } catch (InterruptedException ie) {
                    throw new InterruptedIOException(
                            "Interrupted while waiting for data to be acknowledged by pipeline");
                }
            }
            checkClosed();
        } catch (ClosedChannelException e) {
        }
    }

    private synchronized void start() {
        streamer.start();
    }

    /**
     * Aborts this output stream and releases any system
     * resources associated with this stream.
     */
    synchronized void abort() throws IOException {
        if (closed) {
            if (usingJxioClientResource) {
                this.eventQHandler = null;
                dfsClient.returnJXIOResource(jxioResource);
                usingJxioClientResource = false;
            }
            return;
        }
        streamer.setLastException(
                new IOException("Lease timeout of " + (dfsClient.getHdfsTimeout() / 1000) + " seconds expired."));
        closeThreads(true);
        dfsClient.endFileLease(fileId);
    }

    // shutdown datastreamer and responseprocessor threads.
    // interrupt datastreamer if force is true
    private void closeThreads(boolean force) throws IOException {
        try {
            streamer.close(force);
            streamer.join();
            if (s != null) {
                s.close();
            }
        } catch (InterruptedException e) {
            throw new IOException("Failed to shutdown streamer");
        } finally {
            streamer = null;
            s = null;
            closed = true;
            this.eventQHandler = null;
            dfsClient.returnJXIOResource(jxioResource);
            usingJxioClientResource = false;
        }
    }

    /**
     * Closes this output stream and releases any system
     * resources associated with this stream.
     */
    @Override
    public synchronized void close() throws IOException {
        if (closed) {
            if (usingJxioClientResource) {
                this.eventQHandler = null;
                dfsClient.returnJXIOResource(jxioResource);
                usingJxioClientResource = false;
            }
            IOException e = lastException.getAndSet(null);
            if (e == null)
                return;
            else
                throw e;
        }

        try {
            flushBuffer(); // flush from all upper layers

            if (currentPacket != null) {
                waitAndQueueCurrentPacket();
            }

            if (bytesCurBlock != 0) {
                // send an empty packet to mark the end of the block
                currentPacket = createPacket(0, 0, bytesCurBlock, currentSeqno++, true);
                currentPacket.lastPacketInBlock = true;
                currentPacket.syncBlock = shouldSyncBlock;
            }

            flushInternal(); // flush all data to Datanodes
            // get last block before destroying the streamer
            ExtendedBlock lastBlock = streamer.getBlock();
            closeThreads(false);
            completeFile(lastBlock);
            dfsClient.endFileLease(fileId);
        } catch (ClosedChannelException e) {
        } finally {
            closed = true;
        }
    }

    // should be called holding (this) lock since setTestFilename() may
    // be called during unit tests
    private void completeFile(ExtendedBlock last) throws IOException {
        long localstart = Time.monotonicNow();
        long localTimeout = 400;
        boolean fileComplete = false;
        int retries = dfsClient.getConf().getnBlockWriteLocateFollowingRetry();
        while (!fileComplete) {
            fileComplete = dfsClient.namenode.complete(src, dfsClient.clientName, last, fileId);
            if (!fileComplete) {
                final int hdfsTimeout = dfsClient.getHdfsTimeout();
                if (!dfsClient.clientRunning
                        || (hdfsTimeout > 0 && localstart + hdfsTimeout < Time.monotonicNow())) {
                    String msg = "Unable to close file because dfsclient "
                            + " was unable to contact the HDFS servers." + " clientRunning "
                            + dfsClient.clientRunning + " hdfsTimeout " + hdfsTimeout;
                    DFSClient.LOG.info(msg);
                    throw new IOException(msg);
                }
                try {
                    if (retries == 0) {
                        throw new IOException("Unable to close file because the last block"
                                + " does not have enough number of replicas.");
                    }
                    retries--;
                    Thread.sleep(localTimeout);
                    localTimeout *= 2;
                    if (Time.monotonicNow() - localstart > 5000) {
                        DFSClient.LOG.info("Could not complete " + src + " retrying...");
                    }
                } catch (InterruptedException ie) {
                    DFSClient.LOG.warn("Caught exception ", ie);
                }
            }
        }
    }

    @VisibleForTesting
    public void setArtificialSlowdown(long period) {
        artificialSlowdown = period;
    }

    @VisibleForTesting
    public synchronized void setChunksPerPacket(int value) {
        chunksPerPacket = Math.min(chunksPerPacket, value);
        packetSize = (bytesPerChecksum + getChecksumSize()) * chunksPerPacket;
    }

    synchronized void setTestFilename(String newname) {
        src = newname;
    }

    /**
     * Returns the size of a file as it was when this stream was opened
     */
    public long getInitialLen() {
        return initialFileSize;
    }

    /**
     * @return the FileEncryptionInfo for this stream, or null if not encrypted.
     */
    public FileEncryptionInfo getFileEncryptionInfo() {
        return fileEncryptionInfo;
    }

    /**
     * Returns the access token currently used by streamer, for testing only
     */
    synchronized Token<BlockTokenIdentifier> getBlockToken() {
        return streamer.getBlockToken();
    }

    @Override
    public void setDropBehind(Boolean dropBehind) throws IOException {
        CachingStrategy prevStrategy, nextStrategy;
        // CachingStrategy is immutable. So build a new CachingStrategy with the
        // modifications we want, and compare-and-swap it in.
        do {
            prevStrategy = this.cachingStrategy.get();
            nextStrategy = new CachingStrategy.Builder(prevStrategy).setDropBehind(dropBehind).build();
        } while (!this.cachingStrategy.compareAndSet(prevStrategy, nextStrategy));
    }

    @VisibleForTesting
    ExtendedBlock getBlock() {
        return streamer.getBlock();
    }

    @VisibleForTesting
    public long getFileId() {
        return fileId;
    }

    private static <T> void arraycopy(T[] srcs, T[] dsts, int skipIndex) {
        System.arraycopy(srcs, 0, dsts, 0, skipIndex);
        System.arraycopy(srcs, skipIndex + 1, dsts, skipIndex, dsts.length - skipIndex);
    }

    private Msg getMsg() {
        Msg msg = null;
        synchronized (DFSOutputStream.this.msgPool) {
            while (msg == null) {
                msg = DFSOutputStream.this.msgPool.getMsg();
                if (msg == null) {
                    LOG.warn(Thread.currentThread() + " " + toString() + " MsgPool " + msgPool
                            + " is empty. Going to wait for Msg.");
                    try {
                        DFSOutputStream.this.msgPool.wait();
                    } catch (InterruptedException e) {
                        throw new RuntimeException("Interrupted while waiting for free Msg", e);
                    }
                }
            }
        }
        return msg;
    }

    private void returnMsgToPool(Msg message) {
        synchronized (DFSOutputStream.this.msgPool) {
            message.returnToParentPool();
            DFSOutputStream.this.msgPool.notifyAll();
        }
    }
}