Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs; import static org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status.SUCCESS; import java.io.BufferedOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InterruptedIOException; import java.io.OutputStream; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.Socket; import java.nio.channels.ClosedChannelException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.fs.StorageType; import org.apache.hadoop.hdfs.protocol.DSQuotaExceededException; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.protocol.HdfsFileStatus; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.NSQuotaExceededException; import org.apache.hadoop.hdfs.protocol.QuotaExceededException; import org.apache.hadoop.hdfs.protocol.UnresolvedPathException; import org.apache.hadoop.hdfs.protocol.datatransfer.BlockConstructionStage; import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtoUtil; import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtocol; import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair; import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException; import org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader; import org.apache.hadoop.hdfs.protocol.datatransfer.PipelineAck; import org.apache.hadoop.hdfs.protocol.datatransfer.Sender; import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto; import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status; import org.apache.hadoop.hdfs.protocolPB.PBHelper; import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; import org.apache.hadoop.hdfs.server.blockmanagement.BlockStoragePolicySuite; import org.apache.hadoop.hdfs.server.datanode.CachingStrategy; import org.apache.hadoop.hdfs.server.namenode.NotReplicatedYetException; import org.apache.hadoop.hdfs.util.ByteArrayManager; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.util.Daemon; import org.apache.hadoop.util.DataChecksum; import org.apache.hadoop.util.Progressable; import org.apache.hadoop.util.Time; import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; import com.google.common.cache.RemovalListener; import com.google.common.cache.RemovalNotification; import com.google.common.collect.ImmutableSet; import java.util.Collection; import java.util.Collections; import javax.net.SocketFactory; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.net.StandardSocketFactory; import org.apache.htrace.core.Span; import org.apache.htrace.core.SpanId; import org.apache.htrace.core.TraceScope; import org.apache.htrace.core.Tracer; /********************************************************************* * * The DataStreamer class is responsible for sending data packets to the * datanodes in the pipeline. It retrieves a new blockid and block locations * from the namenode, and starts streaming packets to the pipeline of * Datanodes. Every packet has a sequence number associated with * it. When all the packets for a block are sent out and acks for each * if them are received, the DataStreamer closes the current block. * * The DataStreamer thread picks up packets from the dataQueue, sends it to * the first datanode in the pipeline and moves it from the dataQueue to the * ackQueue. The ResponseProcessor receives acks from the datanodes. When an * successful ack for a packet is received from all datanodes, the * ResponseProcessor removes the corresponding packet from the ackQueue. * * In case of error, all outstanding packets are moved from ackQueue. A new * pipeline is setup by eliminating the bad datanode from the original * pipeline. The DataStreamer now starts sending packets from the dataQueue. * *********************************************************************/ @InterfaceAudience.Private class DataStreamer extends Daemon { public static final Log LOG = LogFactory.getLog(DataStreamer.class); /** * Create a socket for a write pipeline * * @param first the first datanode * @param length the pipeline length * @param client client * @return the socket connected to the first datanode */ static Socket createSocketForPipeline(final DatanodeInfo first, final int length, final DFSClient client) throws IOException { final String dnAddr = first.getXferAddr(client.getConf().connectToDnViaHostname); if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("Connecting to datanode " + dnAddr); } final InetSocketAddress isa = NetUtils.createSocketAddr(dnAddr); SocketFactory socketFactory = new StandardSocketFactory(); final Socket sock = socketFactory.createSocket(); final int timeout = client.getDatanodeReadTimeout(length); NetUtils.connect(sock, isa, client.getRandomLocalInterfaceAddr(), client.getConf().socketTimeout); sock.setSoTimeout(timeout); sock.setSendBufferSize(HdfsConstants.DEFAULT_DATA_SOCKET_SIZE); if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("Send buf size " + sock.getSendBufferSize()); } return sock; } /** * release a list of packets to ByteArrayManager * * @param packets packets to be release * @param bam ByteArrayManager */ private static void releaseBuffer(List<DFSPacket> packets, ByteArrayManager bam) { for (DFSPacket p : packets) { p.releaseBuffer(bam); } packets.clear(); } private volatile boolean streamerClosed = false; private ExtendedBlock block; // its length is number of bytes acked private Token<BlockTokenIdentifier> accessToken; private DataOutputStream blockStream; private DataInputStream blockReplyStream; private ResponseProcessor response = null; private volatile DatanodeInfo[] nodes = null; // list of targets for current block private volatile StorageType[] storageTypes = null; private volatile String[] storageIDs = null; private String[] favoredNodes; volatile boolean hasError = false; volatile int errorIndex = -1; // Restarting node index AtomicInteger restartingNodeIndex = new AtomicInteger(-1); private long restartDeadline = 0; // Deadline of DN restart private BlockConstructionStage stage; // block construction stage private long bytesSent = 0; // number of bytes that've been sent /** Nodes have been used in the pipeline before and have failed. */ private final List<DatanodeInfo> failed = new ArrayList<>(); /** The last ack sequence number before pipeline failure. */ private long lastAckedSeqnoBeforeFailure = -1; private int pipelineRecoveryCount = 0; /** Has the current block been hflushed? */ private boolean isHflushed = false; /** Append on an existing block? */ private boolean isAppend; private long currentSeqno = 0; private long lastQueuedSeqno = -1; private long lastAckedSeqno = -1; private long bytesCurBlock = 0; // bytes written in current block private final AtomicReference<IOException> lastException = new AtomicReference<>(); private Socket s; private final DFSClient dfsClient; private final String src; /** Only for DataTransferProtocol.writeBlock(..) */ private final DataChecksum checksum; private final Progressable progress; private final HdfsFileStatus stat; // appending to existing partial block private volatile boolean appendChunk = false; // both dataQueue and ackQueue are protected by dataQueue lock private final LinkedList<DFSPacket> dataQueue = new LinkedList<>(); private final LinkedList<DFSPacket> ackQueue = new LinkedList<>(); private final AtomicReference<CachingStrategy> cachingStrategy; private final ByteArrayManager byteArrayManager; private static final BlockStoragePolicySuite blockStoragePolicySuite = BlockStoragePolicySuite .createDefaultSuite(); //persist blocks on namenode private final AtomicBoolean persistBlocks = new AtomicBoolean(false); private boolean failPacket = false; private final long dfsclientSlowLogThresholdMs; private long artificialSlowdown = 0; // List of congested data nodes. The stream will back off if the DataNodes // are congested private final ArrayList<DatanodeInfo> congestedNodes = new ArrayList<>(); private static final int CONGESTION_BACKOFF_MEAN_TIME_IN_MS = 5000; private static final int CONGESTION_BACK_OFF_MAX_TIME_IN_MS = CONGESTION_BACKOFF_MEAN_TIME_IN_MS * 10; private int lastCongestionBackoffTime; private final LoadingCache<DatanodeInfo, DatanodeInfo> excludedNodes; private final LinkedList<DFSPacket> smallFileDataQueue = new LinkedList<>(); // Information for sending a single block private LocatedBlock lb; //erasure coding private boolean erasureCodingSourceStream = false; private int currentBlockIndex = 0; private int stripeLength; private HashSet<DatanodeInfo> usedNodes = new HashSet<>(); private int parityLength; private boolean erasureCodingParityStream = false; private List<DatanodeInfo> stripeNodes = new LinkedList<>(); private List<LocatedBlock> sourceBlocks = Collections.emptyList(); private List<DatanodeInfo> parityStripeNodes = new LinkedList<>(); //samll files in db private final int dbFileMaxSize; private final boolean forceClientToWriteSFToDisk; private boolean isThisFileStoredInDB = false; //if the client calls sync/flush method then the file will be stored on the //datanodes irrespective of the file size. The reason is that before the file //is close we are not sure about the final size of the file. If we store the // the data in the database and later on the file size exceeds the "dbFileMaxSize" // limit then we will have to transfer the data stored in the databse to the // datanodes. This will slow down the file creations and put unnecessary stress // on the NameNodes. private boolean syncOrFlushCalled = false; private DataStreamer(HdfsFileStatus stat, DFSClient dfsClient, String src, Progressable progress, DataChecksum checksum, AtomicReference<CachingStrategy> cachingStrategy, ByteArrayManager byteArrayManage, int dbFileMaxSize, boolean forceClientToWriteSFToDisk) { this.dfsClient = dfsClient; this.src = src; this.progress = progress; this.stat = stat; this.cachingStrategy = cachingStrategy; this.byteArrayManager = byteArrayManage; this.dfsclientSlowLogThresholdMs = dfsClient.getConf().dfsclientSlowIoWarningThresholdMs; excludedNodes = initExcludedNodes(); this.checksum = checksum; this.dbFileMaxSize = dbFileMaxSize; this.forceClientToWriteSFToDisk = forceClientToWriteSFToDisk; if (this.forceClientToWriteSFToDisk) { isThisFileStoredInDB = false; } else { isThisFileStoredInDB = stat.isFileStoredInDB(); } } /** * construction with tracing info */ DataStreamer(HdfsFileStatus stat, ExtendedBlock block, DFSClient dfsClient, String src, Progressable progress, DataChecksum checksum, AtomicReference<CachingStrategy> cachingStrategy, ByteArrayManager byteArrayManage, int dbFileMaxSize, boolean forceClientToWriteSFToDisk) { this(stat, dfsClient, src, progress, checksum, cachingStrategy, byteArrayManage, dbFileMaxSize, forceClientToWriteSFToDisk); isAppend = false; this.block = block; stage = BlockConstructionStage.PIPELINE_SETUP_CREATE; } /** * Construct a data streamer for appending to the last partial block * @param lastBlock last block of the file to be appended * @param stat status of the file to be appended * @throws IOException if error occurs */ DataStreamer(LocatedBlock lastBlock, HdfsFileStatus stat, DFSClient dfsClient, String src, Progressable progress, DataChecksum checksum, AtomicReference<CachingStrategy> cachingStrategy, ByteArrayManager byteArrayManage, int dbFileMaxSize, boolean forceClientToWriteSFToDisk) throws IOException { this(stat, dfsClient, src, progress, checksum, cachingStrategy, byteArrayManage, dbFileMaxSize, forceClientToWriteSFToDisk); isAppend = true; stage = BlockConstructionStage.PIPELINE_SETUP_APPEND; block = lastBlock.getBlock(); bytesSent = block.getNumBytes(); accessToken = lastBlock.getBlockToken(); } /** * Construct a data streamer for single block transfer */ DataStreamer(HdfsFileStatus stat, LocatedBlock lb, boolean sigleBlock, DFSClient dfsClient, String src, Progressable progress, DataChecksum checksum, AtomicReference<CachingStrategy> cachingStrategy, ByteArrayManager byteArrayManage, int dbFileMaxSize, boolean saveSmallFilesInDB) { this(stat, dfsClient, src, progress, checksum, cachingStrategy, byteArrayManage, dbFileMaxSize, saveSmallFilesInDB); isAppend = false; stage = BlockConstructionStage.PIPELINE_SETUP_SINGLE_BLOCK; this.lb = lb; } /** * Set pipeline in construction * * @param lastBlock the last block of a file * @throws IOException */ void setPipelineInConstruction(LocatedBlock lastBlock) throws IOException { // setup pipeline to append to the last block XXX retries?? setPipeline(lastBlock); errorIndex = -1; // no errors yet. if (nodes.length < 1) { throw new IOException( "Unable to retrieve blocks locations " + " for last block " + block + "of file " + src); } } private void setPipeline(LocatedBlock lb) { setPipeline(lb.getLocations(), lb.getStorageTypes(), lb.getStorageIDs()); } private void setPipeline(DatanodeInfo[] nodes, StorageType[] storageTypes, String[] storageIDs) { this.nodes = nodes; this.storageTypes = storageTypes; this.storageIDs = storageIDs; } /** * Set favored nodes * * @param favoredNodes favored nodes */ void setFavoredNodes(String[] favoredNodes) { this.favoredNodes = favoredNodes; } /** * Initialize for data streaming */ private void initDataStreaming() { this.setName("DataStreamer for file " + src + " block " + block); response = new ResponseProcessor(nodes); response.start(); stage = BlockConstructionStage.DATA_STREAMING; } private void endBlock() { if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("Closing old block " + block); } this.setName("DataStreamer for file " + src); closeResponder(); closeStream(); setPipeline(null, null, null); stage = BlockConstructionStage.PIPELINE_SETUP_CREATE; } /* * streamer thread is the only thread that opens streams to datanode, * and closes them. Any error recovery is also done by this thread. */ @Override public void run() { long lastPacket = Time.monotonicNow(); TraceScope scope = null; while (!streamerClosed && dfsClient.clientRunning) { // if the Responder encountered an error, shutdown Responder if (hasError && response != null) { try { response.close(); response.join(); response = null; } catch (InterruptedException e) { DFSClient.LOG.warn("Caught exception ", e); } } DFSPacket one; try { // process datanode IO errors if any boolean doSleep = false; if (hasError && (errorIndex >= 0 || restartingNodeIndex.get() >= 0)) { doSleep = processDatanodeError(); } synchronized (dataQueue) { // wait for a packet to be sent. long now = Time.monotonicNow(); while ((!streamerClosed && !hasError && dfsClient.clientRunning && dataQueue.size() == 0 && (stage != BlockConstructionStage.DATA_STREAMING || stage == BlockConstructionStage.DATA_STREAMING && now - lastPacket < dfsClient.getConf().socketTimeout / 2)) || doSleep) { long timeout = dfsClient.getConf().socketTimeout / 2 - (now - lastPacket); timeout = timeout <= 0 ? 1000 : timeout; timeout = (stage == BlockConstructionStage.DATA_STREAMING) ? timeout : 1000; try { dataQueue.wait(timeout); } catch (InterruptedException e) { DFSClient.LOG.warn("Caught exception ", e); } doSleep = false; now = Time.monotonicNow(); } if (streamerClosed || hasError || !dfsClient.clientRunning) { continue; } // get packet to be sent. if (dataQueue.isEmpty()) { one = createHeartbeatPacket(); assert one != null; } else { try { backOffIfNecessary(); } catch (InterruptedException e) { DFSClient.LOG.warn("Caught exception ", e); } one = dataQueue.getFirst(); // regular data packet SpanId[] parents = one.getTraceParents(); if (parents.length > 0) { scope = dfsClient.getTracer().newScope("dataStreamer", parents[0]); scope.getSpan().setParents(parents); } } } // get new block from namenode. if (stage == BlockConstructionStage.PIPELINE_SETUP_CREATE) { if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("Allocating new block"); } setPipeline(nextBlockOutputStream()); initDataStreaming(); } else if (stage == BlockConstructionStage.PIPELINE_SETUP_APPEND) { if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("Append to block " + block); } setupPipelineForAppendOrRecovery(); initDataStreaming(); } else if (stage == BlockConstructionStage.PIPELINE_SETUP_SINGLE_BLOCK) { // TODO This is sent by protobuf and somehow a hack stage = BlockConstructionStage.PIPELINE_SETUP_CREATE; if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("Send single block " + block); } setPipeline(lb); nodes = setupPipelineForSingleBlock(lb); initDataStreaming(); } long lastByteOffsetInBlock = one.getLastByteOffsetBlock(); if (lastByteOffsetInBlock > stat.getBlockSize()) { throw new IOException("BlockSize " + stat.getBlockSize() + " is smaller than data size. " + " Offset of packet in block " + lastByteOffsetInBlock + " Aborting file " + src); } if (one.isLastPacketInBlock()) { // wait for all data packets have been successfully acked synchronized (dataQueue) { while (!streamerClosed && !hasError && ackQueue.size() != 0 && dfsClient.clientRunning) { try { // wait for acks to arrive from datanodes dataQueue.wait(1000); } catch (InterruptedException e) { DFSClient.LOG.warn("Caught exception ", e); } } } if (streamerClosed || hasError || !dfsClient.clientRunning) { continue; } stage = BlockConstructionStage.PIPELINE_CLOSE; } // send the packet SpanId spanId = SpanId.INVALID; synchronized (dataQueue) { // move packet from dataQueue to ackQueue if (!one.isHeartbeatPacket()) { if (scope != null) { spanId = scope.getSpanId(); scope.detach(); one.setTraceScope(scope); } scope = null; dataQueue.removeFirst(); ackQueue.addLast(one); dataQueue.notifyAll(); } } if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("DataStreamer block " + block + " sending packet " + one); } // write out data to remote datanode try (TraceScope ignored = dfsClient.getTracer().newScope("DataStreamer#writeTo", spanId)) { one.writeTo(blockStream); blockStream.flush(); } catch (IOException e) { // HDFS-3398 treat primary DN is down since client is unable to // write to primary DN. If a failed or restarting node has already // been recorded by the responder, the following call will have no // effect. Pipeline recovery can handle only one node error at a // time. If the primary node fails again during the recovery, it // will be taken out then. tryMarkPrimaryDatanodeFailed(); throw e; } lastPacket = Time.monotonicNow(); // update bytesSent long tmpBytesSent = one.getLastByteOffsetBlock(); if (bytesSent < tmpBytesSent) { bytesSent = tmpBytesSent; } if (streamerClosed || hasError || !dfsClient.clientRunning) { continue; } // Is this block full? if (one.isLastPacketInBlock()) { // wait for the close packet has been acked synchronized (dataQueue) { while (!streamerClosed && !hasError && ackQueue.size() != 0 && dfsClient.clientRunning) { dataQueue.wait(1000);// wait for acks to arrive from datanodes } } if (streamerClosed || hasError || !dfsClient.clientRunning) { continue; } endBlock(); } if (progress != null) { progress.progress(); } // This is used by unit test to trigger race conditions. if (artificialSlowdown != 0 && dfsClient.clientRunning) { Thread.sleep(artificialSlowdown); } } catch (Throwable e) { // Log warning if there was a real error. if (restartingNodeIndex.get() == -1) { // Since their messages are descriptive enough, do not always // log a verbose stack-trace WARN for quota exceptions. if (e instanceof QuotaExceededException) { DFSClient.LOG.debug("DataStreamer Quota Exception", e); } else { DFSClient.LOG.warn("DataStreamer Exception", e); } } if (e instanceof IOException) { setLastException((IOException) e); } else { setLastException(new IOException("DataStreamer Exception: ", e)); } hasError = true; if (errorIndex == -1 && restartingNodeIndex.get() == -1) { // Not a datanode issue streamerClosed = true; } } finally { if (scope != null) { scope.close(); scope = null; } } } closeInternal(); } private void closeInternal() { closeResponder(); // close and join closeStream(); streamerClosed = true; release(); synchronized (dataQueue) { dataQueue.notifyAll(); } } /** * release the DFSPackets in the two queues * */ void release() { synchronized (dataQueue) { releaseBuffer(dataQueue, byteArrayManager); releaseBuffer(ackQueue, byteArrayManager); } } /** * wait for the ack of seqno * * @param seqno the sequence number to be acked * @throws IOException */ void waitForAckedSeqno(long seqno) throws IOException { TraceScope scope = dfsClient.getTracer().newScope("waitForAckedSeqno"); try { if (canStoreFileInDB()) { LOG.debug( "Stuffed Inode: Closing File. Datanode ack skipped. All the data will be stored in the database"); } else { if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("Waiting for ack for: " + seqno); } long begin = Time.monotonicNow(); try { synchronized (dataQueue) { while (!streamerClosed) { checkClosed(); if (lastAckedSeqno >= seqno) { break; } try { dataQueue.wait(1000); // when we receive an ack, we notify on // dataQueue } catch (InterruptedException ie) { throw new InterruptedIOException( "Interrupted while waiting for data to be acknowledged by pipeline"); } } } checkClosed(); } catch (ClosedChannelException e) { } long duration = Time.monotonicNow() - begin; if (duration > dfsclientSlowLogThresholdMs) { DFSClient.LOG.warn("Slow waitForAckedSeqno took " + duration + "ms (threshold=" + dfsclientSlowLogThresholdMs + "ms)"); } } } finally { scope.close(); } } /** * wait for space of dataQueue and queue the packet * * @param packet the DFSPacket to be queued * @throws IOException */ void waitAndQueuePacket(DFSPacket packet) throws IOException { synchronized (dataQueue) { try { // If queue is full, then wait till we have enough space boolean firstWait = true; try { while (!streamerClosed && dataQueue.size() + ackQueue.size() > dfsClient.getConf().writeMaxPackets) { if (firstWait) { Span span = Tracer.getCurrentSpan(); if (span != null) { span.addTimelineAnnotation("dataQueue.wait"); } firstWait = false; } try { dataQueue.wait(); } catch (InterruptedException e) { // If we get interrupted while waiting to queue data, we still need to get rid // of the current packet. This is because we have an invariant that if // currentPacket gets full, it will get queued before the next writeChunk. // // Rather than wait around for space in the queue, we should instead try to // return to the caller as soon as possible, even though we slightly overrun // the MAX_PACKETS length. Thread.currentThread().interrupt(); break; } } } finally { Span span = Tracer.getCurrentSpan(); if ((span != null) && (!firstWait)) { span.addTimelineAnnotation("end.wait"); } } checkClosed(); queuePacket(packet); } catch (ClosedChannelException e) { } } } /* * close the streamer, should be called only by an external thread * and only after all data to be sent has been flushed to datanode. * * Interrupt this data streamer if force is true * * @param force if this data stream is forced to be closed */ void close(boolean force) { streamerClosed = true; synchronized (dataQueue) { dataQueue.notifyAll(); } if (force) { this.interrupt(); } } private void checkClosed() throws IOException { if (streamerClosed) { IOException e = lastException.get(); throw e != null ? e : new ClosedChannelException(); } } private void closeResponder() { if (response != null) { try { response.close(); response.join(); } catch (InterruptedException e) { DFSClient.LOG.warn("Caught exception ", e); } finally { response = null; } } } private void closeStream() { if (blockStream != null) { try { blockStream.close(); } catch (IOException e) { setLastException(e); } finally { blockStream = null; } } if (blockReplyStream != null) { try { blockReplyStream.close(); } catch (IOException e) { setLastException(e); } finally { blockReplyStream = null; } } if (null != s) { try { s.close(); } catch (IOException e) { setLastException(e); } finally { s = null; } } } // The following synchronized methods are used whenever // errorIndex or restartingNodeIndex is set. This is because // check & set needs to be atomic. Simply reading variables // does not require a synchronization. When responder is // not running (e.g. during pipeline recovery), there is no // need to use these methods. /** Set the error node index. Called by responder */ synchronized void setErrorIndex(int idx) { errorIndex = idx; } /** Set the restarting node index. Called by responder */ synchronized void setRestartingNodeIndex(int idx) { restartingNodeIndex.set(idx); // If the data streamer has already set the primary node // bad, clear it. It is likely that the write failed due to // the DN shutdown. Even if it was a real failure, the pipeline // recovery will take care of it. errorIndex = -1; } /** * This method is used when no explicit error report was received, * but something failed. When the primary node is a suspect or * unsure about the cause, the primary node is marked as failed. */ synchronized void tryMarkPrimaryDatanodeFailed() { // There should be no existing error and no ongoing restart. if ((errorIndex == -1) && (restartingNodeIndex.get() == -1)) { errorIndex = 0; } } /** * Examine whether it is worth waiting for a node to restart. * @param index the node index */ boolean shouldWaitForRestart(int index) { // Only one node in the pipeline. if (nodes.length == 1) { return true; } // Is it a local node? InetAddress addr = null; try { addr = InetAddress.getByName(nodes[index].getIpAddr()); } catch (java.net.UnknownHostException e) { // we are passing an ip address. this should not happen. assert false; } if (addr != null && NetUtils.isLocalAddress(addr)) { return true; } return false; } // // Processes responses from the datanodes. A packet is removed // from the ackQueue when its response arrives. // private class ResponseProcessor extends Daemon { private volatile boolean responderClosed = false; private DatanodeInfo[] targets = null; private boolean isLastPacketInBlock = false; ResponseProcessor(DatanodeInfo[] targets) { this.targets = targets; } @Override public void run() { setName("ResponseProcessor for block " + block); PipelineAck ack = new PipelineAck(); TraceScope scope = null; while (!responderClosed && dfsClient.clientRunning && !isLastPacketInBlock) { // process responses from datanodes. try { // read an ack from the pipeline long begin = Time.monotonicNow(); ack.readFields(blockReplyStream); long duration = Time.monotonicNow() - begin; if (duration > dfsclientSlowLogThresholdMs && ack.getSeqno() != DFSPacket.HEART_BEAT_SEQNO) { DFSClient.LOG.warn("Slow ReadProcessor read fields took " + duration + "ms (threshold=" + dfsclientSlowLogThresholdMs + "ms); ack: " + ack + ", targets: " + Arrays.asList(targets)); } else if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("DFSClient " + ack); } long seqno = ack.getSeqno(); // processes response status from datanodes. ArrayList<DatanodeInfo> congestedNodesFromAck = new ArrayList<>(); for (int i = ack.getNumOfReplies() - 1; i >= 0 && dfsClient.clientRunning; i--) { final Status reply = PipelineAck.getStatusFromHeader(ack.getHeaderFlag(i)); if (PipelineAck.getECNFromHeader(ack.getHeaderFlag(i)) == PipelineAck.ECN.CONGESTED) { congestedNodesFromAck.add(targets[i]); } // Restart will not be treated differently unless it is // the local node or the only one in the pipeline. if (PipelineAck.isRestartOOBStatus(reply) && shouldWaitForRestart(i)) { restartDeadline = dfsClient.getConf().datanodeRestartTimeout + Time.monotonicNow(); setRestartingNodeIndex(i); String message = "A datanode is restarting: " + targets[i]; DFSClient.LOG.info(message); throw new IOException(message); } // node error if (reply != SUCCESS) { setErrorIndex(i); // first bad datanode throw new IOException("Bad response " + reply + " for block " + block + " from datanode " + targets[i]); } } if (!congestedNodesFromAck.isEmpty()) { synchronized (congestedNodes) { congestedNodes.clear(); congestedNodes.addAll(congestedNodesFromAck); } } else { synchronized (congestedNodes) { congestedNodes.clear(); lastCongestionBackoffTime = 0; } } assert seqno != PipelineAck.UNKOWN_SEQNO : "Ack for unknown seqno should be a failed ack: " + ack; if (seqno == DFSPacket.HEART_BEAT_SEQNO) { // a heartbeat ack continue; } // a success ack for a data packet DFSPacket one; synchronized (dataQueue) { one = ackQueue.getFirst(); } if (one.getSeqno() != seqno) { throw new IOException("ResponseProcessor: Expecting seqno " + " for block " + block + one.getSeqno() + " but received " + seqno); } isLastPacketInBlock = one.isLastPacketInBlock(); // Fail the packet write for testing in order to force a // pipeline recovery. if (DFSClientFaultInjector.get().failPacket() && isLastPacketInBlock) { failPacket = true; throw new IOException("Failing the last packet for testing."); } // update bytesAcked block.setNumBytes(one.getLastByteOffsetBlock()); synchronized (dataQueue) { scope = one.getTraceScope(); if (scope != null) { scope.reattach(); one.setTraceScope(null); } lastAckedSeqno = seqno; ackQueue.removeFirst(); dataQueue.notifyAll(); one.releaseBuffer(byteArrayManager); } } catch (Exception e) { if (!responderClosed) { if (e instanceof IOException) { setLastException((IOException) e); } hasError = true; // If no explicit error report was received, mark the primary // node as failed. tryMarkPrimaryDatanodeFailed(); synchronized (dataQueue) { dataQueue.notifyAll(); } if (restartingNodeIndex.get() == -1) { DFSClient.LOG.warn("DataStreamer ResponseProcessor exception " + " for block " + block, e); } responderClosed = true; } } finally { if (scope != null) { scope.close(); } scope = null; } } } void close() { responderClosed = true; this.interrupt(); } } // If this stream has encountered any errors so far, shutdown // threads and mark stream as closed. Returns true if we should // sleep for a while after returning from this call. // private boolean processDatanodeError() throws IOException { if (response != null) { DFSClient.LOG.info("Error Recovery for " + block + " waiting for responder to exit. "); return true; } closeStream(); // move packets from ack queue to front of the data queue synchronized (dataQueue) { dataQueue.addAll(0, ackQueue); ackQueue.clear(); } // Record the new pipeline failure recovery. if (lastAckedSeqnoBeforeFailure != lastAckedSeqno) { lastAckedSeqnoBeforeFailure = lastAckedSeqno; pipelineRecoveryCount = 1; } else { // If we had to recover the pipeline five times in a row for the // same packet, this client likely has corrupt data or corrupting // during transmission. if (++pipelineRecoveryCount > 5) { DFSClient.LOG.warn("Error recovering pipeline for writing " + block + ". Already retried 5 times for the same packet."); lastException.set( new IOException("Failing write. Tried pipeline " + "recovery 5 times without success.")); streamerClosed = true; return false; } } boolean doSleep = setupPipelineForAppendOrRecovery(); if (!streamerClosed && dfsClient.clientRunning) { if (stage == BlockConstructionStage.PIPELINE_CLOSE) { // If we had an error while closing the pipeline, we go through a fast-path // where the BlockReceiver does not run. Instead, the DataNode just finalizes // the block immediately during the 'connect ack' process. So, we want to pull // the end-of-block packet from the dataQueue, since we don't actually have // a true pipeline to send it over. // // We also need to set lastAckedSeqno to the end-of-block Packet's seqno, so that // a client waiting on close() will be aware that the flush finished. synchronized (dataQueue) { DFSPacket endOfBlockPacket = dataQueue.remove(); // remove the end of block packet TraceScope scope = endOfBlockPacket.getTraceScope(); if (scope != null) { scope.reattach(); scope.close(); endOfBlockPacket.setTraceScope(null); } assert endOfBlockPacket.isLastPacketInBlock(); assert lastAckedSeqno == endOfBlockPacket.getSeqno() - 1; lastAckedSeqno = endOfBlockPacket.getSeqno(); dataQueue.notifyAll(); } endBlock(); } else { initDataStreaming(); } } return doSleep; } void setHflush() { isHflushed = true; } private int findNewDatanode(final DatanodeInfo[] original) throws IOException { if (nodes.length != original.length + 1) { throw new IOException(new StringBuilder() .append("Failed to replace a bad datanode on the existing pipeline ") .append("due to no more good datanodes being available to try. ").append("(Nodes: current=") .append(Arrays.asList(nodes)).append(", original=").append(Arrays.asList(original)) .append("). ").append("The current failed datanode replacement policy is ") .append(dfsClient.dtpReplaceDatanodeOnFailure).append(", and ") .append("a client may configure this via '") .append(DFSConfigKeys.DFS_CLIENT_WRITE_REPLACE_DATANODE_ON_FAILURE_POLICY_KEY) .append("' in its configuration.").toString()); } for (int i = 0; i < nodes.length; i++) { int j = 0; for (; j < original.length && !nodes[i].equals(original[j]); j++) ; if (j == original.length) { return i; } } throw new IOException("Failed: new datanode not found: nodes=" + Arrays.asList(nodes) + ", original=" + Arrays.asList(original)); } private void addDatanode2ExistingPipeline() throws IOException { if (DataTransferProtocol.LOG.isDebugEnabled()) { DataTransferProtocol.LOG.debug("lastAckedSeqno = " + lastAckedSeqno); } /* * Is data transfer necessary? We have the following cases. * * Case 1: Failure in Pipeline Setup * - Append * + Transfer the stored replica, which may be a RBW or a finalized. * - Create * + If no data, then no transfer is required. * + If there are data written, transfer RBW. This case may happens * when there are streaming failure earlier in this pipeline. * * Case 2: Failure in Streaming * - Append/Create: * + transfer RBW * * Case 3: Failure in Close * - Append/Create: * + no transfer, let NameNode replicates the block. */ if (!isAppend && lastAckedSeqno < 0 && stage == BlockConstructionStage.PIPELINE_SETUP_CREATE) { //no data have been written return; } else if (stage == BlockConstructionStage.PIPELINE_CLOSE || stage == BlockConstructionStage.PIPELINE_CLOSE_RECOVERY) { //pipeline is closing return; } //get a new datanode final DatanodeInfo[] original = nodes; final LocatedBlock lb = dfsClient.namenode.getAdditionalDatanode(src, stat.getFileId(), block, nodes, storageIDs, failed.toArray(new DatanodeInfo[failed.size()]), 1, dfsClient.clientName); setPipeline(lb); //find the new datanode final int d = findNewDatanode(original); //transfer replica final DatanodeInfo src = d == 0 ? nodes[1] : nodes[d - 1]; final DatanodeInfo[] targets = { nodes[d] }; final StorageType[] targetStorageTypes = { storageTypes[d] }; transfer(src, targets, targetStorageTypes, lb.getBlockToken()); } private void transfer(final DatanodeInfo src, final DatanodeInfo[] targets, final StorageType[] targetStorageTypes, final Token<BlockTokenIdentifier> blockToken) throws IOException { //transfer replica to the new datanode Socket sock = null; DataOutputStream out = null; DataInputStream in = null; try { sock = createSocketForPipeline(src, 2, dfsClient); final long writeTimeout = dfsClient.getDatanodeWriteTimeout(2); OutputStream unbufOut = NetUtils.getOutputStream(sock, writeTimeout); InputStream unbufIn = NetUtils.getInputStream(sock); IOStreamPair saslStreams = dfsClient.saslClient.socketSend(sock, unbufOut, unbufIn, dfsClient, blockToken, src); unbufOut = saslStreams.out; unbufIn = saslStreams.in; out = new DataOutputStream(new BufferedOutputStream(unbufOut, HdfsConstants.SMALL_BUFFER_SIZE)); in = new DataInputStream(unbufIn); //send the TRANSFER_BLOCK request new Sender(out).transferBlock(block, blockToken, dfsClient.clientName, targets, targetStorageTypes); out.flush(); //ack BlockOpResponseProto response = BlockOpResponseProto.parseFrom(PBHelper.vintPrefixed(in)); if (SUCCESS != response.getStatus()) { throw new IOException("Failed to add a datanode"); } } finally { IOUtils.closeStream(in); IOUtils.closeStream(out); IOUtils.closeSocket(sock); } } /** * Open a DataStreamer to a DataNode pipeline so that * it can be written to. * This happens when a file is appended or data streaming fails * It keeps on trying until a pipeline is setup */ private boolean setupPipelineForAppendOrRecovery() throws IOException { // check number of datanodes if (nodes == null || nodes.length == 0) { String msg = "Could not get block locations. " + "Source file \"" + src + "\" - Aborting..."; DFSClient.LOG.warn(msg); setLastException(new IOException(msg)); streamerClosed = true; return false; } boolean success = false; long newGS = 0L; while (!success && !streamerClosed && dfsClient.clientRunning) { // Sleep before reconnect if a dn is restarting. // This process will be repeated until the deadline or the datanode // starts back up. if (restartingNodeIndex.get() >= 0) { // 4 seconds or the configured deadline period, whichever is shorter. // This is the retry interval and recovery will be retried in this // interval until timeout or success. long delay = Math.min(dfsClient.getConf().datanodeRestartTimeout, 4000L); try { Thread.sleep(delay); } catch (InterruptedException ie) { lastException.set(new IOException("Interrupted while waiting for " + "datanode to restart. " + nodes[restartingNodeIndex.get()])); streamerClosed = true; return false; } } boolean isRecovery = hasError; // remove bad datanode from list of datanodes. // If errorIndex was not set (i.e. appends), then do not remove // any datanodes // if (errorIndex >= 0) { StringBuilder pipelineMsg = new StringBuilder(); for (int j = 0; j < nodes.length; j++) { pipelineMsg.append(nodes[j]); if (j < nodes.length - 1) { pipelineMsg.append(", "); } } if (nodes.length <= 1) { lastException.set(new IOException("All datanodes " + pipelineMsg + " are bad. Aborting...")); streamerClosed = true; return false; } DFSClient.LOG.warn("Error Recovery for block " + block + " in pipeline " + pipelineMsg + ": bad datanode " + nodes[errorIndex]); failed.add(nodes[errorIndex]); DatanodeInfo[] newnodes = new DatanodeInfo[nodes.length - 1]; arraycopy(nodes, newnodes, errorIndex); final StorageType[] newStorageTypes = new StorageType[newnodes.length]; arraycopy(storageTypes, newStorageTypes, errorIndex); final String[] newStorageIDs = new String[newnodes.length]; arraycopy(storageIDs, newStorageIDs, errorIndex); setPipeline(newnodes, newStorageTypes, newStorageIDs); // Just took care of a node error while waiting for a node restart if (restartingNodeIndex.get() >= 0) { // If the error came from a node further away than the restarting // node, the restart must have been complete. if (errorIndex > restartingNodeIndex.get()) { restartingNodeIndex.set(-1); } else if (errorIndex < restartingNodeIndex.get()) { // the node index has shifted. restartingNodeIndex.decrementAndGet(); } else { // this shouldn't happen... assert false; } } if (restartingNodeIndex.get() == -1) { hasError = false; } lastException.set(null); errorIndex = -1; } // Check if replace-datanode policy is satisfied. if (dfsClient.dtpReplaceDatanodeOnFailure.satisfy(stat.getReplication(), nodes, isAppend, isHflushed)) { try { addDatanode2ExistingPipeline(); } catch (IOException ioe) { if (!dfsClient.dtpReplaceDatanodeOnFailure.isBestEffort()) { throw ioe; } DFSClient.LOG .warn("Failed to replace datanode." + " Continue with the remaining datanodes since " + DFSConfigKeys.DFS_CLIENT_WRITE_REPLACE_DATANODE_ON_FAILURE_BEST_EFFORT_KEY + " is set to true.", ioe); } } // get a new generation stamp and an access token LocatedBlock lb = dfsClient.namenode.updateBlockForPipeline(block, dfsClient.clientName); newGS = lb.getBlock().getGenerationStamp(); accessToken = lb.getBlockToken(); // set up the pipeline again with the remaining nodes if (failPacket) { // for testing success = createBlockOutputStream(nodes, storageTypes, newGS, isRecovery); failPacket = false; try { // Give DNs time to send in bad reports. In real situations, // good reports should follow bad ones, if client committed // with those nodes. Thread.sleep(2000); } catch (InterruptedException ie) { } } else { success = createBlockOutputStream(nodes, storageTypes, newGS, isRecovery); } if (restartingNodeIndex.get() >= 0) { assert hasError == true; // check errorIndex set above if (errorIndex == restartingNodeIndex.get()) { // ignore, if came from the restarting node errorIndex = -1; } // still within the deadline if (Time.monotonicNow() < restartDeadline) { continue; // with in the deadline } // expired. declare the restarting node dead restartDeadline = 0; int expiredNodeIndex = restartingNodeIndex.get(); restartingNodeIndex.set(-1); DFSClient.LOG.warn("Datanode did not restart in time: " + nodes[expiredNodeIndex]); // Mark the restarting node as failed. If there is any other failed // node during the last pipeline construction attempt, it will not be // overwritten/dropped. In this case, the restarting node will get // excluded in the following attempt, if it still does not come up. if (errorIndex == -1) { errorIndex = expiredNodeIndex; } // From this point on, normal pipeline recovery applies. } } // while if (success) { // update pipeline at the namenode ExtendedBlock newBlock = new ExtendedBlock(block.getBlockPoolId(), block.getBlockId(), block.getNumBytes(), newGS); dfsClient.namenode.updatePipeline(dfsClient.clientName, block, newBlock, nodes, storageIDs); // update client side generation stamp block = newBlock; } return false; // do not sleep, continue processing } /** * Open a DataStreamer to a DataNode so that it can be written to. * This happens when a file is created and each time a new block is allocated. * Must get block ID and the IDs of the destinations from the namenode. * Returns the list of target datanodes. */ private LocatedBlock nextBlockOutputStream() throws IOException { LocatedBlock lb = null; DatanodeInfo[] nodes = null; StorageType[] storageTypes = null; int count = dfsClient.getConf().nBlockWriteRetry; boolean success = false; ExtendedBlock oldBlock = block; do { hasError = false; lastException.set(null); errorIndex = -1; success = false; DatanodeInfo[] excluded; long startTime = Time.now(); if ((erasureCodingSourceStream && currentBlockIndex % stripeLength == 0)) { usedNodes.clear(); LOG.info("Stripe length " + stripeLength + " parity length " + parityLength); LOG.info("Source write block index " + currentBlockIndex); } if (erasureCodingParityStream && currentBlockIndex % parityLength == 0) { usedNodes.clear(); stripeNodes.clear(); int stripe = (int) Math.ceil(currentBlockIndex / (float) parityLength); int index = stripe * stripeLength; LOG.info("Stripe length " + stripeLength + " parity length " + parityLength); LOG.info("Parity write block index " + currentBlockIndex + " found index " + index + " end " + (index + stripeLength)); for (int j = index; j < sourceBlocks.size() && j < index + stripeLength; j++) { DatanodeInfo[] nodeInfos = sourceBlocks.get(j).getLocations(); Collections.addAll(stripeNodes, nodeInfos); } } if (erasureCodingSourceStream || erasureCodingParityStream) { ImmutableSet<DatanodeInfo> excludedSet = excludedNodes.getAllPresent(excludedNodes.asMap().keySet()) .keySet(); excluded = new DatanodeInfo[excludedSet.size() + usedNodes.size() + stripeNodes.size() + parityStripeNodes.size()]; int i = 0; for (DatanodeInfo node : excludedSet) { excluded[i] = node; LOG.info("Excluding node " + node); i++; } for (DatanodeInfo node : usedNodes) { excluded[i] = node; LOG.info((erasureCodingSourceStream ? "Source stream: " : " Parity stream: ") + "Block " + currentBlockIndex + " excluding used node " + node); i++; } for (DatanodeInfo node : stripeNodes) { excluded[i] = node; LOG.info((erasureCodingSourceStream ? "Source stream: " : " Parity stream: ") + "Block " + currentBlockIndex + " excluding stripe node " + node); i++; } for (DatanodeInfo node : parityStripeNodes) { excluded[i] = node; LOG.info((erasureCodingSourceStream ? "Source stream: " : " Parity stream: ") + "Block " + currentBlockIndex + " excluding parity node " + node); i++; } currentBlockIndex++; } else { excluded = excludedNodes.getAllPresent(excludedNodes.asMap().keySet()).keySet() .toArray(new DatanodeInfo[0]); } block = oldBlock; lb = locateFollowingBlock(excluded.length > 0 ? excluded : null); block = lb.getBlock(); block.setNumBytes(0); bytesSent = 0; accessToken = lb.getBlockToken(); nodes = lb.getLocations(); storageTypes = lb.getStorageTypes(); // // Connect to first DataNode in the list. // success = createBlockOutputStream(nodes, storageTypes, 0L, false); if (!success) { DFSClient.LOG.info("Abandoning " + block); dfsClient.namenode.abandonBlock(block, stat.getFileId(), src, dfsClient.clientName); block = null; DFSClient.LOG.info("Excluding datanode " + nodes[errorIndex]); excludedNodes.put(nodes[errorIndex], nodes[errorIndex]); } } while (!success && --count >= 0); if (!success) { throw new IOException("Unable to create new block."); } if (erasureCodingSourceStream || erasureCodingParityStream) { Collections.addAll(usedNodes, nodes); } return lb; } // connects to the first datanode in the pipeline // Returns true if success, otherwise return failure. // private boolean createBlockOutputStream(DatanodeInfo[] nodes, StorageType[] nodeStorageTypes, long newGS, boolean recoveryFlag) { if (nodes.length == 0) { DFSClient.LOG.info("nodes are empty for write pipeline of block " + block); return false; } Status pipelineStatus = SUCCESS; String firstBadLink = ""; boolean checkRestart = false; if (DFSClient.LOG.isDebugEnabled()) { for (int i = 0; i < nodes.length; i++) { DFSClient.LOG.debug("pipeline = " + nodes[i]); } } // persist blocks on namenode on next flush persistBlocks.set(true); int refetchEncryptionKey = 1; while (true) { boolean result = false; DataOutputStream out = null; try { assert null == s : "Previous socket unclosed"; assert null == blockReplyStream : "Previous blockReplyStream unclosed"; s = createSocketForPipeline(nodes[0], nodes.length, dfsClient); long writeTimeout = dfsClient.getDatanodeWriteTimeout(nodes.length); OutputStream unbufOut = NetUtils.getOutputStream(s, writeTimeout); InputStream unbufIn = NetUtils.getInputStream(s); IOStreamPair saslStreams = dfsClient.saslClient.socketSend(s, unbufOut, unbufIn, dfsClient, accessToken, nodes[0]); unbufOut = saslStreams.out; unbufIn = saslStreams.in; out = new DataOutputStream(new BufferedOutputStream(unbufOut, HdfsConstants.SMALL_BUFFER_SIZE)); blockReplyStream = new DataInputStream(unbufIn); // // Xmit header info to datanode // BlockConstructionStage bcs = recoveryFlag ? stage.getRecoveryStage() : stage; // We cannot change the block length in 'block' as it counts the number // of bytes ack'ed. ExtendedBlock blockCopy = new ExtendedBlock(block); blockCopy.setNumBytes(stat.getBlockSize()); boolean[] targetPinnings = getPinnings(nodes, true); // send the request new Sender(out).writeBlock(blockCopy, nodeStorageTypes[0], accessToken, dfsClient.clientName, nodes, nodeStorageTypes, null, bcs, nodes.length, block.getNumBytes(), bytesSent, newGS, checksum, cachingStrategy.get(), (targetPinnings == null ? false : targetPinnings[0]), targetPinnings); // receive ack for connect BlockOpResponseProto resp = BlockOpResponseProto.parseFrom(PBHelper.vintPrefixed(blockReplyStream)); pipelineStatus = resp.getStatus(); firstBadLink = resp.getFirstBadLink(); // Got an restart OOB ack. // If a node is already restarting, this status is not likely from // the same node. If it is from a different node, it is not // from the local datanode. Thus it is safe to treat this as a // regular node error. if (PipelineAck.isRestartOOBStatus(pipelineStatus) && restartingNodeIndex.get() == -1) { checkRestart = true; throw new IOException("A datanode is restarting."); } String logInfo = "ack with firstBadLink as " + firstBadLink; DataTransferProtoUtil.checkBlockOpStatus(resp, logInfo); assert null == blockStream : "Previous blockStream unclosed"; blockStream = out; result = true; // success restartingNodeIndex.set(-1); hasError = false; } catch (IOException ie) { if (restartingNodeIndex.get() == -1) { DFSClient.LOG.info("Exception in createBlockOutputStream", ie); } if (ie instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) { DFSClient.LOG.info("Will fetch a new encryption key and retry, " + "encryption key was invalid when connecting to " + nodes[0] + " : " + ie); // The encryption key used is invalid. refetchEncryptionKey--; dfsClient.clearDataEncryptionKey(); // Don't close the socket/exclude this node just yet. Try again with // a new encryption key. continue; } // find the datanode that matches if (firstBadLink.length() != 0) { for (int i = 0; i < nodes.length; i++) { // NB: Unconditionally using the xfer addr w/o hostname if (firstBadLink.equals(nodes[i].getXferAddr())) { errorIndex = i; break; } } } else { assert checkRestart == false; errorIndex = 0; } // Check whether there is a restart worth waiting for. if (checkRestart && shouldWaitForRestart(errorIndex)) { restartDeadline = dfsClient.getConf().datanodeRestartTimeout + Time.monotonicNow(); restartingNodeIndex.set(errorIndex); errorIndex = -1; DFSClient.LOG .info("Waiting for the datanode to be restarted: " + nodes[restartingNodeIndex.get()]); } hasError = true; setLastException(ie); result = false; // error } finally { if (!result) { IOUtils.closeSocket(s); s = null; IOUtils.closeStream(out); out = null; IOUtils.closeStream(blockReplyStream); blockReplyStream = null; } } return result; } } private boolean[] getPinnings(DatanodeInfo[] nodes, boolean shouldLog) { if (favoredNodes == null) { return null; } else { boolean[] pinnings = new boolean[nodes.length]; HashSet<String> favoredSet = new HashSet<String>(Arrays.asList(favoredNodes)); for (int i = 0; i < nodes.length; i++) { pinnings[i] = favoredSet.remove(nodes[i].getXferAddrWithHostname()); if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug(nodes[i].getXferAddrWithHostname() + " was chosen by name node (favored=" + pinnings[i] + ")."); } } if (shouldLog && !favoredSet.isEmpty()) { // There is one or more favored nodes that were not allocated. DFSClient.LOG.warn("These favored nodes were specified but not chosen: " + favoredSet + " Specified favored nodes: " + Arrays.toString(favoredNodes)); } return pinnings; } } protected LocatedBlock locateFollowingBlock(DatanodeInfo[] excludedNodes) throws IOException { int retries = dfsClient.getConf().nBlockWriteLocateFollowingRetry; long sleeptime = dfsClient.getConf().blockWriteLocateFollowingInitialDelayMs; while (true) { long localstart = Time.monotonicNow(); while (true) { try { return dfsClient.namenode.addBlock(src, dfsClient.clientName, block, excludedNodes, stat.getFileId(), favoredNodes); } catch (RemoteException e) { IOException ue = e.unwrapRemoteException(FileNotFoundException.class, AccessControlException.class, NSQuotaExceededException.class, DSQuotaExceededException.class, UnresolvedPathException.class); if (ue != e) { throw ue; // no need to retry these exceptions } if (NotReplicatedYetException.class.getName().equals(e.getClassName())) { if (retries == 0) { throw e; } else { --retries; DFSClient.LOG.info("Exception while adding a block", e); long elapsed = Time.monotonicNow() - localstart; if (elapsed > 5000) { DFSClient.LOG.info("Waiting for replication for " + (elapsed / 1000) + " seconds"); } try { DFSClient.LOG.warn( "NotReplicatedYetException sleeping " + src + " retries left " + retries); Thread.sleep(sleeptime); sleeptime *= 2; } catch (InterruptedException ie) { DFSClient.LOG.warn("Caught exception ", ie); } } } else { throw e; } } } } } /** * This function sleeps for a certain amount of time when the writing * pipeline is congested. The function calculates the time based on a * decorrelated filter. * * @see * <a href="http://www.awsarchitectureblog.com/2015/03/backoff.html"> * http://www.awsarchitectureblog.com/2015/03/backoff.html</a> */ private void backOffIfNecessary() throws InterruptedException { int t = 0; synchronized (congestedNodes) { if (!congestedNodes.isEmpty()) { StringBuilder sb = new StringBuilder("DataNode"); for (DatanodeInfo i : congestedNodes) { sb.append(' ').append(i); } int range = Math.abs(lastCongestionBackoffTime * 3 - CONGESTION_BACKOFF_MEAN_TIME_IN_MS); int base = Math.min(lastCongestionBackoffTime * 3, CONGESTION_BACKOFF_MEAN_TIME_IN_MS); t = Math.min(CONGESTION_BACK_OFF_MAX_TIME_IN_MS, (int) (base + Math.random() * range)); lastCongestionBackoffTime = t; sb.append(" are congested. Backing off for ").append(t).append(" ms"); DFSClient.LOG.info(sb.toString()); congestedNodes.clear(); } } if (t != 0) { Thread.sleep(t); } } /** * get the block this streamer is writing to * * @return the block this streamer is writing to */ ExtendedBlock getBlock() { return block; } /** * return the target datanodes in the pipeline * * @return the target datanodes in the pipeline */ DatanodeInfo[] getNodes() { return nodes; } /** * return the token of the block * * @return the token of the block */ Token<BlockTokenIdentifier> getBlockToken() { return accessToken; } /** * set last exception * * @param e an exception */ void setLastException(IOException e) { lastException.compareAndSet(null, e); } /** * Put a packet to the data queue * * @param packet the packet to be put into the data queued */ void queuePacket(DFSPacket packet) { synchronized (dataQueue) { if (packet == null) return; packet.addTraceParent(Tracer.getCurrentSpanId()); // put it is the small files buffer if (canStoreFileInDB() && (packet.getLastByteOffsetBlock() <= dbFileMaxSize)) { LOG.debug("Stuffed Inode: Temporarily withholding the packet in a buffer for small files"); smallFileDataQueue.addLast(packet); } else { //Some condition for storing the data in the database has failed. Store the data on the datanodes forwardSmallFilesPacketsToDataNodes(); dataQueue.addLast(packet); if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("Queued packet " + packet.getSeqno()); } } lastQueuedSeqno = packet.getSeqno(); dataQueue.notifyAll(); } } /** * For heartbeat packets, create buffer directly by new byte[] * since heartbeats should not be blocked. */ private DFSPacket createHeartbeatPacket() throws InterruptedIOException { final byte[] buf = new byte[PacketHeader.PKT_MAX_HEADER_LEN]; return new DFSPacket(buf, 0, 0, DFSPacket.HEART_BEAT_SEQNO, 0, false); } private LoadingCache<DatanodeInfo, DatanodeInfo> initExcludedNodes() { return CacheBuilder.newBuilder() .expireAfterWrite(dfsClient.getConf().excludedNodesCacheExpiry, TimeUnit.MILLISECONDS) .removalListener(new RemovalListener<DatanodeInfo, DatanodeInfo>() { @Override public void onRemoval(RemovalNotification<DatanodeInfo, DatanodeInfo> notification) { DFSClient.LOG .info("Removing node " + notification.getKey() + " from the excluded nodes list"); } }).build(new CacheLoader<DatanodeInfo, DatanodeInfo>() { @Override public DatanodeInfo load(DatanodeInfo key) throws Exception { return key; } }); } private static <T> void arraycopy(T[] srcs, T[] dsts, int skipIndex) { System.arraycopy(srcs, 0, dsts, 0, skipIndex); System.arraycopy(srcs, skipIndex + 1, dsts, skipIndex, dsts.length - skipIndex); } /** * check if to persist blocks on namenode * * @return if to persist blocks on namenode */ AtomicBoolean getPersistBlocks() { return persistBlocks; } /** * check if to append a chunk * * @param appendChunk if to append a chunk */ void setAppendChunk(boolean appendChunk) { this.appendChunk = appendChunk; } /** * get if to append a chunk * * @return if to append a chunk */ boolean getAppendChunk() { return appendChunk; } /** * get the last exception * * @return the last exception */ AtomicReference<IOException> getLastException() { return lastException; } /** * set socket to null */ void setSocketToNull() { this.s = null; } /** * return current sequence number and then increase it by 1 * * @return current sequence number before increasing */ long getAndIncCurrentSeqno() { long old = this.currentSeqno; this.currentSeqno++; return old; } /** * get last queued sequence number * * @return last queued sequence number */ long getLastQueuedSeqno() { return lastQueuedSeqno; } /** * get the number of bytes of current block * * @return the number of bytes of current block */ long getBytesCurBlock() { return bytesCurBlock; } /** * set the bytes of current block that have been written * * @param bytesCurBlock bytes of current block that have been written */ void setBytesCurBlock(long bytesCurBlock) { this.bytesCurBlock = bytesCurBlock; } /** * increase bytes of current block by len. * * @param len how many bytes to increase to current block */ void incBytesCurBlock(long len) { this.bytesCurBlock += len; } /** * set artificial slow down for unit test * * @param period artificial slow down */ void setArtificialSlowdown(long period) { this.artificialSlowdown = period; } /** * if this streamer is to terminate * * @return if this streamer is to terminate */ boolean streamerClosed() { return streamerClosed; } void closeSocket() throws IOException { if (s != null) { s.close(); } } private DatanodeInfo[] setupPipelineForSingleBlock(LocatedBlock lb) throws IOException { DatanodeInfo[] nodes; int count = dfsClient.getConf().nBlockWriteRetry; boolean success; do { hasError = false; lastException.set(null); errorIndex = -1; block = lb.getBlock(); block.setNumBytes(0); bytesSent = 0; accessToken = lb.getBlockToken(); nodes = lb.getLocations(); // // Connect to first DataNode in the list. // success = createBlockOutputStream(nodes, storageTypes, 0L, false); if (!success) { // TODO Request another location from the NameNode } } while (!success && --count >= 0); if (!success) { throw new IOException("Unable to initiate single block send."); } return nodes; } public void enableSourceStream(int stripeLength) { this.erasureCodingSourceStream = true; this.stripeLength = stripeLength; } public void enableParityStream(int stripeLength, int parityLength, String sourceFile) throws IOException { this.erasureCodingParityStream = true; this.stripeLength = stripeLength; this.parityLength = parityLength; if (sourceFile != null) { this.sourceBlocks = new ArrayList( dfsClient.getLocatedBlocks(sourceFile, 0, Long.MAX_VALUE).getLocatedBlocks()); Collections.sort(sourceBlocks, LocatedBlock.blockIdComparator); } } public void setParityStripeNodesForNextStripe(Collection<DatanodeInfo> locations) { parityStripeNodes.clear(); parityStripeNodes.addAll(locations); } public Collection<DatanodeInfo> getUsedNodes() { return usedNodes; } public boolean canStoreFileInDB() { return isThisFileStoredInDB && !syncOrFlushCalled; } public void forwardSmallFilesPacketsToDataNodes() { // can not save the data in the database if (isThisFileStoredInDB) { LOG.debug("Stuffed Inode: The file can not be stored in the database"); isThisFileStoredInDB = false; if (!smallFileDataQueue.isEmpty()) { for (DFSPacket packet : smallFileDataQueue) { packet.addTraceParent(Tracer.getCurrentSpanId()); dataQueue.addLast(packet); if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("Queued packet " + packet.getSeqno()); } } smallFileDataQueue.clear(); } } } public void syncOrFlushCalled() { syncOrFlushCalled = true; } public List<DFSPacket> getSmallFileDataQueue() { return smallFileDataQueue; } public void setFileStoredInDB(boolean isThisFileStoredInDB) { this.isThisFileStoredInDB = isThisFileStoredInDB; } }