Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.blockmanagement; import com.google.common.primitives.Longs; import io.hops.common.INodeUtil; import io.hops.exception.StorageException; import io.hops.exception.TransactionContextException; import io.hops.exception.TransientStorageException; import io.hops.leader_election.node.ActiveNode; import io.hops.metadata.HdfsStorageFactory; import io.hops.metadata.HdfsVariables; import io.hops.metadata.blockmanagement.ExcessReplicasMap; import io.hops.metadata.common.entity.Variable; import io.hops.metadata.hdfs.dal.MisReplicatedRangeQueueDataAccess; import io.hops.metadata.hdfs.entity.EncodingStatus; import io.hops.metadata.hdfs.entity.HashBucket; import io.hops.metadata.hdfs.entity.INodeIdentifier; import io.hops.metadata.hdfs.entity.MisReplicatedRange; import io.hops.metadata.security.token.block.NameNodeBlockTokenSecretManager; import io.hops.transaction.EntityManager; import io.hops.transaction.handler.HDFSOperationType; import io.hops.transaction.handler.HopsTransactionalRequestHandler; import io.hops.transaction.handler.LightWeightRequestHandler; import io.hops.transaction.lock.LockFactory; import io.hops.transaction.lock.TransactionLockTypes.INodeLockType; import io.hops.transaction.lock.TransactionLockTypes.LockType; import io.hops.transaction.lock.TransactionLocks; import io.hops.util.Slicer; import org.apache.hadoop.HadoopIllegalArgumentException; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.StorageType; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.BlockStoragePolicy; import org.apache.hadoop.hdfs.protocol.DatanodeID; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.hdfs.protocol.UnregisteredNodeException; import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager; import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode; import org.apache.hadoop.hdfs.security.token.block.DataEncryptionKey; import org.apache.hadoop.hdfs.security.token.block.ExportedBlockKeys; import org.apache.hadoop.hdfs.server.blockmanagement.CorruptReplicasMap.Reason; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo.AddBlockResult; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.INode; import org.apache.hadoop.hdfs.server.namenode.INodeFile; import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.Namesystem; import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics; import org.apache.hadoop.hdfs.server.protocol.BlockCommand; import org.apache.hadoop.hdfs.server.protocol.BlockReport; import org.apache.hadoop.hdfs.server.protocol.Bucket; import org.apache.hadoop.hdfs.server.protocol.BlockReportContext; import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations; import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations; import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage; import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage.State; import org.apache.hadoop.hdfs.server.protocol.KeyUpdateCommand; import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo; import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks; import org.apache.hadoop.hdfs.util.LightWeightLinkedSet; import org.apache.hadoop.net.Node; import org.apache.hadoop.util.Daemon; import org.apache.hadoop.util.Time; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicLong; import static io.hops.transaction.lock.LockFactory.BLK; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.atomic.AtomicInteger; import org.apache.hadoop.security.UserGroupInformation; import static org.apache.hadoop.util.ExitUtil.terminate; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.Sets; import org.apache.hadoop.hdfs.protocol.BlockListAsLongs; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Keeps information related to the blocks stored in the Hadoop cluster. */ @InterfaceAudience.Private public class BlockManager { public static final Logger LOG = LoggerFactory.getLogger(BlockManager.class); public static final Logger blockLog = NameNode.blockStateChangeLog; private final Namesystem namesystem; private final DatanodeManager datanodeManager; private final HeartbeatManager heartbeatManager; private final NameNodeBlockTokenSecretManager blockTokenSecretManager; private volatile long pendingReplicationBlocksCount = 0L; private volatile long corruptReplicaBlocksCount = 0L; private volatile long underReplicatedBlocksCount = 0L; private volatile long scheduledReplicationBlocksCount = 0L; private AtomicLong postponedMisreplicatedBlocksCount = new AtomicLong(0L); private final long startupDelayBlockDeletionInMs; private ExecutorService datanodeRemover = Executors.newSingleThreadExecutor(); /** * Used by metrics */ public long getPendingReplicationBlocksCount() { return pendingReplicationBlocksCount; } /** * Used by metrics */ public long getUnderReplicatedBlocksCount() { return underReplicatedBlocksCount; } /** * Used by metrics */ public long getCorruptReplicaBlocksCount() { return corruptReplicaBlocksCount; } /** * Used by metrics */ public long getScheduledReplicationBlocksCount() { return scheduledReplicationBlocksCount; } /** * Used by metrics */ public long getPendingDeletionBlocksCount() throws IOException { return invalidateBlocks.numBlocks(); } /** Used by metrics */ public long getStartupDelayBlockDeletionInMs() { return startupDelayBlockDeletionInMs; } /** * Used by metrics */ public long getExcessBlocksCount() throws IOException { return excessReplicateMap.size(); } /** * Used by metrics */ public long getPostponedMisreplicatedBlocksCount() { return postponedMisreplicatedBlocksCount.get(); } /** * replicationRecheckInterval is how often namenode checks for new replication * work */ private final long replicationRecheckInterval; /** * Mapping: Block -> { BlockCollection, datanodes, self ref } * Updated only in response to client-sent information. */ final BlocksMap blocksMap; /** * Replication thread. */ final Daemon replicationThread = new Daemon(new ReplicationMonitor()); /** * Store blocks -> datanodedescriptor(s) map of corrupt replicas */ final CorruptReplicasMap corruptReplicas; /** * Blocks to be invalidated. */ private final InvalidateBlocks invalidateBlocks; /** * After a failover, over-replicated blocks may not be handled * until all of the replicas have done a block report to the * new active. This is to make sure that this NameNode has been * notified of all block deletions that might have been pending * when the failover happened. */ private final Set<Block> postponedMisreplicatedBlocks = Sets.newConcurrentHashSet(); /** * Maps a StorageID to the set of blocks that are "extra" for this * DataNode. We'll eventually remove these extras. */ public final ExcessReplicasMap excessReplicateMap; /** * Store set of Blocks that need to be replicated 1 or more times. * We also store pending replication-orders. */ public final UnderReplicatedBlocks neededReplications = new UnderReplicatedBlocks(); @VisibleForTesting final PendingReplicationBlocks pendingReplications; /** * The maximum number of replicas allowed for a block */ public final short maxReplication; /** * The maximum number of outgoing replication streams a given node should * have * at one time considering all but the highest priority replications needed. */ int maxReplicationStreams; /** * The maximum number of outgoing replication streams a given node should * have * at one time. */ int replicationStreamsHardLimit; /** * Minimum copies needed or else write is disallowed */ public final short minReplication; /** * Default number of replicas */ public final int defaultReplication; /** * value returned by MAX_CORRUPT_FILES_RETURNED */ final int maxCorruptFilesReturned; final float blocksInvalidateWorkPct; final int blocksReplWorkMultiplier; /** * variable to enable check for enough racks */ final boolean shouldCheckForEnoughRacks; // whether or not to issue block encryption keys. final boolean encryptDataTransfer; // Max number of blocks to log info about during a block report. private final long maxNumBlocksToLog; /** * Process replication queues asynchronously to allow namenode safemode exit * and failover to be faster. HDFS-5496 */ private Daemon replicationQueuesInitializer = null; /** * Progress of the Replication queues initialisation. */ private double replicationQueuesInitProgress = 0.0; /** * for block replicas placement */ private BlockPlacementPolicy blockplacement; private final BlockStoragePolicySuite storagePolicySuite; /** Check whether name system is running before terminating */ private boolean checkNSRunning = true; /** * Number of blocks to process at one batch */ private final int slicerBatchSize; /** * Number of batches to be processed by this namenode at one time */ private final int processMisReplicatedNoOfBatchs; /** * Number of threads for slicers */ private final int slicerNbThreads; private final int numBuckets; private final int blockFetcherNBThreads; private final int blockFetcherBucketsPerThread; public BlockManager(final Namesystem namesystem, final Configuration conf) throws IOException { this.namesystem = namesystem; this.numBuckets = conf.getInt(DFSConfigKeys.DFS_NUM_BUCKETS_KEY, DFSConfigKeys.DFS_NUM_BUCKETS_DEFAULT); HashBuckets.initialize(numBuckets); this.blockFetcherNBThreads = conf.getInt(DFSConfigKeys.DFS_BLOCK_FETCHER_NB_THREADS, DFSConfigKeys.DFS_BLOCK_FETCHER_NB_THREADS_DEFAULT); this.blockFetcherBucketsPerThread = conf.getInt(DFSConfigKeys.DFS_BLOCK_FETCHER_BUCKETS_PER_THREAD, DFSConfigKeys.DFS_BLOCK_FETCHER_BUCKETS_PER_THREADS_DEFAULT); datanodeManager = new DatanodeManager(this, namesystem, conf); corruptReplicas = new CorruptReplicasMap(datanodeManager); heartbeatManager = datanodeManager.getHeartbeatManager(); startupDelayBlockDeletionInMs = conf.getLong( DFSConfigKeys.DFS_NAMENODE_STARTUP_DELAY_BLOCK_DELETION_SEC_KEY, DFSConfigKeys.DFS_NAMENODE_STARTUP_DELAY_BLOCK_DELETION_SEC_DEFAULT) * 1000L; invalidateBlocks = new InvalidateBlocks(datanodeManager.blockInvalidateLimit, startupDelayBlockDeletionInMs); excessReplicateMap = new ExcessReplicasMap(datanodeManager); blocksMap = new BlocksMap(datanodeManager); blockplacement = BlockPlacementPolicy.getInstance(conf, datanodeManager.getFSClusterStats(), datanodeManager.getNetworkTopology(), datanodeManager.getHost2DatanodeMap()); storagePolicySuite = BlockStoragePolicySuite.createDefaultSuite(); pendingReplications = new PendingReplicationBlocks( conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_KEY, DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_DEFAULT) * 1000L); blockTokenSecretManager = createBlockTokenSecretManager(conf); this.maxCorruptFilesReturned = conf.getInt(DFSConfigKeys.DFS_DEFAULT_MAX_CORRUPT_FILES_RETURNED_KEY, DFSConfigKeys.DFS_DEFAULT_MAX_CORRUPT_FILES_RETURNED); this.defaultReplication = conf.getInt(DFSConfigKeys.DFS_REPLICATION_KEY, DFSConfigKeys.DFS_REPLICATION_DEFAULT); final int maxR = conf.getInt(DFSConfigKeys.DFS_REPLICATION_MAX_KEY, DFSConfigKeys.DFS_REPLICATION_MAX_DEFAULT); final int minR = conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY, DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT); if (minR <= 0) { throw new IOException("Unexpected configuration parameters: " + DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY + " = " + minR + " <= 0"); } if (maxR > Short.MAX_VALUE) { throw new IOException("Unexpected configuration parameters: " + DFSConfigKeys.DFS_REPLICATION_MAX_KEY + " = " + maxR + " > " + Short.MAX_VALUE); } if (minR > maxR) { throw new IOException( "Unexpected configuration parameters: " + DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY + " = " + minR + " > " + DFSConfigKeys.DFS_REPLICATION_MAX_KEY + " = " + maxR); } this.minReplication = (short) minR; this.maxReplication = (short) maxR; this.maxReplicationStreams = conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY, DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_DEFAULT); this.replicationStreamsHardLimit = conf.getInt( DFSConfigKeys.DFS_NAMENODE_REPLICATION_STREAMS_HARD_LIMIT_KEY, DFSConfigKeys.DFS_NAMENODE_REPLICATION_STREAMS_HARD_LIMIT_DEFAULT); this.shouldCheckForEnoughRacks = conf.get(DFSConfigKeys.NET_TOPOLOGY_SCRIPT_FILE_NAME_KEY) == null ? false : true; this.blocksInvalidateWorkPct = DFSUtil.getInvalidateWorkPctPerIteration(conf); this.blocksReplWorkMultiplier = DFSUtil.getReplWorkMultiplier(conf); this.replicationRecheckInterval = conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_DEFAULT) * 1000L; this.encryptDataTransfer = conf.getBoolean(DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY, DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_DEFAULT); this.maxNumBlocksToLog = conf.getLong(DFSConfigKeys.DFS_MAX_NUM_BLOCKS_TO_LOG_KEY, DFSConfigKeys.DFS_MAX_NUM_BLOCKS_TO_LOG_DEFAULT); this.slicerBatchSize = conf.getInt(DFSConfigKeys.DFS_NAMENODE_SLICER_BATCH_SIZE, DFSConfigKeys.DFS_NAMENODE_SLICER_BATCH_SIZE_DEFAULT); this.processMisReplicatedNoOfBatchs = conf.getInt( DFSConfigKeys.DFS_NAMENODE_PROCESS_MISREPLICATED_NO_OF_BATCHS, DFSConfigKeys.DFS_NAMENODE_PROCESS_MISREPLICATED_NO_OF_BATCHS_DEFAULT); this.slicerNbThreads = conf.getInt(DFSConfigKeys.DFS_NAMENODE_SLICER_NB_OF_THREADS, DFSConfigKeys.DFS_NAMENODE_SLICER_NB_OF_THREADS_DEFAULT); LOG.info("defaultReplication = " + defaultReplication); LOG.info("maxReplication = " + maxReplication); LOG.info("minReplication = " + minReplication); LOG.info("maxReplicationStreams = " + maxReplicationStreams); LOG.info("shouldCheckForEnoughRacks = " + shouldCheckForEnoughRacks); LOG.info("replicationRecheckInterval = " + replicationRecheckInterval); LOG.info("encryptDataTransfer = " + encryptDataTransfer); LOG.info("maxNumBlocksToLog = " + maxNumBlocksToLog); LOG.info("slicerBatchSize = " + slicerBatchSize); LOG.info("misReplicatedNoOfBatchs = " + processMisReplicatedNoOfBatchs); LOG.info("slicerNbOfBatchs = " + processMisReplicatedNoOfBatchs); } private NameNodeBlockTokenSecretManager createBlockTokenSecretManager(final Configuration conf) throws IOException { final boolean isEnabled = conf.getBoolean(DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY, DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_DEFAULT); LOG.info(DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY + "=" + isEnabled); if (!isEnabled) { if (UserGroupInformation.isSecurityEnabled()) { LOG.error("Security is enabled but block access tokens " + "(via " + DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY + ") " + "aren't enabled. This may cause issues " + "when clients attempt to talk to a DataNode."); } return null; } final long updateMin = conf.getLong(DFSConfigKeys.DFS_BLOCK_ACCESS_KEY_UPDATE_INTERVAL_KEY, DFSConfigKeys.DFS_BLOCK_ACCESS_KEY_UPDATE_INTERVAL_DEFAULT); final long lifetimeMin = conf.getLong(DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_LIFETIME_KEY, DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_LIFETIME_DEFAULT); final String encryptionAlgorithm = conf.get(DFSConfigKeys.DFS_DATA_ENCRYPTION_ALGORITHM_KEY); LOG.info(DFSConfigKeys.DFS_BLOCK_ACCESS_KEY_UPDATE_INTERVAL_KEY + "=" + updateMin + " min(s), " + DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_LIFETIME_KEY + "=" + lifetimeMin + " min(s), " + DFSConfigKeys.DFS_DATA_ENCRYPTION_ALGORITHM_KEY + "=" + encryptionAlgorithm); return new NameNodeBlockTokenSecretManager(updateMin * 60 * 1000L, lifetimeMin * 60 * 1000L, null, encryptionAlgorithm, namesystem); } public BlockStoragePolicy getDefaultStoragePolicy() { return storagePolicySuite.getDefaultPolicy(); } public BlockStoragePolicy getStoragePolicy(final String policyName) { return storagePolicySuite.getPolicy(policyName); } public BlockStoragePolicy getStoragePolicy(final byte policyId) { return storagePolicySuite.getPolicy(policyId); } public BlockStoragePolicy[] getStoragePolicies() { return storagePolicySuite.getAllPolicies(); } public void setBlockPoolId(String blockPoolId) { if (isBlockTokenEnabled()) { blockTokenSecretManager.setBlockPoolId(blockPoolId); } } public BlockStoragePolicySuite getStoragePolicySuite() { return storagePolicySuite; } /** get the BlockTokenSecretManager */ @VisibleForTesting public BlockTokenSecretManager getBlockTokenSecretManager() { return blockTokenSecretManager; } /** Allow silent termination of replication monitor for testing */ @VisibleForTesting void enableRMTerminationForTesting() { checkNSRunning = false; } private boolean isBlockTokenEnabled() { return blockTokenSecretManager != null; } /** * Should the access keys be updated? */ boolean shouldUpdateBlockKey(final long updateTime) throws IOException { return isBlockTokenEnabled() ? blockTokenSecretManager.updateKeys(updateTime) : false; } public void activate(Configuration conf) throws IOException { pendingReplications.start(); datanodeManager.activate(conf); this.replicationThread.start(); if (isBlockTokenEnabled()) { this.blockTokenSecretManager.generateKeysIfNeeded(); } } public void close() { try { replicationThread.interrupt(); replicationThread.join(3000); } catch (InterruptedException ie) { } datanodeManager.close(); pendingReplications.stop(); blocksMap.close(); } /** * @return the datanodeManager */ public DatanodeManager getDatanodeManager() { return datanodeManager; } @VisibleForTesting public BlockPlacementPolicy getBlockPlacementPolicy() { return blockplacement; } /** * Set BlockPlacementPolicy */ public void setBlockPlacementPolicy(BlockPlacementPolicy newpolicy) { if (newpolicy == null) { throw new HadoopIllegalArgumentException("newpolicy == null"); } this.blockplacement = newpolicy; } /** * Dump the metadata for the given block in a human-readable * form. */ private void dumpBlockMeta(Block block, PrintWriter out) throws IOException { List<DatanodeDescriptor> containingNodes = new ArrayList<>(); List<DatanodeStorageInfo> containingLiveReplicasNodes = new ArrayList<DatanodeStorageInfo>(); NumberReplicas numReplicas = new NumberReplicas(); // source node returned is not used chooseSourceDatanode(block, containingNodes, containingLiveReplicasNodes, numReplicas, UnderReplicatedBlocks.LEVEL); // containingLiveReplicasNodes can include READ_ONLY_SHARED replicas which are // not included in the numReplicas.liveReplicas() count assert containingLiveReplicasNodes.size() >= numReplicas.liveReplicas(); int usableReplicas = numReplicas.liveReplicas() + numReplicas.decommissionedReplicas(); if (block instanceof BlockInfoContiguous) { BlockCollection bc = ((BlockInfoContiguous) block).getBlockCollection(); String fileName = (bc == null) ? "[orphaned]" : bc.getName(); out.print(fileName + ": "); } // l: == live:, d: == decommissioned c: == corrupt e: == excess out.print(block + ((usableReplicas > 0) ? "" : " MISSING") + " (replicas:" + " l: " + numReplicas.liveReplicas() + " d: " + numReplicas.decommissionedReplicas() + " c: " + numReplicas.corruptReplicas() + " e: " + numReplicas.excessReplicas() + ") "); Collection<DatanodeDescriptor> corruptNodes = corruptReplicas.getNodes(getBlockInfo(block)); for (DatanodeStorageInfo storage : blocksMap.storageList(block)) { final DatanodeDescriptor node = storage.getDatanodeDescriptor(); String state = ""; if (corruptNodes != null && corruptNodes.contains(node)) { state = "(corrupt)"; } else if (node.isDecommissioned() || node.isDecommissionInProgress()) { state = "(decommissioned)"; } if (storage.areBlockContentsStale()) { state += " (block deletions maybe out of date)"; } out.print(" " + node + state + " : "); } out.println(""); } /** * @return maxReplicationStreams */ public int getMaxReplicationStreams() { return maxReplicationStreams; } /** * @param block * @return true if the block has minimum replicas */ public boolean checkMinReplication(Block block) throws IOException { return (countNodes(block).liveReplicas() >= minReplication); } /** * Commit a block of a file * * @param block * block to be committed * @param commitBlock * - contains client reported block length and generation * @return true if the block is changed to committed state. * @throws IOException * if the block does not have at least a minimal number * of replicas reported from data-nodes. */ private static boolean commitBlock(final BlockInfoContiguousUnderConstruction block, final Block commitBlock, DatanodeManager datanodeMgr) throws IOException { if (block.getBlockUCState() == BlockUCState.COMMITTED) return false; assert block.getNumBytes() <= commitBlock.getNumBytes() : "commitBlock length is less than the stored one " + commitBlock.getNumBytes() + " vs. " + block.getNumBytes(); block.commitBlock(commitBlock, datanodeMgr); return true; } /** * Commit the last block of the file and mark it as complete if it has * meets the minimum replication requirement * * @param bc * block collection * @param commitBlock * - contains client reported block length and generation * @return true if the last block is changed to committed state. * @throws IOException * if the block does not have at least a minimal number * of replicas reported from data-nodes. */ public boolean commitOrCompleteLastBlock(BlockCollection bc, Block commitBlock) throws IOException, StorageException { if (commitBlock == null) return false; // not committing, this is a block allocation retry BlockInfoContiguous lastBlock = bc.getLastBlock(); if (lastBlock == null) return false; // no blocks in file yet if (lastBlock.isComplete()) { return false; // already completed (e.g. by syncBlock) } final boolean b = commitBlock((BlockInfoContiguousUnderConstruction) lastBlock, commitBlock, getDatanodeManager()); int numReplicas = countNodes(lastBlock).liveReplicas(); if (numReplicas >= minReplication) completeBlock(bc, lastBlock.getBlockIndex(), false); return b; } /** * Convert a specified block of the file to a complete block. * * @param bc * file * @param blkIndex * block index in the file * @throws IOException * if the block does not have at least a minimal number * of replicas reported from data-nodes. */ private BlockInfoContiguous completeBlock(final BlockCollection bc, final int blkIndex, boolean force) throws IOException, StorageException { if (blkIndex < 0) return null; BlockInfoContiguous curBlock = bc.getBlock(blkIndex); if (curBlock.isComplete()) return curBlock; BlockInfoContiguousUnderConstruction ucBlock = (BlockInfoContiguousUnderConstruction) curBlock; int numNodes = ucBlock.numNodes(datanodeManager); if (!force && numNodes < minReplication) throw new IOException( "Cannot complete block: " + "block does not satisfy minimal replication requirement."); if (!force && ucBlock.getBlockUCState() != BlockUCState.COMMITTED) throw new IOException("Cannot complete block: block has not been COMMITTED by the client"); BlockInfoContiguous completeBlock = ucBlock.convertToCompleteBlock(); // replace penultimate block in file bc.setBlock(blkIndex, completeBlock); // Since safe-mode only counts complete blocks, and we now have // one more complete block, we need to adjust the total up, and // also count it as safe, if we have at least the minimum replica // count. (We may not have the minimum replica count yet if this is // a "forced" completion when a file is getting closed by an // OP_CLOSE edit on the standby). namesystem.adjustSafeModeBlockTotals(null, 1); namesystem.incrementSafeBlockCount(Math.min(numNodes, minReplication), curBlock); return completeBlock; } private BlockInfoContiguous completeBlock(final BlockCollection bc, final BlockInfoContiguous block, boolean force) throws IOException, StorageException { BlockInfoContiguous blk = bc.getBlock(block.getBlockIndex()); if (blk == block) { return completeBlock(bc, blk.getBlockIndex(), force); } return block; } /** * Force the given block in the given file to be marked as complete, * regardless of whether enough replicas are present. This is necessary * when tailing edit logs as a Standby. */ public BlockInfoContiguous forceCompleteBlock(final BlockCollection bc, final BlockInfoContiguousUnderConstruction block) throws IOException { block.commitBlock(block, getDatanodeManager()); return completeBlock(bc, block, true); } /** * Convert the last block of the file to an under construction block.<p> * The block is converted only if the file has blocks and the last one * is a partial block (its size is less than the preferred block size). * The converted block is returned to the client. * The client uses the returned block locations to form the data pipeline * for this block.<br> * The methods returns null if there is no partial block at the end. * The client is supposed to allocate a new block with the next call. * * @param bc file * @param bytesToRemove num of bytes to remove from block * @return the last block locations if the block is partial or null otherwise */ public LocatedBlock convertLastBlockToUnderConstruction(BlockCollection bc, long bytesToRemove) throws IOException { BlockInfoContiguous oldBlock = bc.getLastBlock(); if (oldBlock == null || bc.getPreferredBlockSize() == oldBlock.getNumBytes() - bytesToRemove) return null; assert oldBlock == getStoredBlock(oldBlock) : "last block of the file is not in blocksMap"; DatanodeStorageInfo[] targets = getStorages(oldBlock); BlockInfoContiguousUnderConstruction ucBlock = bc.setLastBlock(oldBlock, targets); // Remove block from replication queue. NumberReplicas replicas = countNodes(ucBlock); neededReplications.remove(ucBlock, replicas.liveReplicas(), replicas.decommissionedReplicas(), getReplication(ucBlock)); pendingReplications.remove(ucBlock); // remove this block from the list of pending blocks to be deleted. for (DatanodeStorageInfo target : targets) { invalidateBlocks.remove(target, oldBlock); } // Adjust safe-mode totals, since under-construction blocks don't // count in safe-mode. List<Block> deltaSafe = new ArrayList<>(); // decrement safe if we had enough if (targets.length >= minReplication) { deltaSafe.add(oldBlock); } namesystem.adjustSafeModeBlockTotals(deltaSafe, // always decrement total blocks -1); final long fileLength = bc.computeContentSummary(getStoragePolicySuite()).getLength(); final long pos = fileLength - ucBlock.getNumBytes(); return createLocatedBlock(ucBlock, pos, AccessMode.WRITE); } /** * Get all valid locations of the block */ private List<DatanodeStorageInfo> getValidLocations(BlockInfoContiguous block) throws StorageException, TransactionContextException { ArrayList<DatanodeStorageInfo> storageSet = new ArrayList<DatanodeStorageInfo>(); for (DatanodeStorageInfo storage : blocksMap.storageList(block)) { // filter invalid replicas if (!invalidateBlocks.contains(storage, block)) { storageSet.add(storage); } } return storageSet; } private List<LocatedBlock> createLocatedBlockList(final BlockInfoContiguous[] blocks, final long offset, final long length, final int nrBlocksToReturn, final AccessMode mode) throws IOException, StorageException { int curBlk = 0; long curPos = 0, blkSize = 0; int nrBlocks = (blocks[0].getNumBytes() == 0) ? 0 : blocks.length; for (curBlk = 0; curBlk < nrBlocks; curBlk++) { blkSize = blocks[curBlk].getNumBytes(); assert blkSize > 0 : "Block of size 0"; if (curPos + blkSize > offset) { break; } curPos += blkSize; } if (nrBlocks > 0 && curBlk == nrBlocks) // offset >= end of file { return Collections.<LocatedBlock>emptyList(); } long endOff = offset + length; List<LocatedBlock> results = new ArrayList<>(blocks.length); do { results.add(createLocatedBlock(blocks[curBlk], curPos, mode)); curPos += blocks[curBlk].getNumBytes(); curBlk++; } while (curPos < endOff && curBlk < blocks.length && results.size() < nrBlocksToReturn); return results; } private LocatedBlock createLocatedBlock(final BlockInfoContiguous[] blocks, final long endPos, final AccessMode mode) throws IOException { int curBlk = 0; long curPos = 0; int nrBlocks = (blocks[0].getNumBytes() == 0) ? 0 : blocks.length; for (curBlk = 0; curBlk < nrBlocks; curBlk++) { long blkSize = blocks[curBlk].getNumBytes(); if (curPos + blkSize >= endPos) { break; } curPos += blkSize; } return createLocatedBlock(blocks[curBlk], curPos, mode); } private List<LocatedBlock> createPhantomLocatedBlockList(INodeFile file, final byte[] data, final AccessMode mode) throws IOException, StorageException { List<LocatedBlock> results = new ArrayList<>(1); BlockInfoContiguous fakeBlk = new BlockInfoContiguous(); fakeBlk.setBlockIdNoPersistance(-file.getId()); fakeBlk.setINodeIdNoPersistance(-file.getId()); fakeBlk.setBlockIndexNoPersistance(0); fakeBlk.setNumBytesNoPersistance(file.getSize()); fakeBlk.setTimestampNoPersistance(file.getModificationTime()); final ExtendedBlock eb = new ExtendedBlock(namesystem.getBlockPoolId(), fakeBlk); // create fake DatanodeInfos pointing to NameNodes /*DatanodeID phantomDatanodID = new DatanodeID( namesystem.getNameNode().getServiceRpcAddress().getAddress().getHostAddress(), namesystem.getNameNode().getServiceRpcAddress().getAddress().getCanonicalHostName(), namesystem.getBlockPoolId(), DFSConfigKeys.DFS_DATANODE_DEFAULT_PORT, DFSConfigKeys.DFS_DATANODE_HTTP_DEFAULT_PORT, DFSConfigKeys.DFS_DATANODE_IPC_DEFAULT_PORT); DatanodeInfo phantomDatanode = new DatanodeInfo(phantomDatanodID); phantomDatanode.setPhantomDatanode(true); DatanodeInfo[] machines = new DatanodeInfo[1]; machines[0] = phantomDatanode; */ List<DatanodeInfo> machines = new ArrayList<>(file.getBlockReplication()); for (int i = 0; i < file.getBlockReplication(); i++) { DatanodeInfo randomDatanode = datanodeManager.getRandomDN(machines, file.getBlockReplication()); if (randomDatanode != null) { machines.add(randomDatanode); } else { DatanodeID phantomDatanodID = new DatanodeID( namesystem.getNameNode().getServiceRpcAddress().getAddress().getHostAddress(), namesystem.getNameNode().getServiceRpcAddress().getAddress().getCanonicalHostName(), namesystem.getBlockPoolId(), DFSConfigKeys.DFS_DATANODE_DEFAULT_PORT, DFSConfigKeys.DFS_DATANODE_HTTP_DEFAULT_PORT, DFSConfigKeys.DFS_DATANODE_HTTPS_DEFAULT_PORT, DFSConfigKeys.DFS_DATANODE_IPC_DEFAULT_PORT); DatanodeInfo phantomDatanode = new DatanodeInfo(phantomDatanodID); machines.add(phantomDatanode); } } LocatedBlock locatedBlock = new LocatedBlock(eb, machines.toArray(new DatanodeInfo[file.getBlockReplication()]), 0, false); locatedBlock.setData(data); results.add(locatedBlock); return results; } private LocatedBlock createLocatedBlock(final BlockInfoContiguous blk, final long pos, final BlockTokenSecretManager.AccessMode mode) throws IOException { final LocatedBlock lb = createLocatedBlock(blk, pos); if (mode != null) { setBlockToken(lb, mode); } return lb; } /** @return a LocatedBlock for the given block */ private LocatedBlock createLocatedBlock(final BlockInfoContiguous blk, final long pos) throws IOException { if (blk instanceof BlockInfoContiguousUnderConstruction) { if (blk.isComplete()) { throw new IOException( "blk instanceof BlockInfoUnderConstruction && blk.isComplete()" + ", blk=" + blk); } final BlockInfoContiguousUnderConstruction uc = (BlockInfoContiguousUnderConstruction) blk; final DatanodeStorageInfo[] storages = uc.getExpectedStorageLocations(datanodeManager); final ExtendedBlock eb = new ExtendedBlock(namesystem.getBlockPoolId(), blk); return new LocatedBlock(eb, storages, pos, false); } // get block locations final int numCorruptNodes = countNodes(blk).corruptReplicas(); final int numCorruptReplicas = corruptReplicas.numCorruptReplicas(blk); if (numCorruptNodes != numCorruptReplicas) { LOG.warn("Inconsistent number of corrupt replicas for " + blk + " blockMap has " + numCorruptNodes + " but corrupt replicas map has " + numCorruptReplicas); } final int numNodes = blocksMap.numNodes(blk); final boolean isCorrupt = numCorruptNodes == numNodes; final int numMachines = isCorrupt ? numNodes : numNodes - numCorruptNodes; final DatanodeStorageInfo[] storages = new DatanodeStorageInfo[numMachines]; int j = 0; if (numMachines > 0) { for (final DatanodeStorageInfo storage : blocksMap.storageList(blk)) { final boolean replicaCorrupt = corruptReplicas.isReplicaCorrupt(blk, storage.getDatanodeDescriptor()); if (isCorrupt || (!replicaCorrupt)) { storages[j++] = storage; } } } assert j == storages.length : "isCorrupt: " + isCorrupt + " numStorages: " + numMachines + " numNodes: " + numNodes + " numCorrupt: " + numCorruptNodes + " numCorruptRepls: " + numCorruptReplicas; final ExtendedBlock eb = new ExtendedBlock(namesystem.getBlockPoolId(), blk); return new LocatedBlock(eb, storages, pos, isCorrupt); } /** * Create a PhantomLocatedBlocks. */ public LocatedBlocks createPhantomLocatedBlocks(INodeFile file, byte[] data, final boolean isFileUnderConstruction, final boolean needBlockToken) throws IOException, StorageException { if (needBlockToken == true) { new IOException("Block Tokens are not currently supported for files stored in the database"); } final AccessMode mode = needBlockToken ? AccessMode.READ : null; final List<LocatedBlock> locatedblocks = createPhantomLocatedBlockList(file, data, mode); return new LocatedBlocks(file.getSize(), isFileUnderConstruction, locatedblocks, null, false/*last block is not complete*/); } /** Create a LocatedBlocks. */ public LocatedBlocks createLocatedBlocks(final BlockInfoContiguous[] blocks, final long fileSizeExcludeBlocksUnderConstruction, final boolean isFileUnderConstruction, final long offset, final long length, final boolean needBlockToken) throws IOException, StorageException { if (blocks == null) { return null; } else if (blocks.length == 0) { return new LocatedBlocks(0, isFileUnderConstruction, Collections.<LocatedBlock>emptyList(), null, false); } else { if (LOG.isDebugEnabled()) { LOG.debug("blocks = " + java.util.Arrays.asList(blocks)); } final AccessMode mode = needBlockToken ? AccessMode.READ : null; final List<LocatedBlock> locatedblocks = createLocatedBlockList(blocks, offset, length, Integer.MAX_VALUE, mode); final BlockInfoContiguous last = blocks[blocks.length - 1]; final long lastPos = last.isComplete() ? fileSizeExcludeBlocksUnderConstruction - last.getNumBytes() : fileSizeExcludeBlocksUnderConstruction; final LocatedBlock lastlb = createLocatedBlock(last, lastPos, mode); final boolean isComplete = last.isComplete(); return new LocatedBlocks(fileSizeExcludeBlocksUnderConstruction, isFileUnderConstruction, locatedblocks, lastlb, isComplete); } } /** * @return current access keys. */ public ExportedBlockKeys getBlockKeys() throws IOException { return isBlockTokenEnabled() ? blockTokenSecretManager.exportKeys() : ExportedBlockKeys.DUMMY_KEYS; } /** * Generate a block token for the located block. */ public void setBlockToken(final LocatedBlock b, final BlockTokenSecretManager.AccessMode mode) throws IOException { if (isBlockTokenEnabled()) { // Use cached UGI if serving RPC calls. b.setBlockToken(blockTokenSecretManager.generateToken(NameNode.getRemoteUser().getShortUserName(), b.getBlock(), EnumSet.of(mode))); } } void addKeyUpdateCommand(final List<DatanodeCommand> cmds, final DatanodeDescriptor nodeinfo) throws IOException { // check access key update if (isBlockTokenEnabled() && nodeinfo.needKeyUpdate) { cmds.add(new KeyUpdateCommand(blockTokenSecretManager.exportKeys())); nodeinfo.needKeyUpdate = false; } } public DataEncryptionKey generateDataEncryptionKey() throws IOException { if (isBlockTokenEnabled() && encryptDataTransfer) { return blockTokenSecretManager.generateDataEncryptionKey(); } else { return null; } } /** * Clamp the specified replication between the minimum and the maximum * replication levels. */ public short adjustReplication(short replication) { return replication < minReplication ? minReplication : replication > maxReplication ? maxReplication : replication; } /** * Check whether the replication parameter is within the range * determined by system configuration. */ public void verifyReplication(String src, short replication, String clientName) throws IOException { if (replication >= minReplication && replication <= maxReplication) { //common case. avoid building 'text' return; } String text = "file " + src + ((clientName != null) ? " on client " + clientName : "") + ".\n" + "Requested replication " + replication; if (replication > maxReplication) { throw new IOException(text + " exceeds maximum " + maxReplication); } if (replication < minReplication) { throw new IOException(text + " is less than the required minimum " + minReplication); } } /** * Check if a block is replicated to at least the minimum replication. */ public boolean isSufficientlyReplicated(BlockInfoContiguous b) throws IOException { // Compare against the lesser of the minReplication and number of live DNs. final int replication = Math.min(minReplication, getDatanodeManager().getNumLiveDataNodes()); return countLiveNodes(b) >= replication; //countNodes(b).liveReplicas() >= replication; } /** * // used in the namenode protocol * return a list of blocks & their locations on <code>datanode</code> whose * total size is <code>size</code> * * @param datanode * on which blocks are located * @param size * total size of blocks */ public BlocksWithLocations getBlocks(DatanodeID datanode, long size // used in the namenode protocol ) throws IOException { namesystem.checkSuperuserPrivilege(); return getBlocksWithLocations(datanode, size); } /** * Get all blocks with location information from a datanode. */ private BlocksWithLocations getBlocksWithLocations(final DatanodeID datanode, final long size) throws UnregisteredNodeException, IOException { final DatanodeDescriptor node = getDatanodeManager().getDatanode(datanode); if (node == null) { blockLog.warn("BLOCK* getBlocks: Asking for blocks from an" + " unrecorded node {}", datanode); throw new HadoopIllegalArgumentException("Datanode " + datanode + " not found."); } int numBlocks = node.numBlocks(); if (numBlocks == 0) { return new BlocksWithLocations(new BlockWithLocations[0]); } int startBlock = DFSUtil.getRandom().nextInt(numBlocks); // starting from a random block Iterator<BlockInfoContiguous> iter = node.getBlockIterator(startBlock); List<BlockWithLocations> results = new ArrayList<>(); long totalSize = 0; BlockInfoContiguous curBlock; while (totalSize < size && iter.hasNext()) { List<Block> toAdd = new ArrayList<>(); long estimatedSize = 0; while (totalSize + estimatedSize < size && iter.hasNext()) { curBlock = iter.next(); if (!curBlock.isComplete()) { continue; } toAdd.add(curBlock); estimatedSize += curBlock.getNumBytes(); } totalSize += addBlocks(toAdd, results); } if (totalSize < size) { iter = node.getBlockIterator(); // start from the beginning for (int i = 0; i < startBlock && totalSize < size;) { List<Block> toAdd = new ArrayList<>(); long estimatedSize = 0; while (totalSize + estimatedSize < size && i < startBlock) { curBlock = iter.next(); i++; if (!curBlock.isComplete()) { continue; } toAdd.add(curBlock); estimatedSize += curBlock.getNumBytes(); } totalSize += addBlocks(toAdd, results); } } return new BlocksWithLocations(results.toArray(new BlockWithLocations[results.size()])); } /** * Remove the blocks associated to the given datanode. * Removing blocks in the database can take a lot of time. To avoid the all NN hanging on this function we make it * asynchronous. If the node is reconnected while this function is running, some block may be reported and then removed * this will result in these block being wrongly seen as under replicated. Which in the worse case will result in the * blocks being replicated and detected as over replicated the next time the node does a block report. This is not * ideal for disk usage, but this will not result in any data lost. */ void datanodeRemoved(final DatanodeDescriptor node, boolean async) throws IOException { Future future = datanodeRemover.submit(new Callable<Object>() { @Override public Object call() throws Exception { try { Map<Long, Long> allBlocksAndInodesIds = node.getAllStorageReplicas(numBuckets, blockFetcherNBThreads, blockFetcherBucketsPerThread, ((FSNamesystem) namesystem).getFSOperationsExecutor()); removeBlocks(allBlocksAndInodesIds, node); DatanodeStorageInfo[] storageInfos = node.getStorageInfos(); for (DatanodeStorageInfo storageInfo : storageInfos) { HashBuckets.getInstance().resetBuckets(storageInfo.getSid()); } return null; } catch (Throwable t) { LOG.error(t.getMessage(), t); throw t; } } }); node.resetBlocks(); List<Integer> sids = datanodeManager.getSidsOnDatanode(node.getDatanodeUuid()); invalidateBlocks.remove(sids); if (!async) { try { future.get(); } catch (Exception e) { if (e instanceof IOException) { throw (IOException) e; } else { throw new IOException(e); } } } } /** Remove the blocks associated to the given DatanodeStorageInfo. * Removing blocks in the database can take a lot of time. To avoid the all NN hanging on this function we make it * asynchronous. If the node is reconnected while this function is running, some block may be reported and then removed * this will result in these block being wrongly seen as under replicated. Which in the worse case will result in the * blocks being replicated and detected as over replicated the next time the node does a block report. This is not * ideal for disk usage, but this will not result in any data lost. */ void removeBlocksAssociatedTo(final DatanodeStorageInfo storageInfo) throws IOException { datanodeRemover.submit(new Callable<Object>() { @Override public Object call() throws Exception { try { Map<Long, Long> allBlocksAndInodesIds = storageInfo.getAllStorageReplicas(numBuckets, blockFetcherNBThreads, blockFetcherBucketsPerThread, ((FSNamesystem) namesystem).getFSOperationsExecutor()); final DatanodeDescriptor node = storageInfo.getDatanodeDescriptor(); removeBlocks(allBlocksAndInodesIds, node); HashBuckets.getInstance().resetBuckets(storageInfo.getSid()); namesystem.checkSafeMode(); return null; } catch (Throwable t) { LOG.error(t.getMessage(), t); throw t; } } }); invalidateBlocks.remove(storageInfo.getSid()); } /* * Removing blocks in the database can take a lot of time. To avoid the all NN hanging on this function we make it * asynchronous. If the node is reconnected while this function is running, some block may be reported and then removed * this will result in these block being wrongly seen as under replicated. Which in the worse case will result in the * blocks being replicated and detected as over replicated the next time the node does a block report. This is not * ideal for disk usage, but this will not result in any data lost. */ void removeBlocksAssociatedTo(final int sid) throws IOException { datanodeRemover.submit(new Callable<Object>() { @Override public Object call() throws Exception { try { Map<Long, Long> allBlocksAndInodesIds = DatanodeStorageInfo.getAllStorageReplicas(numBuckets, sid, blockFetcherNBThreads, blockFetcherBucketsPerThread, ((FSNamesystem) namesystem).getFSOperationsExecutor()); removeBlocks(allBlocksAndInodesIds, sid); HashBuckets.getInstance().resetBuckets(sid); namesystem.checkSafeMode(); return null; } catch (Throwable t) { LOG.error(t.getMessage(), t); throw t; } } }); invalidateBlocks.remove(sid); } /** * Adds block to list of blocks which will be invalidated on specified * datanode and log the operation */ void addToInvalidates(final Block block, final DatanodeInfo datanode) throws StorageException, TransactionContextException, UnregisteredNodeException, IOException { if (!namesystem.isPopulatingReplQueues()) { return; } DatanodeDescriptor dn = datanodeManager.getDatanode(datanode); DatanodeStorageInfo storage = getBlockInfo(block).getStorageOnNode(dn); if (storage != null) { addToInvalidates(block, storage); } } void addToInvalidates(Block block, DatanodeStorageInfo storage) throws TransactionContextException, StorageException, IOException { if (!namesystem.isPopulatingReplQueues()) { return; } BlockInfoContiguous temp = getBlockInfo(block); invalidateBlocks.add(temp, storage, true); } /** * Adds block to list of blocks which will be invalidated on all its * datanodes. */ private void addToInvalidates(Block b) throws StorageException, TransactionContextException, IOException { if (!namesystem.isPopulatingReplQueues()) { return; } StringBuilder datanodes = new StringBuilder(); BlockInfoContiguous block = getBlockInfo(b); DatanodeStorageInfo[] storages = getBlockInfo(block).getStorages(datanodeManager, DatanodeStorage.State.NORMAL); for (DatanodeStorageInfo storage : storages) { final DatanodeDescriptor node = storage.getDatanodeDescriptor(); invalidateBlocks.add(block, storage, false); datanodes.append(node).append(" "); } if (datanodes.length() != 0) { blockLog.info("BLOCK* addToInvalidates: {} {}", block, datanodes.toString()); } } /** * Remove all block invalidation tasks under this datanode UUID; * used when a datanode registers with a new UUID and the old one * is wiped. */ void removeFromInvalidates(final DatanodeDescriptor datanode) throws IOException { if (!namesystem.isPopulatingReplQueues()) { return; } for (int sid : datanode.getSidsOnNode()) { invalidateBlocks.remove(sid); } } /** * Mark the block belonging to datanode as corrupt * @param blk Block to be marked as corrupt * @param dn Datanode which holds the corrupt replica * @param storageID if known, null otherwise. * @param reason a textual reason why the block should be marked corrupt, * for logging purposes */ public void findAndMarkBlockAsCorrupt(final ExtendedBlock blk, final DatanodeInfo dn, final String storageID, final String reason) throws IOException { final DatanodeDescriptor node = getDatanodeManager().getDatanode(dn); final DatanodeStorageInfo storage = storageID == null ? null : node.getStorageInfo(storageID); new HopsTransactionalRequestHandler(HDFSOperationType.FIND_AND_MARK_BLOCKS_AS_CORRUPT) { INodeIdentifier inodeIdentifier; @Override public void setUp() throws StorageException { Block b = blk.getLocalBlock(); inodeIdentifier = INodeUtil.resolveINodeFromBlock(b); } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getIndividualINodeLock(INodeLockType.WRITE, inodeIdentifier)) .add(lf.getIndividualBlockLock(blk.getBlockId(), inodeIdentifier)) .add(lf.getBlockRelated(BLK.RE, BLK.ER, BLK.CR, BLK.UR, BLK.UC, BLK.IV)); if (((FSNamesystem) namesystem).isErasureCodingEnabled() && inodeIdentifier != null) { locks.add(lf.getIndivdualEncodingStatusLock(LockType.WRITE, inodeIdentifier.getInodeId())); } } @Override public Object performTask() throws StorageException, IOException { final BlockInfoContiguous storedBlock = getStoredBlock(blk.getLocalBlock()); if (storedBlock == null) { // Check if the replica is in the blockMap, if not // ignore the request for now. This could happen when BlockScanner // thread of Datanode reports bad block before Block reports are sent // by the Datanode on startup blockLog.info("BLOCK* findAndMarkBlockAsCorrupt: " + blk + " not found"); return null; } BlockToMarkCorrupt b = new BlockToMarkCorrupt(storedBlock, blk.getGenerationStamp(), reason, Reason.CORRUPTION_REPORTED); markBlockAsCorrupt(b, storage, node); return null; } }.handle(namesystem); } /** * * @param b * @param storageInfo storage that contains the block, if known. null otherwise. * @param node the node that contains the block. * @throws IOException */ private void markBlockAsCorrupt(BlockToMarkCorrupt b, DatanodeStorageInfo storageInfo, DatanodeDescriptor node) throws IOException, StorageException { BlockCollection bc = b.corrupted.getBlockCollection(); if (bc == null) { blockLog.info("BLOCK markBlockAsCorrupt: {} cannot be marked as" + " corrupt as it does not belong to any file", b); addToInvalidates(b.corrupted, node); return; } // Lookup which storage we are working on if we didn't know it yet if (storageInfo == null) { storageInfo = b.corrupted.getStorageOnNode(node); } // Add replica to the storage if it is not already there if (storageInfo != null) { storageInfo.addBlock(b.stored); } // Add this replica to corruptReplicas Map corruptReplicas.addToCorruptReplicasMap(b.corrupted, storageInfo, b.reason, b.reasonCode); NumberReplicas numberOfReplicas = countNodes(b.stored); boolean hasEnoughLiveReplicas = numberOfReplicas.liveReplicas() >= bc.getBlockReplication(); boolean minReplicationSatisfied = numberOfReplicas.liveReplicas() >= minReplication; boolean hasMoreCorruptReplicas = minReplicationSatisfied && (numberOfReplicas.liveReplicas() + numberOfReplicas.corruptReplicas()) > bc .getBlockReplication(); boolean corruptedDuringWrite = minReplicationSatisfied && (b.stored.getGenerationStamp() > b.corrupted.getGenerationStamp()); // case 1: have enough number of live replicas // case 2: corrupted replicas + live replicas > Replication factor // case 3: Block is marked corrupt due to failure while writing. In this // case genstamp will be different than that of valid block. // In all these cases we can delete the replica. // In case of 3, rbw block will be deleted and valid block can be replicated if (hasEnoughLiveReplicas || hasMoreCorruptReplicas || corruptedDuringWrite) { // the block is over-replicated so invalidate the replicas immediately invalidateBlock(b, node); } else if (namesystem.isPopulatingReplQueues()) { // add the block to neededReplication updateNeededReplications(b.stored, -1, 0); } // HDFS stops here, but we have to check if Erasure Coding is enabled, // and if we need to use it to restore this block. If there are no // replicas of this block, we can still restore using the parity blocks: FSNamesystem fsNamesystem = (FSNamesystem) namesystem; if (!fsNamesystem.isErasureCodingEnabled()) { return; } if (numberOfReplicas.liveReplicas() == 0) { EncodingStatus status = EntityManager.find(EncodingStatus.Finder.ByInodeId, bc.getId()); if (status != null) { if (status.isCorrupt() == false) { status.setStatus(EncodingStatus.Status.REPAIR_REQUESTED); status.setStatusModificationTime(System.currentTimeMillis()); } status.setLostBlocks(status.getLostBlocks() + 1); EntityManager.update(status); } else { status = EntityManager.find(EncodingStatus.Finder.ByParityInodeId, bc.getId()); if (status != null) { if (status.isParityCorrupt() == false) { status.setParityStatus(EncodingStatus.ParityStatus.REPAIR_REQUESTED); status.setParityStatusModificationTime(System.currentTimeMillis()); } status.setLostParityBlocks(status.getLostParityBlocks() + 1); EntityManager.update(status); LOG.info("markBlockAsCorrupt updated parity status to repair requested"); } } } } /** * Invalidates the given block on the given datanode. * @return true if the block was successfully invalidated and no longer * present in the BlocksMap */ private boolean invalidateBlock(BlockToMarkCorrupt b, DatanodeInfo dn) throws IOException { // /** // * Invalidates the given block on the given storage. // */ // private void invalidateBlock(BlockToMarkCorrupt b, // DatanodeStorageInfo storage) throws IOException, StorageException { blockLog.info("BLOCK* invalidateBlock: {} on {}", b, dn); DatanodeDescriptor node = getDatanodeManager().getDatanode(dn); if (node == null) { throw new IOException("Cannot invalidate " + b + " because datanode " + dn + " does not exist."); } // Check how many copies we have of the block NumberReplicas nr = countNodes(b.stored); if (nr.replicasOnStaleNodes() > 0) { blockLog.info("BLOCK* invalidateBlocks: postponing " + "invalidation of {} on {} because {} replica(s) are located on " + "nodes with potentially out-of-date block reports", b, dn, nr.replicasOnStaleNodes()); postponeBlock(b.corrupted); return false; } else if (nr.liveReplicas() >= 1) { // If we have at least one copy on a live node, then we can delete it. addToInvalidates(b.corrupted, dn); removeStoredBlock(b.stored, node); if (blockLog.isDebugEnabled()) { blockLog.debug("BLOCK* invalidateBlocks: {} on {} listed for deletion.", b, dn); } return true; } else { blockLog.info("BLOCK* invalidateBlocks: {} on {} is the only copy and" + " was not deleted", b, dn); return false; } } private void postponeBlock(Block blk) { if (postponedMisreplicatedBlocks.add(blk)) { postponedMisreplicatedBlocksCount.incrementAndGet(); } } void updateState() throws IOException { pendingReplicationBlocksCount = pendingReplications.size(); underReplicatedBlocksCount = neededReplications.size(); corruptReplicaBlocksCount = corruptReplicas.size(); } /** * Return number of under-replicated but not missing blocks */ public int getUnderReplicatedNotMissingBlocks() throws IOException { return neededReplications.getUnderReplicatedBlockCount(); } /** * Schedule blocks for deletion at datanodes * * @param nodesToProcess * number of datanodes to schedule deletion work * @return total number of block for deletion */ int computeInvalidateWork(int nodesToProcess) throws IOException { final Map<DatanodeInfo, List<Integer>> nodesToSids = invalidateBlocks.getDatanodes(datanodeManager); List<Map.Entry<DatanodeInfo, List<Integer>>> nodes = new ArrayList<>(nodesToSids.entrySet()); Collections.shuffle(nodes); nodesToProcess = Math.min(nodes.size(), nodesToProcess); int blockCnt = 0; for (Map.Entry<DatanodeInfo, List<Integer>> dnInfo : nodes) { int blocks = invalidateWorkForOneNode(dnInfo); if (blocks > 0) { blockCnt += blocks; if (--nodesToProcess == 0) { break; } } } return blockCnt; } /** * Scan blocks in {@link #neededReplications} and assign replication * work to data-nodes they belong to. * <p/> * The number of process blocks equals either twice the number of live * data-nodes or the number of under-replicated blocks whichever is less. * * @return number of blocks scheduled for replication during this iteration. */ int computeReplicationWork(int blocksToProcess) throws IOException { List<List<Block>> blocksToReplicate = neededReplications.chooseUnderReplicatedBlocks(blocksToProcess); return computeReplicationWorkForBlocks(blocksToReplicate); } /** * Replicate a set of blocks * Calls {@link #computeReplicationWorkForBlock(Block, int)} for every block. * * @param blocksToReplicate blocks to be replicated, for each priority * @return the number of blocks scheduled for replication */ @VisibleForTesting int computeReplicationWorkForBlocks(List<List<Block>> blocksToReplicate) throws IOException { int scheduledWork = 0; for (int priority = 0; priority < blocksToReplicate.size(); priority++) { for (Block block : blocksToReplicate.get(priority)) { scheduledWork += computeReplicationWorkForBlock(block, priority); } } return scheduledWork; } /** * Replicate a set of blocks * * @return the number of blocks scheduled for replication */ private int computeReplicationWorkForBlockInternal(Block blk, int priority1) throws StorageException, IOException { int requiredReplication, numEffectiveReplicas; List<DatanodeDescriptor> containingNodes; DatanodeDescriptor srcNode; BlockCollection bc = null; int additionalReplRequired; int scheduledWork = 0; List<ReplicationWork> work = new LinkedList<>(); synchronized (neededReplications) { // block should belong to a file bc = blocksMap.getBlockCollection(blk); // abandoned block or block reopened for append if (bc == null || (bc.isUnderConstruction() && getBlockInfo(blk).equals(bc.getLastBlock()))) { // remove from neededReplications neededReplications.remove(getBlockInfo(blk)); neededReplications.decrementReplicationIndex(priority1); return scheduledWork; } requiredReplication = bc.getBlockReplication(); // get a source data-node containingNodes = new ArrayList<>(); List<DatanodeStorageInfo> liveReplicaNodes = new ArrayList<>(); NumberReplicas numReplicas = new NumberReplicas(); srcNode = chooseSourceDatanode(blk, containingNodes, liveReplicaNodes, numReplicas, priority1); if (srcNode == null) { // block can not be replicated from any storage LOG.debug("Block " + blk + " cannot be repl from any storage"); return scheduledWork; } // liveReplicaNodes can include READ_ONLY_SHARED replicas which are // not included in the numReplicas.liveReplicas() count assert liveReplicaNodes.size() >= numReplicas.liveReplicas(); // do not schedule more if enough replicas is already pending numEffectiveReplicas = numReplicas.liveReplicas() + pendingReplications.getNumReplicas(getBlockInfo(blk)); if (numEffectiveReplicas >= requiredReplication) { if ((pendingReplications.getNumReplicas(getBlockInfo(blk)) > 0) || (blockHasEnoughRacks(blk))) { neededReplications.remove(getBlockInfo(blk)); // remove from neededReplications neededReplications.decrementReplicationIndex(priority1); blockLog.info("BLOCK* Removing " + blk + " from neededReplications as it has enough replicas"); return scheduledWork; } } if (numReplicas.liveReplicas() < requiredReplication) { additionalReplRequired = requiredReplication - numEffectiveReplicas; } else { additionalReplRequired = 1; // Needed on a new rack } work.add(new ReplicationWork(blk, bc, srcNode, containingNodes, liveReplicaNodes, additionalReplRequired, priority1)); } final Set<Node> excludedNodes = new HashSet<>(); for (ReplicationWork rw : work) { // Exclude all of the containing nodes from being targets. // This list includes decommissioning or corrupt nodes. excludedNodes.clear(); for (DatanodeDescriptor dn : rw.containingNodes) { excludedNodes.add(dn); } // choose replication targets: NOT HOLDING THE GLOBAL LOCK // It is costly to extract the filename for which chooseTargets is called, // so for now we pass in the blk collection itself. rw.chooseTargets(blockplacement, storagePolicySuite, excludedNodes); } for (ReplicationWork rw : work) { final DatanodeStorageInfo[] targets = rw.targets; if (targets == null || targets.length == 0) { rw.targets = null; continue; } synchronized (neededReplications) { Block block = rw.block; int priority = rw.priority; // Recheck since global lock was released // block should belong to a file bc = blocksMap.getBlockCollection(block); // abandoned block or block reopened for append if (bc == null || (bc.isUnderConstruction() && getBlockInfo(blk).equals(bc.getLastBlock()))) { neededReplications.remove(getBlockInfo(block)); // remove from neededReplications rw.targets = null; neededReplications.decrementReplicationIndex(priority); continue; } requiredReplication = bc.getBlockReplication(); // do not schedule more if enough replicas is already pending NumberReplicas numReplicas = countNodes(block); numEffectiveReplicas = numReplicas.liveReplicas() + pendingReplications.getNumReplicas(getBlockInfo(block)); if (numEffectiveReplicas >= requiredReplication) { if ((pendingReplications.getNumReplicas(getBlockInfo(block)) > 0) || (blockHasEnoughRacks(block))) { neededReplications.remove(getBlockInfo(block)); // remove from neededReplications neededReplications.decrementReplicationIndex(priority); rw.targets = null; blockLog.info("BLOCK* Removing {} from neededReplications as" + " it has enough replicas", block); continue; } } if ((numReplicas.liveReplicas() >= requiredReplication) && (!blockHasEnoughRacks(block))) { if (rw.srcNode.getNetworkLocation() .equals(targets[0].getDatanodeDescriptor().getNetworkLocation())) { //No use continuing, unless a new rack in this case continue; } } // Add block to the to be replicated list rw.srcNode.addBlockToBeReplicated(block, targets); scheduledWork++; DatanodeStorageInfo.incrementBlocksScheduled(targets); // Move the block-replication into a "pending" state. // The reason we use 'pending' is so we can retry // replications that fail after an appropriate amount of time. pendingReplications.increment(getBlockInfo(block), DatanodeStorageInfo.toDatanodeDescriptors(targets)); if (blockLog.isDebugEnabled()) { blockLog.debug("BLOCK* block {} is moved from neededReplications to " + "pendingReplications", block); } // remove from neededReplications if (numEffectiveReplicas + targets.length >= requiredReplication) { neededReplications.remove(getBlockInfo(block)); // remove from neededReplications neededReplications.decrementReplicationIndex(priority); } } } if (blockLog.isInfoEnabled()) { // log which blocks have been scheduled for replication for (ReplicationWork rw : work) { DatanodeStorageInfo[] targets = rw.targets; if (targets != null && targets.length != 0) { StringBuilder targetList = new StringBuilder("datanode(s)"); for (DatanodeStorageInfo target : targets) { targetList.append(' '); targetList.append(target); } blockLog.info("BLOCK* ask {} to replicate {} to {}", rw.srcNode, rw.block, targetList); } } } if (blockLog.isDebugEnabled()) { blockLog.debug("BLOCK* neededReplications = {} pendingReplications = {}", neededReplications.size(), pendingReplications.size()); } return scheduledWork; } /** * Choose target datanodes for creating a new block. * * @throws IOException * if the number of targets < minimum replication. * @see BlockPlacementPolicy#chooseTarget(String, int, Node, List, boolean, Set, long, BlockStoragePolicy) */ public DatanodeStorageInfo[] chooseTarget4NewBlock(final String src, final int numOfReplicas, final Node client, final Set<Node> excludedNodes, final long blocksize, final List<String> favoredNodes, final byte storagePolicyID) throws IOException { List<DatanodeDescriptor> favoredDatanodeDescriptors = getDatanodeDescriptors(favoredNodes); BlockStoragePolicy storagePolicy = storagePolicySuite.getPolicy(storagePolicyID); final DatanodeStorageInfo[] targets = blockplacement.chooseTarget(src, numOfReplicas, client, excludedNodes, blocksize, favoredDatanodeDescriptors, storagePolicy); if (targets.length < minReplication) { throw new IOException("File " + src + " could only be replicated to " + targets.length + " nodes " + "instead of minReplication (=" + minReplication + "). " + "There are " + getDatanodeManager().getNetworkTopology().getNumOfLeaves() + " datanode(s) running and " + (excludedNodes == null ? "no" : excludedNodes.size()) + " node(s) are excluded in this operation. " + (excludedNodes != null ? Arrays.toString(excludedNodes.toArray(new Node[excludedNodes.size()])) : "[]")); } return targets; } /** Choose target for WebHDFS redirection. */ public DatanodeStorageInfo[] chooseTarget4WebHDFS(String src, DatanodeDescriptor clientnode, Set<Node> excludes, long blocksize) { return blockplacement.chooseTarget(src, 1, clientnode, Collections.<DatanodeStorageInfo>emptyList(), false, excludes, blocksize, storagePolicySuite.getDefaultPolicy()); } /** Choose target for getting additional datanodes for an existing pipeline. */ public DatanodeStorageInfo[] chooseTarget4AdditionalDatanode(String src, int numAdditionalNodes, Node clientnode, List<DatanodeStorageInfo> chosen, Set<Node> excludes, long blocksize, byte storagePolicyID) { final BlockStoragePolicy storagePolicy = storagePolicySuite.getPolicy(storagePolicyID); return blockplacement.chooseTarget(src, numAdditionalNodes, clientnode, chosen, true, excludes, blocksize, storagePolicy); } public DatanodeStorageInfo[] chooseTarget4ParityRepair(String src, int numOfReplicas, Node clientnode, List<DatanodeStorageInfo> chosen, Set<Node> excludes, long blocksize, byte storagePolicyID) { final BlockStoragePolicy storagePolicy = storagePolicySuite.getPolicy(storagePolicyID); return blockplacement.chooseTarget(src, numOfReplicas, clientnode, chosen, false, excludes, blocksize, storagePolicy); } /** * Get list of datanode descriptors for given list of nodes. Nodes are * hostaddress:port or just hostaddress. */ List<DatanodeDescriptor> getDatanodeDescriptors(List<String> nodes) { List<DatanodeDescriptor> datanodeDescriptors = null; if (nodes != null) { datanodeDescriptors = new ArrayList<DatanodeDescriptor>(nodes.size()); for (int i = 0; i < nodes.size(); i++) { DatanodeDescriptor node = datanodeManager.getDatanodeDescriptor(nodes.get(i)); if (node != null) { datanodeDescriptors.add(node); } } } return datanodeDescriptors; } /** * Parse the data-nodes the block belongs to and choose one, * which will be the replication source. * <p/> * We prefer nodes that are in DECOMMISSION_INPROGRESS state to other nodes * since the former do not have write traffic and hence are less busy. * We do not use already decommissioned nodes as a source. * Otherwise we choose a random node among those that did not reach their * replication limits. However, if the replication is of the highest * priority * and all nodes have reached their replication limits, we will choose a * random node despite the replication limit. * <p/> * In addition form a list of all nodes containing the block * and calculate its replication numbers. * * @param b * Block for which a replication source is needed * @param containingNodes * List to be populated with nodes found to contain the * given block * @param nodesContainingLiveReplicas * List to be populated with nodes found to * contain live replicas of the given block * @param numReplicas * NumberReplicas instance to be initialized with the * counts of live, corrupt, excess, and * decommissioned replicas of the given * block. * @param priority * integer representing replication priority of the given * block * @return the DatanodeStorageInfo of the chosen storage from which to * replicate the given block */ @VisibleForTesting DatanodeDescriptor chooseSourceDatanode(Block b, List<DatanodeDescriptor> containingNodes, List<DatanodeStorageInfo> nodesContainingLiveReplicas, NumberReplicas numReplicas, int priority) throws IOException { containingNodes.clear(); nodesContainingLiveReplicas.clear(); DatanodeDescriptor srcNode = null; int live = 0; int decommissioned = 0; int corrupt = 0; int excess = 0; final BlockInfoContiguous block = getBlockInfo(b); Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(block); for (DatanodeStorageInfo storage : block.getStorages(datanodeManager)) { final DatanodeDescriptor node = storage.getDatanodeDescriptor(); int countableReplica = storage.getState() == State.NORMAL ? 1 : 0; if ((nodesCorrupt != null) && (nodesCorrupt.contains(node))) { corrupt += countableReplica; } else if (node.isDecommissionInProgress() || node.isDecommissioned()) { decommissioned += countableReplica; } else if (excessReplicateMap.contains(storage, block)) { excess += countableReplica; } else { nodesContainingLiveReplicas.add(storage); live += countableReplica; } if (!containingNodes.contains(node)) { containingNodes.add(node); } // Check if this replica is corrupt // If so, do not select the node as src node if ((nodesCorrupt != null) && nodesCorrupt.contains(node)) { continue; } if (priority != UnderReplicatedBlocks.QUEUE_HIGHEST_PRIORITY && !node.isDecommissionInProgress() && node.getNumberOfBlocksToBeReplicated() >= maxReplicationStreams) { continue; // already reached replication limit } if (node.getNumberOfBlocksToBeReplicated() >= replicationStreamsHardLimit) { continue; } // the block must not be scheduled for removal on srcNode if (excessReplicateMap.contains(storage, block)) { continue; } // never use already decommissioned nodes if (node.isDecommissioned()) { continue; } // We got this far, current node is a reasonable choice if (srcNode == null) { srcNode = node; continue; } // switch to a different node randomly // this to prevent from deterministically selecting the same node even // if the node failed to replicate the block on previous iterations if (DFSUtil.getRandom().nextBoolean()) { srcNode = node; } } if (numReplicas != null) { numReplicas.initialize(live, decommissioned, corrupt, excess, 0); } return srcNode; } /** * If there were any replication requests that timed out, reap them * and put them back into the neededReplication queue */ @VisibleForTesting void processPendingReplications() throws IOException { long[] timedOutItems = pendingReplications.getTimedOutBlocks(); if (timedOutItems != null) { for (long timedOutItem : timedOutItems) { processTimedOutPendingBlock(timedOutItem); } /* If we know the target datanodes where the replication timedout, * we could invoke decBlocksScheduled() on it. Its ok for now. */ } } /** * StatefulBlockInfo is used to build the "toUC" list, which is a list of * updates to the information about under-construction blocks. * Besides the block in question, it provides the ReplicaState * reported by the datanode in the block report. */ static class StatefulBlockInfo { final BlockInfoContiguousUnderConstruction storedBlock; final Block reportedBlock; final ReplicaState reportedState; StatefulBlockInfo(BlockInfoContiguousUnderConstruction storedBlock, Block reportedBlock, ReplicaState reportedState) { this.storedBlock = storedBlock; this.reportedBlock = reportedBlock; this.reportedState = reportedState; } } /** * BlockToMarkCorrupt is used to build the "toCorrupt" list, which is a * list of blocks that should be considered corrupt due to a block report. */ private static class BlockToMarkCorrupt { /** The corrupted block in a datanode. */ final BlockInfoContiguous corrupted; /** The corresponding block stored in the BlockManager. */ final BlockInfoContiguous stored; /** The reason to mark corrupt. */ final String reason; /** The reason code to be stored */ final Reason reasonCode; BlockToMarkCorrupt(BlockInfoContiguous corrupted, BlockInfoContiguous stored, String reason, Reason reasonCode) { Preconditions.checkNotNull(corrupted, "corrupted is null"); Preconditions.checkNotNull(stored, "stored is null"); this.corrupted = corrupted; this.stored = stored; this.reason = reason; this.reasonCode = reasonCode; } BlockToMarkCorrupt(BlockInfoContiguous stored, String reason, Reason reasonCode) { this(stored, stored, reason, reasonCode); } BlockToMarkCorrupt(BlockInfoContiguous stored, long gs, String reason, Reason reasonCode) { this(new BlockInfoContiguous(stored), stored, reason, reasonCode); //the corrupted block in datanode has a different generation stamp corrupted.setGenerationStampNoPersistance(gs); } @Override public String toString() { return corrupted + "(" + (corrupted == stored ? "same as stored" : "stored=" + stored) + ")"; } } /** * The given storage is reporting all its hashes. */ public List<Integer> checkHashes(final DatanodeID nodeID, final DatanodeStorage storage, final BlockReport newReport) throws IOException { final long startTime = Time.now(); //after acquiring write lock DatanodeDescriptor node = datanodeManager.getDatanode(nodeID); if (node == null || !node.isAlive) { throw new IOException("ReportHashes from dead or unregistered node: " + nodeID); } DatanodeStorageInfo storageInfo = node.getStorageInfo(storage.getStorageID()); if (storageInfo == null) { // We handle this for backwards compatibility. storageInfo = node.updateStorage(storage); } final boolean firstBlockReport = namesystem.isInStartupSafeMode() || storageInfo.getBlockReportCount() == 0; if (storageInfo.getBlockReportCount() == 0) { HashBuckets.getInstance().createBucketsForStorage(storageInfo); } HashMatchingResult matchingResult = calculateMismatchedHashes(storageInfo, newReport, firstBlockReport); blockLog.debug("BLOCK* checkHashes: Number of mismatches buckets for storage: " + storageInfo.getStorageID() + " are: " + matchingResult.mismatchedBuckets); return matchingResult.mismatchedBuckets; } /** * The given storage is reporting all its blocks. * Update the (storage-->block list) and (block-->storage list) maps. */ public boolean processReport(final DatanodeID nodeID, final DatanodeStorage storage, final BlockReport newReport, BlockReportContext context, boolean lastStorageInRpc) throws IOException { final long startTime = Time.monotonicNow(); //after acquiring write lock DatanodeDescriptor node = datanodeManager.getDatanode(nodeID); if (node == null || !node.isAlive) { throw new IOException("ProcessReport from dead or unregistered node: " + nodeID); } DatanodeStorageInfo storageInfo = node.getStorageInfo(storage.getStorageID()); if (storageInfo == null) { // We handle this for backwards compatibility. storageInfo = node.updateStorage(storage); } // To minimize startup time, we discard any second (or later) block reports // that we receive while still in startup phase. if (namesystem.isInStartupSafeMode() && storageInfo.getBlockReportCount() > 0) { blockLog.info("BLOCK* processReport: " + "discarded non-initial block report from {}" + " because namenode still in startup phase", nodeID); return !node.hasStaleStorages(); } ReportStatistics reportStatistics = null; try { // Get the storageinfo object that we are updating in this processreport reportStatistics = processReport(storageInfo, newReport); if (context != null) { storageInfo.setLastBlockReportId(context.getReportId()); if (lastStorageInRpc) { int rpcsSeen = node.updateBlockReportContext(context); if (rpcsSeen >= context.getTotalRpcs()) { List<DatanodeStorageInfo> zombies = node.removeZombieStorages(); if (zombies.isEmpty()) { LOG.debug("processReport 0x{}: no zombie storages found.", Long.toHexString(context.getReportId())); } else { for (DatanodeStorageInfo zombie : zombies) { removeZombieReplicas(context, zombie); } } node.clearBlockReportContext(); } else { LOG.debug("processReport 0x{}: {} more RPCs remaining in this " + "report.", Long.toHexString(context.getReportId()), (context.getTotalRpcs() - rpcsSeen)); } } } final long endTime = Time.monotonicNow(); // Log the block report processing stats from Namenode perspective final NameNodeMetrics metrics = NameNode.getNameNodeMetrics(); if (metrics != null) { metrics.addBlockReport((int) (endTime - startTime)); } blockLog.info("BLOCK* processReport success: from " + nodeID + " storage: " + storage + ", blocks: " + newReport.getNumberOfBlocks() + ", hasStaleStorages: " + node.hasStaleStorages() + ", processing time: " + (endTime - startTime) + " ms. " + reportStatistics); return !node.hasStaleStorages(); } catch (Throwable t) { final long endTime = Time.monotonicNow(); blockLog.error("BLOCK* processReport fail: from " + nodeID + " storage: " + storage + ", blocks: " + newReport.getNumberOfBlocks() + ", processing time: " + (endTime - startTime) + " ms. " + reportStatistics, t); throw t; } } private void removeZombieReplicas(BlockReportContext context, DatanodeStorageInfo zombie) throws IOException { LOG.warn("processReport 0x{}: removing zombie storage {}, which no " + "longer exists on the DataNode.", Long.toHexString(context.getReportId()), zombie.getStorageID()); int prevBlocks = zombie.numBlocks(); removeBlocksAssociatedTo(zombie); assert (zombie.numBlocks() == 0); LOG.warn( "processReport 0x{}: removed {} replicas from storage {}, " + "which no longer exists on the DataNode.", Long.toHexString(context.getReportId()), prevBlocks, zombie.getStorageID()); } /** * Rescan the list of blocks which were previously postponed. */ void rescanPostponedMisreplicatedBlocks() throws IOException { if (getPostponedMisreplicatedBlocksCount() == 0) { return; } long startTimeRescanPostponedMisReplicatedBlocks = Time.now(); long startPostponedMisReplicatedBlocksCount = getPostponedMisreplicatedBlocksCount(); try { // blocksPerRescan is the configured number of blocks per rescan. // Randomly select blocksPerRescan consecutive blocks from the HashSet // when the number of blocks remaining is larger than blocksPerRescan. // The reason we don't always pick the first blocksPerRescan blocks is to // handle the case if for some reason some datanodes remain in // content stale state for a long time and only impact the first // blocksPerRescan blocks. int i = 0; long startIndex = 0; long blocksPerRescan = datanodeManager.getBlocksPerPostponedMisreplicatedBlocksRescan(); long base = getPostponedMisreplicatedBlocksCount() - blocksPerRescan; if (base > 0) { startIndex = DFSUtil.getRandom().nextLong() % (base + 1); if (startIndex < 0) { startIndex += (base + 1); } } Iterator<Block> it = postponedMisreplicatedBlocks.iterator(); for (int tmp = 0; tmp < startIndex; tmp++) { it.next(); } final Set<Block> toRemove = new HashSet<>(); for (; it.hasNext(); i++) { Block b = it.next(); if (i >= blocksPerRescan) { break; } HopsTransactionalRequestHandler rescanPostponedMisreplicatedBlocksHandler = new HopsTransactionalRequestHandler( HDFSOperationType.RESCAN_MISREPLICATED_BLOCKS) { INodeIdentifier inodeIdentifier; @Override public void setUp() throws StorageException { Block b = (Block) getParams()[0]; inodeIdentifier = INodeUtil.resolveINodeFromBlock(b); } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); Block b = (Block) getParams()[0]; locks.add(lf.getIndividualINodeLock(INodeLockType.WRITE, inodeIdentifier, true)) .add(lf.getIndividualBlockLock(b.getBlockId(), inodeIdentifier)) .add(lf.getBlockRelated(BLK.RE, BLK.IV, BLK.CR, BLK.UR, BLK.ER)); } @Override public Object performTask() throws IOException { Block b = (Block) getParams()[0]; BlockInfoContiguous bi = blocksMap.getStoredBlock(b); Set<Block> toRemoveSet = (Set<Block>) getParams()[1]; if (bi == null) { if (LOG.isDebugEnabled()) { LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " + "Postponed mis-replicated block " + b + " no longer found " + "in block map."); } toRemoveSet.add(b); postponedMisreplicatedBlocksCount.decrementAndGet(); return null; } MisReplicationResult res = processMisReplicatedBlock(bi); if (LOG.isDebugEnabled()) { LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " + "Re-scanned block " + b + ", result is " + res); } if (res != MisReplicationResult.POSTPONE) { toRemoveSet.add(b); postponedMisreplicatedBlocksCount.decrementAndGet(); } return null; } }; rescanPostponedMisreplicatedBlocksHandler.setParams(b, toRemove); rescanPostponedMisreplicatedBlocksHandler.handle(namesystem); } postponedMisreplicatedBlocks.removeAll(toRemove); } finally { long endPostponedMisReplicatedBlocksCount = getPostponedMisreplicatedBlocksCount(); LOG.info("Rescan of postponedMisreplicatedBlocks completed in " + (Time.now() - startTimeRescanPostponedMisReplicatedBlocks) + " msecs. " + endPostponedMisReplicatedBlocksCount + " blocks are left. " + (startPostponedMisReplicatedBlocksCount - endPostponedMisReplicatedBlocksCount) + " blocks are removed."); } } /** * Mark block replicas as corrupt except those on the storages in * newStorages list. */ public void markBlockReplicasAsCorrupt(BlockInfoContiguous block, long oldGenerationStamp, long oldNumBytes, DatanodeStorageInfo[] newStorages) throws IOException { BlockToMarkCorrupt b = null; if (block.getGenerationStamp() != oldGenerationStamp) { b = new BlockToMarkCorrupt(block, oldGenerationStamp, "genstamp does not match " + oldGenerationStamp + " : " + block.getGenerationStamp(), Reason.GENSTAMP_MISMATCH); } else if (block.getNumBytes() != oldNumBytes) { b = new BlockToMarkCorrupt(block, "length does not match " + oldNumBytes + " : " + block.getNumBytes(), Reason.SIZE_MISMATCH); } else { return; } for (DatanodeStorageInfo storage : getStorages(block)) { boolean isCorrupt = true; if (newStorages != null) { for (DatanodeStorageInfo newStorage : newStorages) { if (newStorage != null && storage.equals(newStorage)) { isCorrupt = false; break; } } } if (isCorrupt) { blockLog.info("BLOCK* markBlockReplicasAsCorrupt: mark block replica" + " {} on {} as corrupt because the dn is not in the new committed " + "storage list.", b, storage.getDatanodeDescriptor()); markBlockAsCorrupt(b, storage, storage.getDatanodeDescriptor()); } } } @VisibleForTesting public ReportStatistics processReport(final DatanodeStorageInfo storage, final BlockReport report) throws IOException { // Normal case: // Modify the (block-->datanode) map, according to the difference // between the old and new block report. // ConcurrentHashMap<BlockInfoContiguous, Boolean> mapToAdd = new ConcurrentHashMap<BlockInfoContiguous, Boolean>(); ConcurrentHashMap<Long, Boolean> mapToRemove = new ConcurrentHashMap<Long, Boolean>(); ConcurrentHashMap<Block, Boolean> mapToInvalidate = new ConcurrentHashMap<Block, Boolean>(); ConcurrentHashMap<BlockToMarkCorrupt, Boolean> mapToCorrupt = new ConcurrentHashMap<BlockToMarkCorrupt, Boolean>(); ConcurrentHashMap<StatefulBlockInfo, Boolean> mapToUC = new ConcurrentHashMap<StatefulBlockInfo, Boolean>(); Collection<BlockInfoContiguous> toAdd = Collections.newSetFromMap(mapToAdd); Collection<Long> toRemove = Collections.newSetFromMap(mapToRemove); Collection<Block> toInvalidate = Collections.newSetFromMap(mapToInvalidate); Collection<BlockToMarkCorrupt> toCorrupt = Collections.newSetFromMap(mapToCorrupt); Collection<StatefulBlockInfo> toUC = Collections.newSetFromMap(mapToUC); final boolean firstBlockReport = namesystem.isInStartupSafeMode() || storage.getBlockReportCount() == 0; if (storage.getBlockReportCount() == 0) { HashBuckets.getInstance().createBucketsForStorage(storage); } ReportStatistics reportStatistics = reportDiff(storage, report, toAdd, toRemove, toInvalidate, toCorrupt, toUC, firstBlockReport); // Process the blocks on each queue for (StatefulBlockInfo b : toUC) { if (firstBlockReport) { addStoredBlockUnderConstructionImmediateTx(b.storedBlock, storage, b.reportedState); } else { addStoredBlockUnderConstructionTx(b, storage); } } final List<Callable<Object>> addTasks = new ArrayList<>(); int numBlocksLogged = 0; final Map<Long, List<BlockInfoContiguous>> blocksToAddPerInodeId = new HashMap<>(); for (final BlockInfoContiguous b : toAdd) { List<BlockInfoContiguous> blocksToAddList = blocksToAddPerInodeId.get(b.getInodeId()); if (blocksToAddList == null) { blocksToAddList = new ArrayList<>(); blocksToAddPerInodeId.put(b.getInodeId(), blocksToAddList); } blocksToAddList.add(b); } final Map<Integer, List<BlockInfoContiguous>> blocksToAdd = new HashMap(); final Map<Integer, List<Long>> blockIdsToAdd = new HashMap(); final Map<Integer, List<Long>> inodeIdsToAdd = new HashMap(); int index = 0; for (List<BlockInfoContiguous> entry : blocksToAddPerInodeId.values()) { List<BlockInfoContiguous> blocksToAddList = blocksToAdd.get(index); List<Long> blockIdsToAddList = blockIdsToAdd.get(index); List<Long> inodeIdsToAddList = inodeIdsToAdd.get(index); if (blocksToAddList == null) { blocksToAddList = new ArrayList<>(); blockIdsToAddList = new ArrayList<>(); inodeIdsToAddList = new ArrayList<>(); blocksToAdd.put(index, blocksToAddList); blockIdsToAdd.put(index, blockIdsToAddList); inodeIdsToAdd.put(index, inodeIdsToAddList); } for (BlockInfoContiguous b : entry) { blocksToAddList.add(b); blockIdsToAddList.add(b.getBlockId()); inodeIdsToAddList.add(b.getInodeId()); } if (blocksToAddList.size() >= slicerBatchSize) { index++; } } for (final int ind : blocksToAdd.keySet()) { if (firstBlockReport) { final boolean logIt = numBlocksLogged < maxNumBlocksToLog; addTasks.add(new Callable<Object>() { @Override public Object call() throws Exception { addStoredBlockImmediateTx(blocksToAdd.get(ind), blockIdsToAdd.get(ind), inodeIdsToAdd.get(ind), storage, logIt); return null; } }); } else { final boolean logIt = numBlocksLogged < maxNumBlocksToLog; addTasks.add(new Callable<Object>() { @Override public Object call() throws Exception { List<BlockInfoContiguous> l = blocksToAdd.get(ind); List<Long> list = blockIdsToAdd.get(ind); addStoredBlockTx(blocksToAdd.get(ind), blockIdsToAdd.get(ind), inodeIdsToAdd.get(ind), storage, null, logIt); return null; } }); } numBlocksLogged++; } if (numBlocksLogged > maxNumBlocksToLog) { blockLog.info("BLOCK* processReport: logged info for {} of {} " + "reported.", maxNumBlocksToLog, numBlocksLogged); } try { List<Future<Object>> futures = ((FSNamesystem) namesystem).getFSOperationsExecutor() .invokeAll(addTasks); //Check for exceptions for (Future<Object> maybeException : futures) { maybeException.get(); } } catch (InterruptedException e) { LOG.error(e.getMessage(), e); throw new IOException(e); } catch (ExecutionException e) { if (e.getCause() instanceof IOException) { throw (IOException) e.getCause(); } else { throw new IOException(e.getCause()); } } for (BlockToMarkCorrupt b : toCorrupt) { markBlockAsCorruptTx(b, storage); } for (Block b : toInvalidate) { blockLog.info("BLOCK* processReport: " + b + " on " + storage + " " + storage + " size " + b.getNumBytes() + " does not belong to any file"); } addToInvalidates(toInvalidate, storage); removeBlocks(new ArrayList<Long>(toRemove), storage.getDatanodeDescriptor()); return reportStatistics; } @VisibleForTesting public void removeBlocks(List<Long> allBlockIds, final DatanodeDescriptor node) throws IOException { final Map<Long, List<Long>> inodeIdsToBlockMap = INodeUtil.getINodeIdsForBlockIds(allBlockIds, slicerBatchSize, slicerNbThreads, ((FSNamesystem) namesystem).getFSOperationsExecutor()); final List<Long> inodeIds = new ArrayList<>(inodeIdsToBlockMap.keySet()); try { Slicer.slice(inodeIds.size(), slicerBatchSize, slicerNbThreads, ((FSNamesystem) namesystem).getFSOperationsExecutor(), new Slicer.OperationHandler() { @Override public void handle(int startIndex, int endIndex) throws Exception { List<Long> ids = inodeIds.subList(startIndex, endIndex); removeStoredBlocksTx(ids, inodeIdsToBlockMap, node); } }); } catch (Exception ex) { throw new IOException(ex); } } public void removeBlocks(Map<Long, Long> allBlocksAndInodesIds, final DatanodeDescriptor node) throws IOException { final Map<Long, List<Long>> inodeIdsToBlockMap = new HashMap<>(); for (Map.Entry<Long, Long> entry : allBlocksAndInodesIds.entrySet()) { List<Long> list = inodeIdsToBlockMap.get(entry.getValue()); if (list == null) { list = new ArrayList<>(); inodeIdsToBlockMap.put(entry.getValue(), list); } list.add(entry.getKey()); } final List<Long> inodeIds = new ArrayList<>(inodeIdsToBlockMap.keySet()); try { Slicer.slice(inodeIds.size(), slicerBatchSize, slicerNbThreads, ((FSNamesystem) namesystem).getFSOperationsExecutor(), new Slicer.OperationHandler() { @Override public void handle(int startIndex, int endIndex) throws Exception { List<Long> ids = inodeIds.subList(startIndex, endIndex); removeStoredBlocksTx(ids, inodeIdsToBlockMap, node); } }); } catch (Exception ex) { throw new IOException(ex); } } private void removeBlocks(Map<Long, Long> allBlocksAndInodesIds, final int sid) throws IOException { final Map<Long, List<Long>> inodeIdsToBlockMap = new HashMap<>(); for (Map.Entry<Long, Long> entry : allBlocksAndInodesIds.entrySet()) { List<Long> list = inodeIdsToBlockMap.get(entry.getValue()); if (list == null) { list = new ArrayList<>(); inodeIdsToBlockMap.put(entry.getValue(), list); } list.add(entry.getKey()); } final List<Long> inodeIds = new ArrayList<>(inodeIdsToBlockMap.keySet()); try { Slicer.slice(inodeIds.size(), slicerBatchSize, slicerNbThreads, ((FSNamesystem) namesystem).getFSOperationsExecutor(), new Slicer.OperationHandler() { @Override public void handle(int startIndex, int endIndex) throws Exception { List<Long> ids = inodeIds.subList(startIndex, endIndex); removeStoredBlocksTx(ids, inodeIdsToBlockMap, sid); } }); } catch (Exception ex) { throw new IOException(ex); } } private static class HashMatchingResult { private final List<Integer> matchingBuckets; private final List<Integer> mismatchedBuckets; HashMatchingResult(List<Integer> matchingBuckets, List<Integer> mismatchedBuckets) { this.matchingBuckets = matchingBuckets; this.mismatchedBuckets = mismatchedBuckets; } } public class ReportStatistics { int numBuckets; public int numBucketsMatching; int numBlocks; int numToRemove; int numToInvalidate; int numToCorrupt; int numToUC; int numToAdd; int numConsideredSafeIfInSafemode; @Override public String toString() { return String.format( "(buckets,bucketsMatching,blocks,toRemove,toInvalidate,toCorrupt,toUC,toAdd," + "safeBlocksIfSafeMode)=(%d,%d,%d,%d,%d,%d,%d,%d,%d)", numBuckets, numBucketsMatching, numBlocks, numToRemove, numToInvalidate, numToCorrupt, numToUC, numToAdd, numConsideredSafeIfInSafemode); } } Map<Long, Long> replicasInBucketsMT(final DatanodeStorageInfo storage, List<Integer> mismatchedBuckets) throws IOException { Map<Long, Long> mismatchedBlocksAndInodes = new ConcurrentHashMap<>(); final Collection<Callable<Map<Long, Long>>> subTasks = new ArrayList<>(); for (final Integer bucket : mismatchedBuckets) { final Callable<Map<Long, Long>> subTask = new Callable<Map<Long, Long>>() { @Override public Map<Long, Long> call() throws IOException { List<Integer> buckets = new ArrayList(); buckets.add(bucket); final Map<Long, Long> mismatchedBlocksAndInodes = storage .getAllStorageReplicasInBuckets(buckets); return mismatchedBlocksAndInodes; } }; subTasks.add(subTask); // collect subtasks } try { List<Future<Map<Long, Long>>> futures = ((FSNamesystem) namesystem).getFSOperationsExecutor() .invokeAll(subTasks); for (Future<Map<Long, Long>> maybeException : futures) { mismatchedBlocksAndInodes.putAll(maybeException.get()); } } catch (InterruptedException e) { LOG.error("Exception was thrown during block report processing", e); throw new IOException(e); } catch (ExecutionException e) { throw (IOException) e.getCause(); } return mismatchedBlocksAndInodes; } private ReportStatistics reportDiff(final DatanodeStorageInfo storage, final BlockReport newReport, final Collection<BlockInfoContiguous> toAdd, // add to DatanodeStorageInfo final Collection<Long> toRemove, // remove from DatanodeStorageInfo final Collection<Block> toInvalidate, // should be removed from Storage final Collection<BlockToMarkCorrupt> toCorrupt, // add to corrupt replicas list final Collection<StatefulBlockInfo> toUC, final boolean firstBlockReport) throws IOException { // add to under-construction list if (newReport == null) { return null; } // Get all invalidated replica's final Map<Long, Long> invalidatedReplicas = storage.getAllStorageInvalidatedReplicasWithGenStamp(); ReportStatistics stats = new ReportStatistics(); stats.numBuckets = newReport.getBuckets().length; stats.numBlocks = newReport.getNumberOfBlocks(); List<Integer> mismatchedBuckets = getReportedBucketList(newReport); stats.numBucketsMatching = newReport.getBuckets().length - mismatchedBuckets.size(); if (LOG.isDebugEnabled()) { LOG.debug(String.format("%d/%d reported hashes matched", newReport.getBuckets().length - mismatchedBuckets.size(), newReport.getBuckets().length)); } final Set<Long> aggregatedSafeBlocks = new HashSet<>(); final Map<Long, Long> mismatchedBlocksAndInodes = replicasInBucketsMT(storage, mismatchedBuckets); //Safe mode report and first report for storage will have all buckets mismatched. aggregatedSafeBlocks.addAll(mismatchedBlocksAndInodes.keySet()); processMisMatchingBuckets(storage, newReport, mismatchedBuckets, toAdd, toInvalidate, toCorrupt, toUC, firstBlockReport, mismatchedBlocksAndInodes, aggregatedSafeBlocks, invalidatedReplicas); stats.numToAdd = toAdd.size(); stats.numToInvalidate = toInvalidate.size(); stats.numToCorrupt = toCorrupt.size(); stats.numToUC = toUC.size(); toRemove.addAll(mismatchedBlocksAndInodes.keySet()); stats.numToRemove = toRemove.size(); if (namesystem.isInStartupSafeMode()) { aggregatedSafeBlocks.removeAll(toRemove); LOG.debug("AGGREGATED SAFE BLOCK #: " + aggregatedSafeBlocks.size() + " REPORTED BLOCK #: " + newReport.getNumberOfBlocks()); namesystem.adjustSafeModeBlocks(aggregatedSafeBlocks); stats.numConsideredSafeIfInSafemode = aggregatedSafeBlocks.size(); } return stats; } private void processMisMatchingBuckets(final DatanodeStorageInfo storage, final BlockReport newReport, final List<Integer> mismatchedBuckets, final Collection<BlockInfoContiguous> toAdd, final Collection<Block> toInvalidate, final Collection<BlockToMarkCorrupt> toCorrupt, final Collection<StatefulBlockInfo> toUC, final boolean firstBlockReport, final Map<Long, Long> mismatchedBlocksAndInodes, final Set<Long> aggregatedSafeBlocks, final Map<Long, Long> invalidatedReplicas) throws IOException { final Collection<Callable<Void>> subTasks = new ArrayList<>(); for (final int bucketId : mismatchedBuckets) { final Bucket bucket = newReport.getBuckets()[bucketId]; final BlockListAsLongs bucketBlocks = bucket.getBlocks(); final Callable<Void> subTask = new Callable<Void>() { @Override public Void call() throws IOException { final HopsTransactionalRequestHandler processReportHandler = processBucketInternal(storage, bucketId, toAdd, toInvalidate, toCorrupt, toUC, firstBlockReport, mismatchedBlocksAndInodes, aggregatedSafeBlocks, invalidatedReplicas, bucketBlocks); processReportHandler.handle(); return null; } }; subTasks.add(subTask); // collect subtasks } try { List<Future<Void>> futures = ((FSNamesystem) namesystem).getFSOperationsExecutor().invokeAll(subTasks); for (Future<Void> maybeException : futures) { maybeException.get(); } } catch (InterruptedException e) { LOG.error("Exception was thrown during block report processing", e); throw new IOException(e); } catch (ExecutionException e) { throw (IOException) e.getCause(); } } private HopsTransactionalRequestHandler processBucketInternal(final DatanodeStorageInfo storage, final int bucketId, final Collection<BlockInfoContiguous> toAdd, final Collection<Block> toInvalidate, final Collection<BlockToMarkCorrupt> toCorrupt, final Collection<StatefulBlockInfo> toUC, final boolean firstBlockReport, final Map<Long, Long> mismatchedBlocksAndInodes, final Set<Long> aggregatedSafeBlocks, final Map<Long, Long> invalidatedReplicas, final BlockListAsLongs reportedBlocks) { return new HopsTransactionalRequestHandler(HDFSOperationType.PROCESS_REPORT) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); if (reportedBlocks.getNumberOfBlocks() != 0) { List<Long> resolvedBlockIds = new ArrayList<>(); List<Long> inodeIds = new ArrayList<>(); List<Long> unResolvedBlockIds = new ArrayList<>(); for (BlockListAsLongs.BlockReportReplica reportedBlock : reportedBlocks) { Long inodeId = mismatchedBlocksAndInodes.get(reportedBlock.getBlockId()); if (inodeId != null) { resolvedBlockIds.add(reportedBlock.getBlockId()); inodeIds.add(inodeId); } else { unResolvedBlockIds.add(reportedBlock.getBlockId()); } } locks.add(lf.getBlockReportingLocks(Longs.toArray(resolvedBlockIds), Longs.toArray(inodeIds), Longs.toArray(unResolvedBlockIds), storage.getSid())); } locks.add(lf.getIndividualHashBucketLock(storage.getSid(), bucketId)); } @Override public Object performTask() throws IOException { // scan the report and process newly reported blocks byte[] hash = HashBuckets.initalizeHash(); // Our updated hash should only consider // finalized, stored blocks for (BlockListAsLongs.BlockReportReplica brb : reportedBlocks) { Block block = new Block(); block.setNoPersistance(brb.getBlockId(), brb.getBytesOnDisk(), brb.getGenerationStamp()); BlockInfoContiguous storedBlock = processReportedBlock(storage, block, brb.getState(), toAdd, toInvalidate, toCorrupt, toUC, aggregatedSafeBlocks, firstBlockReport, mismatchedBlocksAndInodes.containsKey(brb.getBlockId()), invalidatedReplicas); if (storedBlock != null) { mismatchedBlocksAndInodes.remove(storedBlock.getBlockId()); if (brb.getState() == ReplicaState.FINALIZED) { // Only update hash with blocks that should not // be removed and are finalized. This helps catch excess // replicas as well. HashBuckets.XORHashes(hash, BlockReport.hashAsFinalized(storedBlock)); } } } //update bucket hash HashBucket bucket = HashBuckets.getInstance().getBucket(storage.getSid(), bucketId); bucket.setHash(hash); return null; } }; } private List<Integer> getReportedBucketList(BlockReport report) throws IOException { List<Integer> missMatchingBuckets = new ArrayList(); for (int i = 0; i < report.getBuckets().length; i++) { Bucket b = report.getBuckets()[i]; if (!b.isSkip()) { missMatchingBuckets.add(i); } } return missMatchingBuckets; } private HashMatchingResult calculateMismatchedHashes(DatanodeStorageInfo storage, BlockReport report, Boolean firstBlockReport) throws IOException { List<HashBucket> storedHashes = HashBuckets.getInstance().getBucketsForStorage(storage); Map<Integer, HashBucket> storedHashesMap = new HashMap<>(); for (HashBucket allStorageHash : storedHashes) { storedHashesMap.put(allStorageHash.getBucketId(), allStorageHash); } List<Integer> matchedBuckets = new ArrayList<>(); List<Integer> mismatchedBuckets = new ArrayList<>(); for (int i = 0; i < report.getBuckets().length; i++) { if (!storedHashesMap.containsKey(i)) { //escape early mismatchedBuckets.add(i); continue; } byte[] storedHash = storedHashesMap.get(i).getHash(); //First block report, or report in safe mode, should always process complete report. if (firstBlockReport) { mismatchedBuckets.add(i); continue; } byte[] reportedHash = report.getBuckets()[i].getHash(); if (HashBuckets.hashEquals(storedHash, reportedHash)) { matchedBuckets.add(i); } else { mismatchedBuckets.add(i); } } assert matchedBuckets.size() + mismatchedBuckets.size() == report.getBuckets().length; return new HashMatchingResult(matchedBuckets, mismatchedBuckets); } /** * Process a block replica reported by the data-node. * No side effects except adding to the passed-in Collections. * <p/> * <ol> * <li>If the block is not known to the system (not in blocksMap) then the * data-node should be notified to invalidate this block.</li> * <li>If the reported replica is valid that is has the same generation stamp * and length as recorded on the name-node, then the replica location should * be added to the name-node.</li> * <li>If the reported replica is not valid, then it is marked as corrupt, * which triggers replication of the existing valid replicas. * Corrupt replicas are removed from the system when the block * is fully replicated.</li> * <li>If the reported replica is for a block currently marked "under * construction" in the NN, then it should be added to the * BlockInfoUnderConstruction's list of replicas.</li> * </ol> * * @param storageInfo * the storage that made the report * @param block * reported block replica * @param reportedState * reported replica state * @param toAdd * add to DatanodeDescriptor * @param toInvalidate * missing blocks (not in the blocks map) * should be removed from the data-node * @param toCorrupt * replicas with unexpected length or generation stamp; * add to corrupt replicas * @param toUC * replicas of blocks currently under construction * @return the up-to-date stored block, if it should be kept. * Otherwise, null. */ private BlockInfoContiguous processIncrementallyReportedBlock(final DatanodeStorageInfo storageInfo, final Block block, final ReplicaState reportedState, final Collection<BlockInfoContiguous> toAdd, final Collection<Block> toInvalidate, final Collection<BlockToMarkCorrupt> toCorrupt, final Collection<StatefulBlockInfo> toUC) throws IOException { if (LOG.isDebugEnabled()) { LOG.debug("Reported block " + block + " on " + storageInfo.getStorageID() + " size " + block.getNumBytes() + " replicaState = " + reportedState); } // find block by blockId BlockInfoContiguous storedBlock = blocksMap.getStoredBlock(block); if (storedBlock == null) { // If blocksMap does not contain reported block id, // the replica should be removed from the data-node. blockLog.info("BLOCK* processReport: " + block + " on " + storageInfo + " size " + block.getNumBytes() + " does not belong to any file"); toInvalidate.add(new Block(block)); return null; } BlockUCState ucState = storedBlock.getBlockUCState(); // Block is on the NN if (LOG.isDebugEnabled()) { LOG.debug("In memory blockUCState = " + ucState + " bid=" + storedBlock.getBlockIndex()); } // Ignore replicas already scheduled to be removed from the DN if (invalidateBlocks.contains(storageInfo, getBlockInfo(block))) { /* TODO: following assertion is incorrect, see HDFS-2668 assert storedBlock.findDatanode(dn) < 0 : "Block " + block + " in recentInvalidatesSet should not appear in DN " + dn; */ return storedBlock; } BlockToMarkCorrupt c = checkReplicaCorrupt(block, reportedState, storedBlock, ucState, storageInfo); if (c != null) { toCorrupt.add(c); return storedBlock; } if (isBlockUnderConstruction(storedBlock, ucState, reportedState)) { toUC.add(new StatefulBlockInfo((BlockInfoContiguousUnderConstruction) storedBlock, block, reportedState)); return storedBlock; } // Add replica if appropriate. If the replica was previously corrupt // but now okay, it might need to be updated. if (reportedState == ReplicaState.FINALIZED && (!storedBlock.isReplicatedOnStorage(storageInfo) || corruptReplicas.isReplicaCorrupt(storedBlock, storageInfo.getDatanodeDescriptor()))) { toAdd.add(storedBlock); } return storedBlock; } private BlockInfoContiguous processReportedBlock(final DatanodeStorageInfo storageInfo, final Block block, final ReplicaState reportedState, final Collection<BlockInfoContiguous> toAdd, final Collection<Block> toInvalidate, final Collection<BlockToMarkCorrupt> toCorrupt, final Collection<StatefulBlockInfo> toUC, final Set<Long> safeBlocks, final boolean firstBlockReport, final boolean replicaAlreadyExists, final Map<Long, Long> allMachineInvalidatedBlocks) throws IOException { if (LOG.isDebugEnabled()) { LOG.debug("Reported block " + block + " on " + storageInfo.getStorageID() + " size " + block.getNumBytes() + " replicaState = " + reportedState); } // find block by blockId BlockInfoContiguous storedBlock = blocksMap.getStoredBlock(block); if (storedBlock == null) { // If blocksMap does not contain reported block id, // the replica should be removed from the data-node. blockLog.info("BLOCK* processReport: " + block + " on " + storageInfo.getStorageID() + " size " + block.getNumBytes() + " does not belong to any file"); toInvalidate.add(new Block(block)); safeBlocks.remove(block.getBlockId()); return null; } BlockUCState ucState = storedBlock.getBlockUCState(); // Block is on the NN if (LOG.isDebugEnabled()) { LOG.debug("In memory blockUCState = " + ucState + " bid=" + storedBlock.getBlockIndex()); } // TODO: I see that this is done to "cache" the invalidated blocks before // processing the report. Can't we move this outside this method instead, // and keep the processBlockReport shared between incremental and full // reports? RE: no point, we don't have "safe blocks" either in the other // version. if (!firstBlockReport) { // Ignore replicas already scheduled to be removed from the DN if (allMachineInvalidatedBlocks.containsKey(block.getBlockId()) && allMachineInvalidatedBlocks.get(block.getBlockId()) == block.getGenerationStamp()) { /* TODO: following assertion is incorrect, see HDFS-2668 assert storedBlock.findDatanode(dn) < 0 : "Block " + block + " in recentInvalidatesSet should not appear in DN " + dn; */ return storedBlock; } } BlockToMarkCorrupt c = checkReplicaCorrupt(block, reportedState, storedBlock, ucState, storageInfo); if (c != null) { toCorrupt.add(c); safeBlocks.remove(block.getBlockId()); return storedBlock; } if (isBlockUnderConstruction(storedBlock, ucState, reportedState)) { toUC.add(new StatefulBlockInfo((BlockInfoContiguousUnderConstruction) storedBlock, block, reportedState)); safeBlocks.remove(block.getBlockId()); return storedBlock; } //add replica if appropriate if (reportedState == ReplicaState.FINALIZED) { if (replicaAlreadyExists || storedBlock.isReplicatedOnStorage(storageInfo)) { return storedBlock; } toAdd.add(storedBlock); safeBlocks.remove(block.getBlockId()); } return storedBlock; } /** * The next two methods test the various cases under which we must conclude * the replica is corrupt, or under construction. These are laid out * as switch statements, on the theory that it is easier to understand * the combinatorics of reportedState and ucState that way. It should be * at least as efficient as boolean expressions. * * @return a BlockToMarkCorrupt object, or null if the replica is not corrupt */ private BlockToMarkCorrupt checkReplicaCorrupt(Block reported, ReplicaState reportedState, BlockInfoContiguous storedBlock, BlockUCState ucState, DatanodeStorageInfo storage) { switch (reportedState) { case FINALIZED: switch (ucState) { case COMPLETE: case COMMITTED: if (storedBlock.getGenerationStamp() != reported.getGenerationStamp()) { final long reportedGS = reported.getGenerationStamp(); return new BlockToMarkCorrupt(storedBlock, reportedGS, "block is " + ucState + " and reported genstamp " + reportedGS + " does not match genstamp in block map " + storedBlock.getGenerationStamp(), Reason.GENSTAMP_MISMATCH); } else if (storedBlock.getNumBytes() != reported.getNumBytes()) { return new BlockToMarkCorrupt(storedBlock, "block is " + ucState + " and reported length " + reported.getNumBytes() + " does not match " + "length in block map " + storedBlock.getNumBytes(), Reason.SIZE_MISMATCH); } else { return null; // not corrupt } case UNDER_CONSTRUCTION: if (storedBlock.getGenerationStamp() > reported.getGenerationStamp()) { final long reportedGS = reported.getGenerationStamp(); return new BlockToMarkCorrupt(storedBlock, reportedGS, "block is " + ucState + " and reported state " + reportedState + ", But reported genstamp " + reportedGS + " does not match genstamp in block map " + storedBlock.getGenerationStamp(), Reason.GENSTAMP_MISMATCH); } return null; default: return null; } case RBW: case RWR: if (!storedBlock.isComplete()) { return null; // not corrupt } else if (storedBlock.getGenerationStamp() != reported.getGenerationStamp()) { final long reportedGS = reported.getGenerationStamp(); return new BlockToMarkCorrupt(storedBlock, reportedGS, "reported " + reportedState + " replica with genstamp " + reportedGS + " does not match COMPLETE block's genstamp in block map " + storedBlock.getGenerationStamp(), Reason.GENSTAMP_MISMATCH); } else { // COMPLETE block, same genstamp if (reportedState == ReplicaState.RBW) { // If it's a RBW report for a COMPLETE block, it may just be that // the block report got a little bit delayed after the pipeline // closed. So, ignore this report, assuming we will get a // FINALIZED replica later. See HDFS-2791 LOG.info("Received an RBW replica for " + storedBlock + " on " + storage + ": ignoring it, since it is " + "complete with the same genstamp"); return null; } else { return new BlockToMarkCorrupt(storedBlock, "reported replica has invalid state " + reportedState, Reason.INVALID_STATE); } } case RUR: // should not be reported case TEMPORARY: // should not be reported default: String msg = "Unexpected replica state " + reportedState + " for block: " + storedBlock + " on " + storage + " size " + storedBlock.getNumBytes(); // log here at WARN level since this is really a broken HDFS invariant LOG.warn(msg); return new BlockToMarkCorrupt(storedBlock, msg, Reason.INVALID_STATE); } } private boolean isBlockUnderConstruction(BlockInfoContiguous storedBlock, BlockUCState ucState, ReplicaState reportedState) { switch (reportedState) { case FINALIZED: switch (ucState) { case UNDER_CONSTRUCTION: case UNDER_RECOVERY: return true; default: return false; } case RBW: case RWR: return (!storedBlock.isComplete()); case RUR: // should not be reported case TEMPORARY: // should not be reported default: return false; } } void addStoredBlockUnderConstruction(StatefulBlockInfo ucBlock, DatanodeStorageInfo storageInfo) throws IOException { BlockInfoContiguousUnderConstruction block = ucBlock.storedBlock; block.addReplicaIfNotPresent(storageInfo, ucBlock.reportedState, ucBlock.reportedBlock.getGenerationStamp()); if (ucBlock.reportedState == ReplicaState.FINALIZED && !block.isReplicatedOnStorage(storageInfo)) { addStoredBlock(block, storageInfo, null, true); } } /** * Faster version of {@link #addStoredBlock}, * intended for use with initial block report at startup. If not in startup * safe mode, will call standard addStoredBlock(). Assumes this method is * called "immediately" so there is no need to refresh the storedBlock from * blocksMap. Doesn't handle underReplication/overReplication, or worry about * pendingReplications or corruptReplicas, because it's in startup safe mode. * Doesn't log every block, because there are typically millions of them. * * @throws IOException */ private void addStoredBlockImmediate(BlockInfoContiguous storedBlock, DatanodeStorageInfo storage, boolean logEveryBlock) throws IOException { assert (storedBlock != null); if (!namesystem.isInStartupSafeMode() || namesystem.isPopulatingReplQueues()) { addStoredBlock(storedBlock, storage, null, logEveryBlock); return; } // just add it storage.addBlock(storedBlock); // Now check for completion of blocks and safe block count int numCurrentReplica = countLiveNodes(storedBlock); if (storedBlock.getBlockUCState() == BlockUCState.COMMITTED && numCurrentReplica >= minReplication) { completeBlock(storedBlock.getBlockCollection(), storedBlock, false); } else if (storedBlock.isComplete()) { // check whether safe replication is reached for the block // only complete blocks are counted towards that. // In the case that the block just became complete above, completeBlock() // handles the safe block count maintenance. namesystem.incrementSafeBlockCount(numCurrentReplica, storedBlock); } } /** * Modify (block-->datanode) map. Remove block from set of * needed replications if this takes care of the problem. * * @return the block that is stored in blockMap. */ private Block addStoredBlock(final BlockInfoContiguous block, DatanodeStorageInfo storageInfo, DatanodeDescriptor delNodeHint, boolean logEveryBlock) throws IOException { assert block != null; BlockInfoContiguous storedBlock; if (block instanceof BlockInfoContiguousUnderConstruction) { //refresh our copy in case the block got completed in another thread storedBlock = blocksMap.getStoredBlock(block); } else { storedBlock = block; } if (storedBlock == null || storedBlock.getBlockCollection() == null) { // If this block does not belong to anyfile, then we are done. blockLog.info("BLOCK* addStoredBlock: {} on {} size {} but it does not" + " belong to any file", block, storageInfo.getStorageID(), block.getNumBytes()); // we could add this block to invalidate set of this datanode. // it will happen in next block report otherwise. return block; } BlockCollection bc = storedBlock.getBlockCollection(); assert bc != null : "Block must belong to a file"; FSNamesystem fsNamesystem = (FSNamesystem) namesystem; NumberReplicas numBeforeAdding = null; if (fsNamesystem.isErasureCodingEnabled()) { numBeforeAdding = countNodes(block); } // add block to the datanode AddBlockResult result = storageInfo.addBlock(storedBlock); int curReplicaDelta; if (result == AddBlockResult.ADDED) { curReplicaDelta = 1; if (logEveryBlock) { logAddStoredBlock(storedBlock, storageInfo); } } else if (result == AddBlockResult.REPLACED) { curReplicaDelta = 0; blockLog.warn("BLOCK* addStoredBlock: block {} moved to storageType " + "{} on storage {}", storedBlock, storageInfo.getStorageType(), storageInfo.getStorageID()); } else { // if the same block is added again and the replica was corrupt // previously because of a wrong gen stamp, remove it from the // corrupt block list. corruptReplicas.removeFromCorruptReplicasMap(block, storageInfo.getDatanodeDescriptor(), Reason.GENSTAMP_MISMATCH); curReplicaDelta = 0; blockLog.warn( "BLOCK* addStoredBlock: Redundant addStoredBlock request" + " received for {} on node {} size {}", storedBlock, storageInfo.getStorageID(), storedBlock.getNumBytes()); } // Now check for completion of blocks and safe block count NumberReplicas num = countNodes(storedBlock); int numLiveReplicas = num.liveReplicas(); int numCurrentReplica = numLiveReplicas + pendingReplications.getNumReplicas(storedBlock); if (storedBlock.getBlockUCState() == BlockUCState.COMMITTED && numLiveReplicas >= minReplication) { storedBlock = completeBlock(bc, storedBlock, false); } else if (storedBlock.isComplete() && result == AddBlockResult.ADDED) { // check whether safe replication is reached for the block // only complete blocks are counted towards that // Is no-op if not in safe mode. // In the case that the block just became complete above, completeBlock() // handles the safe block count maintenance. namesystem.incrementSafeBlockCount(numCurrentReplica, storedBlock); } // if file is under construction, then done for now if (bc.isUnderConstruction()) { return storedBlock; } // do not try to handle over/under-replicated blocks during first safe mode if (!namesystem.isPopulatingReplQueues()) { return storedBlock; } // handle underReplication/overReplication short fileReplication = bc.getBlockReplication(); if (!isNeededReplication(storedBlock, fileReplication, numCurrentReplica)) { neededReplications.remove(storedBlock, numCurrentReplica, num.decommissionedReplicas(), fileReplication); } else { updateNeededReplications(storedBlock, curReplicaDelta, 0); } if (numCurrentReplica > fileReplication) { processOverReplicatedBlock(storedBlock, fileReplication, storageInfo.getDatanodeDescriptor(), delNodeHint); } // If the file replication has reached desired value // we can remove any corrupt replicas the block may have int corruptReplicasCount = corruptReplicas.numCorruptReplicas(storedBlock); int numCorruptNodes = num.corruptReplicas(); if (numCorruptNodes != corruptReplicasCount) { LOG.warn("Inconsistent number of corrupt replicas for " + storedBlock + "blockMap has " + numCorruptNodes + " but corrupt replicas map has " + corruptReplicasCount); } if ((corruptReplicasCount > 0) && (numLiveReplicas >= fileReplication)) { invalidateCorruptReplicas(storedBlock); } if (fsNamesystem.isErasureCodingEnabled()) { INode iNode = EntityManager.find(INode.Finder.ByINodeIdFTIS, bc.getId()); if (iNode.isUnderConstruction() == false && numBeforeAdding.liveReplicas() == 0 && numLiveReplicas > 0) { EncodingStatus status = EntityManager.find(EncodingStatus.Finder.ByInodeId, bc.getId()); if (status != null && status.isCorrupt()) { int lostBlockCount = status.getLostBlocks() - 1; status.setLostBlocks(lostBlockCount); if (lostBlockCount == 0) { status.setStatus(EncodingStatus.Status.ENCODED); status.setStatusModificationTime(System.currentTimeMillis()); } EntityManager.update(status); } else { status = EntityManager.find(EncodingStatus.Finder.ByParityInodeId, bc.getId()); if (status == null) { LOG.info("addStoredBlock returned null for " + bc.getId()); } else { LOG.info("addStoredBlock found " + bc.getId() + " with status " + status); } if (status != null && status.isParityCorrupt()) { int lostParityBlockCount = status.getLostParityBlocks() - 1; status.setLostParityBlocks(lostParityBlockCount); if (lostParityBlockCount == 0) { status.setParityStatus(EncodingStatus.ParityStatus.HEALTHY); status.setParityStatusModificationTime(System.currentTimeMillis()); } EntityManager.update(status); LOG.info("addStoredBlock found set status to potentially fixed"); } } } } return storedBlock; } private void logAddStoredBlock(BlockInfoContiguous storedBlock, DatanodeStorageInfo storage) { if (!blockLog.isInfoEnabled()) { return; } StringBuilder sb = new StringBuilder(500); sb.append("BLOCK* addStoredBlock: blockMap updated: ").append(storage).append(" is added to ") .append(storedBlock).append(" size ").append(storedBlock.getNumBytes()).append(" byte"); blockLog.info(sb.toString()); } /** * Invalidate corrupt replicas. * <p/> * This will remove the replicas from the block's location list, * add them to {@link #invalidateBlocks} so that they could be further * deleted from the respective data-nodes, * and remove the block from corruptReplicasMap. * <p/> * This method should be called when the block has sufficient * number of live replicas. * * @param blk * Block whose corrupt replicas need to be invalidated */ private void invalidateCorruptReplicas(BlockInfoContiguous blk) throws StorageException, TransactionContextException { Collection<DatanodeDescriptor> nodes = corruptReplicas.getNodes(blk); boolean removedFromBlocksMap = true; if (nodes == null) { return; } // make a copy of the array of nodes in order to avoid // ConcurrentModificationException, when the block is removed from the node DatanodeDescriptor[] nodesCopy = nodes.toArray(new DatanodeDescriptor[0]); for (DatanodeDescriptor node : nodesCopy) { try { removedFromBlocksMap = invalidateBlock(new BlockToMarkCorrupt(blk, null, Reason.ANY), node); } catch (IOException e) { blockLog.info("invalidateCorruptReplicas error in deleting bad block" + " {} on {}", blk, node, e); removedFromBlocksMap = false; } } // Remove the block from corruptReplicasMap if (removedFromBlocksMap) { corruptReplicas.removeFromCorruptReplicasMap(blk); } } /** * For each block in the name-node verify whether it belongs to any file, * over or under replicated. Place it into the respective queue. */ public synchronized void processMisReplicatedBlocks() throws IOException { //this normaly reinitialize the block scanning, when should we reinitialize the block scanning and //how do we propagate it to all NN? stopReplicationInitializer(); if (namesystem.isLeader()) { //it should be ok to reset even if other NN did not restart //at worse we will have blocks in neededReplication that should not be there //this would only result in these block getting transiantly over replicated HdfsVariables.resetMisReplicatedIndex(); neededReplications.clear(); excessReplicateMap.clear(); } replicationQueuesInitializer = new Daemon() { @Override public void run() { try { processMisReplicatesAsync(); } catch (InterruptedException ie) { LOG.info("Interrupted while processing replication queues."); } catch (Exception e) { LOG.error("Error while processing replication queues async", e); } } }; replicationQueuesInitializer.setName("Replication Queue Initializer"); replicationQueuesInitializer.start(); } /* * Stop the ongoing initialisation of replication queues */ private void stopReplicationInitializer() { if (replicationQueuesInitializer != null) { replicationQueuesInitializer.interrupt(); try { replicationQueuesInitializer.join(); } catch (final InterruptedException e) { LOG.warn("Interrupted while waiting for replicationQueueInitializer. Returning.."); return; } finally { replicationQueuesInitializer = null; } } } // private void restetMisReplicatesIndex() throws IOException{ // while(namesystem.isLeader() && sizeOfMisReplicatedRangeQueue()>0){ // cleanMisReplicatedRangeQueue(); // } // } // private void lockMisReplicatedRangeQueue(long nnId) throws IOException { // new LightWeightRequestHandler( // HDFSOperationType.UPDATE_MIS_REPLICATED_RANGE_QUEUE) { // @Override // public Object performTask() throws IOException { // HdfsStorageFactory.getConnector().writeLock(); // MisReplicatedRangeQueueDataAccess da = // (MisReplicatedRangeQueueDataAccess) HdfsStorageFactory // .getDataAccess(MisReplicatedRangeQueueDataAccess.class); // da.insert(nnId, -2, -2); // return null; // } // }.handle(); // } // // private void unlockMisReplicatedRangeQueue(long nnId) throws IOException { // new LightWeightRequestHandler( // HDFSOperationType.UPDATE_MIS_REPLICATED_RANGE_QUEUE) { // @Override // public Object performTask() throws IOException { // HdfsStorageFactory.getConnector().writeLock(); // MisReplicatedRangeQueueDataAccess da = // (MisReplicatedRangeQueueDataAccess) HdfsStorageFactory // .getDataAccess(MisReplicatedRangeQueueDataAccess.class); // da.remove(nnId, -2, -2); // return null; // } // }.handle(); // } // // private boolean isMisReplicatedRangeQueueLocked() throws IOException { // return (boolean) new LightWeightRequestHandler( // HDFSOperationType.UPDATE_MIS_REPLICATED_RANGE_QUEUE) { // @Override // public Object performTask() throws IOException { // HdfsStorageFactory.getConnector().writeLock(); // MisReplicatedRangeQueueDataAccess da = (MisReplicatedRangeQueueDataAccess) HdfsStorageFactory // .getDataAccess(MisReplicatedRangeQueueDataAccess.class); // Long leaderIndex = da.getStartIndex(namesystem.getNameNode().getActiveNameNodes().getLeader().getId()); // if (leaderIndex < -1) { // return true; // } else { // return false; // } // } // }.handle(); // } // private void waitOnMisReplicatedRangeQueueLock() throws InterruptedException, IOException{ // while(isMisReplicatedRangeQueueLocked()){ // Thread.sleep(500); // } // } private List<MisReplicatedRange> checkMisReplicatedRangeQueue() throws IOException { final LightWeightRequestHandler cleanMisReplicatedRangeQueueHandler = new LightWeightRequestHandler( HDFSOperationType.UPDATE_MIS_REPLICATED_RANGE_QUEUE) { @Override public Object performTask() throws IOException { HdfsStorageFactory.getConnector().writeLock(); MisReplicatedRangeQueueDataAccess da = (MisReplicatedRangeQueueDataAccess) HdfsStorageFactory .getDataAccess(MisReplicatedRangeQueueDataAccess.class); List<MisReplicatedRange> ranges = da.getAll(); Set<Long> activeNodes = new HashSet<>(); for (ActiveNode nn : namesystem.getNameNode().getLeaderElectionInstance().getActiveNamenodes() .getActiveNodes()) { activeNodes.add(nn.getId()); } List<MisReplicatedRange> toRemove = new ArrayList<>(); for (MisReplicatedRange range : ranges) { if (!activeNodes.contains(range.getNnId())) { toRemove.add(range); } } da.remove(toRemove); return toRemove; } }; List<MisReplicatedRange> toProcess = new ArrayList<>(); while (namesystem.isLeader() && sizeOfMisReplicatedRangeQueue() > 0) { toProcess.addAll((List<MisReplicatedRange>) cleanMisReplicatedRangeQueueHandler.handle()); } return toProcess; } // private boolean waitMisReplicatedRangeQueue() throws IOException { // return (boolean) new LightWeightRequestHandler( // HDFSOperationType.UPDATE_MIS_REPLICATED_RANGE_QUEUE) { // @Override // public Object performTask() throws IOException { // HdfsStorageFactory.getConnector().writeLock(); // MisReplicatedRangeQueueDataAccess da = // (MisReplicatedRangeQueueDataAccess) HdfsStorageFactory // .getDataAccess(MisReplicatedRangeQueueDataAccess.class); // List<Long> startIndexes= da.getAllStartIndex(); // // for(long startIndex: startIndexes){ // if(startIndex>=0){ // return true; // } // } // return false; // } // }.handle(); // } private void processMisReplicatesAsync() throws InterruptedException, IOException { final AtomicLong nrInvalid = new AtomicLong(0); final AtomicLong nrOverReplicated = new AtomicLong(0); final AtomicLong nrUnderReplicated = new AtomicLong(0); final AtomicLong nrPostponed = new AtomicLong(0); final AtomicLong nrUnderConstruction = new AtomicLong(0); long startTimeMisReplicatedScan = Time.now(); long totalBlocks = blocksMap.size(); replicationQueuesInitProgress = 0; final AtomicLong totalProcessed = new AtomicLong(0); boolean haveMore; final int filesToProcess = slicerBatchSize * processMisReplicatedNoOfBatchs; addToMisReplicatedRangeQueue(new MisReplicatedRange(namesystem.getNamenodeId(), -1)); long maxInodeId = 0; if (LOG.isInfoEnabled()) { maxInodeId = blocksMap.getMaxInodeId(); } while (namesystem.isRunning() && !Thread.currentThread().isInterrupted()) { long filesToProcessEndIndex; long filesToProcessStartIndex; do { filesToProcessEndIndex = HdfsVariables.incrementMisReplicatedIndex(filesToProcess); filesToProcessStartIndex = filesToProcessEndIndex - filesToProcess; haveMore = blocksMap.haveFilesWithIdGreaterThan(filesToProcessEndIndex); } while (!blocksMap.haveFilesWithIdBetween(filesToProcessStartIndex, filesToProcessEndIndex) && haveMore); addToMisReplicatedRangeQueue( new MisReplicatedRange(namesystem.getNamenodeId(), filesToProcessStartIndex)); processMissreplicatedInt(filesToProcessStartIndex, filesToProcessEndIndex, filesToProcess, nrInvalid, nrOverReplicated, nrUnderReplicated, nrPostponed, nrUnderConstruction, totalProcessed, maxInodeId); addToMisReplicatedRangeQueue(new MisReplicatedRange(namesystem.getNamenodeId(), -1)); // there is a possibility that if any of the blocks deleted/added during // initialisation, then progress might be different. replicationQueuesInitProgress = Math.min((double) totalProcessed.get() / totalBlocks, 1.0); if (!haveMore) { removeFromMisReplicatedRangeQueue(new MisReplicatedRange(namesystem.getNamenodeId(), -1)); if (namesystem.isLeader()) { //get the list of indexes that should have been scanned by namenode that are now dead List<MisReplicatedRange> toProcess = checkMisReplicatedRangeQueue(); //(re)scan the corresponding blocks for (MisReplicatedRange range : toProcess) { long startIndex = range.getStartIndex(); if (startIndex > 0) { processMissreplicatedInt(startIndex, startIndex + filesToProcess, filesToProcess, nrInvalid, nrOverReplicated, nrUnderReplicated, nrPostponed, nrUnderConstruction, totalProcessed, maxInodeId); } } } LOG.info("Total number of blocks = " + blocksMap.size()); LOG.info("Number of invalid blocks = " + nrInvalid.get()); LOG.info("Number of under-replicated blocks = " + nrUnderReplicated.get()); LOG.info("Number of over-replicated blocks = " + nrOverReplicated.get() + ((nrPostponed.get() > 0) ? (" (" + nrPostponed.get() + " postponed)") : "")); LOG.info("Number of blocks being written = " + nrUnderConstruction.get()); NameNode.stateChangeLog.info("STATE* Replication Queue initialization " + "scan for invalid, over- and under-replicated blocks " + "completed in " + (Time.now() - startTimeMisReplicatedScan) + " msec"); break; } } if (Thread.currentThread().isInterrupted()) { LOG.info("Interrupted while processing replication queues."); } } private void processMissreplicatedInt(long filesToProcessStartIndex, long filesToProcessEndIndex, int filesToProcess, final AtomicLong nrInvalid, final AtomicLong nrOverReplicated, final AtomicLong nrUnderReplicated, final AtomicLong nrPostponed, final AtomicLong nrUnderConstruction, final AtomicLong totalProcessed, long maxInodeId) throws IOException { final List<INodeIdentifier> allINodes = blocksMap.getAllINodeFiles(filesToProcessStartIndex, filesToProcessEndIndex); LOG.info("processMisReplicated read " + allINodes.size() + "/" + filesToProcess + " in the Ids range [" + filesToProcessStartIndex + " - " + filesToProcessEndIndex + "] (max inodeId when the process started: " + maxInodeId + ")"); try { Slicer.slice(allINodes.size(), slicerBatchSize, slicerNbThreads, ((FSNamesystem) namesystem).getFSOperationsExecutor(), new Slicer.OperationHandler() { @Override public void handle(int startIndex, int endIndex) throws Exception { final List<INodeIdentifier> inodeIdentifiers = allINodes.subList(startIndex, endIndex); final HopsTransactionalRequestHandler processMisReplicatedBlocksHandler = new HopsTransactionalRequestHandler( HDFSOperationType.PROCESS_MIS_REPLICATED_BLOCKS_PER_INODE_BATCH) { @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getBatchedINodesLock(inodeIdentifiers)) .add(lf.getSqlBatchedBlocksLock()).add(lf.getSqlBatchedBlocksRelated( BLK.RE, BLK.IV, BLK.CR, BLK.UR, BLK.ER)); } @Override public Object performTask() throws IOException { for (INodeIdentifier inodeIdentifier : inodeIdentifiers) { INode inode = EntityManager.find(INode.Finder.ByINodeIdFTIS, inodeIdentifier.getInodeId()); for (BlockInfoContiguous block : ((INodeFile) inode).getBlocks()) { MisReplicationResult res = processMisReplicatedBlock(block); if (LOG.isTraceEnabled()) { LOG.trace("block " + block + ": " + res); } switch (res) { case UNDER_REPLICATED: nrUnderReplicated.incrementAndGet(); break; case OVER_REPLICATED: nrOverReplicated.incrementAndGet(); break; case INVALID: nrInvalid.incrementAndGet(); break; case POSTPONE: nrPostponed.incrementAndGet(); postponeBlock(block); break; case UNDER_CONSTRUCTION: nrUnderConstruction.incrementAndGet(); break; case OK: break; default: throw new AssertionError("Invalid enum value: " + res); } totalProcessed.incrementAndGet(); } } return null; } }; processMisReplicatedBlocksHandler.handle(namesystem); } }); } catch (Exception ex) { throw new IOException(ex); } } /** * Get the progress of the Replication queues initialisation * * @return Returns values between 0 and 1 for the progress. */ public double getReplicationQueuesInitProgress() { //should we store this in the DB to have update from all the NN? return replicationQueuesInitProgress; } private void addToMisReplicatedRangeQueue(final MisReplicatedRange range) throws IOException { new LightWeightRequestHandler(HDFSOperationType.UPDATE_MIS_REPLICATED_RANGE_QUEUE) { @Override public Object performTask() throws IOException { HdfsStorageFactory.getConnector().writeLock(); MisReplicatedRangeQueueDataAccess da = (MisReplicatedRangeQueueDataAccess) HdfsStorageFactory .getDataAccess(MisReplicatedRangeQueueDataAccess.class); da.insert(range); return null; } }.handle(); } private void removeFromMisReplicatedRangeQueue(final MisReplicatedRange range) throws IOException { new LightWeightRequestHandler(HDFSOperationType.UPDATE_MIS_REPLICATED_RANGE_QUEUE) { @Override public Object performTask() throws IOException { HdfsStorageFactory.getConnector().writeLock(); MisReplicatedRangeQueueDataAccess da = (MisReplicatedRangeQueueDataAccess) HdfsStorageFactory .getDataAccess(MisReplicatedRangeQueueDataAccess.class); da.remove(range); return null; } }.handle(); } private int sizeOfMisReplicatedRangeQueue() throws IOException { return (Integer) new LightWeightRequestHandler(HDFSOperationType.COUNT_ALL_MIS_REPLICATED_RANGE_QUEUE) { @Override public Object performTask() throws IOException { MisReplicatedRangeQueueDataAccess da = (MisReplicatedRangeQueueDataAccess) HdfsStorageFactory .getDataAccess(MisReplicatedRangeQueueDataAccess.class); return da.countAll(); } }.handle(); } /** * Process a single possibly misreplicated block. This adds it to the * appropriate queues if necessary, and returns a result code indicating * what happened with it. */ private MisReplicationResult processMisReplicatedBlock(BlockInfoContiguous block) throws IOException { BlockCollection bc = block.getBlockCollection(); if (bc == null) { // block does not belong to any file addToInvalidates(block); return MisReplicationResult.INVALID; } if (!block.isComplete()) { // Incomplete blocks are never considered mis-replicated -- // they'll be reached when they are completed or recovered. return MisReplicationResult.UNDER_CONSTRUCTION; } // calculate current replication short expectedReplication = bc.getBlockReplication(); NumberReplicas num = countNodes(block); int numCurrentReplica = num.liveReplicas(); // add to under-replicated queue if need to be if (isNeededReplication(block, expectedReplication, numCurrentReplica)) { if (neededReplications.add(block, numCurrentReplica, num.decommissionedReplicas(), expectedReplication)) { return MisReplicationResult.UNDER_REPLICATED; } } if (numCurrentReplica > expectedReplication) { if (num.replicasOnStaleNodes() > 0) { // If any of the replicas of this block are on nodes that are // considered "stale", then these replicas may in fact have // already been deleted. So, we cannot safely act on the // over-replication until a later point in time, when // the "stale" nodes have block reported. return MisReplicationResult.POSTPONE; } // over-replicated block processOverReplicatedBlock(block, expectedReplication, null, null); return MisReplicationResult.OVER_REPLICATED; } return MisReplicationResult.OK; } /** * Set replication for the blocks. */ public void setReplication(final short oldRepl, final short newRepl, final String src, final Block... blocks) throws IOException { if (newRepl == oldRepl) { return; } // update needReplication priority queues for (Block b : blocks) { updateNeededReplications(b, 0, newRepl - oldRepl); } if (oldRepl > newRepl) { // old replication > the new one; need to remove copies LOG.info("Decreasing replication from " + oldRepl + " to " + newRepl + " for " + src); for (Block b : blocks) { processOverReplicatedBlock(b, newRepl, null, null); } } else { // replication factor is increased LOG.info("Increasing replication from " + oldRepl + " to " + newRepl + " for " + src); } } /** * Find how many of the containing nodes are "extra", if any. * If there are any extras, call chooseExcessReplicates() to * mark them in the excessReplicateMap. */ private void processOverReplicatedBlock(final Block block, final short replication, final DatanodeDescriptor addedNode, DatanodeDescriptor delNodeHint) throws IOException { if (addedNode == delNodeHint) { delNodeHint = null; } Collection<DatanodeStorageInfo> nonExcess = new ArrayList<>(); Collection<DatanodeDescriptor> corruptNodes = corruptReplicas.getNodes(getBlockInfo(block)); for (DatanodeStorageInfo storage : blocksMap.storageList(block, State.NORMAL)) { final DatanodeDescriptor cur = storage.getDatanodeDescriptor(); if (storage.areBlockContentsStale()) { LOG.info("BLOCK* processOverReplicatedBlock: Postponing processing of over-replicated " + block + " since storage " + storage + " does not yet have up-to-date block information."); postponeBlock(block); return; } if (!excessReplicateMap.contains(storage, getBlockInfo(block))) { if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) { // exclude corrupt replicas if (corruptNodes == null || !corruptNodes.contains(cur)) { nonExcess.add(storage); } } } } chooseExcessReplicates(nonExcess, block, replication, addedNode, delNodeHint, blockplacement); } /** * We want "replication" replicates for the block, but we now have too many. * In this method, copy enough nodes from 'srcNodes' into 'dstNodes' such * that: * <p/> * srcNodes.size() - dstNodes.size() == replication * <p/> * We pick node that make sure that replicas are spread across racks and * also try hard to pick one with least free space. * The algorithm is first to pick a node with least free space from nodes * that are on a rack holding more than one replicas of the block. * So removing such a replica won't remove a rack. * If no such a node is available, * then pick a node with least free space */ private void chooseExcessReplicates(final Collection<DatanodeStorageInfo> nonExcess, Block b, short replication, DatanodeDescriptor addedNode, DatanodeDescriptor delNodeHint, BlockPlacementPolicy replicator) throws StorageException, TransactionContextException, IOException { // first form a rack to datanodes map and BlockCollection bc = getBlockCollection(b); final BlockStoragePolicy storagePolicy = storagePolicySuite.getPolicy(bc.getStoragePolicyID()); final List<StorageType> excessTypes = storagePolicy.chooseExcess(replication, DatanodeStorageInfo.toStorageTypes(nonExcess)); final Map<String, List<DatanodeStorageInfo>> rackMap = new HashMap<String, List<DatanodeStorageInfo>>(); final List<DatanodeStorageInfo> moreThanOne = new ArrayList<DatanodeStorageInfo>(); final List<DatanodeStorageInfo> exactlyOne = new ArrayList<DatanodeStorageInfo>(); // split nodes into two sets // moreThanOne contains nodes on rack with more than one replica // exactlyOne contains the remaining nodes replicator.splitNodesWithRack(nonExcess, rackMap, moreThanOne, exactlyOne); // pick one node to delete that favors the delete hint // otherwise pick one with least space from priSet if it is not empty // otherwise one node with least space from remains boolean firstOne = true; final DatanodeStorageInfo delNodeHintStorage = DatanodeStorageInfo.getDatanodeStorageInfo(nonExcess, delNodeHint); final DatanodeStorageInfo addedNodeStorage = DatanodeStorageInfo.getDatanodeStorageInfo(nonExcess, addedNode); while (nonExcess.size() - replication > 0) { final DatanodeStorageInfo cur; if (useDelHint(firstOne, delNodeHintStorage, addedNodeStorage, moreThanOne, excessTypes)) { cur = delNodeHintStorage; } else { // regular excessive replica removal cur = replicator.chooseReplicaToDelete(bc, b, replication, moreThanOne, exactlyOne, excessTypes); } firstOne = false; // adjust rackmap, moreThanOne, and exactlyOne replicator.adjustSetsWithChosenReplica(rackMap, moreThanOne, exactlyOne, cur); nonExcess.remove(cur); addToExcessReplicate(cur, b); // // The 'excessblocks' tracks blocks until we get confirmation // that the datanode has deleted them; the only way we remove them // is when we get a "removeBlock" message. // // The 'invalidate' list is used to inform the datanode the block // should be deleted. Items are removed from the invalidate list // upon giving instructions to the namenode. // addToInvalidates(b, cur.getDatanodeDescriptor()); blockLog.info("BLOCK* chooseExcessReplicates: " + "({}, {}) is added to invalidated blocks set", cur, b); } } /** Check if we can use delHint */ static boolean useDelHint(boolean isFirst, DatanodeStorageInfo delHint, DatanodeStorageInfo added, List<DatanodeStorageInfo> moreThan1Racks, List<StorageType> excessTypes) { if (!isFirst) { return false; // only consider delHint for the first case } else if (delHint == null) { return false; // no delHint } else if (!excessTypes.contains(delHint.getStorageType())) { return false; // delHint storage type is not an excess type } else { // check if removing delHint reduces the number of racks if (moreThan1Racks.contains(delHint)) { return true; // delHint and some other nodes are under the same rack } else if (added != null && !moreThan1Racks.contains(added)) { return true; // the added node adds a new rack } return false; // removing delHint reduces the number of racks; } } private void addToExcessReplicate(DatanodeStorageInfo storage, Block block) throws StorageException, TransactionContextException { BlockInfoContiguous blockInfo = getBlockInfo(block); if (excessReplicateMap.put(storage.getSid(), blockInfo)) { if (blockLog.isDebugEnabled()) { blockLog.debug("BLOCK* addToExcessReplicate:" + " (" + storage + ", " + block + ") is added to excessReplicateMap"); } } } /** * Modify (block-->datanode) map. Possibly generate replication tasks, if the * removed block is still valid. */ public void removeStoredBlock(Block block, DatanodeDescriptor node) throws IOException { if (blockLog.isDebugEnabled()) { blockLog.debug("BLOCK* removeStoredBlock: {} from {}", block, node); } if (!blocksMap.removeNode(block, node)) { if (blockLog.isDebugEnabled()) { blockLog.debug("BLOCK* removeStoredBlock: {} has already been" + " removed from node {}", block, node); } return; } // // It's possible that the block was removed because of a datanode // failure. If the block is still valid, check if replication is // necessary. In that case, put block on a possibly-will- // be-replicated list. // BlockCollection bc = blocksMap.getBlockCollection(block); if (bc != null) { namesystem.decrementSafeBlockCount(getBlockInfo(block)); updateNeededReplications(block, -1, 0); } // // We've removed a block from a node, so it's definitely no longer // in "excess" there. // if (excessReplicateMap.remove(node, getBlockInfo(block))) { if (blockLog.isDebugEnabled()) { blockLog.debug("BLOCK* removeStoredBlock: " + block + " is removed from excessBlocks"); } } // Remove the replica from corruptReplicas corruptReplicas.removeFromCorruptReplicasMap(getBlockInfo(block), node); FSNamesystem fsNamesystem = (FSNamesystem) namesystem; if (fsNamesystem.isErasureCodingEnabled()) { BlockInfoContiguous blockInfo = getStoredBlock(block); EncodingStatus status = EntityManager.find(EncodingStatus.Finder.ByInodeId, blockInfo.getInodeId()); if (status != null) { NumberReplicas numberReplicas = countNodes(block); if (numberReplicas.liveReplicas() == 0) { if (status.isCorrupt() == false) { status.setStatus(EncodingStatus.Status.REPAIR_REQUESTED); status.setStatusModificationTime(System.currentTimeMillis()); } status.setLostBlocks(status.getLostBlocks() + 1); EntityManager.update(status); } } else { status = EntityManager.find(EncodingStatus.Finder.ByParityInodeId, blockInfo.getInodeId()); if (status == null) { LOG.info("removeStoredBlock returned null for " + blockInfo.getInodeId()); } else { LOG.info("removeStoredBlock found " + blockInfo.getInodeId() + " with status " + status); } if (status != null) { NumberReplicas numberReplicas = countNodes(block); if (numberReplicas.liveReplicas() == 0) { if (status.isParityCorrupt() == false) { status.setParityStatus(EncodingStatus.ParityStatus.REPAIR_REQUESTED); status.setParityStatusModificationTime(System.currentTimeMillis()); } status.setLostParityBlocks(status.getLostParityBlocks() + 1); EntityManager.update(status); LOG.info("removeStoredBlock updated parity status to repair requested"); } else { LOG.info("removeStoredBlock found replicas: " + numberReplicas.liveReplicas()); } } } } } public void removeStoredBlock(Block block, int sid) throws IOException { if (blockLog.isDebugEnabled()) { blockLog.debug("BLOCK* removeStoredBlock: " + block + " from " + sid); } if (!blocksMap.removeNode(block, sid)) { if (blockLog.isDebugEnabled()) { blockLog.debug("BLOCK* removeStoredBlock: " + block + " has already been removed from node " + sid); } return; } // // It's possible that the block was removed because of a datanode // failure. If the block is still valid, check if replication is // necessary. In that case, put block on a possibly-will- // be-replicated list. // BlockCollection bc = blocksMap.getBlockCollection(block); if (bc != null) { namesystem.decrementSafeBlockCount(getBlockInfo(block)); updateNeededReplications(block, -1, 0); } // Remove the replica from corruptReplicas corruptReplicas.forceRemoveFromCorruptReplicasMap(getBlockInfo(block), sid); FSNamesystem fsNamesystem = (FSNamesystem) namesystem; if (fsNamesystem.isErasureCodingEnabled()) { BlockInfoContiguous blockInfo = getStoredBlock(block); EncodingStatus status = EntityManager.find(EncodingStatus.Finder.ByInodeId, blockInfo.getInodeId()); if (status != null) { NumberReplicas numberReplicas = countNodes(block); if (numberReplicas.liveReplicas() == 0) { if (status.isCorrupt() == false) { status.setStatus(EncodingStatus.Status.REPAIR_REQUESTED); status.setStatusModificationTime(System.currentTimeMillis()); } status.setLostBlocks(status.getLostBlocks() + 1); EntityManager.update(status); } } else { status = EntityManager.find(EncodingStatus.Finder.ByParityInodeId, blockInfo.getInodeId()); if (status == null) { LOG.info("removeStoredBlock returned null for " + blockInfo.getInodeId()); } else { LOG.info("removeStoredBlock found " + blockInfo.getInodeId() + " with status " + status); } if (status != null) { NumberReplicas numberReplicas = countNodes(block); if (numberReplicas.liveReplicas() == 0) { if (status.isParityCorrupt() == false) { status.setParityStatus(EncodingStatus.ParityStatus.REPAIR_REQUESTED); status.setParityStatusModificationTime(System.currentTimeMillis()); } status.setLostParityBlocks(status.getLostParityBlocks() + 1); EntityManager.update(status); LOG.info("removeStoredBlock updated parity status to repair requested"); } else { LOG.info("removeStoredBlock found replicas: " + numberReplicas.liveReplicas()); } } } } } /** * Get all valid locations of the block & add the block to results * return the length of the added block; 0 if the block is not added */ private long addBlocks(final List<Block> blocks, List<BlockWithLocations> results) throws IOException { List<Long> blockIds = new ArrayList<>(blocks.size()); final Map<Long, Block> blockIdsToBlocks = new HashMap<>(); for (Block block : blocks) { blockIds.add(block.getBlockId()); blockIdsToBlocks.put(block.getBlockId(), block); } final Map<Long, List<Long>> inodeIdsToBlockMap = INodeUtil.getINodeIdsForBlockIds(blockIds, slicerBatchSize, slicerNbThreads, ((FSNamesystem) namesystem).getFSOperationsExecutor()); final List<Long> allInodeIds = new ArrayList<>(inodeIdsToBlockMap.keySet()); final Map<Block, List<DatanodeStorageInfo>> locationsMap = new ConcurrentHashMap<>(); try { Slicer.slice(allInodeIds.size(), slicerBatchSize, slicerNbThreads, ((FSNamesystem) namesystem).getFSOperationsExecutor(), new Slicer.OperationHandler() { @Override public void handle(int startIndex, int endIndex) throws Exception { final List<Long> inodeIds = allInodeIds.subList(startIndex, endIndex); new HopsTransactionalRequestHandler(HDFSOperationType.GET_VALID_BLK_LOCS) { List<INodeIdentifier> inodeIdentifiers; @Override public void setUp() throws StorageException, IOException { inodeIdentifiers = INodeUtil.resolveINodesFromIds(inodeIds); } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getBatchedINodesLock(inodeIdentifiers)) .add(lf.getSqlBatchedBlocksLock()) .add(lf.getSqlBatchedBlocksRelated(BLK.RE, BLK.IV)); } @Override public Object performTask() throws IOException { for (INodeIdentifier identifier : inodeIdentifiers) { for (long blockId : inodeIdsToBlockMap.get(identifier.getInodeId())) { Block block = blockIdsToBlocks.get(blockId); BlockInfoContiguous temp = getBlockInfo(block); final List<DatanodeStorageInfo> ms = getValidLocations(temp); if (!ms.isEmpty()) { locationsMap.put(block, ms); } } } return null; } }.handle(namesystem); } }); } catch (Exception ex) { throw new IOException(ex); } if (locationsMap.isEmpty()) { return 0; } else { long numBytes = 0; for (Block block : locationsMap.keySet()) { List<DatanodeStorageInfo> locations = locationsMap.get(block); final String[] datanodeUuids = new String[locations.size()]; final String[] storageIDs = new String[datanodeUuids.length]; final StorageType[] storageTypes = new StorageType[datanodeUuids.length]; for (int i = 0; i < locations.size(); i++) { final DatanodeStorageInfo s = locations.get(i); datanodeUuids[i] = s.getDatanodeDescriptor().getDatanodeUuid(); storageIDs[i] = s.getStorageID(); storageTypes[i] = s.getStorageType(); } results.add(new BlockWithLocations(block, datanodeUuids, storageIDs, storageTypes)); numBytes += block.getNumBytes(); } return numBytes; } } /** * The given node is reporting that it received a certain block. */ @VisibleForTesting void addBlock(DatanodeStorageInfo storage, Block block, String delHint) throws IOException { DatanodeDescriptor node = storage.getDatanodeDescriptor(); // Decrement number of blocks scheduled to this datanode. // for a retry request (of DatanodeProtocol#blockReceivedAndDeleted with // RECEIVED_BLOCK), we currently also decrease the approximate number. node.decrementBlocksScheduled(storage.getStorageType()); // get the deletion hint node DatanodeDescriptor delHintNode = null; if (delHint != null && delHint.length() != 0) { delHintNode = datanodeManager.getDatanodeByUuid(delHint); if (delHintNode == null) { blockLog.warn("BLOCK* blockReceived: {} is expected to be removed " + "from an unrecorded node {}", block, delHint); } } // // Modify the blocks->datanode map and node's map. // pendingReplications.decrement(getBlockInfo(block), node); processAndHandleReportedBlock(storage, block, ReplicaState.FINALIZED, delHintNode); } /** * * @param storageInfo * @param block * @param reportedState * @param delHintNode * @return true if the block was new, false otherwise * @throws IOException */ private void processAndHandleReportedBlock(DatanodeStorageInfo storageInfo, Block block, ReplicaState reportedState, DatanodeDescriptor delHintNode) throws IOException { // blockReceived reports a finalized block Collection<BlockInfoContiguous> toAdd = new LinkedList<>(); Collection<Block> toInvalidate = new LinkedList<>(); Collection<BlockToMarkCorrupt> toCorrupt = new LinkedList<>(); Collection<StatefulBlockInfo> toUC = new LinkedList<>(); final DatanodeDescriptor node = storageInfo.getDatanodeDescriptor(); processIncrementallyReportedBlock(storageInfo, block, reportedState, toAdd, toInvalidate, toCorrupt, toUC); // the block is only in one of the to-do lists // if it is in none then data-node already has it assert toUC.size() + toAdd.size() + toInvalidate.size() + toCorrupt.size() <= 1 : "The block should be only in one of the lists."; for (StatefulBlockInfo b : toUC) { addStoredBlockUnderConstruction(b, storageInfo); } long numBlocksLogged = 0; for (BlockInfoContiguous b : toAdd) { addStoredBlock(b, storageInfo, delHintNode, numBlocksLogged < maxNumBlocksToLog); numBlocksLogged++; } if (numBlocksLogged > maxNumBlocksToLog) { blockLog.info("BLOCK* addBlock: logged info for {} of {} reported.", maxNumBlocksToLog, numBlocksLogged); } for (Block b : toInvalidate) { blockLog.info("BLOCK* addBlock: block {} on node {} size {} does not " + "belong to any file", b, storageInfo, b.getNumBytes()); addToInvalidates(b, storageInfo.getDatanodeDescriptor()); } for (BlockToMarkCorrupt b : toCorrupt) { markBlockAsCorrupt(b, storageInfo, storageInfo.getDatanodeDescriptor()); } } /** * The given node is reporting incremental information about some blocks. * This includes blocks that are starting to be received, completed being * received, or deleted. */ public void processIncrementalBlockReport(DatanodeRegistration nodeID, final StorageReceivedDeletedBlocks blockInfos) throws IOException { //hack to have the variables final to pass then to the handler. final int[] received = { 0 }; final int[] deleted = { 0 }; final int[] receiving = { 0 }; final DatanodeDescriptor node = datanodeManager.getDatanode(nodeID); if (node == null || !node.isAlive) { blockLog.warn("BLOCK* processIncrementalBlockReport" + " is received from dead or unregistered node {}", nodeID); throw new IOException("Got incremental block report from unregistered or dead node"); } // Little hack; since we can't reassign final s if s==null, we have to // declare s as a normal variable and then assign it to a statically // declared variable DatanodeStorageInfo s = node.getStorageInfo(blockInfos.getStorage().getStorageID()); if (s == null) { // The DataNode is reporting an unknown storage. Usually the NN learns // about new storages from heartbeats but during NN restart we may // receive a block report or incremental report before the heartbeat. // We must handle this for protocol compatibility. This issue was // uncovered by HDFS-6094. s = node.updateStorage(blockInfos.getStorage()); } final DatanodeStorageInfo storage = s; HopsTransactionalRequestHandler processIncrementalBlockReportHandler = new HopsTransactionalRequestHandler( HDFSOperationType.BLOCK_RECEIVED_AND_DELETED_INC_BLK_REPORT) { INodeIdentifier inodeIdentifier; @Override public void setUp() throws StorageException { ReceivedDeletedBlockInfo rdbi = (ReceivedDeletedBlockInfo) getParams()[0]; LOG.debug("reported block id=" + rdbi.getBlock().getBlockId()); inodeIdentifier = INodeUtil.resolveINodeFromBlock(rdbi.getBlock()); if (inodeIdentifier == null && !rdbi.isDeletedBlock()) { LOG.warn("Invalid State. deleted blk is not recognized. bid=" + rdbi.getBlock().getBlockId()); } } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); ReceivedDeletedBlockInfo rdbi = (ReceivedDeletedBlockInfo) getParams()[0]; locks.add(lf.getIndividualINodeLock(INodeLockType.WRITE, inodeIdentifier, true)) .add(lf.getIndividualBlockLock(rdbi.getBlock().getBlockId(), inodeIdentifier)) .add(lf.getBlockRelated(BLK.RE, BLK.ER, BLK.CR, BLK.UR)); if (!rdbi.isDeletedBlock()) { locks.add(lf.getBlockRelated(BLK.PE, BLK.UC, BLK.IV)); } if (((FSNamesystem) namesystem).isErasureCodingEnabled() && inodeIdentifier != null) { locks.add(lf.getIndivdualEncodingStatusLock(LockType.WRITE, inodeIdentifier.getInodeId())); } locks.add(lf.getIndividualHashBucketLock(storage.getSid(), HashBuckets.getInstance().getBucketForBlock(rdbi.getBlock()))); } @Override public Object performTask() throws IOException { ReceivedDeletedBlockInfo rdbi = (ReceivedDeletedBlockInfo) getParams()[0]; LOG.debug("BLOCK_RECEIVED_AND_DELETED_INC_BLK_REPORT " + rdbi.getStatus() + " bid=" + rdbi.getBlock().getBlockId() + " dataNode=" + node.getXferAddr() + " storage=" + storage.getStorageID() + " sid: " + storage.getSid() + " status=" + rdbi.getStatus()); HashBuckets hashBuckets = HashBuckets.getInstance(); switch (rdbi.getStatus()) { case RECEIVING_BLOCK: addSubopName(ReceivedDeletedBlockInfo.BlockStatus.RECEIVING_BLOCK.name()); processAndHandleReportedBlock(storage, rdbi.getBlock(), ReplicaState.RBW, null); received[0]++; break; case APPENDING: addSubopName(ReceivedDeletedBlockInfo.BlockStatus.APPENDING.name()); processAndHandleReportedBlock(storage, rdbi.getBlock(), ReplicaState.RBW, null); received[0]++; break; case RECOVERING_APPEND: addSubopName(ReceivedDeletedBlockInfo.BlockStatus.RECOVERING_APPEND.name()); processAndHandleReportedBlock(storage, rdbi.getBlock(), ReplicaState.RBW, null); received[0]++; break; case RECEIVED_BLOCK: addSubopName(ReceivedDeletedBlockInfo.BlockStatus.RECEIVED_BLOCK.name()); addBlock(storage, rdbi.getBlock(), rdbi.getDelHints()); hashBuckets.applyHash(storage.getSid(), ReplicaState.FINALIZED, rdbi.getBlock()); received[0]++; break; case UPDATE_RECOVERED: addSubopName(ReceivedDeletedBlockInfo.BlockStatus.UPDATE_RECOVERED.name()); addBlock(storage, rdbi.getBlock(), rdbi.getDelHints()); received[0]++; break; case DELETED_BLOCK: addSubopName(ReceivedDeletedBlockInfo.BlockStatus.DELETED_BLOCK.name()); removeStoredBlock(rdbi.getBlock(), storage.getDatanodeDescriptor()); deleted[0]++; break; default: String msg = "Unknown block status code reported by " + storage.getStorageID() + ": " + rdbi; blockLog.warn(msg); assert false : msg; // if assertions are enabled, throw. break; } if (blockLog.isDebugEnabled()) { blockLog.debug("BLOCK* block " + (rdbi.getStatus()) + ": " + rdbi.getBlock() + " is received from " + storage.getStorageID()); } return null; } }; try { if (node == null || !node.isAlive) { blockLog.warn("BLOCK* processIncrementalBlockReport" + " is received from dead or unregistered node " + nodeID); throw new IOException("Got incremental block report from unregistered or dead node"); } for (ReceivedDeletedBlockInfo rdbi : blockInfos.getBlocks()) { processIncrementalBlockReportHandler.setParams(rdbi); processIncrementalBlockReportHandler.handle(namesystem); } } finally { if (blockLog.isDebugEnabled()) { blockLog.debug("*BLOCK* NameNode.processIncrementalBlockReport: " + "from " + nodeID + " receiving: " + receiving[0] + ", " + " received: " + received[0] + ", " + " deleted: " + deleted[0]); } } } /** * Return the number of nodes hosting a given block, grouped * by the state of those replicas. */ public NumberReplicas countNodes(Block b) throws IOException { int decommissioned = 0; int live = 0; int corrupt = 0; int excess = 0; int stale = 0; Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(getBlockInfo(b)); for (DatanodeStorageInfo storage : blocksMap.storageList(b, State.NORMAL)) { final DatanodeDescriptor node = storage.getDatanodeDescriptor(); if ((nodesCorrupt != null) && (nodesCorrupt.contains(node))) { corrupt++; } else if (node.isDecommissionInProgress() || node.isDecommissioned()) { decommissioned++; } else { LightWeightLinkedSet<Block> blocksExcess = excessReplicateMap.getExcessReplica(getBlockInfo(b), storage.getSid()); if (blocksExcess != null && blocksExcess.contains(b)) { blocksMap.storageList(b); excess++; } else { live++; } } if (storage.areBlockContentsStale()) { stale++; } } return new NumberReplicas(live, decommissioned, corrupt, excess, stale); } /** * Simpler, faster form of {@link #countNodes(Block)} that only returns the * number * of live nodes. If in startup safemode (or its 30-sec extension period), * then it gains speed by ignoring issues of excess replicas or nodes * that are decommissioned or in process of becoming decommissioned. * If not in startup, then it calls {@link #countNodes(Block)} instead. * * @param b * - the block being tested * @return count of live nodes for this block */ int countLiveNodes(BlockInfoContiguous b) throws IOException { if (!namesystem.isInStartupSafeMode()) { return countNodes(b).liveReplicas(); } // else proceed with fast case int live = 0; List<DatanodeStorageInfo> storages = blocksMap.storageList(b, State.NORMAL); Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(b); for (DatanodeStorageInfo storage : storages) { final DatanodeDescriptor node = storage.getDatanodeDescriptor(); if ((nodesCorrupt == null) || (!nodesCorrupt.contains(node))) { live++; } } return live; } /** * On stopping decommission, check if the node has excess replicas. * If there are any excess replicas, call processOverReplicatedBlock(). * Process over replicated blocks only when active NN is out of safe mode. */ void processOverReplicatedBlocksOnReCommission(final DatanodeDescriptor srcNode) throws IOException { if (!namesystem.isPopulatingReplQueues()) { return; } final int[] numOverReplicated = { 0 }; Map<Long, Long> blocksOnNode = srcNode.getAllStorageReplicas(numBuckets, blockFetcherNBThreads, blockFetcherBucketsPerThread, ((FSNamesystem) namesystem).getFSOperationsExecutor()); final Map<Long, List<Long>> inodeIdsToBlockMap = new HashMap<>(); for (Map.Entry<Long, Long> entry : blocksOnNode.entrySet()) { List<Long> list = inodeIdsToBlockMap.get(entry.getValue()); if (list == null) { list = new ArrayList<>(); inodeIdsToBlockMap.put(entry.getValue(), list); } list.add(entry.getKey()); } final List<Long> inodeIds = new ArrayList<>(inodeIdsToBlockMap.keySet()); try { Slicer.slice(inodeIds.size(), slicerBatchSize, slicerNbThreads, ((FSNamesystem) namesystem).getFSOperationsExecutor(), new Slicer.OperationHandler() { @Override public void handle(int startIndex, int endIndex) throws Exception { final List<Long> ids = inodeIds.subList(startIndex, endIndex); new HopsTransactionalRequestHandler( HDFSOperationType.PROCESS_OVER_REPLICATED_BLOCKS_ON_RECOMMISSION) { List<INodeIdentifier> inodeIdentifiers; @Override public void setUp() throws StorageException { inodeIdentifiers = INodeUtil.resolveINodesFromIds(ids); } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getBatchedINodesLock(inodeIdentifiers)) .add(lf.getSqlBatchedBlocksLock()).add(lf.getSqlBatchedBlocksRelated( BLK.RE, BLK.IV, BLK.CR, BLK.UR, BLK.ER)); } @Override public Object performTask() throws IOException { for (INodeIdentifier identifier : inodeIdentifiers) { for (long blockId : inodeIdsToBlockMap.get(identifier.getInodeId())) { BlockInfoContiguous block = EntityManager .find(BlockInfoContiguous.Finder.ByBlockIdAndINodeId, blockId); BlockCollection bc = blocksMap.getBlockCollection(block); short expectedReplication = bc.getBlockReplication(); NumberReplicas num = countNodes(block); int numCurrentReplica = num.liveReplicas(); if (numCurrentReplica > expectedReplication) { // over-replicated block processOverReplicatedBlock(block, expectedReplication, null, null); numOverReplicated[0]++; } } } return null; } }.handle(); } }); } catch (Exception ex) { throw new IOException(ex); } LOG.info("Invalidated " + numOverReplicated[0] + " over-replicated blocks on " + srcNode + " during recommissioning"); } /** * Returns whether a node can be safely decommissioned based on its * liveness. Dead nodes cannot always be safely decommissioned. */ boolean isNodeHealthyForDecommission(DatanodeDescriptor node) throws IOException { if (!node.checkBlockReportReceived()) { LOG.info("Node {} hasn't sent its first block report.", node); return false; } if (node.isAlive) { return true; } updateState(); if (pendingReplicationBlocksCount == 0 && underReplicatedBlocksCount == 0) { LOG.info("Node {} is dead and there are no under-replicated" + " blocks or blocks pending replication. Safe to decommission.", node); return true; } LOG.warn("Node {} is dead " + "while decommission is in progress. Cannot be safely " + "decommissioned since there is risk of reduced " + "data durability or data loss. Either restart the failed node or" + " force decommissioning by removing, calling refreshNodes, " + "then re-adding to the excludes files.", node); return false; } public int getActiveBlockCount() throws IOException { return blocksMap.size(); } public DatanodeStorageInfo[] getStorages(BlockInfoContiguous block) throws TransactionContextException, StorageException { return block.getStorages(datanodeManager); } public int getTotalBlocks() throws IOException { return blocksMap.size(); } public void removeBlock(Block block) throws StorageException, TransactionContextException, IOException { // No need to ACK blocks that are being removed entirely // from the namespace, since the removal of the associated // file already removes them from the block map below. // block.setNumBytesNoPersistance(BlockCommand.NO_ACK); addToInvalidates(block); BlockInfoContiguous storedBlock = getBlockInfo(block); removeBlockFromMap(block); // Remove the block from pendingReplications and neededReplications pendingReplications.remove(storedBlock); neededReplications.remove(storedBlock); if (postponedMisreplicatedBlocks.remove(block)) { postponedMisreplicatedBlocksCount.decrementAndGet(); } } public BlockInfoContiguous getStoredBlock(Block block) throws StorageException, TransactionContextException { return blocksMap.getStoredBlock(block); } /** * updates a block in under replication queue */ private void updateNeededReplications(final Block block, final int curReplicasDelta, int expectedReplicasDelta) throws IOException { if (!namesystem.isPopulatingReplQueues()) { return; } NumberReplicas repl = countNodes(block); int curExpectedReplicas = getReplication(block); if (isNeededReplication(block, curExpectedReplicas, repl.liveReplicas())) { neededReplications.update(getBlockInfo(block), repl.liveReplicas(), repl.decommissionedReplicas(), curExpectedReplicas, curReplicasDelta, expectedReplicasDelta); } else { int oldReplicas = repl.liveReplicas() - curReplicasDelta; int oldExpectedReplicas = curExpectedReplicas - expectedReplicasDelta; neededReplications.remove(getBlockInfo(block), oldReplicas, repl.decommissionedReplicas(), oldExpectedReplicas); } } /** * Check replication of the blocks in the collection. * If any block is needed replication, insert it into the replication queue. * Otherwise, if the block is more than the expected replication factor, * process it as an over replicated block. */ public void checkReplication(BlockCollection bc) throws IOException { final short expected = bc.getBlockReplication(); for (Block block : bc.getBlocks()) { final NumberReplicas n = countNodes(block); if (isNeededReplication(block, expected, n.liveReplicas())) { neededReplications.add(getBlockInfo(block), n.liveReplicas(), n.decommissionedReplicas(), expected); } else if (n.liveReplicas() > expected) { processOverReplicatedBlock(block, expected, null, null); } } } /** * Check that the indicated blocks are present and * replicated. */ public boolean checkBlocksProperlyReplicated(String src, BlockInfoContiguous[] blocks) throws StorageException, TransactionContextException { for (BlockInfoContiguous b : blocks) { if (!b.isComplete()) { final BlockInfoContiguousUnderConstruction uc = (BlockInfoContiguousUnderConstruction) b; final int numNodes = b.getStorages(getDatanodeManager()).length; LOG.info("BLOCK* " + b + " is not COMPLETE (ucState = " + uc.getBlockUCState() + ", replication# = " + numNodes + (numNodes < minReplication ? " < " : " >= ") + " minimum = " + minReplication + ") in file " + src); return false; } } return true; } /** * @return 0 if the block is not found; * otherwise, return the replication factor of the block. */ private int getReplication(Block block) throws StorageException, TransactionContextException { final BlockCollection bc = blocksMap.getBlockCollection(block); return bc == null ? 0 : bc.getBlockReplication(); } /** * Get blocks to invalidate for <i>nodeId</i> * in {@link #invalidateBlocks}. * * @return number of blocks scheduled for removal during this iteration. */ private int invalidateWorkForOneNode(Map.Entry<DatanodeInfo, List<Integer>> entry) throws IOException { // blocks should not be replicated or removed if safe mode is on if (namesystem.isInSafeMode()) { LOG.debug("In safemode, not computing replication work"); return 0; } // get blocks to invalidate for the nodeId DatanodeDescriptor dnDescriptor = datanodeManager.getDatanode(entry.getKey()); if (dnDescriptor == null) { LOG.warn("DataNode " + entry.getKey() + " cannot be found for sids " + Arrays.toString(entry.getValue().toArray()) + ", removing block invalidation work."); invalidateBlocks.remove(entry.getValue()); return 0; } final List<Block> toInvalidate = invalidateBlocks.invalidateWork(dnDescriptor); if (toInvalidate == null) { return 0; } if (blockLog.isInfoEnabled()) { blockLog.info("BLOCK* {}: ask {} to delete {}", getClass().getSimpleName(), entry.getKey(), toInvalidate); } return toInvalidate.size(); } boolean blockHasEnoughRacks(Block b) throws StorageException, TransactionContextException { if (!this.shouldCheckForEnoughRacks) { return true; } boolean enoughRacks = false; Collection<DatanodeDescriptor> corruptNodes = corruptReplicas.getNodes(getBlockInfo(b)); int numExpectedReplicas = getReplication(b); String rackName = null; for (DatanodeStorageInfo storage : blocksMap.storageList(b)) { final DatanodeDescriptor cur = storage.getDatanodeDescriptor(); if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) { if ((corruptNodes == null) || !corruptNodes.contains(cur)) { if (numExpectedReplicas == 1 || (numExpectedReplicas > 1 && !datanodeManager.hasClusterEverBeenMultiRack())) { enoughRacks = true; break; } String rackNameNew = cur.getNetworkLocation(); if (rackName == null) { rackName = rackNameNew; } else if (!rackName.equals(rackNameNew)) { enoughRacks = true; break; } } } } return enoughRacks; } /** * A block needs replication if the number of replicas is less than expected * or if it does not have enough racks. */ boolean isNeededReplication(Block b, int expected, int current) throws StorageException, TransactionContextException { return current < expected || !blockHasEnoughRacks(b); } public long getMissingBlocksCount() throws IOException { // not locking return this.neededReplications.getCorruptBlockSize(); } public long getMissingReplOneBlocksCount() throws IOException { // not locking return this.neededReplications.getCorruptReplOneBlockSize(); } public BlockInfoContiguous addBlockCollection(BlockInfoContiguous block, BlockCollection bc) throws StorageException, TransactionContextException { return blocksMap.addBlockCollection(block, bc); } public BlockCollection getBlockCollection(Block b) throws StorageException, TransactionContextException { return blocksMap.getBlockCollection(b); } /** * @return an iterator of the datanodes. */ public List<DatanodeStorageInfo> storageList(final Block block) throws StorageException, TransactionContextException { return blocksMap.storageList(block); } public int numCorruptReplicas(Block block) throws StorageException, TransactionContextException { return corruptReplicas.numCorruptReplicas(getBlockInfo(block)); } public void removeBlockFromMap(Block block) throws IOException { removeFromExcessReplicateMap(getBlockInfo(block)); // If block is removed from blocksMap remove it from corruptReplicasMap corruptReplicas.removeFromCorruptReplicasMap(getBlockInfo(block)); blocksMap.removeBlock(block); } /** * If a block is removed from blocksMap, remove it from excessReplicateMap. */ private void removeFromExcessReplicateMap(Block block) throws IOException { BlockInfoContiguous blockInfo = getBlockInfo(block); for (DatanodeStorageInfo info : blocksMap.getStorages(blockInfo)) { excessReplicateMap.remove(info.getDatanodeDescriptor(), blockInfo); } } public int getCapacity() { return blocksMap.getCapacity(); } /** * Return a range of corrupt replica block ids. Up to numExpectedBlocks * blocks starting at the next block after startingBlockId are returned * (fewer if numExpectedBlocks blocks are unavailable). If startingBlockId * is null, up to numExpectedBlocks blocks are returned from the beginning. * If startingBlockId cannot be found, null is returned. * * @param numExpectedBlocks * Number of block ids to return. * 0 <= numExpectedBlocks <= 100 * @param startingBlockId * Block id from which to start. If null, start at * beginning. * @return Up to numExpectedBlocks blocks from startingBlockId if it exists */ public long[] getCorruptReplicaBlockIds(int numExpectedBlocks, Long startingBlockId) throws IOException { return corruptReplicas.getCorruptReplicaBlockIds(numExpectedBlocks, startingBlockId); } /** * Return an iterator over the set of blocks for which there are no replicas. */ public Iterator<Block> getCorruptReplicaBlockIterator() { return neededReplications.iterator(UnderReplicatedBlocks.QUEUE_WITH_CORRUPT_BLOCKS); } /** * Get the replicas which are corrupt for a given block. */ public Collection<DatanodeDescriptor> getCorruptReplicas(Block block) throws StorageException, TransactionContextException { return corruptReplicas.getNodes(getBlockInfo(block)); } /** * Get reason for certain corrupted replicas for a given block and a given dn. */ public String getCorruptReason(BlockInfoContiguous block, DatanodeDescriptor node) throws IOException { return corruptReplicas.getCorruptReason(block, node); } /** @return the size of UnderReplicatedBlocks */ public int numOfUnderReplicatedBlocks() throws IOException { return neededReplications.size(); } /** * Periodically calls computeReplicationWork(). */ private class ReplicationMonitor implements Runnable { @Override public void run() { while (namesystem.isRunning()) { try { if (namesystem.isLeader()) { LOG.debug("Running replication monitor"); // Process replication work only when active NN is out of safe mode. if (namesystem.isPopulatingReplQueues()) { computeDatanodeWork(); processPendingReplications(); rescanPostponedMisreplicatedBlocks(); } } else { updateState(); LOG.debug("Namesystem is not leader: will not run replication monitor"); } Thread.sleep(replicationRecheckInterval); } catch (Throwable t) { if (t instanceof TransientStorageException) { continue; } if (t instanceof StorageException) { //Storage problems should be handled by FSNameSystem.checkAvailableResources(), retry continue; } if (!namesystem.isRunning()) { LOG.info("Stopping ReplicationMonitor."); if (!(t instanceof InterruptedException)) { LOG.info("ReplicationMonitor received an exception" + " while shutting down.", t); } break; } else if (!checkNSRunning && t instanceof InterruptedException) { LOG.info("Stopping ReplicationMonitor for testing."); break; } LOG.error("ReplicationMonitor thread received Runtime exception. ", t); terminate(1, t); } } } } /** * Compute block replication and block invalidation work that can be * scheduled * on data-nodes. The datanode will be informed of this work at the next * heartbeat. * * @return number of blocks scheduled for replication or removal. * @throws IOException */ int computeDatanodeWork() throws IOException { // Blocks should not be replicated or removed if in safe mode. // It's OK to check safe mode here w/o holding lock, in the worst // case extra replications will be scheduled, and these will get // fixed up later. if (namesystem.isInSafeMode()) { return 0; } final int numlive = heartbeatManager.getLiveDatanodeCount(); final int blocksToProcess = numlive * this.blocksReplWorkMultiplier; final int nodesToProcess = (int) Math.ceil(numlive * this.blocksInvalidateWorkPct); int workFound = this.computeReplicationWork(blocksToProcess); // Update counters this.updateState(); this.scheduledReplicationBlocksCount = workFound; workFound += this.computeInvalidateWork(nodesToProcess); return workFound; } /** * Clear all queues that hold decisions previously made by * this NameNode. */ public void clearQueues() throws IOException { neededReplications.clear(); pendingReplications.clear(); excessReplicateMap.clear(); invalidateBlocks.clear(); datanodeManager.clearPendingQueues(); postponedMisreplicatedBlocks.clear(); postponedMisreplicatedBlocksCount.set(0); }; private static class ReplicationWork { private final Block block; private final BlockCollection bc; private final DatanodeDescriptor srcNode; private final List<DatanodeDescriptor> containingNodes; private final List<DatanodeStorageInfo> liveReplicaStorages; private final int additionalReplRequired; private DatanodeStorageInfo targets[]; private final int priority; public ReplicationWork(Block block, BlockCollection bc, DatanodeDescriptor srcNode, List<DatanodeDescriptor> containingNodes, List<DatanodeStorageInfo> liveReplicaStorages, int additionalReplRequired, int priority) { this.block = block; this.bc = bc; this.srcNode = srcNode; this.srcNode.incrementPendingReplicationWithoutTargets(); this.containingNodes = containingNodes; this.liveReplicaStorages = liveReplicaStorages; this.additionalReplRequired = additionalReplRequired; this.priority = priority; this.targets = null; } private void chooseTargets(BlockPlacementPolicy blockplacement, BlockStoragePolicySuite storagePolicySuite, Set<Node> excludedNodes) throws TransactionContextException, StorageException { try { //HOP: [M] srcPath is not used targets = blockplacement.chooseTarget(null /*bc.getName()*/, additionalReplRequired, srcNode, liveReplicaStorages, false, excludedNodes, block.getNumBytes(), storagePolicySuite.getPolicy(bc.getStoragePolicyID())); } finally { srcNode.decrementPendingReplicationWithoutTargets(); } } } /** * A simple result enum for the result of * {@link BlockManager#processMisReplicatedBlock(BlockInfoContiguous)}. */ enum MisReplicationResult { /** * The block should be invalidated since it belongs to a deleted file. */ INVALID, /** * The block is currently under-replicated. */ UNDER_REPLICATED, /** * The block is currently over-replicated. */ OVER_REPLICATED, /** * A decision can't currently be made about this block. */ POSTPONE, /** * The block is under construction, so should be ignored */ UNDER_CONSTRUCTION, /** * The block is properly replicated */ OK } private void removeStoredBlocksTx(final List<Long> inodeIds, final Map<Long, List<Long>> inodeIdsToBlockMap, final DatanodeDescriptor node) throws IOException { final AtomicInteger removedBlocks = new AtomicInteger(0); new HopsTransactionalRequestHandler(HDFSOperationType.REMOVE_STORED_BLOCKS) { List<INodeIdentifier> inodeIdentifiers; @Override public void setUp() throws StorageException { inodeIdentifiers = INodeUtil.resolveINodesFromIds(inodeIds); } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getINodesLocks(INodeLockType.WRITE, inodeIdentifiers)).add(lf.getBlockLock()) .add(lf.getBlockRelated(BLK.RE, BLK.IV, BLK.CR, BLK.UR, BLK.ER)); if (((FSNamesystem) namesystem).isErasureCodingEnabled() && inodeIdentifiers != null) { locks.add(lf.getBatchedEncodingStatusLock(LockType.WRITE, inodeIdentifiers)); } } @Override public Object performTask() throws IOException { for (INodeIdentifier identifier : inodeIdentifiers) { for (long blockId : inodeIdsToBlockMap.get(identifier.getInodeId())) { BlockInfoContiguous block = EntityManager .find(BlockInfoContiguous.Finder.ByBlockIdAndINodeId, blockId); removeStoredBlock(block, node); removedBlocks.incrementAndGet(); } } return null; } }.handle(namesystem); LOG.debug("removed " + removedBlocks.get() + " replicas from " + node.getName()); } private void removeStoredBlocksTx(final List<Long> inodeIds, final Map<Long, List<Long>> inodeIdsToBlockMap, final int sid) throws IOException { final AtomicInteger removedBlocks = new AtomicInteger(0); new HopsTransactionalRequestHandler(HDFSOperationType.REMOVE_STORED_BLOCKS) { List<INodeIdentifier> inodeIdentifiers; @Override public void setUp() throws StorageException { inodeIdentifiers = INodeUtil.resolveINodesFromIds(inodeIds); } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getBatchedINodesLock(inodeIdentifiers)).add(lf.getSqlBatchedBlocksLock()) .add(lf.getSqlBatchedBlocksRelated(BLK.RE, BLK.IV, BLK.CR, BLK.UR, BLK.ER)); if (((FSNamesystem) namesystem).isErasureCodingEnabled() && inodeIdentifiers != null) { locks.add(lf.getBatchedEncodingStatusLock(LockType.WRITE, inodeIdentifiers)); } } @Override public Object performTask() throws IOException { for (INodeIdentifier identifier : inodeIdentifiers) { for (long blockId : inodeIdsToBlockMap.get(identifier.getInodeId())) { BlockInfoContiguous block = EntityManager .find(BlockInfoContiguous.Finder.ByBlockIdAndINodeId, blockId); removeStoredBlock(block, sid); removedBlocks.incrementAndGet(); } } return null; } }.handle(namesystem); LOG.info("removed " + removedBlocks.get() + " replicas from " + sid); } @VisibleForTesting int computeReplicationWorkForBlock(final Block b, final int priority) throws IOException { return (Integer) new HopsTransactionalRequestHandler(HDFSOperationType.COMPUTE_REPLICATION_WORK_FOR_BLOCK) { INodeIdentifier inodeIdentifier; @Override public void setUp() throws StorageException { inodeIdentifier = INodeUtil.resolveINodeFromBlock(b); } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getIndividualINodeLock(INodeLockType.WRITE, inodeIdentifier, true)) .add(lf.getBlockLock(b.getBlockId(), inodeIdentifier)) .add(lf.getVariableLock(Variable.Finder.ReplicationIndex, LockType.WRITE)) .add(lf.getBlockRelated(BLK.RE, BLK.ER, BLK.CR, BLK.PE, BLK.UR, BLK.UC)); } @Override public Object performTask() throws IOException { return computeReplicationWorkForBlockInternal(b, priority); } }.handle(namesystem); } public BlockInfoContiguous tryToCompleteBlock(final BlockCollection bc, final int blkIndex) throws IOException { if (blkIndex < 0) { return null; } BlockInfoContiguous curBlock = bc.getBlock(blkIndex); LOG.debug("tryToCompleteBlock. blkId = " + curBlock.getBlockId()); if (curBlock.isComplete()) { return curBlock; } BlockInfoContiguousUnderConstruction ucBlock = (BlockInfoContiguousUnderConstruction) curBlock; int numNodes = ucBlock.numNodes(datanodeManager); if (numNodes < minReplication) { return null; } if (ucBlock.getBlockUCState() != BlockUCState.COMMITTED) { return null; } BlockInfoContiguous completeBlock = ucBlock.convertToCompleteBlock(); // replace penultimate block in file bc.setBlock(blkIndex, completeBlock); // Since safe-mode only counts complete blocks, and we now have // one more complete block, we need to adjust the total up, and // also count it as safe, if we have at least the minimum replica // count. (We may not have the minimum replica count yet if this is // a "forced" completion when a file is getting closed by an // OP_CLOSE edit on the standby). namesystem.adjustSafeModeBlockTotals(null, 1); namesystem.incrementSafeBlockCount(Math.min(numNodes, minReplication), curBlock); return completeBlock; } @VisibleForTesting public void processTimedOutPendingBlock(final long timedOutItemId) throws IOException { new HopsTransactionalRequestHandler(HDFSOperationType.PROCESS_TIMEDOUT_PENDING_BLOCK) { INodeIdentifier inodeIdentifier; @Override public void setUp() throws StorageException { inodeIdentifier = INodeUtil.resolveINodeFromBlockID(timedOutItemId); } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getIndividualINodeLock(INodeLockType.WRITE, inodeIdentifier)) .add(lf.getIndividualBlockLock(timedOutItemId, inodeIdentifier)) .add(lf.getBlockRelated(BLK.RE, BLK.ER, BLK.CR, BLK.PE, BLK.UR)); if (((FSNamesystem) namesystem).isErasureCodingEnabled() && inodeIdentifier != null) { locks.add(lf.getIndivdualEncodingStatusLock(LockType.WRITE, inodeIdentifier.getInodeId())); } } @Override public Object performTask() throws IOException { BlockInfoContiguous timedOutItem = EntityManager .find(BlockInfoContiguous.Finder.ByBlockIdAndINodeId, timedOutItemId); if (timedOutItem == null) { return null; } NumberReplicas num = countNodes(timedOutItem); if (isNeededReplication(timedOutItem, getReplication(timedOutItem), num.liveReplicas())) { neededReplications.add(getBlockInfo(timedOutItem), num.liveReplicas(), num.decommissionedReplicas(), getReplication(timedOutItem)); } pendingReplications.remove(timedOutItem); return null; } }.handle(namesystem); } private BlockInfoContiguous getBlockInfo(Block b) throws StorageException, TransactionContextException { BlockInfoContiguous binfo = blocksMap.getStoredBlock(b); if (binfo == null) { LOG.error("ERROR: Dangling Block. bid=" + b.getBlockId() + " setting inodeId to be " + BlockInfoContiguous.NON_EXISTING_ID); binfo = new BlockInfoContiguous(b, BlockInfoContiguous.NON_EXISTING_ID); } return binfo; } private void addStoredBlockTx(final List<BlockInfoContiguous> blocks, final List<Long> blockIds, final List<Long> inodeIds, final DatanodeStorageInfo storage, final DatanodeDescriptor delNodeHint, final boolean logEveryBlock) throws IOException { new HopsTransactionalRequestHandler(HDFSOperationType.AFTER_PROCESS_REPORT_ADD_BLK) { List<INodeIdentifier> inodeIdentifiers = new ArrayList<>(); @Override public void setUp() throws StorageException { Set<Long> addedInodeIds = new HashSet<>(); for (long id : inodeIds) { if (!addedInodeIds.contains(id)) { inodeIdentifiers.add(INodeUtil.resolveINodeFromId(id)); addedInodeIds.add(id); } } } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getINodesLocks(INodeLockType.WRITE, inodeIdentifiers)) .add(lf.getBlockReportingLocks(Longs.toArray(blockIds), Longs.toArray(inodeIds), new long[0], 0)) .add(lf.getBlockRelated(BLK.RE, BLK.ER, BLK.CR, BLK.PE, BLK.IV, BLK.UR)); if (((FSNamesystem) namesystem).isErasureCodingEnabled() && !inodeIdentifiers.isEmpty()) { locks.add(lf.getBatchedEncodingStatusLock(LockType.WRITE, inodeIdentifiers)); } } @Override public Object performTask() throws IOException { for (BlockInfoContiguous block : blocks) { Block b = addStoredBlock(block, storage, delNodeHint, logEveryBlock); } return null; } }.handle(); } private void addStoredBlockUnderConstructionTx(final StatefulBlockInfo ucBlock, final DatanodeStorageInfo storage) throws IOException { new HopsTransactionalRequestHandler(HDFSOperationType.AFTER_PROCESS_REPORT_ADD_UC_BLK) { INodeIdentifier inodeIdentifier; @Override public void setUp() throws StorageException { inodeIdentifier = INodeUtil.resolveINodeFromBlock(ucBlock.reportedBlock); } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getIndividualINodeLock(INodeLockType.WRITE, inodeIdentifier, true)) .add(lf.getIndividualBlockLock(ucBlock.reportedBlock.getBlockId(), inodeIdentifier)) .add(lf.getBlockRelated(BLK.RE, BLK.UC, BLK.ER, BLK.CR, BLK.PE, BLK.UR)); if (((FSNamesystem) namesystem).isErasureCodingEnabled() && inodeIdentifier != null) { locks.add(lf.getIndivdualEncodingStatusLock(LockType.WRITE, inodeIdentifier.getInodeId())); } } @Override public Object performTask() throws IOException { addStoredBlockUnderConstruction(ucBlock, storage); return null; } }.handle(); } private void addToInvalidates(final Collection<Block> blocks, final DatanodeStorageInfo storage) throws IOException { invalidateBlocks.add(blocks, storage); } private void markBlockAsCorruptTx(final BlockToMarkCorrupt b, final DatanodeStorageInfo storage) throws IOException { new HopsTransactionalRequestHandler(HDFSOperationType.AFTER_PROCESS_REPORT_ADD_CORRUPT_BLK) { INodeIdentifier inodeIdentifier; @Override public void setUp() throws StorageException { inodeIdentifier = INodeUtil.resolveINodeFromBlock(b.corrupted); } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getIndividualINodeLock(INodeLockType.WRITE, inodeIdentifier)) .add(lf.getIndividualBlockLock(b.corrupted.getBlockId(), inodeIdentifier)) .add(lf.getBlockRelated(BLK.RE, BLK.ER, BLK.CR, BLK.UR, BLK.UC, BLK.IV)); if (((FSNamesystem) namesystem).isErasureCodingEnabled() && inodeIdentifier != null) { locks.add(lf.getIndivdualEncodingStatusLock(LockType.WRITE, inodeIdentifier.getInodeId())); } } @Override public Object performTask() throws IOException { markBlockAsCorrupt(b, storage, storage.getDatanodeDescriptor()); return null; } }.handle(); } public int getTotalCompleteBlocks() throws IOException { return blocksMap.sizeCompleteOnly(); } private void addStoredBlockUnderConstructionImmediateTx(final BlockInfoContiguousUnderConstruction block, final DatanodeStorageInfo storage, final ReplicaState reportedState) throws IOException { new HopsTransactionalRequestHandler(HDFSOperationType.AFTER_PROCESS_REPORT_ADD_UC_BLK_IMMEDIATE) { INodeIdentifier inodeIdentifier; @Override public void setUp() throws StorageException { inodeIdentifier = INodeUtil.resolveINodeFromBlock(block); } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getIndividualINodeLock(INodeLockType.WRITE, inodeIdentifier, true)) .add(lf.getIndividualBlockLock(block.getBlockId(), inodeIdentifier)) .add(lf.getBlockRelated(BLK.RE, BLK.UC, BLK.CR, BLK.ER, BLK.PE, BLK.UR)); if (((FSNamesystem) namesystem).isErasureCodingEnabled() && inodeIdentifier != null) { locks.add(lf.getIndivdualEncodingStatusLock(LockType.WRITE, inodeIdentifier.getInodeId())); } } @Override public Object performTask() throws IOException { block.addReplicaIfNotPresent(storage, reportedState, block.getGenerationStamp()); //and fall through to next clause //add replica if appropriate if (reportedState == ReplicaState.FINALIZED) { addStoredBlockImmediate(block, storage, false); } return null; } }.handle(); } private void addStoredBlockImmediateTx(final List<BlockInfoContiguous> blocks, final List<Long> blockIds, final List<Long> inodeIds, final DatanodeStorageInfo storage, final boolean logEveryBlock) throws IOException { new HopsTransactionalRequestHandler(HDFSOperationType.AFTER_PROCESS_REPORT_ADD_BLK_IMMEDIATE) { List<INodeIdentifier> inodeIdentifiers = new ArrayList<>(); @Override public void setUp() throws StorageException { Set<Long> addedInodeIds = new HashSet<>(); for (long id : inodeIds) { if (!addedInodeIds.contains(id)) { inodeIdentifiers.add(INodeUtil.resolveINodeFromId(id)); addedInodeIds.add(id); } } } @Override public void acquireLock(TransactionLocks locks) throws IOException { LockFactory lf = LockFactory.getInstance(); locks.add(lf.getINodesLocks(INodeLockType.WRITE, inodeIdentifiers)).add(lf.getBlockLock()) .add(lf.getBlockRelated(BLK.RE, BLK.ER, BLK.CR, BLK.PE, BLK.IV, BLK.UR)); if (((FSNamesystem) namesystem).isErasureCodingEnabled() && !inodeIdentifiers.isEmpty()) { locks.add(lf.getBatchedEncodingStatusLock(LockType.WRITE, inodeIdentifiers)); } } @Override public Object performTask() throws IOException { for (BlockInfoContiguous block : blocks) { addStoredBlockImmediate(block, storage, logEveryBlock); } return null; } }.handle(); } public void shutdown() { if (datanodeRemover != null) { datanodeRemover.shutdown(); } stopReplicationInitializer(); } public int getNumBuckets() { return numBuckets; } public int getBlockFetcherNBThreads() { return blockFetcherNBThreads; } public int getBlockFetcherBucketsPerThread() { return blockFetcherBucketsPerThread; } public int getRemovalBatchSize() { return slicerBatchSize; } public int getRemovalNoThreads() { return slicerNbThreads; } public void blockReportCompleted(final DatanodeID nodeID, DatanodeStorage[] storages, boolean success) throws IOException { //Leader should remove the information about the block report from the DB if (namesystem != null && namesystem.getNameNode() != null) { //for unit testing namesystem.getNameNode().getBRTrackingService().blockReportCompleted(nodeID.getXferAddr()); } if (success) { DatanodeDescriptor node = datanodeManager.getDatanode(nodeID); if (node != null) { for (DatanodeStorage storage : storages) { DatanodeStorageInfo storageInfo = node.getStorageInfo(storage.getStorageID()); storageInfo.receivedBlockReport(); } } } } }