Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.blockmanagement; import static org.apache.hadoop.hdfs.protocol.BlockType.CONTIGUOUS; import static org.apache.hadoop.hdfs.protocol.BlockType.STRIPED; import static org.apache.hadoop.util.ExitUtil.terminate; import static org.apache.hadoop.util.Time.now; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; import java.util.Collection; import java.util.Collections; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Queue; import java.util.Set; import java.util.TreeSet; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.FutureTask; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; import javax.management.ObjectName; import org.apache.hadoop.HadoopIllegalArgumentException; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.AddBlockFlag; import org.apache.hadoop.fs.FileEncryptionInfo; import org.apache.hadoop.fs.StorageType; import org.apache.hadoop.hdfs.DFSUtilClient; import org.apache.hadoop.hdfs.protocol.BlockStoragePolicy; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.HAUtil; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.BlockListAsLongs; import org.apache.hadoop.hdfs.protocol.BlockListAsLongs.BlockReportReplica; import org.apache.hadoop.hdfs.protocol.BlockType; import org.apache.hadoop.hdfs.protocol.DatanodeID; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; import org.apache.hadoop.hdfs.protocol.HdfsConstants.StoragePolicySatisfierMode; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock; import org.apache.hadoop.hdfs.protocol.UnregisteredNodeException; import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager; import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier.AccessMode; import org.apache.hadoop.hdfs.security.token.block.DataEncryptionKey; import org.apache.hadoop.hdfs.security.token.block.ExportedBlockKeys; import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoStriped.StorageAndBlockIndex; import org.apache.hadoop.hdfs.server.blockmanagement.CorruptReplicasMap.Reason; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo.AddBlockResult; import org.apache.hadoop.hdfs.server.blockmanagement.NumberReplicas.StoredReplicaState; import org.apache.hadoop.hdfs.server.blockmanagement.PendingDataNodeMessages.ReportedBlockInfo; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; import org.apache.hadoop.hdfs.server.namenode.CachedBlock; import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo; import org.apache.hadoop.hdfs.server.namenode.INodesInPath; import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.Namesystem; import org.apache.hadoop.hdfs.server.namenode.ha.HAContext; import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics; import org.apache.hadoop.hdfs.server.namenode.sps.StoragePolicySatisfyManager; import org.apache.hadoop.hdfs.server.protocol.BlockCommand; import org.apache.hadoop.hdfs.server.protocol.BlockReportContext; import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations; import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations; import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.StripedBlockWithLocations; import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage; import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage.State; import org.apache.hadoop.hdfs.server.protocol.KeyUpdateCommand; import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo; import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks; import org.apache.hadoop.hdfs.server.protocol.StorageReport; import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary; import org.apache.hadoop.hdfs.util.FoldedTreeSet; import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy; import org.apache.hadoop.hdfs.server.namenode.CacheManager; import static org.apache.hadoop.hdfs.util.StripedBlockUtil.getInternalBlockLength; import org.apache.hadoop.metrics2.util.MBeans; import org.apache.hadoop.net.Node; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.util.Daemon; import org.apache.hadoop.util.ExitUtil; import org.apache.hadoop.util.LightWeightGSet; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Time; import org.apache.hadoop.util.VersionInfo; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Keeps information related to the blocks stored in the Hadoop cluster. * For block state management, it tries to maintain the safety * property of "# of live replicas == # of expected redundancy" under * any events such as decommission, namenode failover, datanode failure. * * The motivation of maintenance mode is to allow admins quickly repair nodes * without paying the cost of decommission. Thus with maintenance mode, * # of live replicas doesn't have to be equal to # of expected redundancy. * If any of the replica is in maintenance mode, the safety property * is extended as follows. These property still apply for the case of zero * maintenance replicas, thus we can use these safe property for all scenarios. * a. # of live replicas >= # of min replication for maintenance. * b. # of live replicas <= # of expected redundancy. * c. # of live replicas and maintenance replicas >= # of expected redundancy. * * For regular replication, # of min live replicas for maintenance is determined * by DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY. This number has to <= * DFS_NAMENODE_REPLICATION_MIN_KEY. * For erasure encoding, # of min live replicas for maintenance is * BlockInfoStriped#getRealDataBlockNum. * * Another safety property is to satisfy the block placement policy. While the * policy is configurable, the replicas the policy is applied to are the live * replicas + maintenance replicas. */ @InterfaceAudience.Private public class BlockManager implements BlockStatsMXBean { public static final Logger LOG = LoggerFactory.getLogger(BlockManager.class); public static final Logger blockLog = NameNode.blockStateChangeLog; private static final String QUEUE_REASON_CORRUPT_STATE = "it has the wrong state or generation stamp"; private static final String QUEUE_REASON_FUTURE_GENSTAMP = "generation stamp is in the future"; private static final long BLOCK_RECOVERY_TIMEOUT_MULTIPLIER = 30; private final Namesystem namesystem; private final BlockManagerSafeMode bmSafeMode; private final DatanodeManager datanodeManager; private final HeartbeatManager heartbeatManager; private final BlockTokenSecretManager blockTokenSecretManager; // Block pool ID used by this namenode private String blockPoolId; private final PendingDataNodeMessages pendingDNMessages = new PendingDataNodeMessages(); private volatile long pendingReconstructionBlocksCount = 0L; private volatile long corruptReplicaBlocksCount = 0L; private volatile long lowRedundancyBlocksCount = 0L; private volatile long scheduledReplicationBlocksCount = 0L; /** flag indicating whether replication queues have been initialized */ private boolean initializedReplQueues; private final long startupDelayBlockDeletionInMs; private final BlockReportLeaseManager blockReportLeaseManager; private ObjectName mxBeanName; /** Used by metrics */ public long getPendingReconstructionBlocksCount() { return pendingReconstructionBlocksCount; } /** Used by metrics */ public long getLowRedundancyBlocksCount() { return lowRedundancyBlocksCount; } /** Used by metrics */ public long getCorruptReplicaBlocksCount() { return corruptReplicaBlocksCount; } /** Used by metrics */ public long getScheduledReplicationBlocksCount() { return scheduledReplicationBlocksCount; } /** Used by metrics */ public long getPendingDeletionBlocksCount() { return invalidateBlocks.numBlocks(); } /** Used by metrics */ public long getStartupDelayBlockDeletionInMs() { return startupDelayBlockDeletionInMs; } /** Used by metrics */ public long getExcessBlocksCount() { return excessRedundancyMap.size(); } /** Used by metrics */ public long getPostponedMisreplicatedBlocksCount() { return postponedMisreplicatedBlocks.size(); } /** Used by metrics */ public int getPendingDataNodeMessageCount() { return pendingDNMessages.count(); } /** Used by metrics. */ public long getNumTimedOutPendingReconstructions() { return pendingReconstruction.getNumTimedOuts(); } /** Used by metrics. */ public long getLowRedundancyBlocks() { return neededReconstruction.getLowRedundancyBlocks(); } /** Used by metrics. */ public long getCorruptBlocks() { return corruptReplicas.getCorruptBlocks(); } /** Used by metrics. */ public long getMissingBlocks() { return neededReconstruction.getCorruptBlocks(); } /** Used by metrics. */ public long getMissingReplicationOneBlocks() { return neededReconstruction.getCorruptReplicationOneBlocks(); } /** Used by metrics. */ public long getPendingDeletionReplicatedBlocks() { return invalidateBlocks.getBlocks(); } /** Used by metrics. */ public long getTotalReplicatedBlocks() { return blocksMap.getReplicatedBlocks(); } /** Used by metrics. */ public long getLowRedundancyECBlockGroups() { return neededReconstruction.getLowRedundancyECBlockGroups(); } /** Used by metrics. */ public long getCorruptECBlockGroups() { return corruptReplicas.getCorruptECBlockGroups(); } /** Used by metrics. */ public long getMissingECBlockGroups() { return neededReconstruction.getCorruptECBlockGroups(); } /** Used by metrics. */ public long getPendingDeletionECBlocks() { return invalidateBlocks.getECBlocks(); } /** Used by metrics. */ public long getTotalECBlockGroups() { return blocksMap.getECBlockGroups(); } /** * redundancyRecheckInterval is how often namenode checks for new * reconstruction work. */ private final long redundancyRecheckIntervalMs; /** How often to check and the limit for the storageinfo efficiency. */ private final long storageInfoDefragmentInterval; private final long storageInfoDefragmentTimeout; private final double storageInfoDefragmentRatio; /** * Mapping: Block -> { BlockCollection, datanodes, self ref } * Updated only in response to client-sent information. */ final BlocksMap blocksMap; /** Redundancy thread. */ private final Daemon redundancyThread = new Daemon(new RedundancyMonitor()); /** StorageInfoDefragmenter thread. */ private final Daemon storageInfoDefragmenterThread = new Daemon(new StorageInfoDefragmenter()); /** Block report thread for handling async reports. */ private final BlockReportProcessingThread blockReportThread = new BlockReportProcessingThread(); /** Store blocks -> datanodedescriptor(s) map of corrupt replicas */ final CorruptReplicasMap corruptReplicas = new CorruptReplicasMap(); /** * Blocks to be invalidated. * For a striped block to invalidate, we should track its individual internal * blocks. */ private final InvalidateBlocks invalidateBlocks; /** * After a failover, over-replicated blocks may not be handled * until all of the replicas have done a block report to the * new active. This is to make sure that this NameNode has been * notified of all block deletions that might have been pending * when the failover happened. */ private final Set<Block> postponedMisreplicatedBlocks = new LinkedHashSet<Block>(); private final int blocksPerPostpondedRescan; private final ArrayList<Block> rescannedMisreplicatedBlocks; /** * Maps a StorageID to the set of blocks that are "extra" for this * DataNode. We'll eventually remove these extras. */ private final ExcessRedundancyMap excessRedundancyMap = new ExcessRedundancyMap(); /** * Store set of Blocks that need to be replicated 1 or more times. * We also store pending reconstruction-orders. */ public final LowRedundancyBlocks neededReconstruction = new LowRedundancyBlocks(); @VisibleForTesting final PendingReconstructionBlocks pendingReconstruction; /** Stores information about block recovery attempts. */ private final PendingRecoveryBlocks pendingRecoveryBlocks; /** The maximum number of replicas allowed for a block */ public final short maxReplication; /** * The maximum number of outgoing replication streams a given node should have * at one time considering all but the highest priority replications needed. */ int maxReplicationStreams; /** * The maximum number of outgoing replication streams a given node should have * at one time. */ int replicationStreamsHardLimit; /** Minimum copies needed or else write is disallowed */ public final short minReplication; /** Default number of replicas */ public final int defaultReplication; /** value returned by MAX_CORRUPT_FILES_RETURNED */ final int maxCorruptFilesReturned; final float blocksInvalidateWorkPct; final int blocksReplWorkMultiplier; // whether or not to issue block encryption keys. final boolean encryptDataTransfer; // Max number of blocks to log info about during a block report. private final long maxNumBlocksToLog; /** * When running inside a Standby node, the node may receive block reports * from datanodes before receiving the corresponding namespace edits from * the active NameNode. Thus, it will postpone them for later processing, * instead of marking the blocks as corrupt. */ private boolean shouldPostponeBlocksFromFuture = false; /** * Process reconstruction queues asynchronously to allow namenode safemode * exit and failover to be faster. HDFS-5496. */ private Daemon reconstructionQueuesInitializer = null; /** * Number of blocks to process asychronously for reconstruction queues * initialization once aquired the namesystem lock. Remaining blocks will be * processed again after aquiring lock again. */ private int numBlocksPerIteration; /** * Progress of the Reconstruction queues initialisation. */ private double reconstructionQueuesInitProgress = 0.0; /** for block replicas placement */ private BlockPlacementPolicies placementPolicies; private final BlockStoragePolicySuite storagePolicySuite; /** Check whether name system is running before terminating */ private boolean checkNSRunning = true; /** Check whether there are any non-EC blocks using StripedID */ private boolean hasNonEcBlockUsingStripedID = false; private final BlockIdManager blockIdManager; /** * For satisfying block storage policies. Instantiates if sps is enabled * internally or externally. */ private StoragePolicySatisfyManager spsManager; /** Minimum live replicas needed for the datanode to be transitioned * from ENTERING_MAINTENANCE to IN_MAINTENANCE. */ private final short minReplicationToBeInMaintenance; /** Storages accessible from multiple DNs. */ private final ProvidedStorageMap providedStorageMap; public BlockManager(final Namesystem namesystem, boolean haEnabled, final Configuration conf) throws IOException { this.namesystem = namesystem; datanodeManager = new DatanodeManager(this, namesystem, conf); heartbeatManager = datanodeManager.getHeartbeatManager(); this.blockIdManager = new BlockIdManager(this); blocksPerPostpondedRescan = (int) Math.min(Integer.MAX_VALUE, datanodeManager.getBlocksPerPostponedMisreplicatedBlocksRescan()); rescannedMisreplicatedBlocks = new ArrayList<Block>(blocksPerPostpondedRescan); startupDelayBlockDeletionInMs = conf.getLong( DFSConfigKeys.DFS_NAMENODE_STARTUP_DELAY_BLOCK_DELETION_SEC_KEY, DFSConfigKeys.DFS_NAMENODE_STARTUP_DELAY_BLOCK_DELETION_SEC_DEFAULT) * 1000L; invalidateBlocks = new InvalidateBlocks(datanodeManager.getBlockInvalidateLimit(), startupDelayBlockDeletionInMs, blockIdManager); // Compute the map capacity by allocating 2% of total memory blocksMap = new BlocksMap(LightWeightGSet.computeCapacity(2.0, "BlocksMap")); placementPolicies = new BlockPlacementPolicies(conf, datanodeManager.getFSClusterStats(), datanodeManager.getNetworkTopology(), datanodeManager.getHost2DatanodeMap()); storagePolicySuite = BlockStoragePolicySuite.createDefaultSuite(); pendingReconstruction = new PendingReconstructionBlocks( conf.getInt(DFSConfigKeys.DFS_NAMENODE_RECONSTRUCTION_PENDING_TIMEOUT_SEC_KEY, DFSConfigKeys.DFS_NAMENODE_RECONSTRUCTION_PENDING_TIMEOUT_SEC_DEFAULT) * 1000L); createSPSManager(conf); blockTokenSecretManager = createBlockTokenSecretManager(conf); providedStorageMap = new ProvidedStorageMap(namesystem, this, conf); this.maxCorruptFilesReturned = conf.getInt(DFSConfigKeys.DFS_DEFAULT_MAX_CORRUPT_FILES_RETURNED_KEY, DFSConfigKeys.DFS_DEFAULT_MAX_CORRUPT_FILES_RETURNED); this.defaultReplication = conf.getInt(DFSConfigKeys.DFS_REPLICATION_KEY, DFSConfigKeys.DFS_REPLICATION_DEFAULT); final int maxR = conf.getInt(DFSConfigKeys.DFS_REPLICATION_MAX_KEY, DFSConfigKeys.DFS_REPLICATION_MAX_DEFAULT); final int minR = conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY, DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT); if (minR <= 0) throw new IOException("Unexpected configuration parameters: " + DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY + " = " + minR + " <= 0"); if (maxR > Short.MAX_VALUE) throw new IOException("Unexpected configuration parameters: " + DFSConfigKeys.DFS_REPLICATION_MAX_KEY + " = " + maxR + " > " + Short.MAX_VALUE); if (minR > maxR) throw new IOException( "Unexpected configuration parameters: " + DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY + " = " + minR + " > " + DFSConfigKeys.DFS_REPLICATION_MAX_KEY + " = " + maxR); this.minReplication = (short) minR; this.maxReplication = (short) maxR; this.maxReplicationStreams = conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY, DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_DEFAULT); this.replicationStreamsHardLimit = conf.getInt( DFSConfigKeys.DFS_NAMENODE_REPLICATION_STREAMS_HARD_LIMIT_KEY, DFSConfigKeys.DFS_NAMENODE_REPLICATION_STREAMS_HARD_LIMIT_DEFAULT); this.blocksInvalidateWorkPct = DFSUtil.getInvalidateWorkPctPerIteration(conf); this.blocksReplWorkMultiplier = DFSUtil.getReplWorkMultiplier(conf); this.redundancyRecheckIntervalMs = conf.getTimeDuration( DFSConfigKeys.DFS_NAMENODE_REDUNDANCY_INTERVAL_SECONDS_KEY, DFSConfigKeys.DFS_NAMENODE_REDUNDANCY_INTERVAL_SECONDS_DEFAULT, TimeUnit.SECONDS) * 1000; this.storageInfoDefragmentInterval = conf.getLong( DFSConfigKeys.DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_INTERVAL_MS_KEY, DFSConfigKeys.DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_INTERVAL_MS_DEFAULT); this.storageInfoDefragmentTimeout = conf.getLong( DFSConfigKeys.DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_TIMEOUT_MS_KEY, DFSConfigKeys.DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_TIMEOUT_MS_DEFAULT); this.storageInfoDefragmentRatio = conf.getDouble( DFSConfigKeys.DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_RATIO_KEY, DFSConfigKeys.DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_RATIO_DEFAULT); this.encryptDataTransfer = conf.getBoolean(DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY, DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_DEFAULT); this.maxNumBlocksToLog = conf.getLong(DFSConfigKeys.DFS_MAX_NUM_BLOCKS_TO_LOG_KEY, DFSConfigKeys.DFS_MAX_NUM_BLOCKS_TO_LOG_DEFAULT); this.numBlocksPerIteration = conf.getInt(DFSConfigKeys.DFS_BLOCK_MISREPLICATION_PROCESSING_LIMIT, DFSConfigKeys.DFS_BLOCK_MISREPLICATION_PROCESSING_LIMIT_DEFAULT); final int minMaintenanceR = conf.getInt(DFSConfigKeys.DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY, DFSConfigKeys.DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_DEFAULT); if (minMaintenanceR < 0) { throw new IOException("Unexpected configuration parameters: " + DFSConfigKeys.DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY + " = " + minMaintenanceR + " < 0"); } if (minMaintenanceR > defaultReplication) { throw new IOException("Unexpected configuration parameters: " + DFSConfigKeys.DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY + " = " + minMaintenanceR + " > " + DFSConfigKeys.DFS_REPLICATION_KEY + " = " + defaultReplication); } this.minReplicationToBeInMaintenance = (short) minMaintenanceR; long heartbeatIntervalSecs = conf.getTimeDuration(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT, TimeUnit.SECONDS); long blockRecoveryTimeout = getBlockRecoveryTimeout(heartbeatIntervalSecs); pendingRecoveryBlocks = new PendingRecoveryBlocks(blockRecoveryTimeout); this.blockReportLeaseManager = new BlockReportLeaseManager(conf); bmSafeMode = new BlockManagerSafeMode(this, namesystem, haEnabled, conf); LOG.info("defaultReplication = {}", defaultReplication); LOG.info("maxReplication = {}", maxReplication); LOG.info("minReplication = {}", minReplication); LOG.info("maxReplicationStreams = {}", maxReplicationStreams); LOG.info("redundancyRecheckInterval = {}ms", redundancyRecheckIntervalMs); LOG.info("encryptDataTransfer = {}", encryptDataTransfer); LOG.info("maxNumBlocksToLog = {}", maxNumBlocksToLog); } private static BlockTokenSecretManager createBlockTokenSecretManager(final Configuration conf) throws IOException { final boolean isEnabled = conf.getBoolean(DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY, DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_DEFAULT); LOG.info("{} = {}", DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY, isEnabled); if (!isEnabled) { if (UserGroupInformation.isSecurityEnabled()) { String errMessage = "Security is enabled but block access tokens " + "(via " + DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY + ") " + "aren't enabled. This may cause issues " + "when clients attempt to connect to a DataNode. Aborting NameNode"; throw new IOException(errMessage); } return null; } final long updateMin = conf.getLong(DFSConfigKeys.DFS_BLOCK_ACCESS_KEY_UPDATE_INTERVAL_KEY, DFSConfigKeys.DFS_BLOCK_ACCESS_KEY_UPDATE_INTERVAL_DEFAULT); final long lifetimeMin = conf.getLong(DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_LIFETIME_KEY, DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_LIFETIME_DEFAULT); final String encryptionAlgorithm = conf.get(DFSConfigKeys.DFS_DATA_ENCRYPTION_ALGORITHM_KEY); LOG.info("{}={} min(s), {}={} min(s), {}={}", DFSConfigKeys.DFS_BLOCK_ACCESS_KEY_UPDATE_INTERVAL_KEY, updateMin, DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_LIFETIME_KEY, lifetimeMin, DFSConfigKeys.DFS_DATA_ENCRYPTION_ALGORITHM_KEY, encryptionAlgorithm); String nsId = DFSUtil.getNamenodeNameServiceId(conf); boolean isHaEnabled = HAUtil.isHAEnabled(conf, nsId); boolean shouldWriteProtobufToken = conf.getBoolean(DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_PROTOBUF_ENABLE, DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_PROTOBUF_ENABLE_DEFAULT); if (isHaEnabled) { // figure out which index we are of the nns Collection<String> nnIds = DFSUtilClient.getNameNodeIds(conf, nsId); String nnId = HAUtil.getNameNodeId(conf, nsId); int nnIndex = 0; for (String id : nnIds) { if (id.equals(nnId)) { break; } nnIndex++; } return new BlockTokenSecretManager(updateMin * 60 * 1000L, lifetimeMin * 60 * 1000L, nnIndex, nnIds.size(), null, encryptionAlgorithm, shouldWriteProtobufToken); } else { return new BlockTokenSecretManager(updateMin * 60 * 1000L, lifetimeMin * 60 * 1000L, 0, 1, null, encryptionAlgorithm, shouldWriteProtobufToken); } } public BlockStoragePolicy getStoragePolicy(final String policyName) { return storagePolicySuite.getPolicy(policyName); } public BlockStoragePolicy getStoragePolicy(final byte policyId) { return storagePolicySuite.getPolicy(policyId); } public BlockStoragePolicy[] getStoragePolicies() { return storagePolicySuite.getAllPolicies(); } public void setBlockPoolId(String blockPoolId) { this.blockPoolId = blockPoolId; if (isBlockTokenEnabled()) { blockTokenSecretManager.setBlockPoolId(blockPoolId); } } public String getBlockPoolId() { return blockPoolId; } public BlockStoragePolicySuite getStoragePolicySuite() { return storagePolicySuite; } /** get the BlockTokenSecretManager */ @VisibleForTesting public BlockTokenSecretManager getBlockTokenSecretManager() { return blockTokenSecretManager; } /** Allow silent termination of redundancy monitor for testing. */ @VisibleForTesting void enableRMTerminationForTesting() { checkNSRunning = false; } private boolean isBlockTokenEnabled() { return blockTokenSecretManager != null; } /** Should the access keys be updated? */ boolean shouldUpdateBlockKey(final long updateTime) throws IOException { return isBlockTokenEnabled() && blockTokenSecretManager.updateKeys(updateTime); } public void activate(Configuration conf, long blockTotal) { pendingReconstruction.start(); datanodeManager.activate(conf); this.redundancyThread.setName("RedundancyMonitor"); this.redundancyThread.start(); storageInfoDefragmenterThread.setName("StorageInfoMonitor"); storageInfoDefragmenterThread.start(); this.blockReportThread.start(); mxBeanName = MBeans.register("NameNode", "BlockStats", this); bmSafeMode.activate(blockTotal); } public void close() { if (getSPSManager() != null) { getSPSManager().stop(); } bmSafeMode.close(); try { redundancyThread.interrupt(); storageInfoDefragmenterThread.interrupt(); blockReportThread.interrupt(); redundancyThread.join(3000); storageInfoDefragmenterThread.join(3000); blockReportThread.join(3000); } catch (InterruptedException ie) { } datanodeManager.close(); pendingReconstruction.stop(); blocksMap.close(); } /** @return the datanodeManager */ public DatanodeManager getDatanodeManager() { return datanodeManager; } @VisibleForTesting public BlockPlacementPolicy getBlockPlacementPolicy() { return placementPolicies.getPolicy(CONTIGUOUS); } /** Dump meta data to out. */ public void metaSave(PrintWriter out) { assert namesystem.hasWriteLock(); // TODO: block manager read lock and NS write lock final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>(); datanodeManager.fetchDatanodes(live, dead, false); out.println("Live Datanodes: " + live.size()); out.println("Dead Datanodes: " + dead.size()); // // Need to iterate over all queues from neededReplications // except for the QUEUE_WITH_CORRUPT_BLOCKS) // synchronized (neededReconstruction) { out.println("Metasave: Blocks waiting for reconstruction: " + neededReconstruction.getLowRedundancyBlockCount()); for (int i = 0; i < neededReconstruction.LEVEL; i++) { if (i != neededReconstruction.QUEUE_WITH_CORRUPT_BLOCKS) { for (Iterator<BlockInfo> it = neededReconstruction.iterator(i); it.hasNext();) { Block block = it.next(); dumpBlockMeta(block, out); } } } // // Now prints corrupt blocks separately // out.println("Metasave: Blocks currently missing: " + neededReconstruction.getCorruptBlockSize()); for (Iterator<BlockInfo> it = neededReconstruction .iterator(neededReconstruction.QUEUE_WITH_CORRUPT_BLOCKS); it.hasNext();) { Block block = it.next(); dumpBlockMeta(block, out); } } // Dump any postponed over-replicated blocks out.println("Mis-replicated blocks that have been postponed:"); for (Block block : postponedMisreplicatedBlocks) { dumpBlockMeta(block, out); } // Dump blocks from pendingReconstruction pendingReconstruction.metaSave(out); // Dump blocks that are waiting to be deleted invalidateBlocks.dump(out); //Dump corrupt blocks and their storageIDs Set<Block> corruptBlocks = corruptReplicas.getCorruptBlocksSet(); out.println("Corrupt Blocks:"); for (Block block : corruptBlocks) { Collection<DatanodeDescriptor> corruptNodes = corruptReplicas.getNodes(block); if (corruptNodes == null) { LOG.warn("{} is corrupt but has no associated node.", block.getBlockId()); continue; } int numNodesToFind = corruptNodes.size(); for (DatanodeStorageInfo storage : blocksMap.getStorages(block)) { DatanodeDescriptor node = storage.getDatanodeDescriptor(); if (corruptNodes.contains(node)) { String storageId = storage.getStorageID(); DatanodeStorageInfo storageInfo = node.getStorageInfo(storageId); State state = (storageInfo == null) ? null : storageInfo.getState(); out.println("Block=" + block.toString() + "\tSize=" + block.getNumBytes() + "\tNode=" + node.getName() + "\tStorageID=" + storageId + "\tStorageState=" + state + "\tTotalReplicas=" + blocksMap.numNodes(block) + "\tReason=" + corruptReplicas.getCorruptReason(block, node)); numNodesToFind--; if (numNodesToFind == 0) { break; } } } if (numNodesToFind > 0) { String[] corruptNodesList = new String[corruptNodes.size()]; int i = 0; for (DatanodeDescriptor d : corruptNodes) { corruptNodesList[i] = d.getHostName(); i++; } out.println(block.getBlockId() + " corrupt on " + StringUtils.join(",", corruptNodesList) + " but not all nodes are" + "found in its block locations"); } } // Dump all datanodes getDatanodeManager().datanodeDump(out); } /** * Dump the metadata for the given block in a human-readable * form. */ private void dumpBlockMeta(Block block, PrintWriter out) { List<DatanodeDescriptor> containingNodes = new ArrayList<DatanodeDescriptor>(); List<DatanodeStorageInfo> containingLiveReplicasNodes = new ArrayList<DatanodeStorageInfo>(); NumberReplicas numReplicas = new NumberReplicas(); // source node returned is not used chooseSourceDatanodes(getStoredBlock(block), containingNodes, containingLiveReplicasNodes, numReplicas, new LinkedList<Byte>(), LowRedundancyBlocks.LEVEL); // containingLiveReplicasNodes can include READ_ONLY_SHARED replicas which are // not included in the numReplicas.liveReplicas() count assert containingLiveReplicasNodes.size() >= numReplicas.liveReplicas(); int usableReplicas = numReplicas.liveReplicas() + numReplicas.decommissionedAndDecommissioning(); if (block instanceof BlockInfo) { BlockCollection bc = getBlockCollection((BlockInfo) block); String fileName = (bc == null) ? "[orphaned]" : bc.getName(); out.print(fileName + ": "); } // l: == live:, d: == decommissioned c: == corrupt e: == excess out.print(block + ((usableReplicas > 0) ? "" : " MISSING") + " (replicas:" + " live: " + numReplicas.liveReplicas() + " decommissioning and decommissioned: " + numReplicas.decommissionedAndDecommissioning() + " corrupt: " + numReplicas.corruptReplicas() + " in excess: " + numReplicas.excessReplicas() + " maintenance mode: " + numReplicas.maintenanceReplicas() + ") "); Collection<DatanodeDescriptor> corruptNodes = corruptReplicas.getNodes(block); for (DatanodeStorageInfo storage : blocksMap.getStorages(block)) { final DatanodeDescriptor node = storage.getDatanodeDescriptor(); String state = ""; if (corruptNodes != null && corruptNodes.contains(node)) { state = "(corrupt)"; } else if (node.isDecommissioned() || node.isDecommissionInProgress()) { state = "(decommissioned)"; } else if (node.isMaintenance() || node.isInMaintenance()) { state = "(maintenance)"; } if (storage.areBlockContentsStale()) { state += " (block deletions maybe out of date)"; } out.print(" " + node + state + " : "); } out.println(""); } /** @return maxReplicationStreams */ public int getMaxReplicationStreams() { return maxReplicationStreams; } public int getDefaultStorageNum(BlockInfo block) { switch (block.getBlockType()) { case STRIPED: return ((BlockInfoStriped) block).getRealTotalBlockNum(); case CONTIGUOUS: return defaultReplication; default: throw new IllegalArgumentException( "getDefaultStorageNum called with unknown BlockType: " + block.getBlockType()); } } public short getMinReplication() { return minReplication; } public short getMinStorageNum(BlockInfo block) { switch (block.getBlockType()) { case STRIPED: return ((BlockInfoStriped) block).getRealDataBlockNum(); case CONTIGUOUS: return minReplication; default: throw new IllegalArgumentException( "getMinStorageNum called with unknown BlockType: " + block.getBlockType()); } } public short getMinReplicationToBeInMaintenance() { return minReplicationToBeInMaintenance; } private short getMinMaintenanceStorageNum(BlockInfo block) { if (block.isStriped()) { return ((BlockInfoStriped) block).getRealDataBlockNum(); } else { return (short) Math.min(minReplicationToBeInMaintenance, block.getReplication()); } } public boolean hasMinStorage(BlockInfo block) { return countNodes(block).liveReplicas() >= getMinStorageNum(block); } public boolean hasMinStorage(BlockInfo block, int liveNum) { return liveNum >= getMinStorageNum(block); } /** * Commit a block of a file * * @param block block to be committed * @param commitBlock - contains client reported block length and generation * @return true if the block is changed to committed state. * @throws IOException if the block does not have at least a minimal number * of replicas reported from data-nodes. */ private boolean commitBlock(final BlockInfo block, final Block commitBlock) throws IOException { if (block.getBlockUCState() == BlockUCState.COMMITTED) return false; assert block.getNumBytes() <= commitBlock.getNumBytes() : "commitBlock length is less than the stored one " + commitBlock.getNumBytes() + " vs. " + block.getNumBytes(); if (block.getGenerationStamp() != commitBlock.getGenerationStamp()) { throw new IOException( "Commit block with mismatching GS. NN has " + block + ", client submits " + commitBlock); } List<ReplicaUnderConstruction> staleReplicas = block.commitBlock(commitBlock); removeStaleReplicas(staleReplicas, block); return true; } /** * Commit the last block of the file and mark it as complete if it has * meets the minimum redundancy requirement * * @param bc block collection * @param commitBlock - contains client reported block length and generation * @param iip - INodes in path to bc * @return true if the last block is changed to committed state. * @throws IOException if the block does not have at least a minimal number * of replicas reported from data-nodes. */ public boolean commitOrCompleteLastBlock(BlockCollection bc, Block commitBlock, INodesInPath iip) throws IOException { if (commitBlock == null) return false; // not committing, this is a block allocation retry BlockInfo lastBlock = bc.getLastBlock(); if (lastBlock == null) return false; // no blocks in file yet if (lastBlock.isComplete()) return false; // already completed (e.g. by syncBlock) if (lastBlock.isUnderRecovery()) { throw new IOException("Commit or complete block " + commitBlock + ", whereas it is under recovery."); } final boolean committed = commitBlock(lastBlock, commitBlock); if (committed && lastBlock.isStriped()) { // update scheduled size for DatanodeStorages that do not store any // internal blocks lastBlock.getUnderConstructionFeature().updateStorageScheduledSize((BlockInfoStriped) lastBlock); } // Count replicas on decommissioning nodes, as these will not be // decommissioned unless recovery/completing last block has finished NumberReplicas numReplicas = countNodes(lastBlock); int numUsableReplicas = numReplicas.liveReplicas() + numReplicas.decommissioning() + numReplicas.liveEnteringMaintenanceReplicas(); if (hasMinStorage(lastBlock, numUsableReplicas)) { if (committed) { addExpectedReplicasToPending(lastBlock); } completeBlock(lastBlock, iip, false); } else if (pendingRecoveryBlocks.isUnderRecovery(lastBlock)) { // We've just finished recovery for this block, complete // the block forcibly disregarding number of replicas. // This is to ignore minReplication, the block will be closed // and then replicated out. completeBlock(lastBlock, iip, true); updateNeededReconstructions(lastBlock, 1, 0); } return committed; } /** * If IBR is not sent from expected locations yet, add the datanodes to * pendingReconstruction in order to keep RedundancyMonitor from scheduling * the block. */ public void addExpectedReplicasToPending(BlockInfo blk) { if (!blk.isStriped()) { DatanodeStorageInfo[] expectedStorages = blk.getUnderConstructionFeature() .getExpectedStorageLocations(); if (expectedStorages.length - blk.numNodes() > 0) { ArrayList<DatanodeDescriptor> pendingNodes = new ArrayList<>(); for (DatanodeStorageInfo storage : expectedStorages) { DatanodeDescriptor dnd = storage.getDatanodeDescriptor(); if (blk.findStorageInfo(dnd) == null) { pendingNodes.add(dnd); } } pendingReconstruction.increment(blk, pendingNodes.toArray(new DatanodeDescriptor[pendingNodes.size()])); } } } /** * Convert a specified block of the file to a complete block. * @param curBlock - block to be completed * @param iip - INodes in path to file containing curBlock; if null, * this will be resolved internally * @param force - force completion of the block * @throws IOException if the block does not have at least a minimal number * of replicas reported from data-nodes. */ private void completeBlock(BlockInfo curBlock, INodesInPath iip, boolean force) throws IOException { if (curBlock.isComplete()) { return; } int numNodes = curBlock.numNodes(); if (!force && !hasMinStorage(curBlock, numNodes)) { throw new IOException( "Cannot complete block: " + "block does not satisfy minimal replication requirement."); } if (!force && curBlock.getBlockUCState() != BlockUCState.COMMITTED) { throw new IOException("Cannot complete block: block has not been COMMITTED by the client"); } convertToCompleteBlock(curBlock, iip); // Since safe-mode only counts complete blocks, and we now have // one more complete block, we need to adjust the total up, and // also count it as safe, if we have at least the minimum replica // count. (We may not have the minimum replica count yet if this is // a "forced" completion when a file is getting closed by an // OP_CLOSE edit on the standby). bmSafeMode.adjustBlockTotals(0, 1); final int minStorage = curBlock.isStriped() ? ((BlockInfoStriped) curBlock).getRealDataBlockNum() : minReplication; bmSafeMode.incrementSafeBlockCount(Math.min(numNodes, minStorage), curBlock); } /** * Convert a specified block of the file to a complete block. * Skips validity checking and safe mode block total updates; use * {@link BlockManager#completeBlock} to include these. * @param curBlock - block to be completed * @param iip - INodes in path to file containing curBlock; if null, * this will be resolved internally * @throws IOException if the block does not have at least a minimal number * of replicas reported from data-nodes. */ private void convertToCompleteBlock(BlockInfo curBlock, INodesInPath iip) throws IOException { curBlock.convertToCompleteBlock(); namesystem.getFSDirectory().updateSpaceForCompleteBlock(curBlock, iip); } /** * Force the given block in the given file to be marked as complete, * regardless of whether enough replicas are present. This is necessary * when tailing edit logs as a Standby. */ public void forceCompleteBlock(final BlockInfo block) throws IOException { List<ReplicaUnderConstruction> staleReplicas = block.commitBlock(block); removeStaleReplicas(staleReplicas, block); completeBlock(block, null, true); } /** * Convert the last block of the file to an under construction block.<p> * The block is converted only if the file has blocks and the last one * is a partial block (its size is less than the preferred block size). * The converted block is returned to the client. * The client uses the returned block locations to form the data pipeline * for this block.<br> * The methods returns null if there is no partial block at the end. * The client is supposed to allocate a new block with the next call. * * @param bc file * @param bytesToRemove num of bytes to remove from block * @return the last block locations if the block is partial or null otherwise */ public LocatedBlock convertLastBlockToUnderConstruction(BlockCollection bc, long bytesToRemove) throws IOException { BlockInfo lastBlock = bc.getLastBlock(); if (lastBlock == null || bc.getPreferredBlockSize() == lastBlock.getNumBytes() - bytesToRemove) { return null; } assert lastBlock == getStoredBlock(lastBlock) : "last block of the file is not in blocksMap"; DatanodeStorageInfo[] targets = getStorages(lastBlock); // convert the last block to under construction. note no block replacement // is happening bc.convertLastBlockToUC(lastBlock, targets); // Remove block from reconstruction queue. NumberReplicas replicas = countNodes(lastBlock); neededReconstruction.remove(lastBlock, replicas.liveReplicas(), replicas.readOnlyReplicas(), replicas.outOfServiceReplicas(), getExpectedRedundancyNum(lastBlock)); pendingReconstruction.remove(lastBlock); // remove this block from the list of pending blocks to be deleted. for (DatanodeStorageInfo storage : targets) { final Block b = getBlockOnStorage(lastBlock, storage); if (b != null) { invalidateBlocks.remove(storage.getDatanodeDescriptor(), b); } } // Adjust safe-mode totals, since under-construction blocks don't // count in safe-mode. bmSafeMode.adjustBlockTotals( // decrement safe if we had enough hasMinStorage(lastBlock, targets.length) ? -1 : 0, // always decrement total blocks -1); final long fileLength = bc.computeContentSummary(getStoragePolicySuite()).getLength(); final long pos = fileLength - lastBlock.getNumBytes(); return createLocatedBlock(null, lastBlock, pos, BlockTokenIdentifier.AccessMode.WRITE); } /** * Get all valid locations of the block */ private List<DatanodeStorageInfo> getValidLocations(BlockInfo block) { final List<DatanodeStorageInfo> locations = new ArrayList<DatanodeStorageInfo>(blocksMap.numNodes(block)); for (DatanodeStorageInfo storage : blocksMap.getStorages(block)) { // filter invalidate replicas Block b = getBlockOnStorage(block, storage); if (b != null && !invalidateBlocks.contains(storage.getDatanodeDescriptor(), b)) { locations.add(storage); } } return locations; } private void createLocatedBlockList(LocatedBlockBuilder locatedBlocks, final BlockInfo[] blocks, final long offset, final long length, final AccessMode mode) throws IOException { int curBlk; long curPos = 0, blkSize = 0; int nrBlocks = (blocks[0].getNumBytes() == 0) ? 0 : blocks.length; for (curBlk = 0; curBlk < nrBlocks; curBlk++) { blkSize = blocks[curBlk].getNumBytes(); assert blkSize > 0 : "Block of size 0"; if (curPos + blkSize > offset) { break; } curPos += blkSize; } if (nrBlocks > 0 && curBlk == nrBlocks) // offset >= end of file return; long endOff = offset + length; do { locatedBlocks.addBlock(createLocatedBlock(locatedBlocks, blocks[curBlk], curPos, mode)); curPos += blocks[curBlk].getNumBytes(); curBlk++; } while (curPos < endOff && curBlk < blocks.length && !locatedBlocks.isBlockMax()); return; } private LocatedBlock createLocatedBlock(LocatedBlockBuilder locatedBlocks, final BlockInfo[] blocks, final long endPos, final AccessMode mode) throws IOException { int curBlk; long curPos = 0; int nrBlocks = (blocks[0].getNumBytes() == 0) ? 0 : blocks.length; for (curBlk = 0; curBlk < nrBlocks; curBlk++) { long blkSize = blocks[curBlk].getNumBytes(); if (curPos + blkSize >= endPos) { break; } curPos += blkSize; } return createLocatedBlock(locatedBlocks, blocks[curBlk], curPos, mode); } private LocatedBlock createLocatedBlock(LocatedBlockBuilder locatedBlocks, final BlockInfo blk, final long pos, final AccessMode mode) throws IOException { final LocatedBlock lb = createLocatedBlock(locatedBlocks, blk, pos); if (mode != null) { setBlockToken(lb, mode); } return lb; } /** @return a LocatedBlock for the given block */ private LocatedBlock createLocatedBlock(LocatedBlockBuilder locatedBlocks, final BlockInfo blk, final long pos) throws IOException { if (!blk.isComplete()) { final BlockUnderConstructionFeature uc = blk.getUnderConstructionFeature(); if (blk.isStriped()) { final DatanodeStorageInfo[] storages = uc.getExpectedStorageLocations(); final ExtendedBlock eb = new ExtendedBlock(getBlockPoolId(), blk); return newLocatedStripedBlock(eb, storages, uc.getBlockIndices(), pos, false); } else { final DatanodeStorageInfo[] storages = uc.getExpectedStorageLocations(); final ExtendedBlock eb = new ExtendedBlock(getBlockPoolId(), blk); return null == locatedBlocks ? newLocatedBlock(eb, storages, pos, false) : locatedBlocks.newLocatedBlock(eb, storages, pos, false); } } // get block locations NumberReplicas numReplicas = countNodes(blk); final int numCorruptNodes = numReplicas.corruptReplicas(); final int numCorruptReplicas = corruptReplicas.numCorruptReplicas(blk); if (numCorruptNodes != numCorruptReplicas) { LOG.warn( "Inconsistent number of corrupt replicas for {}" + " blockMap has {} but corrupt replicas map has {}", blk, numCorruptNodes, numCorruptReplicas); } final int numNodes = blocksMap.numNodes(blk); final boolean isCorrupt; if (blk.isStriped()) { BlockInfoStriped sblk = (BlockInfoStriped) blk; isCorrupt = numCorruptReplicas != 0 && numReplicas.liveReplicas() < sblk.getRealDataBlockNum(); } else { isCorrupt = numCorruptReplicas != 0 && numCorruptReplicas == numNodes; } int numMachines = isCorrupt ? numNodes : numNodes - numCorruptReplicas; numMachines -= numReplicas.maintenanceNotForReadReplicas(); DatanodeStorageInfo[] machines = new DatanodeStorageInfo[numMachines]; final byte[] blockIndices = blk.isStriped() ? new byte[numMachines] : null; int j = 0, i = 0; if (numMachines > 0) { final boolean noCorrupt = (numCorruptReplicas == 0); for (DatanodeStorageInfo storage : blocksMap.getStorages(blk)) { if (storage.getState() != State.FAILED) { final DatanodeDescriptor d = storage.getDatanodeDescriptor(); // Don't pick IN_MAINTENANCE or dead ENTERING_MAINTENANCE states. if (d.isInMaintenance() || (d.isEnteringMaintenance() && !d.isAlive())) { continue; } if (noCorrupt) { machines[j++] = storage; i = setBlockIndices(blk, blockIndices, i, storage); } else { final boolean replicaCorrupt = isReplicaCorrupt(blk, d); if (isCorrupt || !replicaCorrupt) { machines[j++] = storage; i = setBlockIndices(blk, blockIndices, i, storage); } } } } } if (j < machines.length) { machines = Arrays.copyOf(machines, j); } assert j == machines.length : "isCorrupt: " + isCorrupt + " numMachines: " + numMachines + " numNodes: " + numNodes + " numCorrupt: " + numCorruptNodes + " numCorruptRepls: " + numCorruptReplicas; final ExtendedBlock eb = new ExtendedBlock(getBlockPoolId(), blk); return blockIndices == null ? null == locatedBlocks ? newLocatedBlock(eb, machines, pos, isCorrupt) : locatedBlocks.newLocatedBlock(eb, machines, pos, isCorrupt) : newLocatedStripedBlock(eb, machines, blockIndices, pos, isCorrupt); } /** Create a LocatedBlocks. */ public LocatedBlocks createLocatedBlocks(final BlockInfo[] blocks, final long fileSizeExcludeBlocksUnderConstruction, final boolean isFileUnderConstruction, final long offset, final long length, final boolean needBlockToken, final boolean inSnapshot, FileEncryptionInfo feInfo, ErasureCodingPolicy ecPolicy) throws IOException { assert namesystem.hasReadLock(); if (blocks == null) { return null; } else if (blocks.length == 0) { return new LocatedBlocks(0, isFileUnderConstruction, Collections.<LocatedBlock>emptyList(), null, false, feInfo, ecPolicy); } else { if (LOG.isDebugEnabled()) { LOG.debug("blocks = {}", java.util.Arrays.asList(blocks)); } final AccessMode mode = needBlockToken ? BlockTokenIdentifier.AccessMode.READ : null; LocatedBlockBuilder locatedBlocks = providedStorageMap.newLocatedBlocks(Integer.MAX_VALUE) .fileLength(fileSizeExcludeBlocksUnderConstruction).lastUC(isFileUnderConstruction) .encryption(feInfo).erasureCoding(ecPolicy); createLocatedBlockList(locatedBlocks, blocks, offset, length, mode); if (!inSnapshot) { final BlockInfo last = blocks[blocks.length - 1]; final long lastPos = last.isComplete() ? fileSizeExcludeBlocksUnderConstruction - last.getNumBytes() : fileSizeExcludeBlocksUnderConstruction; locatedBlocks.lastBlock(createLocatedBlock(locatedBlocks, last, lastPos, mode)) .lastComplete(last.isComplete()); } else { locatedBlocks.lastBlock( createLocatedBlock(locatedBlocks, blocks, fileSizeExcludeBlocksUnderConstruction, mode)) .lastComplete(true); } LocatedBlocks locations = locatedBlocks.build(); // Set caching information for the located blocks. CacheManager cm = namesystem.getCacheManager(); if (cm != null) { cm.setCachedLocations(locations); } return locations; } } /** @return current access keys. */ public ExportedBlockKeys getBlockKeys() { return isBlockTokenEnabled() ? blockTokenSecretManager.exportKeys() : ExportedBlockKeys.DUMMY_KEYS; } /** Generate a block token for the located block. */ public void setBlockToken(final LocatedBlock b, final AccessMode mode) throws IOException { if (isBlockTokenEnabled()) { // Use cached UGI if serving RPC calls. if (b.isStriped()) { Preconditions.checkState(b instanceof LocatedStripedBlock); LocatedStripedBlock sb = (LocatedStripedBlock) b; byte[] indices = sb.getBlockIndices(); Token<BlockTokenIdentifier>[] blockTokens = new Token[indices.length]; ExtendedBlock internalBlock = new ExtendedBlock(b.getBlock()); for (int i = 0; i < indices.length; i++) { internalBlock.setBlockId(b.getBlock().getBlockId() + indices[i]); blockTokens[i] = blockTokenSecretManager.generateToken( NameNode.getRemoteUser().getShortUserName(), internalBlock, EnumSet.of(mode), b.getStorageTypes(), b.getStorageIDs()); } sb.setBlockTokens(blockTokens); } b.setBlockToken(blockTokenSecretManager.generateToken(NameNode.getRemoteUser().getShortUserName(), b.getBlock(), EnumSet.of(mode), b.getStorageTypes(), b.getStorageIDs())); } } void addKeyUpdateCommand(final List<DatanodeCommand> cmds, final DatanodeDescriptor nodeinfo) { // check access key update if (isBlockTokenEnabled() && nodeinfo.needKeyUpdate()) { cmds.add(new KeyUpdateCommand(blockTokenSecretManager.exportKeys())); nodeinfo.setNeedKeyUpdate(false); } } public DataEncryptionKey generateDataEncryptionKey() { if (isBlockTokenEnabled() && encryptDataTransfer) { return blockTokenSecretManager.generateDataEncryptionKey(); } else { return null; } } /** * Clamp the specified replication between the minimum and the maximum * replication levels. */ public short adjustReplication(short replication) { return replication < minReplication ? minReplication : replication > maxReplication ? maxReplication : replication; } /** * Check whether the replication parameter is within the range * determined by system configuration and throw an exception if it's not. * * @param src the path to the target file * @param replication the requested replication factor * @param clientName the name of the client node making the request * @throws java.io.IOException thrown if the requested replication factor * is out of bounds */ public void verifyReplication(String src, short replication, String clientName) throws IOException { String err = null; if (replication > maxReplication) { err = " exceeds maximum of " + maxReplication; } else if (replication < minReplication) { err = " is less than the required minimum of " + minReplication; } if (err != null) { throw new IOException("Requested replication factor of " + replication + err + " for " + src + (clientName == null ? "" : ", clientName=" + clientName)); } } /** * Check if a block is replicated to at least the minimum replication. */ public boolean isSufficientlyReplicated(BlockInfo b) { // Compare against the lesser of the minReplication and number of live DNs. final int replication = Math.min(minReplication, getDatanodeManager().getNumLiveDataNodes()); return countNodes(b).liveReplicas() >= replication; } /** Get all blocks with location information from a datanode. */ public BlocksWithLocations getBlocksWithLocations(final DatanodeID datanode, final long size, final long minBlockSize) throws UnregisteredNodeException { final DatanodeDescriptor node = getDatanodeManager().getDatanode(datanode); if (node == null) { blockLog.warn("BLOCK* getBlocks: Asking for blocks from an" + " unrecorded node {}", datanode); throw new HadoopIllegalArgumentException("Datanode " + datanode + " not found."); } int numBlocks = node.numBlocks(); if (numBlocks == 0) { return new BlocksWithLocations(new BlockWithLocations[0]); } // starting from a random block int startBlock = ThreadLocalRandom.current().nextInt(numBlocks); Iterator<BlockInfo> iter = node.getBlockIterator(startBlock); List<BlockWithLocations> results = new ArrayList<BlockWithLocations>(); long totalSize = 0; BlockInfo curBlock; while (totalSize < size && iter.hasNext()) { curBlock = iter.next(); if (!curBlock.isComplete()) continue; if (curBlock.getNumBytes() < minBlockSize) { continue; } totalSize += addBlock(curBlock, results); } if (totalSize < size) { iter = node.getBlockIterator(); // start from the beginning for (int i = 0; i < startBlock && totalSize < size; i++) { curBlock = iter.next(); if (!curBlock.isComplete()) continue; if (curBlock.getNumBytes() < minBlockSize) { continue; } totalSize += addBlock(curBlock, results); } } return new BlocksWithLocations(results.toArray(new BlockWithLocations[results.size()])); } /** Remove the blocks associated to the given datanode. */ void removeBlocksAssociatedTo(final DatanodeDescriptor node) { providedStorageMap.removeDatanode(node); for (DatanodeStorageInfo storage : node.getStorageInfos()) { final Iterator<BlockInfo> it = storage.getBlockIterator(); //add the BlockInfos to a new collection as the //returned iterator is not modifiable. Collection<BlockInfo> toRemove = new ArrayList<>(); while (it.hasNext()) { toRemove.add(it.next()); } for (BlockInfo b : toRemove) { removeStoredBlock(b, node); } } // Remove all pending DN messages referencing this DN. pendingDNMessages.removeAllMessagesForDatanode(node); node.resetBlocks(); invalidateBlocks.remove(node); } /** Remove the blocks associated to the given DatanodeStorageInfo. */ void removeBlocksAssociatedTo(final DatanodeStorageInfo storageInfo) { assert namesystem.hasWriteLock(); final Iterator<BlockInfo> it = storageInfo.getBlockIterator(); DatanodeDescriptor node = storageInfo.getDatanodeDescriptor(); Collection<BlockInfo> toRemove = new ArrayList<>(); while (it.hasNext()) { toRemove.add(it.next()); } for (BlockInfo block : toRemove) { removeStoredBlock(block, node); final Block b = getBlockOnStorage(block, storageInfo); if (b != null) { invalidateBlocks.remove(node, b); } } checkSafeMode(); LOG.info("Removed blocks associated with storage {} from DataNode {}", storageInfo, node); } /** * Adds block to list of blocks which will be invalidated on specified * datanode and log the operation */ void addToInvalidates(final Block block, final DatanodeInfo datanode) { if (!isPopulatingReplQueues()) { return; } invalidateBlocks.add(block, datanode, true); } /** * Adds block to list of blocks which will be invalidated on all its * datanodes. */ private void addToInvalidates(BlockInfo storedBlock) { if (!isPopulatingReplQueues()) { return; } StringBuilder datanodes = blockLog.isDebugEnabled() ? new StringBuilder() : null; for (DatanodeStorageInfo storage : blocksMap.getStorages(storedBlock)) { if (storage.getState() != State.NORMAL) { continue; } final DatanodeDescriptor node = storage.getDatanodeDescriptor(); final Block b = getBlockOnStorage(storedBlock, storage); if (b != null) { invalidateBlocks.add(b, node, false); if (datanodes != null) { datanodes.append(node).append(" "); } } } if (datanodes != null && datanodes.length() != 0) { blockLog.debug("BLOCK* addToInvalidates: {} {}", storedBlock, datanodes); } } private Block getBlockOnStorage(BlockInfo storedBlock, DatanodeStorageInfo storage) { return storedBlock.isStriped() ? ((BlockInfoStriped) storedBlock).getBlockOnStorage(storage) : storedBlock; } /** * Mark the block belonging to datanode as corrupt * @param blk Block to be marked as corrupt * @param dn Datanode which holds the corrupt replica * @param storageID if known, null otherwise. * @param reason a textual reason why the block should be marked corrupt, * for logging purposes */ public void findAndMarkBlockAsCorrupt(final ExtendedBlock blk, final DatanodeInfo dn, String storageID, String reason) throws IOException { assert namesystem.hasWriteLock(); final Block reportedBlock = blk.getLocalBlock(); final BlockInfo storedBlock = getStoredBlock(reportedBlock); if (storedBlock == null) { // Check if the replica is in the blockMap, if not // ignore the request for now. This could happen when BlockScanner // thread of Datanode reports bad block before Block reports are sent // by the Datanode on startup blockLog.debug("BLOCK* findAndMarkBlockAsCorrupt: {} not found", blk); return; } DatanodeDescriptor node = getDatanodeManager().getDatanode(dn); if (node == null) { throw new IOException("Cannot mark " + blk + " as corrupt because datanode " + dn + " (" + dn.getDatanodeUuid() + ") does not exist"); } DatanodeStorageInfo storage = null; if (storageID != null) { storage = node.getStorageInfo(storageID); } if (storage == null) { storage = storedBlock.findStorageInfo(node); } if (storage == null) { blockLog.debug("BLOCK* findAndMarkBlockAsCorrupt: {} not found on {}", blk, dn); return; } markBlockAsCorrupt(new BlockToMarkCorrupt(reportedBlock, storedBlock, blk.getGenerationStamp(), reason, Reason.CORRUPTION_REPORTED), storage, node); } /** * Mark a replica (of a contiguous block) or an internal block (of a striped * block group) as corrupt. * @param b Indicating the reported bad block and the corresponding BlockInfo * stored in blocksMap. * @param storageInfo storage that contains the block, if known. null otherwise. */ private void markBlockAsCorrupt(BlockToMarkCorrupt b, DatanodeStorageInfo storageInfo, DatanodeDescriptor node) throws IOException { if (b.getStored().isDeleted()) { blockLog.debug("BLOCK markBlockAsCorrupt: {} cannot be marked as" + " corrupt as it does not belong to any file", b); addToInvalidates(b.getCorrupted(), node); return; } short expectedRedundancies = getExpectedRedundancyNum(b.getStored()); // Add replica to the data-node if it is not already there if (storageInfo != null) { storageInfo.addBlock(b.getStored(), b.getCorrupted()); } // Add this replica to corruptReplicas Map. For striped blocks, we always // use the id of whole striped block group when adding to corruptReplicas Block corrupted = new Block(b.getCorrupted()); if (b.getStored().isStriped()) { corrupted.setBlockId(b.getStored().getBlockId()); } corruptReplicas.addToCorruptReplicasMap(corrupted, node, b.getReason(), b.getReasonCode(), b.getStored().isStriped()); NumberReplicas numberOfReplicas = countNodes(b.getStored()); boolean hasEnoughLiveReplicas = numberOfReplicas.liveReplicas() >= expectedRedundancies; boolean minReplicationSatisfied = hasMinStorage(b.getStored(), numberOfReplicas.liveReplicas()); boolean hasMoreCorruptReplicas = minReplicationSatisfied && (numberOfReplicas.liveReplicas() + numberOfReplicas.corruptReplicas()) > expectedRedundancies; boolean corruptedDuringWrite = minReplicationSatisfied && b.isCorruptedDuringWrite(); // case 1: have enough number of live replicas // case 2: corrupted replicas + live replicas > Replication factor // case 3: Block is marked corrupt due to failure while writing. In this // case genstamp will be different than that of valid block. // In all these cases we can delete the replica. // In case of 3, rbw block will be deleted and valid block can be replicated if (hasEnoughLiveReplicas || hasMoreCorruptReplicas || corruptedDuringWrite) { // the block is over-replicated so invalidate the replicas immediately invalidateBlock(b, node, numberOfReplicas); } else if (isPopulatingReplQueues()) { // add the block to neededReconstruction updateNeededReconstructions(b.getStored(), -1, 0); } } /** * Invalidates the given block on the given datanode. * @return true if the block was successfully invalidated and no longer * present in the BlocksMap */ private boolean invalidateBlock(BlockToMarkCorrupt b, DatanodeInfo dn, NumberReplicas nr) throws IOException { blockLog.debug("BLOCK* invalidateBlock: {} on {}", b, dn); DatanodeDescriptor node = getDatanodeManager().getDatanode(dn); if (node == null) { throw new IOException("Cannot invalidate " + b + " because datanode " + dn + " does not exist."); } // Check how many copies we have of the block if (nr.replicasOnStaleNodes() > 0) { blockLog.debug("BLOCK* invalidateBlocks: postponing " + "invalidation of {} on {} because {} replica(s) are located on " + "nodes with potentially out-of-date block reports", b, dn, nr.replicasOnStaleNodes()); postponeBlock(b.getCorrupted()); return false; } else { // we already checked the number of replicas in the caller of this // function and know there are enough live replicas, so we can delete it. addToInvalidates(b.getCorrupted(), dn); removeStoredBlock(b.getStored(), node); blockLog.debug("BLOCK* invalidateBlocks: {} on {} listed for deletion.", b, dn); return true; } } public void setPostponeBlocksFromFuture(boolean postpone) { this.shouldPostponeBlocksFromFuture = postpone; } private void postponeBlock(Block blk) { postponedMisreplicatedBlocks.add(blk); } void updateState() { pendingReconstructionBlocksCount = pendingReconstruction.size(); lowRedundancyBlocksCount = neededReconstruction.size(); corruptReplicaBlocksCount = corruptReplicas.size(); } /** Return number of low redundancy blocks but not missing blocks. */ public int getUnderReplicatedNotMissingBlocks() { return neededReconstruction.getLowRedundancyBlockCount(); } /** * Schedule blocks for deletion at datanodes * @param nodesToProcess number of datanodes to schedule deletion work * @return total number of block for deletion */ int computeInvalidateWork(int nodesToProcess) { final List<DatanodeInfo> nodes = invalidateBlocks.getDatanodes(); Collections.shuffle(nodes); nodesToProcess = Math.min(nodes.size(), nodesToProcess); int blockCnt = 0; for (DatanodeInfo dnInfo : nodes) { int blocks = invalidateWorkForOneNode(dnInfo); if (blocks > 0) { blockCnt += blocks; if (--nodesToProcess == 0) { break; } } } return blockCnt; } /** * Scan blocks in {@link #neededReconstruction} and assign reconstruction * (replication or erasure coding) work to data-nodes they belong to. * * The number of process blocks equals either twice the number of live * data-nodes or the number of low redundancy blocks whichever is less. * * @return number of blocks scheduled for reconstruction during this * iteration. */ int computeBlockReconstructionWork(int blocksToProcess) { List<List<BlockInfo>> blocksToReconstruct = null; namesystem.writeLock(); try { // Choose the blocks to be reconstructed blocksToReconstruct = neededReconstruction.chooseLowRedundancyBlocks(blocksToProcess); } finally { namesystem.writeUnlock(); } return computeReconstructionWorkForBlocks(blocksToReconstruct); } /** * Reconstruct a set of blocks to full strength through replication or * erasure coding * * @param blocksToReconstruct blocks to be reconstructed, for each priority * @return the number of blocks scheduled for replication */ @VisibleForTesting int computeReconstructionWorkForBlocks(List<List<BlockInfo>> blocksToReconstruct) { int scheduledWork = 0; List<BlockReconstructionWork> reconWork = new LinkedList<>(); // Step 1: categorize at-risk blocks into replication and EC tasks namesystem.writeLock(); try { synchronized (neededReconstruction) { for (int priority = 0; priority < blocksToReconstruct.size(); priority++) { for (BlockInfo block : blocksToReconstruct.get(priority)) { BlockReconstructionWork rw = scheduleReconstruction(block, priority); if (rw != null) { reconWork.add(rw); } } } } } finally { namesystem.writeUnlock(); } // Step 2: choose target nodes for each reconstruction task final Set<Node> excludedNodes = new HashSet<>(); for (BlockReconstructionWork rw : reconWork) { // Exclude all of the containing nodes from being targets. // This list includes decommissioning or corrupt nodes. excludedNodes.clear(); for (DatanodeDescriptor dn : rw.getContainingNodes()) { excludedNodes.add(dn); } // choose replication targets: NOT HOLDING THE GLOBAL LOCK final BlockPlacementPolicy placementPolicy = placementPolicies.getPolicy(rw.getBlock().getBlockType()); rw.chooseTargets(placementPolicy, storagePolicySuite, excludedNodes); } // Step 3: add tasks to the DN namesystem.writeLock(); try { for (BlockReconstructionWork rw : reconWork) { final DatanodeStorageInfo[] targets = rw.getTargets(); if (targets == null || targets.length == 0) { rw.resetTargets(); continue; } synchronized (neededReconstruction) { if (validateReconstructionWork(rw)) { scheduledWork++; } } } } finally { namesystem.writeUnlock(); } if (blockLog.isDebugEnabled()) { // log which blocks have been scheduled for reconstruction for (BlockReconstructionWork rw : reconWork) { DatanodeStorageInfo[] targets = rw.getTargets(); if (targets != null && targets.length != 0) { StringBuilder targetList = new StringBuilder("datanode(s)"); for (DatanodeStorageInfo target : targets) { targetList.append(' '); targetList.append(target.getDatanodeDescriptor()); } blockLog.debug("BLOCK* ask {} to replicate {} to {}", rw.getSrcNodes(), rw.getBlock(), targetList); } } blockLog.debug("BLOCK* neededReconstruction = {} pendingReconstruction = {}", neededReconstruction.size(), pendingReconstruction.size()); } return scheduledWork; } // Check if the number of live + pending replicas satisfies // the expected redundancy. boolean hasEnoughEffectiveReplicas(BlockInfo block, NumberReplicas numReplicas, int pendingReplicaNum) { int required = getExpectedLiveRedundancyNum(block, numReplicas); int numEffectiveReplicas = numReplicas.liveReplicas() + pendingReplicaNum; return (numEffectiveReplicas >= required) && (pendingReplicaNum > 0 || isPlacementPolicySatisfied(block)); } BlockReconstructionWork scheduleReconstruction(BlockInfo block, int priority) { // skip abandoned block or block reopened for append if (block.isDeleted() || !block.isCompleteOrCommitted()) { // remove from neededReconstruction neededReconstruction.remove(block, priority); return null; } // get a source data-node List<DatanodeDescriptor> containingNodes = new ArrayList<>(); List<DatanodeStorageInfo> liveReplicaNodes = new ArrayList<>(); NumberReplicas numReplicas = new NumberReplicas(); List<Byte> liveBlockIndices = new ArrayList<>(); final DatanodeDescriptor[] srcNodes = chooseSourceDatanodes(block, containingNodes, liveReplicaNodes, numReplicas, liveBlockIndices, priority); short requiredRedundancy = getExpectedLiveRedundancyNum(block, numReplicas); if (srcNodes == null || srcNodes.length == 0) { // block can not be reconstructed from any node LOG.debug("Block {} cannot be reconstructed from any node", block); NameNode.getNameNodeMetrics().incNumTimesReReplicationNotScheduled(); return null; } // liveReplicaNodes can include READ_ONLY_SHARED replicas which are // not included in the numReplicas.liveReplicas() count assert liveReplicaNodes.size() >= numReplicas.liveReplicas(); int pendingNum = pendingReconstruction.getNumReplicas(block); if (hasEnoughEffectiveReplicas(block, numReplicas, pendingNum)) { neededReconstruction.remove(block, priority); blockLog.debug("BLOCK* Removing {} from neededReconstruction as" + " it has enough replicas", block); NameNode.getNameNodeMetrics().incNumTimesReReplicationNotScheduled(); return null; } int additionalReplRequired; if (numReplicas.liveReplicas() < requiredRedundancy) { additionalReplRequired = requiredRedundancy - numReplicas.liveReplicas() - pendingNum; } else { additionalReplRequired = 1; // Needed on a new rack } final BlockCollection bc = getBlockCollection(block); if (block.isStriped()) { if (pendingNum > 0) { // Wait the previous reconstruction to finish. NameNode.getNameNodeMetrics().incNumTimesReReplicationNotScheduled(); return null; } // should reconstruct all the internal blocks before scheduling // replication task for decommissioning node(s). if (additionalReplRequired - numReplicas.decommissioning() - numReplicas.liveEnteringMaintenanceReplicas() > 0) { additionalReplRequired = additionalReplRequired - numReplicas.decommissioning() - numReplicas.liveEnteringMaintenanceReplicas(); } byte[] indices = new byte[liveBlockIndices.size()]; for (int i = 0; i < liveBlockIndices.size(); i++) { indices[i] = liveBlockIndices.get(i); } return new ErasureCodingWork(getBlockPoolId(), block, bc, srcNodes, containingNodes, liveReplicaNodes, additionalReplRequired, priority, indices); } else { return new ReplicationWork(block, bc, srcNodes, containingNodes, liveReplicaNodes, additionalReplRequired, priority); } } private boolean isInNewRack(DatanodeDescriptor[] srcs, DatanodeDescriptor target) { LOG.debug("check if target {} increases racks, srcs={}", target, Arrays.asList(srcs)); for (DatanodeDescriptor src : srcs) { if (!src.isDecommissionInProgress() && src.getNetworkLocation().equals(target.getNetworkLocation())) { LOG.debug("the target {} is in the same rack with src {}", target, src); return false; } } return true; } private boolean validateReconstructionWork(BlockReconstructionWork rw) { BlockInfo block = rw.getBlock(); int priority = rw.getPriority(); // Recheck since global lock was released // skip abandoned block or block reopened for append if (block.isDeleted() || !block.isCompleteOrCommitted()) { neededReconstruction.remove(block, priority); rw.resetTargets(); return false; } // do not schedule more if enough replicas is already pending NumberReplicas numReplicas = countNodes(block); final short requiredRedundancy = getExpectedLiveRedundancyNum(block, numReplicas); final int pendingNum = pendingReconstruction.getNumReplicas(block); if (hasEnoughEffectiveReplicas(block, numReplicas, pendingNum)) { neededReconstruction.remove(block, priority); rw.resetTargets(); blockLog.debug("BLOCK* Removing {} from neededReconstruction as" + " it has enough replicas", block); return false; } DatanodeStorageInfo[] targets = rw.getTargets(); if ((numReplicas.liveReplicas() >= requiredRedundancy) && (!isPlacementPolicySatisfied(block))) { if (!isInNewRack(rw.getSrcNodes(), targets[0].getDatanodeDescriptor())) { // No use continuing, unless a new rack in this case return false; } // mark that the reconstruction work is to replicate internal block to a // new rack. rw.setNotEnoughRack(); } // Add block to the datanode's task list rw.addTaskToDatanode(numReplicas); DatanodeStorageInfo.incrementBlocksScheduled(targets); // Move the block-replication into a "pending" state. // The reason we use 'pending' is so we can retry // reconstructions that fail after an appropriate amount of time. pendingReconstruction.increment(block, DatanodeStorageInfo.toDatanodeDescriptors(targets)); blockLog.debug("BLOCK* block {} is moved from neededReconstruction to " + "pendingReconstruction", block); int numEffectiveReplicas = numReplicas.liveReplicas() + pendingNum; // remove from neededReconstruction if (numEffectiveReplicas + targets.length >= requiredRedundancy) { neededReconstruction.remove(block, priority); } return true; } /** Choose target for WebHDFS redirection. */ public DatanodeStorageInfo[] chooseTarget4WebHDFS(String src, DatanodeDescriptor clientnode, Set<Node> excludes, long blocksize) { return placementPolicies.getPolicy(CONTIGUOUS).chooseTarget(src, 1, clientnode, Collections.<DatanodeStorageInfo>emptyList(), false, excludes, blocksize, storagePolicySuite.getDefaultPolicy(), null); } /** Choose target for getting additional datanodes for an existing pipeline. */ public DatanodeStorageInfo[] chooseTarget4AdditionalDatanode(String src, int numAdditionalNodes, Node clientnode, List<DatanodeStorageInfo> chosen, Set<Node> excludes, long blocksize, byte storagePolicyID, BlockType blockType) { final BlockStoragePolicy storagePolicy = storagePolicySuite.getPolicy(storagePolicyID); final BlockPlacementPolicy blockplacement = placementPolicies.getPolicy(blockType); return blockplacement.chooseTarget(src, numAdditionalNodes, clientnode, chosen, true, excludes, blocksize, storagePolicy, null); } /** * Choose target datanodes for creating a new block. * * @throws IOException * if the number of targets < minimum replication. * @see BlockPlacementPolicy#chooseTarget(String, int, Node, * Set, long, List, BlockStoragePolicy, EnumSet) */ public DatanodeStorageInfo[] chooseTarget4NewBlock(final String src, final int numOfReplicas, final Node client, final Set<Node> excludedNodes, final long blocksize, final List<String> favoredNodes, final byte storagePolicyID, final BlockType blockType, final ErasureCodingPolicy ecPolicy, final EnumSet<AddBlockFlag> flags) throws IOException { List<DatanodeDescriptor> favoredDatanodeDescriptors = getDatanodeDescriptors(favoredNodes); final BlockStoragePolicy storagePolicy = storagePolicySuite.getPolicy(storagePolicyID); final BlockPlacementPolicy blockplacement = placementPolicies.getPolicy(blockType); final DatanodeStorageInfo[] targets = blockplacement.chooseTarget(src, numOfReplicas, client, excludedNodes, blocksize, favoredDatanodeDescriptors, storagePolicy, flags); final String errorMessage = "File %s could only be written to %d of " + "the %d %s. There are %d datanode(s) running and %s " + "node(s) are excluded in this operation."; if (blockType == BlockType.CONTIGUOUS && targets.length < minReplication) { throw new IOException(String.format(errorMessage, src, targets.length, minReplication, "minReplication nodes", getDatanodeManager().getNetworkTopology().getNumOfLeaves(), (excludedNodes == null ? "no" : excludedNodes.size()))); } else if (blockType == BlockType.STRIPED && targets.length < ecPolicy.getNumDataUnits()) { throw new IOException(String.format(errorMessage, src, targets.length, ecPolicy.getNumDataUnits(), String.format("required nodes for %s", ecPolicy.getName()), getDatanodeManager().getNetworkTopology().getNumOfLeaves(), (excludedNodes == null ? "no" : excludedNodes.size()))); } return targets; } /** * Get list of datanode descriptors for given list of nodes. Nodes are * hostaddress:port or just hostaddress. */ List<DatanodeDescriptor> getDatanodeDescriptors(List<String> nodes) { List<DatanodeDescriptor> datanodeDescriptors = null; if (nodes != null) { datanodeDescriptors = new ArrayList<DatanodeDescriptor>(nodes.size()); for (int i = 0; i < nodes.size(); i++) { DatanodeDescriptor node = datanodeManager.getDatanodeDescriptor(nodes.get(i)); if (node != null) { datanodeDescriptors.add(node); } } } return datanodeDescriptors; } /** * Get the associated {@link DatanodeDescriptor} for the storage. * If the storage is of type PROVIDED, one of the nodes that reported * PROVIDED storage are returned. If not, this is equivalent to * {@code storage.getDatanodeDescriptor()}. * @param storage * @return the associated {@link DatanodeDescriptor}. */ private DatanodeDescriptor getDatanodeDescriptorFromStorage(DatanodeStorageInfo storage) { if (storage.getStorageType() == StorageType.PROVIDED) { return providedStorageMap.chooseProvidedDatanode(); } return storage.getDatanodeDescriptor(); } /** * Parse the data-nodes the block belongs to and choose a certain number * from them to be the recovery sources. * * We prefer nodes that are in DECOMMISSION_INPROGRESS state to other nodes * since the former do not have write traffic and hence are less busy. * We do not use already decommissioned nodes as a source, unless there is * no other choice. * Otherwise we randomly choose nodes among those that did not reach their * replication limits. However, if the recovery work is of the highest * priority and all nodes have reached their replication limits, we will * randomly choose the desired number of nodes despite the replication limit. * * In addition form a list of all nodes containing the block * and calculate its replication numbers. * * @param block Block for which a replication source is needed * @param containingNodes List to be populated with nodes found to contain * the given block * @param nodesContainingLiveReplicas List to be populated with nodes found * to contain live replicas of the given * block * @param numReplicas NumberReplicas instance to be initialized with the * counts of live, corrupt, excess, and decommissioned * replicas of the given block. * @param liveBlockIndices List to be populated with indices of healthy * blocks in a striped block group * @param priority integer representing replication priority of the given * block * @return the array of DatanodeDescriptor of the chosen nodes from which to * recover the given block */ @VisibleForTesting DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, List<DatanodeDescriptor> containingNodes, List<DatanodeStorageInfo> nodesContainingLiveReplicas, NumberReplicas numReplicas, List<Byte> liveBlockIndices, int priority) { containingNodes.clear(); nodesContainingLiveReplicas.clear(); List<DatanodeDescriptor> srcNodes = new ArrayList<>(); liveBlockIndices.clear(); final boolean isStriped = block.isStriped(); DatanodeDescriptor decommissionedSrc = null; BitSet bitSet = isStriped ? new BitSet(((BlockInfoStriped) block).getTotalBlockNum()) : null; for (DatanodeStorageInfo storage : blocksMap.getStorages(block)) { final DatanodeDescriptor node = getDatanodeDescriptorFromStorage(storage); final StoredReplicaState state = checkReplicaOnStorage(numReplicas, block, storage, corruptReplicas.getNodes(block), false); if (state == StoredReplicaState.LIVE) { if (storage.getStorageType() == StorageType.PROVIDED) { storage = new DatanodeStorageInfo(node, storage.getStorageID(), storage.getStorageType(), storage.getState()); } nodesContainingLiveReplicas.add(storage); } containingNodes.add(node); // do not select the replica if it is corrupt or excess if (state == StoredReplicaState.CORRUPT || state == StoredReplicaState.EXCESS) { continue; } // Never use maintenance node not suitable for read // or unknown state replicas. if (state == null || state == StoredReplicaState.MAINTENANCE_NOT_FOR_READ) { continue; } // Save the live decommissioned replica in case we need it. Such replicas // are normally not used for replication, but if nothing else is // available, one can be selected as a source. if (state == StoredReplicaState.DECOMMISSIONED) { if (decommissionedSrc == null || ThreadLocalRandom.current().nextBoolean()) { decommissionedSrc = node; } continue; } if (priority != LowRedundancyBlocks.QUEUE_HIGHEST_PRIORITY && (!node.isDecommissionInProgress() && !node.isEnteringMaintenance()) && node.getNumberOfBlocksToBeReplicated() >= maxReplicationStreams) { continue; // already reached replication limit } if (node.getNumberOfBlocksToBeReplicated() >= replicationStreamsHardLimit) { continue; } if (isStriped || srcNodes.isEmpty()) { srcNodes.add(node); if (isStriped) { byte blockIndex = ((BlockInfoStriped) block).getStorageBlockIndex(storage); liveBlockIndices.add(blockIndex); if (!bitSet.get(blockIndex)) { bitSet.set(blockIndex); } else if (state == StoredReplicaState.LIVE) { numReplicas.subtract(StoredReplicaState.LIVE, 1); numReplicas.add(StoredReplicaState.REDUNDANT, 1); } } continue; } // for replicated block, switch to a different node randomly // this to prevent from deterministically selecting the same node even // if the node failed to replicate the block on previous iterations if (ThreadLocalRandom.current().nextBoolean()) { srcNodes.set(0, node); } } // Pick a live decommissioned replica, if nothing else is available. if (!isStriped && nodesContainingLiveReplicas.isEmpty() && srcNodes.isEmpty() && decommissionedSrc != null) { srcNodes.add(decommissionedSrc); } return srcNodes.toArray(new DatanodeDescriptor[srcNodes.size()]); } /** * If there were any reconstruction requests that timed out, reap them * and put them back into the neededReconstruction queue */ void processPendingReconstructions() { BlockInfo[] timedOutItems = pendingReconstruction.getTimedOutBlocks(); if (timedOutItems != null) { namesystem.writeLock(); try { for (int i = 0; i < timedOutItems.length; i++) { /* * Use the blockinfo from the blocksmap to be certain we're working * with the most up-to-date block information (e.g. genstamp). */ BlockInfo bi = blocksMap.getStoredBlock(timedOutItems[i]); if (bi == null) { continue; } NumberReplicas num = countNodes(timedOutItems[i]); if (isNeededReconstruction(bi, num)) { neededReconstruction.add(bi, num.liveReplicas(), num.readOnlyReplicas(), num.outOfServiceReplicas(), getExpectedRedundancyNum(bi)); } } } finally { namesystem.writeUnlock(); } /* If we know the target datanodes where the replication timedout, * we could invoke decBlocksScheduled() on it. Its ok for now. */ } } public long requestBlockReportLeaseId(DatanodeRegistration nodeReg) { assert namesystem.hasReadLock(); DatanodeDescriptor node = null; try { node = datanodeManager.getDatanode(nodeReg); } catch (UnregisteredNodeException e) { LOG.warn("Unregistered datanode {}", nodeReg); return 0; } if (node == null) { LOG.warn("Failed to find datanode {}", nodeReg); return 0; } // Request a new block report lease. The BlockReportLeaseManager has // its own internal locking. long leaseId = blockReportLeaseManager.requestLease(node); BlockManagerFaultInjector.getInstance().requestBlockReportLease(node, leaseId); return leaseId; } public void registerDatanode(DatanodeRegistration nodeReg) throws IOException { assert namesystem.hasWriteLock(); datanodeManager.registerDatanode(nodeReg); bmSafeMode.checkSafeMode(); } /** * Set the total number of blocks in the system. * If safe mode is not currently on, this is a no-op. */ public void setBlockTotal(long total) { if (bmSafeMode.isInSafeMode()) { bmSafeMode.setBlockTotal(total); bmSafeMode.checkSafeMode(); } } public boolean isInSafeMode() { return bmSafeMode.isInSafeMode(); } public String getSafeModeTip() { return bmSafeMode.getSafeModeTip(); } public boolean leaveSafeMode(boolean force) { return bmSafeMode.leaveSafeMode(force); } public void checkSafeMode() { bmSafeMode.checkSafeMode(); } public long getBytesInFuture() { return bmSafeMode.getBytesInFuture(); } public long getBytesInFutureReplicatedBlocks() { return bmSafeMode.getBytesInFutureBlocks(); } public long getBytesInFutureECBlockGroups() { return bmSafeMode.getBytesInFutureECBlockGroups(); } /** * Removes the blocks from blocksmap and updates the safemode blocks total. * @param blocks An instance of {@link BlocksMapUpdateInfo} which contains a * list of blocks that need to be removed from blocksMap */ public void removeBlocksAndUpdateSafemodeTotal(BlocksMapUpdateInfo blocks) { assert namesystem.hasWriteLock(); // In the case that we are a Standby tailing edits from the // active while in safe-mode, we need to track the total number // of blocks and safe blocks in the system. boolean trackBlockCounts = bmSafeMode.isSafeModeTrackingBlocks(); int numRemovedComplete = 0, numRemovedSafe = 0; for (BlockInfo b : blocks.getToDeleteList()) { if (trackBlockCounts) { if (b.isComplete()) { numRemovedComplete++; if (hasMinStorage(b, b.numNodes())) { numRemovedSafe++; } } } removeBlock(b); } if (trackBlockCounts) { LOG.debug("Adjusting safe-mode totals for deletion." + "decreasing safeBlocks by {}, totalBlocks by {}", numRemovedSafe, numRemovedComplete); bmSafeMode.adjustBlockTotals(-numRemovedSafe, -numRemovedComplete); } } public long getProvidedCapacity() { return providedStorageMap.getCapacity(); } public void updateHeartbeat(DatanodeDescriptor node, StorageReport[] reports, long cacheCapacity, long cacheUsed, int xceiverCount, int failedVolumes, VolumeFailureSummary volumeFailureSummary) { for (StorageReport report : reports) { providedStorageMap.updateStorage(node, report.getStorage()); } node.updateHeartbeat(reports, cacheCapacity, cacheUsed, xceiverCount, failedVolumes, volumeFailureSummary); } /** * StatefulBlockInfo is used to build the "toUC" list, which is a list of * updates to the information about under-construction blocks. * Besides the block in question, it provides the ReplicaState * reported by the datanode in the block report. */ static class StatefulBlockInfo { final BlockInfo storedBlock; // should be UC block final Block reportedBlock; final ReplicaState reportedState; StatefulBlockInfo(BlockInfo storedBlock, Block reportedBlock, ReplicaState reportedState) { Preconditions.checkArgument(!storedBlock.isComplete()); this.storedBlock = storedBlock; this.reportedBlock = reportedBlock; this.reportedState = reportedState; } } private static class BlockInfoToAdd { final BlockInfo stored; final Block reported; BlockInfoToAdd(BlockInfo stored, Block reported) { this.stored = stored; this.reported = reported; } } /** * The given storage is reporting all its blocks. * Update the (storage-->block list) and (block-->storage list) maps. * * @return true if all known storages of the given DN have finished reporting. * @throws IOException */ public boolean processReport(final DatanodeID nodeID, final DatanodeStorage storage, final BlockListAsLongs newReport, BlockReportContext context) throws IOException { namesystem.writeLock(); final long startTime = Time.monotonicNow(); //after acquiring write lock final long endTime; DatanodeDescriptor node; Collection<Block> invalidatedBlocks = Collections.emptyList(); String strBlockReportId = context != null ? Long.toHexString(context.getReportId()) : ""; try { node = datanodeManager.getDatanode(nodeID); if (node == null || !node.isRegistered()) { throw new IOException("ProcessReport from dead or unregistered node: " + nodeID); } // To minimize startup time, we discard any second (or later) block reports // that we receive while still in startup phase. // Register DN with provided storage, not with storage owned by DN // DN should still have a ref to the DNStorageInfo. DatanodeStorageInfo storageInfo = providedStorageMap.getStorage(node, storage); if (storageInfo == null) { // We handle this for backwards compatibility. storageInfo = node.updateStorage(storage); } if (namesystem.isInStartupSafeMode() && storageInfo.getBlockReportCount() > 0) { blockLog.info("BLOCK* processReport 0x{}: " + "discarded non-initial block report from {}" + " because namenode still in startup phase", strBlockReportId, nodeID); blockReportLeaseManager.removeLease(node); return !node.hasStaleStorages(); } if (context != null) { if (!blockReportLeaseManager.checkLease(node, startTime, context.getLeaseId())) { return false; } } if (storageInfo.getBlockReportCount() == 0) { // The first block report can be processed a lot more efficiently than // ordinary block reports. This shortens restart times. blockLog.info( "BLOCK* processReport 0x{}: Processing first " + "storage report for {} from datanode {}", strBlockReportId, storageInfo.getStorageID(), nodeID.getDatanodeUuid()); processFirstBlockReport(storageInfo, newReport); } else { // Block reports for provided storage are not // maintained by DN heartbeats if (!StorageType.PROVIDED.equals(storageInfo.getStorageType())) { invalidatedBlocks = processReport(storageInfo, newReport, context); } } storageInfo.receivedBlockReport(); } finally { endTime = Time.monotonicNow(); namesystem.writeUnlock(); } for (Block b : invalidatedBlocks) { blockLog.debug("BLOCK* processReport 0x{}: {} on node {} size {} does not" + " belong to any file", strBlockReportId, b, node, b.getNumBytes()); } // Log the block report processing stats from Namenode perspective final NameNodeMetrics metrics = NameNode.getNameNodeMetrics(); if (metrics != null) { metrics.addStorageBlockReport((int) (endTime - startTime)); } blockLog.info( "BLOCK* processReport 0x{}: from storage {} node {}, " + "blocks: {}, hasStaleStorage: {}, processing time: {} msecs, " + "invalidatedBlocks: {}", strBlockReportId, storage.getStorageID(), nodeID, newReport.getNumberOfBlocks(), node.hasStaleStorages(), (endTime - startTime), invalidatedBlocks.size()); return !node.hasStaleStorages(); } public void removeBRLeaseIfNeeded(final DatanodeID nodeID, final BlockReportContext context) throws IOException { namesystem.writeLock(); DatanodeDescriptor node; try { node = datanodeManager.getDatanode(nodeID); if (context != null) { if (context.getTotalRpcs() == context.getCurRpc() + 1) { long leaseId = this.getBlockReportLeaseManager().removeLease(node); BlockManagerFaultInjector.getInstance().removeBlockReportLease(node, leaseId); node.setLastBlockReportTime(now()); node.setLastBlockReportMonotonic(Time.monotonicNow()); } LOG.debug("Processing RPC with index {} out of total {} RPCs in " + "processReport 0x{}", context.getCurRpc(), context.getTotalRpcs(), Long.toHexString(context.getReportId())); } } finally { namesystem.writeUnlock(); } } /** * Rescan the list of blocks which were previously postponed. */ void rescanPostponedMisreplicatedBlocks() { if (getPostponedMisreplicatedBlocksCount() == 0) { return; } namesystem.writeLock(); long startTime = Time.monotonicNow(); long startSize = postponedMisreplicatedBlocks.size(); try { Iterator<Block> it = postponedMisreplicatedBlocks.iterator(); for (int i = 0; i < blocksPerPostpondedRescan && it.hasNext(); i++) { Block b = it.next(); it.remove(); BlockInfo bi = getStoredBlock(b); if (bi == null) { LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " + "Postponed mis-replicated block {} no longer found " + "in block map.", b); continue; } MisReplicationResult res = processMisReplicatedBlock(bi); LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " + "Re-scanned block {}, result is {}", b, res); if (res == MisReplicationResult.POSTPONE) { rescannedMisreplicatedBlocks.add(b); } } } finally { postponedMisreplicatedBlocks.addAll(rescannedMisreplicatedBlocks); rescannedMisreplicatedBlocks.clear(); long endSize = postponedMisreplicatedBlocks.size(); namesystem.writeUnlock(); LOG.info( "Rescan of postponedMisreplicatedBlocks completed in {}" + " msecs. {} blocks are left. {} blocks were removed.", (Time.monotonicNow() - startTime), endSize, (startSize - endSize)); } } Collection<Block> processReport(final DatanodeStorageInfo storageInfo, final BlockListAsLongs report, BlockReportContext context) throws IOException { // Normal case: // Modify the (block-->datanode) map, according to the difference // between the old and new block report. // Collection<BlockInfoToAdd> toAdd = new LinkedList<>(); Collection<BlockInfo> toRemove = new TreeSet<>(); Collection<Block> toInvalidate = new LinkedList<>(); Collection<BlockToMarkCorrupt> toCorrupt = new LinkedList<>(); Collection<StatefulBlockInfo> toUC = new LinkedList<>(); boolean sorted = false; String strBlockReportId = ""; if (context != null) { sorted = context.isSorted(); strBlockReportId = Long.toHexString(context.getReportId()); } Iterable<BlockReportReplica> sortedReport; if (!sorted) { blockLog.warn( "BLOCK* processReport 0x{}: Report from the DataNode ({}) " + "is unsorted. This will cause overhead on the NameNode " + "which needs to sort the Full BR. Please update the " + "DataNode to the same version of Hadoop HDFS as the " + "NameNode ({}).", strBlockReportId, storageInfo.getDatanodeDescriptor().getDatanodeUuid(), VersionInfo.getVersion()); Set<BlockReportReplica> set = new FoldedTreeSet<>(); for (BlockReportReplica iblk : report) { set.add(new BlockReportReplica(iblk)); } sortedReport = set; } else { sortedReport = report; } reportDiffSorted(storageInfo, sortedReport, toAdd, toRemove, toInvalidate, toCorrupt, toUC); DatanodeDescriptor node = storageInfo.getDatanodeDescriptor(); // Process the blocks on each queue for (StatefulBlockInfo b : toUC) { addStoredBlockUnderConstruction(b, storageInfo); } for (BlockInfo b : toRemove) { removeStoredBlock(b, node); } int numBlocksLogged = 0; for (BlockInfoToAdd b : toAdd) { addStoredBlock(b.stored, b.reported, storageInfo, null, numBlocksLogged < maxNumBlocksToLog); numBlocksLogged++; } if (numBlocksLogged > maxNumBlocksToLog) { blockLog.info("BLOCK* processReport 0x{}: logged info for {} of {} " + "reported.", strBlockReportId, maxNumBlocksToLog, numBlocksLogged); } for (Block b : toInvalidate) { addToInvalidates(b, node); } for (BlockToMarkCorrupt b : toCorrupt) { markBlockAsCorrupt(b, storageInfo, node); } return toInvalidate; } /** * Mark block replicas as corrupt except those on the storages in * newStorages list. */ public void markBlockReplicasAsCorrupt(Block oldBlock, BlockInfo block, long oldGenerationStamp, long oldNumBytes, DatanodeStorageInfo[] newStorages) throws IOException { assert namesystem.hasWriteLock(); BlockToMarkCorrupt b = null; if (block.getGenerationStamp() != oldGenerationStamp) { b = new BlockToMarkCorrupt(oldBlock, block, oldGenerationStamp, "genstamp does not match " + oldGenerationStamp + " : " + block.getGenerationStamp(), Reason.GENSTAMP_MISMATCH); } else if (block.getNumBytes() != oldNumBytes) { b = new BlockToMarkCorrupt(oldBlock, block, "length does not match " + oldNumBytes + " : " + block.getNumBytes(), Reason.SIZE_MISMATCH); } else { return; } for (DatanodeStorageInfo storage : getStorages(block)) { boolean isCorrupt = true; if (newStorages != null) { for (DatanodeStorageInfo newStorage : newStorages) { if (newStorage != null && storage.equals(newStorage)) { isCorrupt = false; break; } } } if (isCorrupt) { blockLog.debug("BLOCK* markBlockReplicasAsCorrupt: mark block replica" + " {} on {} as corrupt because the dn is not in the new committed " + "storage list.", b, storage.getDatanodeDescriptor()); markBlockAsCorrupt(b, storage, storage.getDatanodeDescriptor()); } } } /** * processFirstBlockReport is intended only for processing "initial" block * reports, the first block report received from a DN after it registers. * It just adds all the valid replicas to the datanode, without calculating * a toRemove list (since there won't be any). It also silently discards * any invalid blocks, thereby deferring their processing until * the next block report. * @param storageInfo - DatanodeStorageInfo that sent the report * @param report - the initial block report, to be processed * @throws IOException */ void processFirstBlockReport(final DatanodeStorageInfo storageInfo, final BlockListAsLongs report) throws IOException { if (report == null) return; assert (namesystem.hasWriteLock()); assert (storageInfo.getBlockReportCount() == 0); for (BlockReportReplica iblk : report) { ReplicaState reportedState = iblk.getState(); if (LOG.isDebugEnabled()) { LOG.debug("Initial report of block {} on {} size {} replicaState = {}", iblk.getBlockName(), storageInfo.getDatanodeDescriptor(), iblk.getNumBytes(), reportedState); } if (shouldPostponeBlocksFromFuture && isGenStampInFuture(iblk)) { queueReportedBlock(storageInfo, iblk, reportedState, QUEUE_REASON_FUTURE_GENSTAMP); continue; } BlockInfo storedBlock = getStoredBlock(iblk); // If block does not belong to any file, we check if it violates // an integrity assumption of Name node if (storedBlock == null) { bmSafeMode.checkBlocksWithFutureGS(iblk); continue; } // If block is corrupt, mark it and continue to next block. BlockUCState ucState = storedBlock.getBlockUCState(); BlockToMarkCorrupt c = checkReplicaCorrupt(iblk, reportedState, storedBlock, ucState, storageInfo.getDatanodeDescriptor()); if (c != null) { if (shouldPostponeBlocksFromFuture) { // In the Standby, we may receive a block report for a file that we // just have an out-of-date gen-stamp or state for, for example. queueReportedBlock(storageInfo, iblk, reportedState, QUEUE_REASON_CORRUPT_STATE); } else { markBlockAsCorrupt(c, storageInfo, storageInfo.getDatanodeDescriptor()); } continue; } // If block is under construction, add this replica to its list if (isBlockUnderConstruction(storedBlock, ucState, reportedState)) { storedBlock.getUnderConstructionFeature().addReplicaIfNotPresent(storageInfo, iblk, reportedState); // OpenFileBlocks only inside snapshots also will be added to safemode // threshold. So we need to update such blocks to safemode // refer HDFS-5283 if (namesystem.isInSnapshot(storedBlock.getBlockCollectionId())) { int numOfReplicas = storedBlock.getUnderConstructionFeature().getNumExpectedLocations(); bmSafeMode.incrementSafeBlockCount(numOfReplicas, storedBlock); } //and fall through to next clause } //add replica if appropriate if (reportedState == ReplicaState.FINALIZED) { addStoredBlockImmediate(storedBlock, iblk, storageInfo); } } } private void reportDiffSorted(DatanodeStorageInfo storageInfo, Iterable<BlockReportReplica> newReport, Collection<BlockInfoToAdd> toAdd, // add to DatanodeDescriptor Collection<BlockInfo> toRemove, // remove from DatanodeDescriptor Collection<Block> toInvalidate, // should be removed from DN Collection<BlockToMarkCorrupt> toCorrupt, // add to corrupt replicas list Collection<StatefulBlockInfo> toUC) { // add to under-construction list // The blocks must be sorted and the storagenodes blocks must be sorted Iterator<BlockInfo> storageBlocksIterator = storageInfo.getBlockIterator(); DatanodeDescriptor dn = storageInfo.getDatanodeDescriptor(); BlockInfo storageBlock = null; for (BlockReportReplica replica : newReport) { long replicaID = replica.getBlockId(); if (BlockIdManager.isStripedBlockID(replicaID) && (!hasNonEcBlockUsingStripedID || !blocksMap.containsBlock(replica))) { replicaID = BlockIdManager.convertToStripedID(replicaID); } ReplicaState reportedState = replica.getState(); LOG.debug("Reported block {} on {} size {} replicaState = {}", replica, dn, replica.getNumBytes(), reportedState); if (shouldPostponeBlocksFromFuture && isGenStampInFuture(replica)) { queueReportedBlock(storageInfo, replica, reportedState, QUEUE_REASON_FUTURE_GENSTAMP); continue; } if (storageBlock == null && storageBlocksIterator.hasNext()) { storageBlock = storageBlocksIterator.next(); } do { int cmp; if (storageBlock == null || (cmp = Long.compare(replicaID, storageBlock.getBlockId())) < 0) { // Check if block is available in NN but not yet on this storage BlockInfo nnBlock = blocksMap.getStoredBlock(new Block(replicaID)); if (nnBlock != null) { reportDiffSortedInner(storageInfo, replica, reportedState, nnBlock, toAdd, toCorrupt, toUC); } else { // Replica not found anywhere so it should be invalidated toInvalidate.add(new Block(replica)); } break; } else if (cmp == 0) { // Replica matched current storageblock reportDiffSortedInner(storageInfo, replica, reportedState, storageBlock, toAdd, toCorrupt, toUC); storageBlock = null; } else { // replica has higher ID than storedBlock // Remove all stored blocks with IDs lower than replica do { toRemove.add(storageBlock); storageBlock = storageBlocksIterator.hasNext() ? storageBlocksIterator.next() : null; } while (storageBlock != null && Long.compare(replicaID, storageBlock.getBlockId()) > 0); } } while (storageBlock != null); } // Iterate any remaining blocks that have not been reported and remove them while (storageBlocksIterator.hasNext()) { toRemove.add(storageBlocksIterator.next()); } } private void reportDiffSortedInner(final DatanodeStorageInfo storageInfo, final BlockReportReplica replica, final ReplicaState reportedState, final BlockInfo storedBlock, final Collection<BlockInfoToAdd> toAdd, final Collection<BlockToMarkCorrupt> toCorrupt, final Collection<StatefulBlockInfo> toUC) { assert replica != null; assert storedBlock != null; DatanodeDescriptor dn = storageInfo.getDatanodeDescriptor(); BlockUCState ucState = storedBlock.getBlockUCState(); // Block is on the NN LOG.debug("In memory blockUCState = {}", ucState); // Ignore replicas already scheduled to be removed from the DN if (invalidateBlocks.contains(dn, replica)) { return; } BlockToMarkCorrupt c = checkReplicaCorrupt(replica, reportedState, storedBlock, ucState, dn); if (c != null) { if (shouldPostponeBlocksFromFuture) { // If the block is an out-of-date generation stamp or state, // but we're the standby, we shouldn't treat it as corrupt, // but instead just queue it for later processing. // TODO: Pretty confident this should be s/storedBlock/block below, // since we should be postponing the info of the reported block, not // the stored block. See HDFS-6289 for more context. queueReportedBlock(storageInfo, storedBlock, reportedState, QUEUE_REASON_CORRUPT_STATE); } else { toCorrupt.add(c); } } else if (isBlockUnderConstruction(storedBlock, ucState, reportedState)) { toUC.add(new StatefulBlockInfo(storedBlock, new Block(replica), reportedState)); } else if (reportedState == ReplicaState.FINALIZED && (storedBlock.findStorageInfo(storageInfo) == -1 || corruptReplicas.isReplicaCorrupt(storedBlock, dn))) { // Add replica if appropriate. If the replica was previously corrupt // but now okay, it might need to be updated. toAdd.add(new BlockInfoToAdd(storedBlock, new Block(replica))); } } /** * Queue the given reported block for later processing in the * standby node. @see PendingDataNodeMessages. * @param reason a textual reason to report in the debug logs */ private void queueReportedBlock(DatanodeStorageInfo storageInfo, Block block, ReplicaState reportedState, String reason) { assert shouldPostponeBlocksFromFuture; LOG.debug("Queueing reported block {} in state {}" + " from datanode {} for later processing because {}.", block, reportedState, storageInfo.getDatanodeDescriptor(), reason); pendingDNMessages.enqueueReportedBlock(storageInfo, block, reportedState); } /** * Try to process any messages that were previously queued for the given * block. This is called from FSEditLogLoader whenever a block's state * in the namespace has changed or a new block has been created. */ public void processQueuedMessagesForBlock(Block b) throws IOException { Queue<ReportedBlockInfo> queue = pendingDNMessages.takeBlockQueue(b); if (queue == null) { // Nothing to re-process return; } processQueuedMessages(queue); } private void processQueuedMessages(Iterable<ReportedBlockInfo> rbis) throws IOException { for (ReportedBlockInfo rbi : rbis) { LOG.debug("Processing previouly queued message {}", rbi); if (rbi.getReportedState() == null) { // This is a DELETE_BLOCK request DatanodeStorageInfo storageInfo = rbi.getStorageInfo(); removeStoredBlock(getStoredBlock(rbi.getBlock()), storageInfo.getDatanodeDescriptor()); } else { processAndHandleReportedBlock(rbi.getStorageInfo(), rbi.getBlock(), rbi.getReportedState(), null); } } } /** * Process any remaining queued datanode messages after entering * active state. At this point they will not be re-queued since * we are the definitive master node and thus should be up-to-date * with the namespace information. */ public void processAllPendingDNMessages() throws IOException { assert !shouldPostponeBlocksFromFuture : "processAllPendingDNMessages() should be called after disabling " + "block postponement."; int count = pendingDNMessages.count(); if (count > 0) { LOG.info("Processing {} messages from DataNodes " + "that were previously queued during standby state", count); } processQueuedMessages(pendingDNMessages.takeAll()); assert pendingDNMessages.count() == 0; } /** * The next two methods test the various cases under which we must conclude * the replica is corrupt, or under construction. These are laid out * as switch statements, on the theory that it is easier to understand * the combinatorics of reportedState and ucState that way. It should be * at least as efficient as boolean expressions. * * @return a BlockToMarkCorrupt object, or null if the replica is not corrupt */ private BlockToMarkCorrupt checkReplicaCorrupt(Block reported, ReplicaState reportedState, BlockInfo storedBlock, BlockUCState ucState, DatanodeDescriptor dn) { switch (reportedState) { case FINALIZED: switch (ucState) { case COMPLETE: case COMMITTED: if (storedBlock.getGenerationStamp() != reported.getGenerationStamp()) { final long reportedGS = reported.getGenerationStamp(); return new BlockToMarkCorrupt(new Block(reported), storedBlock, reportedGS, "block is " + ucState + " and reported genstamp " + reportedGS + " does not match genstamp in block map " + storedBlock.getGenerationStamp(), Reason.GENSTAMP_MISMATCH); } boolean wrongSize; if (storedBlock.isStriped()) { assert BlockIdManager.isStripedBlockID(reported.getBlockId()); assert storedBlock.getBlockId() == BlockIdManager.convertToStripedID(reported.getBlockId()); BlockInfoStriped stripedBlock = (BlockInfoStriped) storedBlock; int reportedBlkIdx = BlockIdManager.getBlockIndex(reported); wrongSize = reported.getNumBytes() != getInternalBlockLength(stripedBlock.getNumBytes(), stripedBlock.getCellSize(), stripedBlock.getDataBlockNum(), reportedBlkIdx); } else { wrongSize = storedBlock.getNumBytes() != reported.getNumBytes(); } if (wrongSize) { return new BlockToMarkCorrupt(new Block(reported), storedBlock, "block is " + ucState + " and reported length " + reported.getNumBytes() + " does not match " + "length in block map " + storedBlock.getNumBytes(), Reason.SIZE_MISMATCH); } else { return null; // not corrupt } case UNDER_CONSTRUCTION: if (storedBlock.getGenerationStamp() > reported.getGenerationStamp()) { final long reportedGS = reported.getGenerationStamp(); return new BlockToMarkCorrupt(new Block(reported), storedBlock, reportedGS, "block is " + ucState + " and reported state " + reportedState + ", But reported genstamp " + reportedGS + " does not match genstamp in block map " + storedBlock.getGenerationStamp(), Reason.GENSTAMP_MISMATCH); } return null; default: return null; } case RBW: case RWR: final long reportedGS = reported.getGenerationStamp(); if (!storedBlock.isComplete()) { //When DN report lesser GS than the storedBlock then mark it is corrupt, //As already valid replica will be present. if (storedBlock.getGenerationStamp() > reported.getGenerationStamp()) { return new BlockToMarkCorrupt(new Block(reported), storedBlock, reportedGS, "reported " + reportedState + " replica with genstamp " + reportedGS + " does not match Stored block's genstamp in block map " + storedBlock.getGenerationStamp(), Reason.GENSTAMP_MISMATCH); } return null; // not corrupt } else if (storedBlock.getGenerationStamp() != reported.getGenerationStamp()) { return new BlockToMarkCorrupt(new Block(reported), storedBlock, reportedGS, "reported " + reportedState + " replica with genstamp " + reportedGS + " does not match COMPLETE block's genstamp in block map " + storedBlock.getGenerationStamp(), Reason.GENSTAMP_MISMATCH); } else { // COMPLETE block, same genstamp if (reportedState == ReplicaState.RBW) { // If it's a RBW report for a COMPLETE block, it may just be that // the block report got a little bit delayed after the pipeline // closed. So, ignore this report, assuming we will get a // FINALIZED replica later. See HDFS-2791 LOG.info("Received an RBW replica for {} on {}: ignoring it, since " + "it is complete with the same genstamp", storedBlock, dn); return null; } else { return new BlockToMarkCorrupt(new Block(reported), storedBlock, "reported replica has invalid state " + reportedState, Reason.INVALID_STATE); } } case RUR: // should not be reported case TEMPORARY: // should not be reported default: String msg = "Unexpected replica state " + reportedState + " for block: " + storedBlock + " on " + dn + " size " + storedBlock.getNumBytes(); // log here at WARN level since this is really a broken HDFS invariant LOG.warn("{}", msg); return new BlockToMarkCorrupt(new Block(reported), storedBlock, msg, Reason.INVALID_STATE); } } private boolean isBlockUnderConstruction(BlockInfo storedBlock, BlockUCState ucState, ReplicaState reportedState) { switch (reportedState) { case FINALIZED: switch (ucState) { case UNDER_CONSTRUCTION: case UNDER_RECOVERY: return true; default: return false; } case RBW: case RWR: return (!storedBlock.isComplete()); case RUR: // should not be reported case TEMPORARY: // should not be reported default: return false; } } void addStoredBlockUnderConstruction(StatefulBlockInfo ucBlock, DatanodeStorageInfo storageInfo) throws IOException { BlockInfo block = ucBlock.storedBlock; block.getUnderConstructionFeature().addReplicaIfNotPresent(storageInfo, ucBlock.reportedBlock, ucBlock.reportedState); // Add replica if appropriate. If the replica was previously corrupt // but now okay, it might need to be updated. if (ucBlock.reportedState == ReplicaState.FINALIZED && (block.findStorageInfo(storageInfo) < 0) || corruptReplicas.isReplicaCorrupt(block, storageInfo.getDatanodeDescriptor())) { addStoredBlock(block, ucBlock.reportedBlock, storageInfo, null, true); } } /** * Faster version of {@link #addStoredBlock}, * intended for use with initial block report at startup. If not in startup * safe mode, will call standard addStoredBlock(). Assumes this method is * called "immediately" so there is no need to refresh the storedBlock from * blocksMap. Doesn't handle low redundancy/extra redundancy, or worry about * pendingReplications or corruptReplicas, because it's in startup safe mode. * Doesn't log every block, because there are typically millions of them. * * @throws IOException */ private void addStoredBlockImmediate(BlockInfo storedBlock, Block reported, DatanodeStorageInfo storageInfo) throws IOException { assert (storedBlock != null && namesystem.hasWriteLock()); if (!namesystem.isInStartupSafeMode() || isPopulatingReplQueues()) { addStoredBlock(storedBlock, reported, storageInfo, null, false); return; } // just add it AddBlockResult result = storageInfo.addBlockInitial(storedBlock, reported); // Now check for completion of blocks and safe block count int numCurrentReplica = countLiveNodes(storedBlock); if (storedBlock.getBlockUCState() == BlockUCState.COMMITTED && hasMinStorage(storedBlock, numCurrentReplica)) { completeBlock(storedBlock, null, false); } else if (storedBlock.isComplete() && result == AddBlockResult.ADDED) { // check whether safe replication is reached for the block // only complete blocks are counted towards that. // In the case that the block just became complete above, completeBlock() // handles the safe block count maintenance. bmSafeMode.incrementSafeBlockCount(numCurrentReplica, storedBlock); } } /** * Modify (block-->datanode) map. Remove block from set of * needed reconstruction if this takes care of the problem. * @return the block that is stored in blocksMap. */ private Block addStoredBlock(final BlockInfo block, final Block reportedBlock, DatanodeStorageInfo storageInfo, DatanodeDescriptor delNodeHint, boolean logEveryBlock) throws IOException { assert block != null && namesystem.hasWriteLock(); BlockInfo storedBlock; DatanodeDescriptor node = storageInfo.getDatanodeDescriptor(); if (!block.isComplete()) { //refresh our copy in case the block got completed in another thread storedBlock = getStoredBlock(block); } else { storedBlock = block; } if (storedBlock == null || storedBlock.isDeleted()) { // If this block does not belong to anyfile, then we are done. blockLog.debug("BLOCK* addStoredBlock: {} on {} size {} but it does not" + " belong to any file", block, node, block.getNumBytes()); // we could add this block to invalidate set of this datanode. // it will happen in next block report otherwise. return block; } // add block to the datanode AddBlockResult result = storageInfo.addBlock(storedBlock, reportedBlock); int curReplicaDelta; if (result == AddBlockResult.ADDED) { curReplicaDelta = (node.isDecommissioned()) ? 0 : 1; if (logEveryBlock) { blockLog.debug("BLOCK* addStoredBlock: {} is added to {} (size={})", node, storedBlock, storedBlock.getNumBytes()); } } else if (result == AddBlockResult.REPLACED) { curReplicaDelta = 0; blockLog.warn("BLOCK* addStoredBlock: block {} moved to storageType " + "{} on node {}", storedBlock, storageInfo.getStorageType(), node); } else { // if the same block is added again and the replica was corrupt // previously because of a wrong gen stamp, remove it from the // corrupt block list. corruptReplicas.removeFromCorruptReplicasMap(block, node, Reason.GENSTAMP_MISMATCH); curReplicaDelta = 0; blockLog.debug("BLOCK* addStoredBlock: Redundant addStoredBlock request" + " received for {} on node {} size {}", storedBlock, node, storedBlock.getNumBytes()); } // Now check for completion of blocks and safe block count NumberReplicas num = countNodes(storedBlock); int numLiveReplicas = num.liveReplicas(); int pendingNum = pendingReconstruction.getNumReplicas(storedBlock); int numCurrentReplica = numLiveReplicas + pendingNum; if (storedBlock.getBlockUCState() == BlockUCState.COMMITTED && hasMinStorage(storedBlock, numLiveReplicas)) { addExpectedReplicasToPending(storedBlock); completeBlock(storedBlock, null, false); } else if (storedBlock.isComplete() && result == AddBlockResult.ADDED) { // check whether safe replication is reached for the block // only complete blocks are counted towards that // Is no-op if not in safe mode. // In the case that the block just became complete above, completeBlock() // handles the safe block count maintenance. bmSafeMode.incrementSafeBlockCount(numCurrentReplica, storedBlock); } // if block is still under construction, then done for now if (!storedBlock.isCompleteOrCommitted()) { return storedBlock; } // do not try to handle extra/low redundancy blocks during first safe mode if (!isPopulatingReplQueues()) { return storedBlock; } // handle low redundancy/extra redundancy short fileRedundancy = getExpectedRedundancyNum(storedBlock); if (!isNeededReconstruction(storedBlock, num, pendingNum)) { neededReconstruction.remove(storedBlock, numCurrentReplica, num.readOnlyReplicas(), num.outOfServiceReplicas(), fileRedundancy); } else { updateNeededReconstructions(storedBlock, curReplicaDelta, 0); } if (shouldProcessExtraRedundancy(num, fileRedundancy)) { processExtraRedundancyBlock(storedBlock, fileRedundancy, node, delNodeHint); } // If the file redundancy has reached desired value // we can remove any corrupt replicas the block may have int corruptReplicasCount = corruptReplicas.numCorruptReplicas(storedBlock); int numCorruptNodes = num.corruptReplicas(); if (numCorruptNodes != corruptReplicasCount) { LOG.warn( "Inconsistent number of corrupt replicas for {}" + ". blockMap has {} but corrupt replicas map has {}", storedBlock, numCorruptNodes, corruptReplicasCount); } if ((corruptReplicasCount > 0) && (numLiveReplicas >= fileRedundancy)) { invalidateCorruptReplicas(storedBlock, reportedBlock, num); } return storedBlock; } // If there is any maintenance replica, we don't have to restore // the condition of live + maintenance == expected. We allow // live + maintenance >= expected. The extra redundancy will be removed // when the maintenance node changes to live. private boolean shouldProcessExtraRedundancy(NumberReplicas num, int expectedNum) { final int numCurrent = num.liveReplicas(); return numCurrent > expectedNum || (numCurrent == expectedNum && num.redundantInternalBlocks() > 0); } /** * Invalidate corrupt replicas. * <p> * This will remove the replicas from the block's location list, * add them to {@link #invalidateBlocks} so that they could be further * deleted from the respective data-nodes, * and remove the block from corruptReplicasMap. * <p> * This method should be called when the block has sufficient * number of live replicas. * * @param blk Block whose corrupt replicas need to be invalidated */ private void invalidateCorruptReplicas(BlockInfo blk, Block reported, NumberReplicas numberReplicas) { Collection<DatanodeDescriptor> nodes = corruptReplicas.getNodes(blk); boolean removedFromBlocksMap = true; if (nodes == null) return; // make a copy of the array of nodes in order to avoid // ConcurrentModificationException, when the block is removed from the node DatanodeDescriptor[] nodesCopy = nodes.toArray(new DatanodeDescriptor[nodes.size()]); for (DatanodeDescriptor node : nodesCopy) { try { if (!invalidateBlock(new BlockToMarkCorrupt(reported, blk, null, Reason.ANY), node, numberReplicas)) { removedFromBlocksMap = false; } } catch (IOException e) { blockLog.debug("invalidateCorruptReplicas error in deleting bad block" + " {} on {}", blk, node, e); removedFromBlocksMap = false; } } // Remove the block from corruptReplicasMap if (removedFromBlocksMap) { corruptReplicas.removeFromCorruptReplicasMap(blk); } } /** * For each block in the name-node verify whether it belongs to any file, * extra or low redundancy. Place it into the respective queue. */ public void processMisReplicatedBlocks() { assert namesystem.hasWriteLock(); stopReconstructionInitializer(); neededReconstruction.clear(); reconstructionQueuesInitializer = new Daemon() { @Override public void run() { try { processMisReplicatesAsync(); } catch (InterruptedException ie) { LOG.info("Interrupted while processing reconstruction queues."); } catch (Exception e) { LOG.error("Error while processing reconstruction queues async", e); } } }; reconstructionQueuesInitializer.setName("Reconstruction Queue Initializer"); reconstructionQueuesInitializer.start(); } /* * Stop the ongoing initialisation of reconstruction queues */ private void stopReconstructionInitializer() { if (reconstructionQueuesInitializer != null) { reconstructionQueuesInitializer.interrupt(); try { reconstructionQueuesInitializer.join(); } catch (final InterruptedException e) { LOG.warn("Interrupted while waiting for " + "reconstructionQueueInitializer. Returning.."); return; } finally { reconstructionQueuesInitializer = null; } } } /* * Since the BlocksMapGset does not throw the ConcurrentModificationException * and supports further iteration after modification to list, there is a * chance of missing the newly added block while iterating. Since every * addition to blocksMap will check for mis-replication, missing mis-replication * check for new blocks will not be a problem. */ private void processMisReplicatesAsync() throws InterruptedException { long nrInvalid = 0, nrOverReplicated = 0; long nrUnderReplicated = 0, nrPostponed = 0, nrUnderConstruction = 0; long startTimeMisReplicatedScan = Time.monotonicNow(); Iterator<BlockInfo> blocksItr = blocksMap.getBlocks().iterator(); long totalBlocks = blocksMap.size(); reconstructionQueuesInitProgress = 0; long totalProcessed = 0; long sleepDuration = Math.max(1, Math.min(numBlocksPerIteration / 1000, 10000)); while (namesystem.isRunning() && !Thread.currentThread().isInterrupted()) { int processed = 0; namesystem.writeLockInterruptibly(); try { while (processed < numBlocksPerIteration && blocksItr.hasNext()) { BlockInfo block = blocksItr.next(); MisReplicationResult res = processMisReplicatedBlock(block); switch (res) { case UNDER_REPLICATED: LOG.trace("under replicated block {}: {}", block, res); nrUnderReplicated++; break; case OVER_REPLICATED: LOG.trace("over replicated block {}: {}", block, res); nrOverReplicated++; break; case INVALID: LOG.trace("invalid block {}: {}", block, res); nrInvalid++; break; case POSTPONE: LOG.trace("postpone block {}: {}", block, res); nrPostponed++; postponeBlock(block); break; case UNDER_CONSTRUCTION: LOG.trace("under construction block {}: {}", block, res); nrUnderConstruction++; break; case OK: break; default: throw new AssertionError("Invalid enum value: " + res); } processed++; } totalProcessed += processed; // there is a possibility that if any of the blocks deleted/added during // initialisation, then progress might be different. reconstructionQueuesInitProgress = Math.min((double) totalProcessed / totalBlocks, 1.0); if (!blocksItr.hasNext()) { LOG.info("Total number of blocks = {}", blocksMap.size()); LOG.info("Number of invalid blocks = {}", nrInvalid); LOG.info("Number of under-replicated blocks = {}", nrUnderReplicated); LOG.info("Number of over-replicated blocks = {}{}", nrOverReplicated, ((nrPostponed > 0) ? (" (" + nrPostponed + " postponed)") : "")); LOG.info("Number of blocks being written = {}", nrUnderConstruction); NameNode.stateChangeLog.info("STATE* Replication Queue initialization " + "scan for invalid, over- and under-replicated blocks " + "completed in " + (Time.monotonicNow() - startTimeMisReplicatedScan) + " msec"); break; } } finally { namesystem.writeUnlock(); // Make sure it is out of the write lock for sufficiently long time. Thread.sleep(sleepDuration); } } if (Thread.currentThread().isInterrupted()) { LOG.info("Interrupted while processing replication queues."); } } /** * Get the progress of the reconstruction queues initialisation * * @return Returns values between 0 and 1 for the progress. */ public double getReconstructionQueuesInitProgress() { return reconstructionQueuesInitProgress; } /** * Get the value of whether there are any non-EC blocks using StripedID. * * @return Returns the value of whether there are any non-EC blocks using StripedID. */ public boolean hasNonEcBlockUsingStripedID() { return hasNonEcBlockUsingStripedID; } /** * Process a single possibly misreplicated block. This adds it to the * appropriate queues if necessary, and returns a result code indicating * what happened with it. */ private MisReplicationResult processMisReplicatedBlock(BlockInfo block) { if (block.isDeleted()) { // block does not belong to any file addToInvalidates(block); return MisReplicationResult.INVALID; } if (!block.isComplete()) { // Incomplete blocks are never considered mis-replicated -- // they'll be reached when they are completed or recovered. return MisReplicationResult.UNDER_CONSTRUCTION; } // calculate current redundancy short expectedRedundancy = getExpectedRedundancyNum(block); NumberReplicas num = countNodes(block); final int numCurrentReplica = num.liveReplicas(); // add to low redundancy queue if need to be if (isNeededReconstruction(block, num)) { if (neededReconstruction.add(block, numCurrentReplica, num.readOnlyReplicas(), num.outOfServiceReplicas(), expectedRedundancy)) { return MisReplicationResult.UNDER_REPLICATED; } } if (shouldProcessExtraRedundancy(num, expectedRedundancy)) { if (num.replicasOnStaleNodes() > 0) { // If any of the replicas of this block are on nodes that are // considered "stale", then these replicas may in fact have // already been deleted. So, we cannot safely act on the // over-replication until a later point in time, when // the "stale" nodes have block reported. return MisReplicationResult.POSTPONE; } // extra redundancy block processExtraRedundancyBlock(block, expectedRedundancy, null, null); return MisReplicationResult.OVER_REPLICATED; } return MisReplicationResult.OK; } /** Set replication for the blocks. */ public void setReplication(final short oldRepl, final short newRepl, final BlockInfo b) { if (newRepl == oldRepl) { return; } // update neededReconstruction priority queues b.setReplication(newRepl); NumberReplicas num = countNodes(b); updateNeededReconstructions(b, 0, newRepl - oldRepl); if (shouldProcessExtraRedundancy(num, newRepl)) { processExtraRedundancyBlock(b, newRepl, null, null); } } /** * Find how many of the containing nodes are "extra", if any. * If there are any extras, call chooseExcessRedundancies() to * mark them in the excessRedundancyMap. */ private void processExtraRedundancyBlock(final BlockInfo block, final short replication, final DatanodeDescriptor addedNode, DatanodeDescriptor delNodeHint) { assert namesystem.hasWriteLock(); if (addedNode == delNodeHint) { delNodeHint = null; } Collection<DatanodeStorageInfo> nonExcess = new ArrayList<>(); Collection<DatanodeDescriptor> corruptNodes = corruptReplicas.getNodes(block); for (DatanodeStorageInfo storage : blocksMap.getStorages(block)) { if (storage.getState() != State.NORMAL) { continue; } final DatanodeDescriptor cur = storage.getDatanodeDescriptor(); if (storage.areBlockContentsStale()) { LOG.trace("BLOCK* processExtraRedundancyBlock: Postponing {}" + " since storage {} does not yet have up-to-date information.", block, storage); postponeBlock(block); return; } if (!isExcess(cur, block)) { if (cur.isInService()) { // exclude corrupt replicas if (corruptNodes == null || !corruptNodes.contains(cur)) { nonExcess.add(storage); } } } } chooseExcessRedundancies(nonExcess, block, replication, addedNode, delNodeHint); } private void chooseExcessRedundancies(final Collection<DatanodeStorageInfo> nonExcess, BlockInfo storedBlock, short replication, DatanodeDescriptor addedNode, DatanodeDescriptor delNodeHint) { assert namesystem.hasWriteLock(); // first form a rack to datanodes map and BlockCollection bc = getBlockCollection(storedBlock); if (storedBlock.isStriped()) { chooseExcessRedundancyStriped(bc, nonExcess, storedBlock, delNodeHint); } else { final BlockStoragePolicy storagePolicy = storagePolicySuite.getPolicy(bc.getStoragePolicyID()); final List<StorageType> excessTypes = storagePolicy.chooseExcess(replication, DatanodeStorageInfo.toStorageTypes(nonExcess)); chooseExcessRedundancyContiguous(nonExcess, storedBlock, replication, addedNode, delNodeHint, excessTypes); } } /** * We want sufficient redundancy for the block, but we now have too many. * In this method, copy enough nodes from 'srcNodes' into 'dstNodes' such that: * * srcNodes.size() - dstNodes.size() == replication * * We pick node that make sure that replicas are spread across racks and * also try hard to pick one with least free space. * The algorithm is first to pick a node with least free space from nodes * that are on a rack holding more than one replicas of the block. * So removing such a replica won't remove a rack. * If no such a node is available, * then pick a node with least free space */ private void chooseExcessRedundancyContiguous(final Collection<DatanodeStorageInfo> nonExcess, BlockInfo storedBlock, short replication, DatanodeDescriptor addedNode, DatanodeDescriptor delNodeHint, List<StorageType> excessTypes) { BlockPlacementPolicy replicator = placementPolicies.getPolicy(CONTIGUOUS); List<DatanodeStorageInfo> replicasToDelete = replicator.chooseReplicasToDelete(nonExcess, nonExcess, replication, excessTypes, addedNode, delNodeHint); for (DatanodeStorageInfo chosenReplica : replicasToDelete) { processChosenExcessRedundancy(nonExcess, chosenReplica, storedBlock); } } /** * We want block group has every internal block, but we have redundant * internal blocks (which have the same index). * In this method, we delete the redundant internal blocks until only one * left for each index. * * The block placement policy will make sure that the left internal blocks are * spread across racks and also try hard to pick one with least free space. */ private void chooseExcessRedundancyStriped(BlockCollection bc, final Collection<DatanodeStorageInfo> nonExcess, BlockInfo storedBlock, DatanodeDescriptor delNodeHint) { assert storedBlock instanceof BlockInfoStriped; BlockInfoStriped sblk = (BlockInfoStriped) storedBlock; short groupSize = sblk.getTotalBlockNum(); // find all duplicated indices BitSet found = new BitSet(groupSize); //indices found BitSet duplicated = new BitSet(groupSize); //indices found more than once HashMap<DatanodeStorageInfo, Integer> storage2index = new HashMap<>(); for (DatanodeStorageInfo storage : nonExcess) { int index = sblk.getStorageBlockIndex(storage); assert index >= 0; if (found.get(index)) { duplicated.set(index); } found.set(index); storage2index.put(storage, index); } // use delHint only if delHint is duplicated final DatanodeStorageInfo delStorageHint = DatanodeStorageInfo.getDatanodeStorageInfo(nonExcess, delNodeHint); if (delStorageHint != null) { Integer index = storage2index.get(delStorageHint); if (index != null && duplicated.get(index)) { processChosenExcessRedundancy(nonExcess, delStorageHint, storedBlock); } } // cardinality of found indicates the expected number of internal blocks final int numOfTarget = found.cardinality(); final BlockStoragePolicy storagePolicy = storagePolicySuite.getPolicy(bc.getStoragePolicyID()); final List<StorageType> excessTypes = storagePolicy.chooseExcess((short) numOfTarget, DatanodeStorageInfo.toStorageTypes(nonExcess)); if (excessTypes.isEmpty()) { LOG.warn("excess types chosen for block {} among storages {} is empty", storedBlock, nonExcess); return; } BlockPlacementPolicy placementPolicy = placementPolicies.getPolicy(STRIPED); // for each duplicated index, delete some replicas until only one left for (int targetIndex = duplicated.nextSetBit(0); targetIndex >= 0; targetIndex = duplicated .nextSetBit(targetIndex + 1)) { List<DatanodeStorageInfo> candidates = new ArrayList<>(); for (DatanodeStorageInfo storage : nonExcess) { int index = storage2index.get(storage); if (index == targetIndex) { candidates.add(storage); } } if (candidates.size() > 1) { List<DatanodeStorageInfo> replicasToDelete = placementPolicy.chooseReplicasToDelete(nonExcess, candidates, (short) 1, excessTypes, null, null); for (DatanodeStorageInfo chosen : replicasToDelete) { processChosenExcessRedundancy(nonExcess, chosen, storedBlock); candidates.remove(chosen); } } duplicated.clear(targetIndex); } } private void processChosenExcessRedundancy(final Collection<DatanodeStorageInfo> nonExcess, final DatanodeStorageInfo chosen, BlockInfo storedBlock) { nonExcess.remove(chosen); excessRedundancyMap.add(chosen.getDatanodeDescriptor(), storedBlock); // // The 'excessblocks' tracks blocks until we get confirmation // that the datanode has deleted them; the only way we remove them // is when we get a "removeBlock" message. // // The 'invalidate' list is used to inform the datanode the block // should be deleted. Items are removed from the invalidate list // upon giving instructions to the datanodes. // final Block blockToInvalidate = getBlockOnStorage(storedBlock, chosen); addToInvalidates(blockToInvalidate, chosen.getDatanodeDescriptor()); blockLog.debug("BLOCK* chooseExcessRedundancies: " + "({}, {}) is added to invalidated blocks set", chosen, storedBlock); } private void removeStoredBlock(DatanodeStorageInfo storageInfo, Block block, DatanodeDescriptor node) { if (shouldPostponeBlocksFromFuture && isGenStampInFuture(block)) { queueReportedBlock(storageInfo, block, null, QUEUE_REASON_FUTURE_GENSTAMP); return; } removeStoredBlock(getStoredBlock(block), node); } /** * Modify (block-->datanode) map. Possibly generate replication tasks, if the * removed block is still valid. */ public void removeStoredBlock(BlockInfo storedBlock, DatanodeDescriptor node) { blockLog.debug("BLOCK* removeStoredBlock: {} from {}", storedBlock, node); assert (namesystem.hasWriteLock()); { if (storedBlock == null || !blocksMap.removeNode(storedBlock, node)) { blockLog.debug("BLOCK* removeStoredBlock: {} has already been" + " removed from node {}", storedBlock, node); return; } CachedBlock cblock = namesystem.getCacheManager().getCachedBlocks() .get(new CachedBlock(storedBlock.getBlockId(), (short) 0, false)); if (cblock != null) { boolean removed = false; removed |= node.getPendingCached().remove(cblock); removed |= node.getCached().remove(cblock); removed |= node.getPendingUncached().remove(cblock); if (removed) { blockLog.debug( "BLOCK* removeStoredBlock: {} removed from caching " + "related lists on node {}", storedBlock, node); } } // // It's possible that the block was removed because of a datanode // failure. If the block is still valid, check if replication is // necessary. In that case, put block on a possibly-will- // be-replicated list. // if (!storedBlock.isDeleted()) { bmSafeMode.decrementSafeBlockCount(storedBlock); updateNeededReconstructions(storedBlock, -1, 0); } excessRedundancyMap.remove(node, storedBlock); corruptReplicas.removeFromCorruptReplicasMap(storedBlock, node); } } private void removeStaleReplicas(List<ReplicaUnderConstruction> staleReplicas, BlockInfo block) { for (ReplicaUnderConstruction r : staleReplicas) { removeStoredBlock(block, r.getExpectedStorageLocation().getDatanodeDescriptor()); NameNode.blockStateChangeLog.debug("BLOCK* Removing stale replica {}" + " of {}", r, Block.toString(r)); } } /** * Get all valid locations of the block & add the block to results * @return the length of the added block; 0 if the block is not added. If the * added block is a block group, return its approximate internal block size */ private long addBlock(BlockInfo block, List<BlockWithLocations> results) { final List<DatanodeStorageInfo> locations = getValidLocations(block); if (locations.size() == 0) { return 0; } else { final String[] datanodeUuids = new String[locations.size()]; final String[] storageIDs = new String[datanodeUuids.length]; final StorageType[] storageTypes = new StorageType[datanodeUuids.length]; for (int i = 0; i < locations.size(); i++) { final DatanodeStorageInfo s = locations.get(i); datanodeUuids[i] = s.getDatanodeDescriptor().getDatanodeUuid(); storageIDs[i] = s.getStorageID(); storageTypes[i] = s.getStorageType(); } BlockWithLocations blkWithLocs = new BlockWithLocations(block, datanodeUuids, storageIDs, storageTypes); if (block.isStriped()) { BlockInfoStriped blockStriped = (BlockInfoStriped) block; byte[] indices = new byte[locations.size()]; for (int i = 0; i < locations.size(); i++) { indices[i] = (byte) blockStriped.getStorageBlockIndex(locations.get(i)); } results.add(new StripedBlockWithLocations(blkWithLocs, indices, blockStriped.getDataBlockNum(), blockStriped.getCellSize())); // approximate size return block.getNumBytes() / blockStriped.getDataBlockNum(); } else { results.add(blkWithLocs); return block.getNumBytes(); } } } /** * The given node is reporting that it received a certain block. */ @VisibleForTesting public void addBlock(DatanodeStorageInfo storageInfo, Block block, String delHint) throws IOException { DatanodeDescriptor node = storageInfo.getDatanodeDescriptor(); // Decrement number of blocks scheduled to this datanode. // for a retry request (of DatanodeProtocol#blockReceivedAndDeleted with // RECEIVED_BLOCK), we currently also decrease the approximate number. node.decrementBlocksScheduled(storageInfo.getStorageType()); // get the deletion hint node DatanodeDescriptor delHintNode = null; if (delHint != null && delHint.length() != 0) { delHintNode = datanodeManager.getDatanode(delHint); if (delHintNode == null) { blockLog.warn("BLOCK* blockReceived: {} is expected to be removed " + "from an unrecorded node {}", block, delHint); } } // // Modify the blocks->datanode map and node's map. // BlockInfo storedBlock = getStoredBlock(block); if (storedBlock != null && block.getGenerationStamp() == storedBlock.getGenerationStamp()) { if (pendingReconstruction.decrement(storedBlock, node)) { NameNode.getNameNodeMetrics().incSuccessfulReReplications(); } } processAndHandleReportedBlock(storageInfo, block, ReplicaState.FINALIZED, delHintNode); } private void processAndHandleReportedBlock(DatanodeStorageInfo storageInfo, Block block, ReplicaState reportedState, DatanodeDescriptor delHintNode) throws IOException { final DatanodeDescriptor node = storageInfo.getDatanodeDescriptor(); LOG.debug("Reported block {} on {} size {} replicaState = {}", block, node, block.getNumBytes(), reportedState); if (shouldPostponeBlocksFromFuture && isGenStampInFuture(block)) { queueReportedBlock(storageInfo, block, reportedState, QUEUE_REASON_FUTURE_GENSTAMP); return; } // find block by blockId BlockInfo storedBlock = getStoredBlock(block); if (storedBlock == null) { // If blocksMap does not contain reported block id, // the replica should be removed from the data-node. blockLog.debug("BLOCK* addBlock: block {} on node {} size {} does not " + "belong to any file", block, node, block.getNumBytes()); addToInvalidates(new Block(block), node); return; } BlockUCState ucState = storedBlock.getBlockUCState(); // Block is on the NN LOG.debug("In memory blockUCState = {}", ucState); // Ignore replicas already scheduled to be removed from the DN if (invalidateBlocks.contains(node, block)) { return; } BlockToMarkCorrupt c = checkReplicaCorrupt(block, reportedState, storedBlock, ucState, node); if (c != null) { if (shouldPostponeBlocksFromFuture) { // If the block is an out-of-date generation stamp or state, // but we're the standby, we shouldn't treat it as corrupt, // but instead just queue it for later processing. // TODO: Pretty confident this should be s/storedBlock/block below, // since we should be postponing the info of the reported block, not // the stored block. See HDFS-6289 for more context. queueReportedBlock(storageInfo, storedBlock, reportedState, QUEUE_REASON_CORRUPT_STATE); } else { markBlockAsCorrupt(c, storageInfo, node); } return; } if (isBlockUnderConstruction(storedBlock, ucState, reportedState)) { addStoredBlockUnderConstruction(new StatefulBlockInfo(storedBlock, new Block(block), reportedState), storageInfo); return; } // Add replica if appropriate. If the replica was previously corrupt // but now okay, it might need to be updated. if (reportedState == ReplicaState.FINALIZED && (storedBlock.findStorageInfo(storageInfo) == -1 || corruptReplicas.isReplicaCorrupt(storedBlock, node))) { addStoredBlock(storedBlock, block, storageInfo, delHintNode, true); } } /** * The given node is reporting incremental information about some blocks. * This includes blocks that are starting to be received, completed being * received, or deleted. * * This method must be called with FSNamesystem lock held. */ public void processIncrementalBlockReport(final DatanodeID nodeID, final StorageReceivedDeletedBlocks srdb) throws IOException { assert namesystem.hasWriteLock(); final DatanodeDescriptor node = datanodeManager.getDatanode(nodeID); if (node == null || !node.isRegistered()) { blockLog.warn("BLOCK* processIncrementalBlockReport" + " is received from dead or unregistered node {}", nodeID); throw new IOException("Got incremental block report from unregistered or dead node"); } boolean successful = false; try { processIncrementalBlockReport(node, srdb); successful = true; } finally { if (!successful) { node.setForceRegistration(true); } } } private void processIncrementalBlockReport(final DatanodeDescriptor node, final StorageReceivedDeletedBlocks srdb) throws IOException { DatanodeStorageInfo storageInfo = node.getStorageInfo(srdb.getStorage().getStorageID()); if (storageInfo == null) { // The DataNode is reporting an unknown storage. Usually the NN learns // about new storages from heartbeats but during NN restart we may // receive a block report or incremental report before the heartbeat. // We must handle this for protocol compatibility. This issue was // uncovered by HDFS-6094. storageInfo = node.updateStorage(srdb.getStorage()); } int received = 0; int deleted = 0; int receiving = 0; for (ReceivedDeletedBlockInfo rdbi : srdb.getBlocks()) { switch (rdbi.getStatus()) { case DELETED_BLOCK: removeStoredBlock(storageInfo, rdbi.getBlock(), node); deleted++; break; case RECEIVED_BLOCK: addBlock(storageInfo, rdbi.getBlock(), rdbi.getDelHints()); received++; break; case RECEIVING_BLOCK: receiving++; processAndHandleReportedBlock(storageInfo, rdbi.getBlock(), ReplicaState.RBW, null); break; default: String msg = "Unknown block status code reported by " + node + ": " + rdbi; blockLog.warn(msg); assert false : msg; // if assertions are enabled, throw. break; } blockLog.debug("BLOCK* block {}: {} is received from {}", rdbi.getStatus(), rdbi.getBlock(), node); } blockLog.debug("*BLOCK* NameNode.processIncrementalBlockReport: from " + "{} receiving: {}, received: {}, deleted: {}", node, receiving, received, deleted); } /** * Return the number of nodes hosting a given block, grouped * by the state of those replicas. * For a striped block, this includes nodes storing blocks belonging to the * striped block group. But note we exclude duplicated internal block replicas * for calculating {@link NumberReplicas#liveReplicas}. */ public NumberReplicas countNodes(BlockInfo b) { return countNodes(b, false); } NumberReplicas countNodes(BlockInfo b, boolean inStartupSafeMode) { NumberReplicas numberReplicas = new NumberReplicas(); Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(b); if (b.isStriped()) { countReplicasForStripedBlock(numberReplicas, (BlockInfoStriped) b, nodesCorrupt, inStartupSafeMode); } else { for (DatanodeStorageInfo storage : blocksMap.getStorages(b)) { checkReplicaOnStorage(numberReplicas, b, storage, nodesCorrupt, inStartupSafeMode); } } return numberReplicas; } private StoredReplicaState checkReplicaOnStorage(NumberReplicas counters, BlockInfo b, DatanodeStorageInfo storage, Collection<DatanodeDescriptor> nodesCorrupt, boolean inStartupSafeMode) { final StoredReplicaState s; if (storage.getState() == State.NORMAL) { final DatanodeDescriptor node = storage.getDatanodeDescriptor(); if (nodesCorrupt != null && nodesCorrupt.contains(node)) { s = StoredReplicaState.CORRUPT; } else if (inStartupSafeMode) { s = StoredReplicaState.LIVE; counters.add(s, 1); return s; } else if (node.isDecommissionInProgress()) { s = StoredReplicaState.DECOMMISSIONING; } else if (node.isDecommissioned()) { s = StoredReplicaState.DECOMMISSIONED; } else if (node.isMaintenance()) { if (node.isInMaintenance() || !node.isAlive()) { s = StoredReplicaState.MAINTENANCE_NOT_FOR_READ; } else { s = StoredReplicaState.MAINTENANCE_FOR_READ; } } else if (isExcess(node, b)) { s = StoredReplicaState.EXCESS; } else { s = StoredReplicaState.LIVE; } counters.add(s, 1); if (storage.areBlockContentsStale()) { counters.add(StoredReplicaState.STALESTORAGE, 1); } } else if (!inStartupSafeMode && storage.getState() == State.READ_ONLY_SHARED) { s = StoredReplicaState.READONLY; counters.add(s, 1); } else { s = null; } return s; } /** * For a striped block, it is possible it contains full number of internal * blocks (i.e., 9 by default), but with duplicated replicas of the same * internal block. E.g., for the following list of internal blocks * b0, b0, b1, b2, b3, b4, b5, b6, b7 * we have 9 internal blocks but we actually miss b8. * We should use this method to detect the above scenario and schedule * necessary reconstruction. */ private void countReplicasForStripedBlock(NumberReplicas counters, BlockInfoStriped block, Collection<DatanodeDescriptor> nodesCorrupt, boolean inStartupSafeMode) { BitSet bitSet = new BitSet(block.getTotalBlockNum()); for (StorageAndBlockIndex si : block.getStorageAndIndexInfos()) { StoredReplicaState state = checkReplicaOnStorage(counters, block, si.getStorage(), nodesCorrupt, inStartupSafeMode); if (state == StoredReplicaState.LIVE) { if (!bitSet.get(si.getBlockIndex())) { bitSet.set(si.getBlockIndex()); } else { counters.subtract(StoredReplicaState.LIVE, 1); counters.add(StoredReplicaState.REDUNDANT, 1); } } } } @VisibleForTesting int getExcessSize4Testing(String dnUuid) { return excessRedundancyMap.getSize4Testing(dnUuid); } public boolean isExcess(DatanodeDescriptor dn, BlockInfo blk) { return excessRedundancyMap.contains(dn, blk); } /** * Simpler, faster form of {@link #countNodes} that only returns the number * of live nodes. If in startup safemode (or its 30-sec extension period), * then it gains speed by ignoring issues of excess replicas or nodes * that are decommissioned or in process of becoming decommissioned. * If not in startup, then it calls {@link #countNodes} instead. * * @param b - the block being tested * @return count of live nodes for this block */ int countLiveNodes(BlockInfo b) { final boolean inStartupSafeMode = namesystem.isInStartupSafeMode(); return countNodes(b, inStartupSafeMode).liveReplicas(); } /** * On putting the node in service, check if the node has excess replicas. * If there are any excess replicas, call processExtraRedundancyBlock(). * Process extra redundancy blocks only when active NN is out of safe mode. */ void processExtraRedundancyBlocksOnInService(final DatanodeDescriptor srcNode) { if (!isPopulatingReplQueues()) { return; } final Iterator<BlockInfo> it = srcNode.getBlockIterator(); int numExtraRedundancy = 0; while (it.hasNext()) { final BlockInfo block = it.next(); if (block.isDeleted()) { //Orphan block, will be handled eventually, skip continue; } int expectedReplication = this.getExpectedRedundancyNum(block); NumberReplicas num = countNodes(block); if (shouldProcessExtraRedundancy(num, expectedReplication)) { // extra redundancy block processExtraRedundancyBlock(block, (short) expectedReplication, null, null); numExtraRedundancy++; } } LOG.info("Invalidated {} extra redundancy blocks on {} after " + "it is in service", numExtraRedundancy, srcNode); } /** * Returns whether a node can be safely decommissioned or in maintenance * based on its liveness. Dead nodes cannot always be safely decommissioned * or in maintenance. */ boolean isNodeHealthyForDecommissionOrMaintenance(DatanodeDescriptor node) { if (!node.checkBlockReportReceived()) { LOG.info("Node {} hasn't sent its first block report.", node); return false; } if (node.isAlive()) { return true; } updateState(); if (pendingReconstructionBlocksCount == 0 && lowRedundancyBlocksCount == 0) { LOG.info( "Node {} is dead and there are no low redundancy" + " blocks or blocks pending reconstruction. Safe to decommission or", " put in maintenance.", node); return true; } LOG.warn( "Node {} is dead " + "while in {}. Cannot be safely " + "decommissioned or be in maintenance since there is risk of reduced " + "data durability or data loss. Either restart the failed node or " + "force decommissioning or maintenance by removing, calling " + "refreshNodes, then re-adding to the excludes or host config files.", node, node.getAdminState()); return false; } public int getActiveBlockCount() { return blocksMap.size(); } public DatanodeStorageInfo[] getStorages(BlockInfo block) { final DatanodeStorageInfo[] storages = new DatanodeStorageInfo[block.numNodes()]; int i = 0; for (DatanodeStorageInfo s : blocksMap.getStorages(block)) { storages[i++] = s; } return storages; } /** @return an iterator of the datanodes. */ public Iterable<DatanodeStorageInfo> getStorages(final Block block) { return blocksMap.getStorages(block); } public int getTotalBlocks() { return blocksMap.size(); } public void removeBlock(BlockInfo block) { assert namesystem.hasWriteLock(); // No need to ACK blocks that are being removed entirely // from the namespace, since the removal of the associated // file already removes them from the block map below. block.setNumBytes(BlockCommand.NO_ACK); addToInvalidates(block); removeBlockFromMap(block); // Remove the block from pendingReconstruction and neededReconstruction pendingReconstruction.remove(block); neededReconstruction.remove(block, LowRedundancyBlocks.LEVEL); postponedMisreplicatedBlocks.remove(block); } public BlockInfo getStoredBlock(Block block) { if (!BlockIdManager.isStripedBlockID(block.getBlockId())) { return blocksMap.getStoredBlock(block); } if (!hasNonEcBlockUsingStripedID) { return blocksMap.getStoredBlock(new Block(BlockIdManager.convertToStripedID(block.getBlockId()))); } BlockInfo info = blocksMap.getStoredBlock(block); if (info != null) { return info; } return blocksMap.getStoredBlock(new Block(BlockIdManager.convertToStripedID(block.getBlockId()))); } public void updateLastBlock(BlockInfo lastBlock, ExtendedBlock newBlock) { lastBlock.setNumBytes(newBlock.getNumBytes()); List<ReplicaUnderConstruction> staleReplicas = lastBlock .setGenerationStampAndVerifyReplicas(newBlock.getGenerationStamp()); removeStaleReplicas(staleReplicas, lastBlock); } /** updates a block in needed reconstruction queue. */ private void updateNeededReconstructions(final BlockInfo block, final int curReplicasDelta, int expectedReplicasDelta) { namesystem.writeLock(); try { if (!isPopulatingReplQueues() || !block.isComplete()) { return; } NumberReplicas repl = countNodes(block); int pendingNum = pendingReconstruction.getNumReplicas(block); int curExpectedReplicas = getExpectedRedundancyNum(block); if (!hasEnoughEffectiveReplicas(block, repl, pendingNum)) { neededReconstruction.update(block, repl.liveReplicas() + pendingNum, repl.readOnlyReplicas(), repl.outOfServiceReplicas(), curExpectedReplicas, curReplicasDelta, expectedReplicasDelta); } else { int oldReplicas = repl.liveReplicas() + pendingNum - curReplicasDelta; int oldExpectedReplicas = curExpectedReplicas - expectedReplicasDelta; neededReconstruction.remove(block, oldReplicas, repl.readOnlyReplicas(), repl.outOfServiceReplicas(), oldExpectedReplicas); } } finally { namesystem.writeUnlock(); } } /** * Check sufficient redundancy of the blocks in the collection. If any block * is needed reconstruction, insert it into the reconstruction queue. * Otherwise, if the block is more than the expected replication factor, * process it as an extra redundancy block. */ public void checkRedundancy(BlockCollection bc) { for (BlockInfo block : bc.getBlocks()) { short expected = getExpectedRedundancyNum(block); final NumberReplicas n = countNodes(block); final int pending = pendingReconstruction.getNumReplicas(block); if (!hasEnoughEffectiveReplicas(block, n, pending)) { neededReconstruction.add(block, n.liveReplicas() + pending, n.readOnlyReplicas(), n.outOfServiceReplicas(), expected); } else if (shouldProcessExtraRedundancy(n, expected)) { processExtraRedundancyBlock(block, expected, null, null); } } } /** * Get blocks to invalidate for <i>nodeId</i> * in {@link #invalidateBlocks}. * * @return number of blocks scheduled for removal during this iteration. */ private int invalidateWorkForOneNode(DatanodeInfo dn) { final List<Block> toInvalidate; namesystem.writeLock(); try { // blocks should not be replicated or removed if safe mode is on if (namesystem.isInSafeMode()) { LOG.debug("In safemode, not computing reconstruction work"); return 0; } try { DatanodeDescriptor dnDescriptor = datanodeManager.getDatanode(dn); if (dnDescriptor == null) { LOG.warn("DataNode {} cannot be found with UUID {}" + ", removing block invalidation work.", dn, dn.getDatanodeUuid()); invalidateBlocks.remove(dn); return 0; } toInvalidate = invalidateBlocks.invalidateWork(dnDescriptor); if (toInvalidate == null) { return 0; } } catch (UnregisteredNodeException une) { return 0; } } finally { namesystem.writeUnlock(); } blockLog.debug("BLOCK* {}: ask {} to delete {}", getClass().getSimpleName(), dn, toInvalidate); return toInvalidate.size(); } @VisibleForTesting public boolean containsInvalidateBlock(final DatanodeInfo dn, final Block block) { return invalidateBlocks.contains(dn, block); } boolean isPlacementPolicySatisfied(BlockInfo storedBlock) { List<DatanodeDescriptor> liveNodes = new ArrayList<>(); Collection<DatanodeDescriptor> corruptNodes = corruptReplicas.getNodes(storedBlock); for (DatanodeStorageInfo storage : blocksMap.getStorages(storedBlock)) { if (storage.getStorageType() == StorageType.PROVIDED && storage.getState() == State.NORMAL) { // assume the policy is satisfied for blocks on PROVIDED storage // as long as the storage is in normal state. return true; } final DatanodeDescriptor cur = getDatanodeDescriptorFromStorage(storage); // Nodes under maintenance should be counted as valid replicas from // rack policy point of view. if (!cur.isDecommissionInProgress() && !cur.isDecommissioned() && ((corruptNodes == null) || !corruptNodes.contains(cur))) { liveNodes.add(cur); } } DatanodeInfo[] locs = liveNodes.toArray(new DatanodeInfo[liveNodes.size()]); BlockType blockType = storedBlock.getBlockType(); BlockPlacementPolicy placementPolicy = placementPolicies.getPolicy(blockType); int numReplicas = blockType == STRIPED ? ((BlockInfoStriped) storedBlock).getRealTotalBlockNum() : storedBlock.getReplication(); return placementPolicy.verifyBlockPlacement(locs, numReplicas).isPlacementPolicySatisfied(); } boolean isNeededReconstructionForMaintenance(BlockInfo storedBlock, NumberReplicas numberReplicas) { return storedBlock.isComplete() && (numberReplicas.liveReplicas() < getMinMaintenanceStorageNum(storedBlock) || !isPlacementPolicySatisfied(storedBlock)); } boolean isNeededReconstruction(BlockInfo storedBlock, NumberReplicas numberReplicas) { return isNeededReconstruction(storedBlock, numberReplicas, 0); } /** * A block needs reconstruction if the number of redundancies is less than * expected or if it does not have enough racks. */ boolean isNeededReconstruction(BlockInfo storedBlock, NumberReplicas numberReplicas, int pending) { return storedBlock.isComplete() && !hasEnoughEffectiveReplicas(storedBlock, numberReplicas, pending); } // Exclude maintenance, but make sure it has minimal live replicas // to satisfy the maintenance requirement. public short getExpectedLiveRedundancyNum(BlockInfo block, NumberReplicas numberReplicas) { final short expectedRedundancy = getExpectedRedundancyNum(block); return (short) Math.max(expectedRedundancy - numberReplicas.maintenanceReplicas(), getMinMaintenanceStorageNum(block)); } public short getExpectedRedundancyNum(BlockInfo block) { return block.isStriped() ? ((BlockInfoStriped) block).getRealTotalBlockNum() : block.getReplication(); } public long getMissingBlocksCount() { // not locking return this.neededReconstruction.getCorruptBlockSize(); } public long getMissingReplOneBlocksCount() { // not locking return this.neededReconstruction.getCorruptReplicationOneBlockSize(); } public long getHighestPriorityReplicatedBlockCount() { return this.neededReconstruction.getHighestPriorityReplicatedBlockCount(); } public long getHighestPriorityECBlockCount() { return this.neededReconstruction.getHighestPriorityECBlockCount(); } public BlockInfo addBlockCollection(BlockInfo block, BlockCollection bc) { return blocksMap.addBlockCollection(block, bc); } /** * Do some check when adding a block to blocksmap. * For HDFS-7994 to check whether then block is a NonEcBlockUsingStripedID. * */ public BlockInfo addBlockCollectionWithCheck(BlockInfo block, BlockCollection bc) { if (!hasNonEcBlockUsingStripedID && !block.isStriped() && BlockIdManager.isStripedBlockID(block.getBlockId())) { hasNonEcBlockUsingStripedID = true; } return addBlockCollection(block, bc); } BlockCollection getBlockCollection(BlockInfo b) { return namesystem.getBlockCollection(b.getBlockCollectionId()); } public int numCorruptReplicas(Block block) { return corruptReplicas.numCorruptReplicas(block); } public void removeBlockFromMap(BlockInfo block) { for (DatanodeStorageInfo info : blocksMap.getStorages(block)) { excessRedundancyMap.remove(info.getDatanodeDescriptor(), block); } blocksMap.removeBlock(block); // If block is removed from blocksMap remove it from corruptReplicasMap corruptReplicas.removeFromCorruptReplicasMap(block); } public int getCapacity() { return blocksMap.getCapacity(); } /** * Return an iterator over the set of blocks for which there are no replicas. */ public Iterator<BlockInfo> getCorruptReplicaBlockIterator() { return neededReconstruction.iterator(LowRedundancyBlocks.QUEUE_WITH_CORRUPT_BLOCKS); } /** * Get the replicas which are corrupt for a given block. */ public Collection<DatanodeDescriptor> getCorruptReplicas(Block block) { return corruptReplicas.getNodes(block); } /** * Get reason for certain corrupted replicas for a given block and a given dn. */ public String getCorruptReason(Block block, DatanodeDescriptor node) { return corruptReplicas.getCorruptReason(block, node); } /** @return the size of UnderReplicatedBlocks */ public int numOfUnderReplicatedBlocks() { return neededReconstruction.size(); } /** * Periodically calls computeBlockRecoveryWork(). */ private class RedundancyMonitor implements Runnable { @Override public void run() { while (namesystem.isRunning()) { try { // Process recovery work only when active NN is out of safe mode. if (isPopulatingReplQueues()) { computeDatanodeWork(); processPendingReconstructions(); rescanPostponedMisreplicatedBlocks(); } TimeUnit.MILLISECONDS.sleep(redundancyRecheckIntervalMs); } catch (Throwable t) { if (!namesystem.isRunning()) { LOG.info("Stopping RedundancyMonitor."); if (!(t instanceof InterruptedException)) { LOG.info("RedundancyMonitor received an exception" + " while shutting down.", t); } break; } else if (!checkNSRunning && t instanceof InterruptedException) { LOG.info("Stopping RedundancyMonitor for testing."); break; } LOG.error("RedundancyMonitor thread received Runtime exception. ", t); terminate(1, t); } } } } /** * Runnable that monitors the fragmentation of the StorageInfo TreeSet and * compacts it when it falls under a certain threshold. */ private class StorageInfoDefragmenter implements Runnable { @Override public void run() { while (namesystem.isRunning()) { try { // Check storage efficiency only when active NN is out of safe mode. if (isPopulatingReplQueues()) { scanAndCompactStorages(); } Thread.sleep(storageInfoDefragmentInterval); } catch (Throwable t) { if (!namesystem.isRunning()) { LOG.info("Stopping thread."); if (!(t instanceof InterruptedException)) { LOG.info("Received an exception while shutting down.", t); } break; } else if (!checkNSRunning && t instanceof InterruptedException) { LOG.info("Stopping for testing."); break; } LOG.error("Thread received Runtime exception.", t); terminate(1, t); } } } private void scanAndCompactStorages() throws InterruptedException { ArrayList<String> datanodesAndStorages = new ArrayList<>(); for (DatanodeDescriptor node : datanodeManager.getDatanodeListForReport(DatanodeReportType.ALL)) { for (DatanodeStorageInfo storage : node.getStorageInfos()) { try { namesystem.readLock(); double ratio = storage.treeSetFillRatio(); if (ratio < storageInfoDefragmentRatio) { datanodesAndStorages.add(node.getDatanodeUuid()); datanodesAndStorages.add(storage.getStorageID()); } LOG.debug("StorageInfo TreeSet fill ratio {} : {}{}", storage.getStorageID(), ratio, (ratio < storageInfoDefragmentRatio) ? " (queued for defragmentation)" : ""); } finally { namesystem.readUnlock(); } } } if (!datanodesAndStorages.isEmpty()) { for (int i = 0; i < datanodesAndStorages.size(); i += 2) { namesystem.writeLock(); try { final DatanodeDescriptor dn = datanodeManager.getDatanode(datanodesAndStorages.get(i)); if (dn == null) { continue; } final DatanodeStorageInfo storage = dn.getStorageInfo(datanodesAndStorages.get(i + 1)); if (storage != null) { boolean aborted = !storage.treeSetCompact(storageInfoDefragmentTimeout); if (aborted) { // Compaction timed out, reset iterator to continue with // the same storage next iteration. i -= 2; } LOG.info("StorageInfo TreeSet defragmented {} : {}{}", storage.getStorageID(), storage.treeSetFillRatio(), aborted ? " (aborted)" : ""); } } finally { namesystem.writeUnlock(); } // Wait between each iteration Thread.sleep(1000); } } } } /** * Compute block replication and block invalidation work that can be scheduled * on data-nodes. The datanode will be informed of this work at the next * heartbeat. * * @return number of blocks scheduled for replication or removal. */ int computeDatanodeWork() { // Blocks should not be replicated or removed if in safe mode. // It's OK to check safe mode here w/o holding lock, in the worst // case extra replications will be scheduled, and these will get // fixed up later. if (namesystem.isInSafeMode()) { return 0; } final int numlive = heartbeatManager.getLiveDatanodeCount(); final int blocksToProcess = numlive * this.blocksReplWorkMultiplier; final int nodesToProcess = (int) Math.ceil(numlive * this.blocksInvalidateWorkPct); int workFound = this.computeBlockReconstructionWork(blocksToProcess); // Update counters namesystem.writeLock(); try { this.updateState(); this.scheduledReplicationBlocksCount = workFound; } finally { namesystem.writeUnlock(); } workFound += this.computeInvalidateWork(nodesToProcess); return workFound; } /** * Clear all queues that hold decisions previously made by * this NameNode. */ public void clearQueues() { neededReconstruction.clear(); pendingReconstruction.clear(); excessRedundancyMap.clear(); invalidateBlocks.clear(); datanodeManager.clearPendingQueues(); postponedMisreplicatedBlocks.clear(); }; public static LocatedBlock newLocatedBlock(ExtendedBlock b, DatanodeStorageInfo[] storages, long startOffset, boolean corrupt) { // startOffset is unknown return new LocatedBlock(b, DatanodeStorageInfo.toDatanodeInfos(storages), DatanodeStorageInfo.toStorageIDs(storages), DatanodeStorageInfo.toStorageTypes(storages), startOffset, corrupt, null); } public static LocatedStripedBlock newLocatedStripedBlock(ExtendedBlock b, DatanodeStorageInfo[] storages, byte[] indices, long startOffset, boolean corrupt) { // startOffset is unknown return new LocatedStripedBlock(b, DatanodeStorageInfo.toDatanodeInfos(storages), DatanodeStorageInfo.toStorageIDs(storages), DatanodeStorageInfo.toStorageTypes(storages), indices, startOffset, corrupt, null); } public static LocatedBlock newLocatedBlock(ExtendedBlock eb, BlockInfo info, DatanodeStorageInfo[] locs, long offset) throws IOException { final LocatedBlock lb; if (info.isStriped()) { lb = newLocatedStripedBlock(eb, locs, info.getUnderConstructionFeature().getBlockIndices(), offset, false); } else { lb = newLocatedBlock(eb, locs, offset, false); } return lb; } /** * A simple result enum for the result of * {@link BlockManager#processMisReplicatedBlock(BlockInfo)}. */ enum MisReplicationResult { /** The block should be invalidated since it belongs to a deleted file. */ INVALID, /** The block is currently under-replicated. */ UNDER_REPLICATED, /** The block is currently over-replicated. */ OVER_REPLICATED, /** A decision can't currently be made about this block. */ POSTPONE, /** The block is under construction, so should be ignored. */ UNDER_CONSTRUCTION, /** The block is properly replicated. */ OK } public void shutdown() { stopReconstructionInitializer(); blocksMap.close(); MBeans.unregister(mxBeanName); mxBeanName = null; } public void clear() { blockIdManager.clear(); clearQueues(); blocksMap.clear(); } public BlockReportLeaseManager getBlockReportLeaseManager() { return blockReportLeaseManager; } @Override // BlockStatsMXBean public Map<StorageType, StorageTypeStats> getStorageTypeStats() { return datanodeManager.getDatanodeStatistics().getStorageTypeStats(); } /** * Initialize replication queues. */ public void initializeReplQueues() { LOG.info("initializing replication queues"); processMisReplicatedBlocks(); initializedReplQueues = true; } /** * Check if replication queues are to be populated * @return true when node is HAState.Active and not in the very first safemode */ public boolean isPopulatingReplQueues() { if (!shouldPopulateReplQueues()) { return false; } return initializedReplQueues; } public void setInitializedReplQueues(boolean v) { this.initializedReplQueues = v; } public boolean shouldPopulateReplQueues() { HAContext haContext = namesystem.getHAContext(); if (haContext == null || haContext.getState() == null) return false; return haContext.getState().shouldPopulateReplQueues(); } boolean getShouldPostponeBlocksFromFuture() { return shouldPostponeBlocksFromFuture; } // async processing of an action, used for IBRs. public void enqueueBlockOp(final Runnable action) throws IOException { try { blockReportThread.enqueue(action); } catch (InterruptedException ie) { throw new IOException(ie); } } // sync batch processing for a full BR. public <T> T runBlockOp(final Callable<T> action) throws IOException { final FutureTask<T> future = new FutureTask<T>(action); enqueueBlockOp(future); try { return future.get(); } catch (ExecutionException ee) { Throwable cause = ee.getCause(); if (cause == null) { cause = ee; } if (!(cause instanceof IOException)) { cause = new IOException(cause); } throw (IOException) cause; } catch (InterruptedException ie) { Thread.currentThread().interrupt(); throw new IOException(ie); } } /** * Notification of a successful block recovery. * @param block for which the recovery succeeded */ public void successfulBlockRecovery(BlockInfo block) { pendingRecoveryBlocks.remove(block); } /** * Checks whether a recovery attempt has been made for the given block. * If so, checks whether that attempt has timed out. * @param b block for which recovery is being attempted * @return true if no recovery attempt has been made or * the previous attempt timed out */ public boolean addBlockRecoveryAttempt(BlockInfo b) { return pendingRecoveryBlocks.add(b); } @VisibleForTesting public void flushBlockOps() throws IOException { runBlockOp(new Callable<Void>() { @Override public Void call() { return null; } }); } public int getBlockOpQueueLength() { return blockReportThread.queue.size(); } private class BlockReportProcessingThread extends Thread { private static final long MAX_LOCK_HOLD_MS = 4; private long lastFull = 0; private final BlockingQueue<Runnable> queue = new ArrayBlockingQueue<Runnable>(1024); BlockReportProcessingThread() { super("Block report processor"); setDaemon(true); } @Override public void run() { try { processQueue(); } catch (Throwable t) { ExitUtil.terminate(1, getName() + " encountered fatal exception: " + t); } } private void processQueue() { while (namesystem.isRunning()) { NameNodeMetrics metrics = NameNode.getNameNodeMetrics(); try { Runnable action = queue.take(); // batch as many operations in the write lock until the queue // runs dry, or the max lock hold is reached. int processed = 0; namesystem.writeLock(); metrics.setBlockOpsQueued(queue.size() + 1); try { long start = Time.monotonicNow(); do { processed++; action.run(); if (Time.monotonicNow() - start > MAX_LOCK_HOLD_MS) { break; } action = queue.poll(); } while (action != null); } finally { namesystem.writeUnlock(); metrics.addBlockOpsBatched(processed - 1); } } catch (InterruptedException e) { // ignore unless thread was specifically interrupted. if (Thread.interrupted()) { break; } } } queue.clear(); } void enqueue(Runnable action) throws InterruptedException { if (!queue.offer(action)) { if (!isAlive() && namesystem.isRunning()) { ExitUtil.terminate(1, getName() + " is not running"); } long now = Time.monotonicNow(); if (now - lastFull > 4000) { lastFull = now; LOG.info("Block report queue is full"); } queue.put(action); } } } /** * @return redundancy thread. */ @VisibleForTesting Daemon getRedundancyThread() { return redundancyThread; } public BlockIdManager getBlockIdManager() { return blockIdManager; } public long nextGenerationStamp(boolean legacyBlock) throws IOException { return blockIdManager.nextGenerationStamp(legacyBlock); } public boolean isLegacyBlock(Block block) { return blockIdManager.isLegacyBlock(block); } public long nextBlockId(BlockType blockType) { return blockIdManager.nextBlockId(blockType); } boolean isGenStampInFuture(Block block) { return blockIdManager.isGenStampInFuture(block); } boolean isReplicaCorrupt(BlockInfo blk, DatanodeDescriptor d) { return corruptReplicas.isReplicaCorrupt(blk, d); } private int setBlockIndices(BlockInfo blk, byte[] blockIndices, int i, DatanodeStorageInfo storage) { // TODO this can be more efficient if (blockIndices != null) { byte index = ((BlockInfoStriped) blk).getStorageBlockIndex(storage); assert index >= 0; blockIndices[i++] = index; } return i; } private static long getBlockRecoveryTimeout(long heartbeatIntervalSecs) { return TimeUnit.SECONDS.toMillis(heartbeatIntervalSecs * BLOCK_RECOVERY_TIMEOUT_MULTIPLIER); } @VisibleForTesting public void setBlockRecoveryTimeout(long blockRecoveryTimeout) { pendingRecoveryBlocks.setRecoveryTimeoutInterval(blockRecoveryTimeout); } @VisibleForTesting public ProvidedStorageMap getProvidedStorageMap() { return providedStorageMap; } /** * Create SPS manager instance. It manages the user invoked sps paths and does * the movement. * * @param conf * configuration * @return true if the instance is successfully created, false otherwise. */ private boolean createSPSManager(final Configuration conf) { return createSPSManager(conf, null); } /** * Create SPS manager instance. It manages the user invoked sps paths and does * the movement. * * @param conf * configuration * @param spsMode * satisfier mode * @return true if the instance is successfully created, false otherwise. */ public boolean createSPSManager(final Configuration conf, final String spsMode) { // sps manager manages the user invoked sps paths and does the movement. // StoragePolicySatisfier(SPS) configs boolean storagePolicyEnabled = conf.getBoolean(DFSConfigKeys.DFS_STORAGE_POLICY_ENABLED_KEY, DFSConfigKeys.DFS_STORAGE_POLICY_ENABLED_DEFAULT); String modeVal = spsMode; if (org.apache.commons.lang3.StringUtils.isBlank(modeVal)) { modeVal = conf.get(DFSConfigKeys.DFS_STORAGE_POLICY_SATISFIER_MODE_KEY, DFSConfigKeys.DFS_STORAGE_POLICY_SATISFIER_MODE_DEFAULT); } StoragePolicySatisfierMode mode = StoragePolicySatisfierMode.fromString(modeVal); if (!storagePolicyEnabled || mode == StoragePolicySatisfierMode.NONE) { LOG.info("Storage policy satisfier is disabled"); return false; } spsManager = new StoragePolicySatisfyManager(conf, namesystem); return true; } /** * Nullify SPS manager as this feature is disabled fully. */ public void disableSPS() { spsManager = null; } /** * @return sps manager. */ public StoragePolicySatisfyManager getSPSManager() { return spsManager; } }