org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.blockmanagement;

import static org.apache.hadoop.util.ExitUtil.terminate;

import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import java.util.TreeMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.fs.UnresolvedLinkException;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.CacheDirective;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.CachedBlocksList.Type;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
import org.apache.hadoop.hdfs.server.namenode.CacheManager;
import org.apache.hadoop.hdfs.server.namenode.CachePool;
import org.apache.hadoop.hdfs.server.namenode.CachedBlock;
import org.apache.hadoop.hdfs.server.namenode.FSDirectory;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.INode;
import org.apache.hadoop.hdfs.server.namenode.INodeDirectory;
import org.apache.hadoop.hdfs.server.namenode.INodeFile;
import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
import org.apache.hadoop.hdfs.util.ReadOnlyList;
import org.apache.hadoop.util.GSet;
import org.apache.hadoop.util.Time;

import com.google.common.base.Preconditions;

/**
 * Scans the namesystem, scheduling blocks to be cached as appropriate.
 *
 * The CacheReplicationMonitor does a full scan when the NameNode first
 * starts up, and at configurable intervals afterwards.
 */
@InterfaceAudience.LimitedPrivate({ "HDFS" })
public class CacheReplicationMonitor extends Thread implements Closeable {

    private static final Log LOG = LogFactory.getLog(CacheReplicationMonitor.class);

    private final FSNamesystem namesystem;

    private final BlockManager blockManager;

    private final CacheManager cacheManager;

    private final GSet<CachedBlock, CachedBlock> cachedBlocks;

    /**
     * Pseudorandom number source
     */
    private static final Random random = new Random();

    /**
     * The interval at which we scan the namesystem for caching changes.
     */
    private final long intervalMs;

    /**
     * The CacheReplicationMonitor (CRM) lock. Used to synchronize starting and
     * waiting for rescan operations.
     */
    private final ReentrantLock lock;

    /**
     * Notifies the scan thread that an immediate rescan is needed.
     */
    private final Condition doRescan;

    /**
     * Notifies waiting threads that a rescan has finished.
     */
    private final Condition scanFinished;

    /**
     * Whether there are pending CacheManager operations that necessitate a
     * CacheReplicationMonitor rescan. Protected by the CRM lock.
     */
    private boolean needsRescan = true;

    /**
     * Whether we are currently doing a rescan. Protected by the CRM lock.
     */
    private boolean isScanning = false;

    /**
     * The number of rescans completed. Used to wait for scans to finish.
     * Protected by the CacheReplicationMonitor lock.
     */
    private long scanCount = 0;

    /**
     * True if this monitor should terminate. Protected by the CRM lock.
     */
    private boolean shutdown = false;

    /**
     * Mark status of the current scan.
     */
    private boolean mark = false;

    /**
     * Cache directives found in the previous scan.
     */
    private int scannedDirectives;

    /**
     * Blocks found in the previous scan.
     */
    private long scannedBlocks;

    public CacheReplicationMonitor(FSNamesystem namesystem, CacheManager cacheManager, long intervalMs,
            ReentrantLock lock) {
        this.namesystem = namesystem;
        this.blockManager = namesystem.getBlockManager();
        this.cacheManager = cacheManager;
        this.cachedBlocks = cacheManager.getCachedBlocks();
        this.intervalMs = intervalMs;
        this.lock = lock;
        this.doRescan = this.lock.newCondition();
        this.scanFinished = this.lock.newCondition();
    }

    @Override
    public void run() {
        long startTimeMs = 0;
        Thread.currentThread().setName("CacheReplicationMonitor(" + System.identityHashCode(this) + ")");
        LOG.info("Starting CacheReplicationMonitor with interval " + intervalMs + " milliseconds");
        try {
            long curTimeMs = Time.monotonicNow();
            while (true) {
                lock.lock();
                try {
                    while (true) {
                        if (shutdown) {
                            LOG.info("Shutting down CacheReplicationMonitor");
                            return;
                        }
                        if (needsRescan) {
                            LOG.info("Rescanning because of pending operations");
                            break;
                        }
                        long delta = (startTimeMs + intervalMs) - curTimeMs;
                        if (delta <= 0) {
                            LOG.info("Rescanning after " + (curTimeMs - startTimeMs) + " milliseconds");
                            break;
                        }
                        doRescan.await(delta, TimeUnit.MILLISECONDS);
                        curTimeMs = Time.monotonicNow();
                    }
                    isScanning = true;
                    needsRescan = false;
                } finally {
                    lock.unlock();
                }
                startTimeMs = curTimeMs;
                mark = !mark;
                rescan();
                curTimeMs = Time.monotonicNow();
                // Update synchronization-related variables.
                lock.lock();
                try {
                    isScanning = false;
                    scanCount++;
                    scanFinished.signalAll();
                } finally {
                    lock.unlock();
                }
                LOG.info("Scanned " + scannedDirectives + " directive(s) and " + scannedBlocks + " block(s) in "
                        + (curTimeMs - startTimeMs) + " " + "millisecond(s).");
            }
        } catch (InterruptedException e) {
            LOG.info("Shutting down CacheReplicationMonitor.");
            return;
        } catch (Throwable t) {
            LOG.fatal("Thread exiting", t);
            terminate(1, t);
        }
    }

    /**
     * Waits for a rescan to complete. This doesn't guarantee consistency with
     * pending operations, only relative recency, since it will not force a new
     * rescan if a rescan is already underway.
     * <p>
     * Note that this call will release the FSN lock, so operations before and
     * after are not atomic.
     */
    public void waitForRescanIfNeeded() {
        Preconditions.checkArgument(!namesystem.hasWriteLock(),
                "Must not hold the FSN write lock when waiting for a rescan.");
        Preconditions.checkArgument(lock.isHeldByCurrentThread(),
                "Must hold the CRM lock when waiting for a rescan.");
        if (!needsRescan) {
            return;
        }
        // If no scan is already ongoing, mark the CRM as dirty and kick
        if (!isScanning) {
            doRescan.signal();
        }
        // Wait until the scan finishes and the count advances
        final long startCount = scanCount;
        while ((!shutdown) && (startCount >= scanCount)) {
            try {
                scanFinished.await();
            } catch (InterruptedException e) {
                LOG.warn("Interrupted while waiting for CacheReplicationMonitor" + " rescan", e);
                break;
            }
        }
    }

    /**
     * Indicates to the CacheReplicationMonitor that there have been CacheManager
     * changes that require a rescan.
     */
    public void setNeedsRescan() {
        Preconditions.checkArgument(lock.isHeldByCurrentThread(),
                "Must hold the CRM lock when setting the needsRescan bit.");
        this.needsRescan = true;
    }

    /**
     * Shut down the monitor thread.
     */
    @Override
    public void close() throws IOException {
        Preconditions.checkArgument(namesystem.hasWriteLock());
        lock.lock();
        try {
            if (shutdown)
                return;
            // Since we hold both the FSN write lock and the CRM lock here,
            // we know that the CRM thread cannot be currently modifying
            // the cache manager state while we're closing it.
            // Since the CRM thread checks the value of 'shutdown' after waiting
            // for a lock, we know that the thread will not modify the cache
            // manager state after this point.
            shutdown = true;
            doRescan.signalAll();
            scanFinished.signalAll();
        } finally {
            lock.unlock();
        }
    }

    private void rescan() throws InterruptedException {
        scannedDirectives = 0;
        scannedBlocks = 0;
        namesystem.writeLock();
        try {
            if (shutdown) {
                throw new InterruptedException("CacheReplicationMonitor was " + "shut down.");
            }
            resetStatistics();
            rescanCacheDirectives();
            rescanCachedBlockMap();
            blockManager.getDatanodeManager().resetLastCachingDirectiveSentTime();
        } finally {
            namesystem.writeUnlock();
        }
    }

    private void resetStatistics() {
        for (CachePool pool : cacheManager.getCachePools()) {
            pool.resetStatistics();
        }
        for (CacheDirective directive : cacheManager.getCacheDirectives()) {
            directive.resetStatistics();
        }
    }

    /**
     * Scan all CacheDirectives.  Use the information to figure out
     * what cache replication factor each block should have.
     */
    private void rescanCacheDirectives() {
        FSDirectory fsDir = namesystem.getFSDirectory();
        final long now = new Date().getTime();
        for (CacheDirective directive : cacheManager.getCacheDirectives()) {
            scannedDirectives++;
            // Skip processing this entry if it has expired
            if (directive.getExpiryTime() > 0 && directive.getExpiryTime() <= now) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Directive " + directive.getId() + ": the directive " + "expired at "
                            + directive.getExpiryTime() + " (now = " + now + ")");
                }
                continue;
            }
            String path = directive.getPath();
            INode node;
            try {
                node = fsDir.getINode(path);
            } catch (UnresolvedLinkException e) {
                // We don't cache through symlinks
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Directive " + directive.getId()
                            + ": got UnresolvedLinkException while resolving path " + path);
                }
                continue;
            }
            if (node == null) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Directive " + directive.getId() + ": No inode found at " + path);
                }
            } else if (node.isDirectory()) {
                INodeDirectory dir = node.asDirectory();
                ReadOnlyList<INode> children = dir.getChildrenList(Snapshot.CURRENT_STATE_ID);
                for (INode child : children) {
                    if (child.isFile()) {
                        rescanFile(directive, child.asFile());
                    }
                }
            } else if (node.isFile()) {
                rescanFile(directive, node.asFile());
            } else {
                if (LOG.isDebugEnabled()) {
                    LOG.debug(
                            "Directive " + directive.getId() + ": ignoring non-directive, non-file inode " + node);
                }
            }
        }
    }

    /**
     * Apply a CacheDirective to a file.
     * 
     * @param directive The CacheDirective to apply.
     * @param file The file.
     */
    private void rescanFile(CacheDirective directive, INodeFile file) {
        BlockInfo[] blockInfos = file.getBlocks();

        // Increment the "needed" statistics
        directive.addFilesNeeded(1);
        // We don't cache UC blocks, don't add them to the total here
        long neededTotal = file.computeFileSizeNotIncludingLastUcBlock() * directive.getReplication();
        directive.addBytesNeeded(neededTotal);

        // The pool's bytesNeeded is incremented as we scan. If the demand
        // thus far plus the demand of this file would exceed the pool's limit,
        // do not cache this file.
        CachePool pool = directive.getPool();
        if (pool.getBytesNeeded() > pool.getLimit()) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format(
                        "Directive %d: not scanning file %s because "
                                + "bytesNeeded for pool %s is %d, but the pool's limit is %d",
                        directive.getId(), file.getFullPathName(), pool.getPoolName(), pool.getBytesNeeded(),
                        pool.getLimit()));
            }
            return;
        }

        long cachedTotal = 0;
        for (BlockInfo blockInfo : blockInfos) {
            if (!blockInfo.getBlockUCState().equals(BlockUCState.COMPLETE)) {
                // We don't try to cache blocks that are under construction.
                if (LOG.isTraceEnabled()) {
                    LOG.trace("Directive " + directive.getId() + ": can't cache " + "block " + blockInfo
                            + " because it is in state " + blockInfo.getBlockUCState() + ", not COMPLETE.");
                }
                continue;
            }
            Block block = new Block(blockInfo.getBlockId());
            CachedBlock ncblock = new CachedBlock(block.getBlockId(), directive.getReplication(), mark);
            CachedBlock ocblock = cachedBlocks.get(ncblock);
            if (ocblock == null) {
                cachedBlocks.put(ncblock);
                ocblock = ncblock;
            } else {
                // Update bytesUsed using the current replication levels.
                // Assumptions: we assume that all the blocks are the same length
                // on each datanode.  We can assume this because we're only caching
                // blocks in state COMMITTED.
                // Note that if two directives are caching the same block(s), they will
                // both get them added to their bytesCached.
                List<DatanodeDescriptor> cachedOn = ocblock.getDatanodes(Type.CACHED);
                long cachedByBlock = Math.min(cachedOn.size(), directive.getReplication())
                        * blockInfo.getNumBytes();
                cachedTotal += cachedByBlock;

                if ((mark != ocblock.getMark()) || (ocblock.getReplication() < directive.getReplication())) {
                    //
                    // Overwrite the block's replication and mark in two cases:
                    //
                    // 1. If the mark on the CachedBlock is different from the mark for
                    // this scan, that means the block hasn't been updated during this
                    // scan, and we should overwrite whatever is there, since it is no
                    // longer valid.
                    //
                    // 2. If the replication in the CachedBlock is less than what the
                    // directive asks for, we want to increase the block's replication
                    // field to what the directive asks for.
                    //
                    ocblock.setReplicationAndMark(directive.getReplication(), mark);
                }
            }
            if (LOG.isTraceEnabled()) {
                LOG.trace("Directive " + directive.getId() + ": setting replication " + "for block " + blockInfo
                        + " to " + ocblock.getReplication());
            }
        }
        // Increment the "cached" statistics
        directive.addBytesCached(cachedTotal);
        if (cachedTotal == neededTotal) {
            directive.addFilesCached(1);
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug("Directive " + directive.getId() + ": caching " + file.getFullPathName() + ": " + cachedTotal
                    + "/" + neededTotal + " bytes");
        }
    }

    private String findReasonForNotCaching(CachedBlock cblock, BlockInfo blockInfo) {
        if (blockInfo == null) {
            // Somehow, a cache report with the block arrived, but the block
            // reports from the DataNode haven't (yet?) described such a block.
            // Alternately, the NameNode might have invalidated the block, but the
            // DataNode hasn't caught up.  In any case, we want to tell the DN
            // to uncache this.
            return "not tracked by the BlockManager";
        } else if (!blockInfo.isComplete()) {
            // When a cached block changes state from complete to some other state
            // on the DataNode (perhaps because of append), it will begin the
            // uncaching process.  However, the uncaching process is not
            // instantaneous, especially if clients have pinned the block.  So
            // there may be a period of time when incomplete blocks remain cached
            // on the DataNodes.
            return "not complete";
        } else if (cblock.getReplication() == 0) {
            // Since 0 is not a valid value for a cache directive's replication
            // field, seeing a replication of 0 on a CacheBlock means that it
            // has never been reached by any sweep.
            return "not needed by any directives";
        } else if (cblock.getMark() != mark) {
            // Although the block was needed in the past, we didn't reach it during
            // the current sweep.  Therefore, it doesn't need to be cached any more.
            // Need to set the replication to 0 so it doesn't flip back to cached
            // when the mark flips on the next scan
            cblock.setReplicationAndMark((short) 0, mark);
            return "no longer needed by any directives";
        }
        return null;
    }

    /**
     * Scan through the cached block map.
     * Any blocks which are under-replicated should be assigned new Datanodes.
     * Blocks that are over-replicated should be removed from Datanodes.
     */
    private void rescanCachedBlockMap() {
        for (Iterator<CachedBlock> cbIter = cachedBlocks.iterator(); cbIter.hasNext();) {
            scannedBlocks++;
            CachedBlock cblock = cbIter.next();
            List<DatanodeDescriptor> pendingCached = cblock.getDatanodes(Type.PENDING_CACHED);
            List<DatanodeDescriptor> cached = cblock.getDatanodes(Type.CACHED);
            List<DatanodeDescriptor> pendingUncached = cblock.getDatanodes(Type.PENDING_UNCACHED);
            // Remove nodes from PENDING_UNCACHED if they were actually uncached.
            for (Iterator<DatanodeDescriptor> iter = pendingUncached.iterator(); iter.hasNext();) {
                DatanodeDescriptor datanode = iter.next();
                if (!cblock.isInList(datanode.getCached())) {
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Block " + cblock.getBlockId() + ": removing from " + "PENDING_UNCACHED for node "
                                + datanode.getDatanodeUuid() + "because the DataNode uncached it.");
                    }
                    datanode.getPendingUncached().remove(cblock);
                    iter.remove();
                }
            }
            BlockInfo blockInfo = blockManager.getStoredBlock(new Block(cblock.getBlockId()));
            String reason = findReasonForNotCaching(cblock, blockInfo);
            int neededCached = 0;
            if (reason != null) {
                if (LOG.isTraceEnabled()) {
                    LOG.trace("Block " + cblock.getBlockId() + ": can't cache " + "block because it is " + reason);
                }
            } else {
                neededCached = cblock.getReplication();
            }
            int numCached = cached.size();
            if (numCached >= neededCached) {
                // If we have enough replicas, drop all pending cached.
                for (Iterator<DatanodeDescriptor> iter = pendingCached.iterator(); iter.hasNext();) {
                    DatanodeDescriptor datanode = iter.next();
                    datanode.getPendingCached().remove(cblock);
                    iter.remove();
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Block " + cblock.getBlockId() + ": removing from " + "PENDING_CACHED for node "
                                + datanode.getDatanodeUuid() + "because we already have " + numCached + " cached "
                                + "replicas and we only need " + neededCached);
                    }
                }
            }
            if (numCached < neededCached) {
                // If we don't have enough replicas, drop all pending uncached.
                for (Iterator<DatanodeDescriptor> iter = pendingUncached.iterator(); iter.hasNext();) {
                    DatanodeDescriptor datanode = iter.next();
                    datanode.getPendingUncached().remove(cblock);
                    iter.remove();
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Block " + cblock.getBlockId() + ": removing from " + "PENDING_UNCACHED for node "
                                + datanode.getDatanodeUuid() + "because we only have " + numCached
                                + " cached replicas " + "and we need " + neededCached);
                    }
                }
            }
            int neededUncached = numCached - (pendingUncached.size() + neededCached);
            if (neededUncached > 0) {
                addNewPendingUncached(neededUncached, cblock, cached, pendingUncached);
            } else {
                int additionalCachedNeeded = neededCached - (numCached + pendingCached.size());
                if (additionalCachedNeeded > 0) {
                    addNewPendingCached(additionalCachedNeeded, cblock, cached, pendingCached);
                }
            }
            if ((neededCached == 0) && pendingUncached.isEmpty() && pendingCached.isEmpty()) {
                // we have nothing more to do with this block.
                if (LOG.isTraceEnabled()) {
                    LOG.trace("Block " + cblock.getBlockId() + ": removing from "
                            + "cachedBlocks, since neededCached == 0, and "
                            + "pendingUncached and pendingCached are empty.");
                }
                cbIter.remove();
            }
        }
    }

    /**
     * Add new entries to the PendingUncached list.
     *
     * @param neededUncached   The number of replicas that need to be uncached.
     * @param cachedBlock      The block which needs to be uncached.
     * @param cached           A list of DataNodes currently caching the block.
     * @param pendingUncached  A list of DataNodes that will soon uncache the
     *                         block.
     */
    private void addNewPendingUncached(int neededUncached, CachedBlock cachedBlock, List<DatanodeDescriptor> cached,
            List<DatanodeDescriptor> pendingUncached) {
        // Figure out which replicas can be uncached.
        LinkedList<DatanodeDescriptor> possibilities = new LinkedList<DatanodeDescriptor>();
        for (DatanodeDescriptor datanode : cached) {
            if (!pendingUncached.contains(datanode)) {
                possibilities.add(datanode);
            }
        }
        while (neededUncached > 0) {
            if (possibilities.isEmpty()) {
                LOG.warn("Logic error: we're trying to uncache more replicas than " + "actually exist for "
                        + cachedBlock);
                return;
            }
            DatanodeDescriptor datanode = possibilities.remove(random.nextInt(possibilities.size()));
            pendingUncached.add(datanode);
            boolean added = datanode.getPendingUncached().add(cachedBlock);
            assert added;
            neededUncached--;
        }
    }

    /**
     * Add new entries to the PendingCached list.
     *
     * @param neededCached     The number of replicas that need to be cached.
     * @param cachedBlock      The block which needs to be cached.
     * @param cached           A list of DataNodes currently caching the block.
     * @param pendingCached    A list of DataNodes that will soon cache the
     *                         block.
     */
    private void addNewPendingCached(final int neededCached, CachedBlock cachedBlock,
            List<DatanodeDescriptor> cached, List<DatanodeDescriptor> pendingCached) {
        // To figure out which replicas can be cached, we consult the
        // blocksMap.  We don't want to try to cache a corrupt replica, though.
        BlockInfo blockInfo = blockManager.getStoredBlock(new Block(cachedBlock.getBlockId()));
        if (blockInfo == null) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Block " + cachedBlock.getBlockId() + ": can't add new "
                        + "cached replicas, because there is no record of this block " + "on the NameNode.");
            }
            return;
        }
        if (!blockInfo.isComplete()) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Block " + cachedBlock.getBlockId() + ": can't cache this "
                        + "block, because it is not yet complete.");
            }
            return;
        }
        // Filter the list of replicas to only the valid targets
        List<DatanodeDescriptor> possibilities = new LinkedList<DatanodeDescriptor>();
        int numReplicas = blockInfo.getCapacity();
        Collection<DatanodeDescriptor> corrupt = blockManager.getCorruptReplicas(blockInfo);
        int outOfCapacity = 0;
        for (int i = 0; i < numReplicas; i++) {
            DatanodeDescriptor datanode = blockInfo.getDatanode(i);
            if (datanode == null) {
                continue;
            }
            if (datanode.isDecommissioned() || datanode.isDecommissionInProgress()) {
                continue;
            }
            if (corrupt != null && corrupt.contains(datanode)) {
                continue;
            }
            if (pendingCached.contains(datanode) || cached.contains(datanode)) {
                continue;
            }
            long pendingCapacity = datanode.getCacheRemaining();
            // Subtract pending cached blocks from effective capacity
            Iterator<CachedBlock> it = datanode.getPendingCached().iterator();
            while (it.hasNext()) {
                CachedBlock cBlock = it.next();
                BlockInfo info = blockManager.getStoredBlock(new Block(cBlock.getBlockId()));
                if (info != null) {
                    pendingCapacity -= info.getNumBytes();
                }
            }
            it = datanode.getPendingUncached().iterator();
            // Add pending uncached blocks from effective capacity
            while (it.hasNext()) {
                CachedBlock cBlock = it.next();
                BlockInfo info = blockManager.getStoredBlock(new Block(cBlock.getBlockId()));
                if (info != null) {
                    pendingCapacity += info.getNumBytes();
                }
            }
            if (pendingCapacity < blockInfo.getNumBytes()) {
                if (LOG.isTraceEnabled()) {
                    LOG.trace("Block " + blockInfo.getBlockId() + ": DataNode " + datanode.getDatanodeUuid()
                            + " is not a valid possibility " + "because the block has size "
                            + blockInfo.getNumBytes() + ", but " + "the DataNode only has "
                            + datanode.getCacheRemaining() + " " + "bytes of cache remaining.");
                }
                outOfCapacity++;
                continue;
            }
            possibilities.add(datanode);
        }
        List<DatanodeDescriptor> chosen = chooseDatanodesForCaching(possibilities, neededCached,
                blockManager.getDatanodeManager().getStaleInterval());
        for (DatanodeDescriptor datanode : chosen) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Block " + blockInfo.getBlockId() + ": added to " + "PENDING_CACHED on DataNode "
                        + datanode.getDatanodeUuid());
            }
            pendingCached.add(datanode);
            boolean added = datanode.getPendingCached().add(cachedBlock);
            assert added;
        }
        // We were unable to satisfy the requested replication factor
        if (neededCached > chosen.size()) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Block " + blockInfo.getBlockId() + ": we only have "
                        + (cachedBlock.getReplication() - neededCached + chosen.size()) + " of "
                        + cachedBlock.getReplication() + " cached replicas.  " + outOfCapacity
                        + " DataNodes have insufficient cache capacity.");
            }
        }
    }

    /**
     * Chooses datanode locations for caching from a list of valid possibilities.
     * Non-stale nodes are chosen before stale nodes.
     * 
     * @param possibilities List of candidate datanodes
     * @param neededCached Number of replicas needed
     * @param staleInterval Age of a stale datanode
     * @return A list of chosen datanodes
     */
    private static List<DatanodeDescriptor> chooseDatanodesForCaching(final List<DatanodeDescriptor> possibilities,
            final int neededCached, final long staleInterval) {
        // Make a copy that we can modify
        List<DatanodeDescriptor> targets = new ArrayList<DatanodeDescriptor>(possibilities);
        // Selected targets
        List<DatanodeDescriptor> chosen = new LinkedList<DatanodeDescriptor>();

        // Filter out stale datanodes
        List<DatanodeDescriptor> stale = new LinkedList<DatanodeDescriptor>();
        Iterator<DatanodeDescriptor> it = targets.iterator();
        while (it.hasNext()) {
            DatanodeDescriptor d = it.next();
            if (d.isStale(staleInterval)) {
                it.remove();
                stale.add(d);
            }
        }
        // Select targets
        while (chosen.size() < neededCached) {
            // Try to use stale nodes if we're out of non-stale nodes, else we're done
            if (targets.isEmpty()) {
                if (!stale.isEmpty()) {
                    targets = stale;
                } else {
                    break;
                }
            }
            // Select a random target
            DatanodeDescriptor target = chooseRandomDatanodeByRemainingCapacity(targets);
            chosen.add(target);
            targets.remove(target);
        }
        return chosen;
    }

    /**
     * Choose a single datanode from the provided list of possible
     * targets, weighted by the percentage of free space remaining on the node.
     * 
     * @return The chosen datanode
     */
    private static DatanodeDescriptor chooseRandomDatanodeByRemainingCapacity(
            final List<DatanodeDescriptor> targets) {
        // Use a weighted probability to choose the target datanode
        float total = 0;
        for (DatanodeDescriptor d : targets) {
            total += d.getCacheRemainingPercent();
        }
        // Give each datanode a portion of keyspace equal to its relative weight
        // [0, w1) selects d1, [w1, w2) selects d2, etc.
        TreeMap<Integer, DatanodeDescriptor> lottery = new TreeMap<Integer, DatanodeDescriptor>();
        int offset = 0;
        for (DatanodeDescriptor d : targets) {
            // Since we're using floats, be paranoid about negative values
            int weight = Math.max(1, (int) ((d.getCacheRemainingPercent() / total) * 1000000));
            offset += weight;
            lottery.put(offset, d);
        }
        // Choose a number from [0, offset), which is the total amount of weight,
        // to select the winner
        DatanodeDescriptor winner = lottery.higherEntry(random.nextInt(offset)).getValue();
        return winner;
    }
}