org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicyConfigurable.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicyConfigurable.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.namenode;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.locks.ReentrantReadWriteLock;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.net.DNSToSwitchMapping;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.net.Node;
import org.apache.hadoop.net.NodeBase;
import org.apache.hadoop.util.HostsFileReader;

public class BlockPlacementPolicyConfigurable extends BlockPlacementPolicyDefault {

    /**
     * RackRingInfo contains the rack information with respect to the ring it
     * belongs to and the internal machine ring it keeps.
     */
    protected class RackRingInfo {
        public int index;
        public List<String> rackNodes;
        public HashMap<String, Integer> rackNodesMap;

        public Integer findNode(DatanodeDescriptor node) {
            Integer retVal = rackNodesMap.get(node.getHostName());

            if (retVal == null) {
                retVal = rackNodesMap.get(node.getName());
                if (retVal == null) {
                    retVal = rackNodesMap.get(node.getHost());
                    if (retVal == null) {
                        LOG.info("Didn't find " + node.getHostName() + " - " + node.getName() + " - "
                                + node.getHost());
                    }
                }
            }
            return retVal;
        }
    }

    public static final Log LOG = LogFactory.getLog(BlockPlacementPolicyConfigurable.class);

    // a fair rw lock for racks and racksMap.
    ReentrantReadWriteLock rwLock = new ReentrantReadWriteLock(true);
    protected List<String> racks; // Ring of racks
    protected HashMap<String, RackRingInfo> racksMap; //RackRingInfo map

    protected int rackWindow;
    protected int machineWindow;
    Random r = null;

    HostsFileReader hostsReader;
    DNSToSwitchMapping dnsToSwitchMapping;

    BlockPlacementPolicyConfigurable(Configuration conf, FSClusterStats stats, NetworkTopology clusterMap,
            HostsFileReader hostsReader, DNSToSwitchMapping dnsToSwitchMapping) {
        initialize(conf, stats, clusterMap, hostsReader, dnsToSwitchMapping, null);
    }

    BlockPlacementPolicyConfigurable() {
    }

    BlockPlacementPolicyConfigurable(long seed) {
        r = new Random(seed);
    }

    private class HashComparator implements Comparator<String> {
        Random rand = new Random();

        public int compare(String o1, String o2) {
            rand.setSeed(o1.hashCode());
            int hc1 = rand.nextInt();
            rand.setSeed(o2.hashCode());
            int hc2 = rand.nextInt();

            if (hc1 < hc2)
                return -1;
            if (hc1 > hc2)
                return 1;
            return 0;
        }
    }

    private void readLock() {
        rwLock.readLock().lock();
    }

    private void readUnlock() {
        rwLock.readLock().unlock();
    }

    private void writeLock() {
        rwLock.writeLock().lock();
    }

    private void writeUnlock() {
        rwLock.writeLock().unlock();
    }

    /**
     * The two classes below are used to hash rack and host names when forming
     * the rings. They can be overwritten to implement different strategies
     */
    protected Comparator<String> rackComparator = new HashComparator();
    protected Comparator<String> hostComparator = new HashComparator();

    /** {@inheritDoc} */
    public void initialize(Configuration conf, FSClusterStats stats, NetworkTopology clusterMap,
            HostsFileReader hostsReader, DNSToSwitchMapping dnsToSwitchMapping, FSNamesystem ns) {
        super.initialize(conf, stats, clusterMap, hostsReader, dnsToSwitchMapping, ns);
        this.rackWindow = conf.getInt("dfs.replication.rackwindow", 2);
        this.machineWindow = conf.getInt("dfs.replication.machineWindow", 5);
        this.racks = new ArrayList<String>();
        this.hostsReader = hostsReader;
        this.dnsToSwitchMapping = dnsToSwitchMapping;
        hostsUpdated(true);
        if (r == null) {
            r = new Random();
        }
        LOG.info("BlockPlacementPolicyConfigurable initialized");
    }

    /** {@inheritDoc} */
    public void hostsUpdated() {
        hostsUpdated(false);
    }

    public void hostsUpdated(boolean startup) {
        List<String> hostsIn = new ArrayList<String>(hostsReader.getHosts());
        List<String> hostsRacks = dnsToSwitchMapping.resolve(hostsIn);
        HashMap<String, RackRingInfo> tempRacksMap = new HashMap<String, RackRingInfo>();
        List<String> tempRacks = new ArrayList<String>();

        int index = hostsRacks.indexOf(NetworkTopology.DEFAULT_RACK);
        if (index != -1) {
            if (!startup) {
                throw new DefaultRackException(
                        "Could not resolve rack for : " + hostsIn.get(index) + " probably due to a DNS issue");
            } else {
                // We do not want to abort startup, just remove the bad datanode.
                for (int i = 0; i < hostsRacks.size(); i++) {
                    if (hostsRacks.get(i).equals(NetworkTopology.DEFAULT_RACK)) {
                        LOG.warn("Could not resolve rack for : " + hostsIn.get(i)
                                + " probably due to a DNS issue, removing" + " the host since we are in startup");
                        hostsRacks.remove(i);
                        hostsReader.getHosts().remove(hostsIn.get(i));
                        hostsIn.remove(i);
                        i--;
                    }
                }
            }
        }

        for (int i = 0; i < hostsIn.size(); i++) {
            String host = hostsIn.get(i);
            String rack = hostsRacks.get(i);

            RackRingInfo rackinfo = tempRacksMap.get(rack);
            if (rackinfo == null) {
                LOG.info("Adding rack:" + rack);
                tempRacks.add(rack);
                rackinfo = new RackRingInfo();
                rackinfo.rackNodes = new ArrayList<String>();
                tempRacksMap.put(rack, rackinfo);
            }
            LOG.info("Adding host:" + host);
            rackinfo.rackNodes.add(host);
        }

        Collections.sort(tempRacks, rackComparator);

        StringBuffer ringRep = new StringBuffer("\nRing Topology:\n");
        for (int i = 0; i < tempRacks.size(); i++) {
            RackRingInfo rackinfo = tempRacksMap.get(tempRacks.get(i));
            rackinfo.index = i;
            List<String> rackNodes = rackinfo.rackNodes;
            HashMap<String, Integer> nodesMap = new HashMap<String, Integer>();
            rackinfo.rackNodesMap = nodesMap;

            ringRep.append("\tRing " + i + ": " + tempRacks.get(i) + "\n");

            Collections.sort(rackNodes, hostComparator);
            for (int j = 0; j < rackNodes.size(); j++) {
                ringRep.append("\t\t" + j + ": " + rackNodes.get(j) + "\n");
                nodesMap.put(rackNodes.get(j), j);
            }
        }
        LOG.info(ringRep.toString());
        // Update both datastructures together in a lock.
        writeLock();
        racksMap = tempRacksMap;
        racks = tempRacks;
        writeUnlock();
    }

    /**
     * returns a random integer within a modular window taking into consideration
     * a sorted list of nodes to be excluded.
     */
    protected int randomIntInWindow(int begin, int windowSize, int n, Set<Integer> excludeSet) {

        final int size = Math.min(windowSize, n);

        if (size <= 0) {
            return -1;
        }

        int adjustment = 0;
        for (Integer v : excludeSet) {
            int vindex = (v.intValue() - begin + n) % n;
            if (vindex < size) {
                adjustment++; // calculates excluded elements within window
            }
        }

        if (adjustment >= size) {
            return -1;
        }

        int rindex = r.nextInt(size - adjustment); // ith element is chosen

        int iterator = begin;
        for (int i = 0; i <= rindex; i++) {
            while (excludeSet.contains(iterator)) {
                iterator = (iterator + 1) % n;
            }
            if (i != rindex) {
                iterator = (iterator + 1) % n;
            }
        }

        return iterator;
    }

    /**
     * This method is currently used only for re-replication and should be used
     * only for this. If this method is used for normal block placements that
     * would completely break this placement policy.
     */
    @Override
    public DatanodeDescriptor[] chooseTarget(FSInodeInfo srcInode, int numOfReplicas, DatanodeDescriptor writer,
            List<DatanodeDescriptor> chosenNodes, List<Node> excludesNodes, long blocksize) {
        if (numOfReplicas == 0 || clusterMap.getNumOfLeaves() == 0) {
            return new DatanodeDescriptor[0];
        }

        int[] result = getActualReplicas(numOfReplicas, chosenNodes);
        numOfReplicas = result[0];
        int maxNodesPerRack = result[1];

        HashMap<Node, Node> excludedNodes = new HashMap<Node, Node>();
        List<DatanodeDescriptor> results = new ArrayList<DatanodeDescriptor>(chosenNodes.size() + numOfReplicas);

        updateExcludedAndChosen(null, excludedNodes, results, chosenNodes);

        if (!clusterMap.contains(writer)) {
            writer = null;
        }

        DatanodeDescriptor localNode = super.chooseTarget(numOfReplicas, writer, excludedNodes, blocksize,
                maxNodesPerRack, results, chosenNodes.isEmpty());

        return this.finalizeTargets(results, chosenNodes, writer, localNode);
    }

    /* choose <i>numOfReplicas</i> from all data nodes */
    @Override
    protected DatanodeDescriptor chooseTarget(int numOfReplicas, DatanodeDescriptor writer,
            HashMap<Node, Node> excludedNodes, long blocksize, int maxNodesPerRack,
            List<DatanodeDescriptor> results, boolean newBlock) {

        if (numOfReplicas == 0 || clusterMap.getNumOfLeaves() == 0) {
            return writer;
        }

        int numOfResults = results.size();
        if (writer == null && !newBlock) {
            writer = results.get(0);
        }

        try {
            if (numOfResults == 0) {
                writer = chooseLocalNode(writer, excludedNodes, blocksize, maxNodesPerRack, results);
                if (--numOfReplicas == 0) {
                    return writer;
                }
            }
            if (numOfResults <= 1) {
                // If we have a replication factor of 2, place both replicas on the
                // same rack.
                if (numOfReplicas == 1) {
                    chooseLocalRack(results.get(0), excludedNodes, blocksize, maxNodesPerRack, results);
                } else {
                    chooseFirstInRemoteRack(results.get(0), excludedNodes, blocksize, maxNodesPerRack, results);
                }
                if (--numOfReplicas == 0) {
                    return writer;
                }
            }
            chooseRemainingReplicas(numOfReplicas, excludedNodes, blocksize, maxNodesPerRack, results);
        } catch (NotEnoughReplicasException e) {
            LOG.warn("Not able to place enough replicas, still in need of " + numOfReplicas);
        }
        return writer;
    }

    /**
     * Picks up the first replica stored in a remote rack.
     * @param localMachine local machine that is writing the data
     * @param excludedNodes nodes that should not be considered
     * @param blocksize size of blocks
     * @param maxReplicasPerRack maximum replicas per rack
     * @param results datanodes used for replicas
     * @throws NotEnoughReplicasException
     */
    protected void chooseFirstInRemoteRack(DatanodeDescriptor localMachine, HashMap<Node, Node> excludedNodes,
            long blocksize, int maxReplicasPerRack, List<DatanodeDescriptor> results)
            throws NotEnoughReplicasException {
        readLock();
        try {
            RackRingInfo rackInfo = racksMap.get(localMachine.getNetworkLocation());
            assert (rackInfo != null);

            Integer machineId = rackInfo.findNode(localMachine);
            assert (machineId != null);

            if (!chooseRemoteRack(rackInfo.index, rackInfo.index, rackWindow + 1, machineId, machineWindow,
                    excludedNodes, blocksize, maxReplicasPerRack, results, false)) {
                LOG.info("Couldn't find a Datanode within node group. " + "Resorting to default policy.");
                super.chooseRemoteRack(1, localMachine, excludedNodes, blocksize, maxReplicasPerRack, results);
            }
        } finally {
            readUnlock();
        }
    }

    /**
     * returns the best possible match of nodes to the first three replicas in
     * in the current replication scheme
     * result[0] = first replica or null
     * result[1] = second replica or null
     * result[2] = third replica or null
     */
    protected DatanodeDescriptor[] findBest(List<DatanodeDescriptor> listOfNodes) {

        DatanodeDescriptor[] result = new DatanodeDescriptor[3];

        result[0] = listOfNodes.isEmpty() ? null : listOfNodes.get(0);
        result[1] = null;
        result[2] = null;

        for (DatanodeDescriptor n : listOfNodes) {
            findBestWithFirst(n, listOfNodes, result);
            if (result[2] != null)
                return result;
        }

        if (result[1] == null && listOfNodes.size() > 1) {
            findBestWithoutFirst(listOfNodes, result);
        }

        return result;
    }

    /**
     * Function that finds the best partial triple including a first replica
     * @param first first replica to be used
     * @param listOfNodes nodes to choose from
     * @param result array that stores results
     */
    private void findBestWithFirst(DatanodeDescriptor first, List<DatanodeDescriptor> listOfNodes,
            DatanodeDescriptor[] result) {
        for (int in2 = 0; in2 < listOfNodes.size(); in2++) {
            DatanodeDescriptor n2 = listOfNodes.get(in2);
            if (!first.equals(n2)) {
                if (result[1] == null && inWindow(first, n2)) {
                    result[0] = first;
                    result[1] = n2;
                }

                for (int in3 = in2 + 1; in3 < listOfNodes.size(); in3++) {
                    DatanodeDescriptor n3 = listOfNodes.get(in3);
                    if (!first.equals(n3) && inWindow(first, n3, n2)) {
                        result[0] = first;
                        result[1] = n2;
                        result[2] = n3;
                        return;
                    }
                }
            }
        }
    }

    /**
     * Verifies if testing node is within right windows of first node
     * @param first first node being considered
     * @param testing node we are testing to check if it is within window or not
     * @return We return true if it is successful, and not otherwise
     */
    private boolean inWindow(DatanodeDescriptor first, DatanodeDescriptor testing) {

        readLock();
        try {
            RackRingInfo rackInfo = racksMap.get(first.getNetworkLocation());
            assert (rackInfo != null);

            Integer machineId = rackInfo.findNode(first);
            assert (machineId != null);

            final int rackWindowStart = rackInfo.index;

            final RackRingInfo rackTest = racksMap.get(testing.getNetworkLocation());
            assert (rackTest != null);

            final int rackDist = (rackTest.index - rackWindowStart + racks.size()) % racks.size();

            if (rackDist < rackWindow + 1 && rackTest.index != rackInfo.index) {
                // inside rack window
                final Integer idFirst = rackInfo.findNode(first);
                assert (idFirst != null);

                final int sizeFirstRack = rackInfo.rackNodes.size();
                final int sizeTestRack = rackTest.rackNodes.size();

                final int start = idFirst * sizeTestRack / sizeFirstRack;
                final Integer idTest = rackTest.findNode(testing);
                assert (idTest != null);

                final int dist = (idTest - start + sizeTestRack) % sizeTestRack;

                if (dist < machineWindow) { // inside machine Window
                    return true;
                }
            }

            return false;
        } finally {
            readUnlock();
        }
    }

    /**
     * Verifies if testing nodes are within right windows of first node
     * @param first first node being considered
     * @param testing1 node we are testing to check if it is within window or not
     * @param testing2 node we are testing to check if it is within window or not
     * @return We return true if it is successful, and not otherwise
     */
    private boolean inWindow(DatanodeDescriptor first, DatanodeDescriptor testing1, DatanodeDescriptor testing2) {

        readLock();
        try {
            if (!testing1.getNetworkLocation().equals(testing2.getNetworkLocation())) {
                return false;
            }

            RackRingInfo rackInfo = racksMap.get(first.getNetworkLocation());
            assert (rackInfo != null);

            Integer machineId = rackInfo.findNode(first);
            assert (machineId != null);

            final int rackWindowStart = rackInfo.index;

            final RackRingInfo rackTest = racksMap.get(testing1.getNetworkLocation());
            assert (rackTest != null);

            final int rackDist = (rackTest.index - rackWindowStart + racks.size()) % racks.size();

            if (rackDist < rackWindow + 1 && rackTest.index != rackInfo.index) {
                // inside rack window

                final int rackSize = rackTest.rackNodes.size();
                Integer idN2 = rackTest.findNode(testing1);
                assert (idN2 != null);
                Integer idN3 = rackTest.findNode(testing2);
                assert (idN3 != null);

                final Integer idFirst = rackInfo.findNode(first);
                assert (idFirst != null);

                final int sizeFirstRack = rackInfo.rackNodes.size();

                final int end = idFirst * rackSize / sizeFirstRack;

                // proportional to previous of idFirst
                final int prevIdFirst = (idFirst + sizeFirstRack - 1) % sizeFirstRack;
                int start = (prevIdFirst * rackSize / sizeFirstRack);

                int distPropWindow = (end - start + rackSize) % rackSize;

                if (distPropWindow > 0) {
                    start = (start + 1) % rackSize;
                    distPropWindow--;
                }

                int distIdN2 = (idN2 - start + rackSize) % rackSize;
                int distIdN3 = (idN3 - start + rackSize) % rackSize;

                int distN3N2 = (idN3 - idN2 + rackSize) % rackSize;
                int distN2N3 = (idN2 - idN3 + rackSize) % rackSize;

                if (distIdN2 <= distPropWindow && distN3N2 < machineWindow)
                    return true;

                if (distIdN3 <= distPropWindow && distN2N3 < machineWindow)
                    return true;

            }

            return false;
        } finally {
            readUnlock();
        }
    }

    /**
     * Finds best match considering only the remote nodes.
     * @param listOfNodes Datanodes to choose from
     * @param result Array containing results
     */
    private void findBestWithoutFirst(List<DatanodeDescriptor> listOfNodes, DatanodeDescriptor[] result) {

        readLock();
        try {
            for (int in2 = 0; in2 < listOfNodes.size(); in2++) {
                DatanodeDescriptor n2 = listOfNodes.get(in2);

                for (int in3 = in2 + 1; in3 < listOfNodes.size(); in3++) {
                    DatanodeDescriptor n3 = listOfNodes.get(in3);
                    if (n2.getNetworkLocation().equals(n3.getNetworkLocation())) {
                        RackRingInfo rackInfo = racksMap.get(n2.getNetworkLocation());
                        assert (rackInfo != null);
                        final int rackSize = rackInfo.rackNodes.size();
                        final Integer idN2 = rackInfo.findNode(n2);
                        final Integer idN3 = rackInfo.findNode(n3);

                        if (idN2 != null && idN3 != null) {
                            int dist = (idN3 - idN2 + rackSize) % rackSize;
                            if (dist >= machineWindow) {
                                dist = rackSize - dist; // try n2 - n3
                            }

                            if (dist < machineWindow) {
                                result[0] = null;
                                result[1] = n2;
                                result[2] = n3;
                                return;
                            }
                        }
                    }
                }
            }
        } finally {
            readUnlock();
        }
    }

    /**
     * Chooses the third replica, after 2 have been allocated
     * @param excludedNodes Nodes that we should not consider
     * @param blocksize size of blocks
     * @param maxReplicasPerRack maximum number of replicas per rack
     * @param results array containing results
     * @throws NotEnoughReplicasException
     */
    protected void chooseRemainingReplicas(int numOfReplicas, HashMap<Node, Node> excludedNodes, long blocksize,
            int maxReplicasPerRack, List<DatanodeDescriptor> results) throws NotEnoughReplicasException {
        readLock();
        try {

            if (numOfReplicas <= 0) {
                return;
            }

            DatanodeDescriptor[] bestmatch = findBest(results);

            if (bestmatch[0] != null) { // there is a first replica: 1,X,X

                excludedNodes.put(bestmatch[0], bestmatch[0]);

                if (bestmatch[1] == null) { // there is no second replica: 1,0,0

                    chooseFirstInRemoteRack(bestmatch[0], excludedNodes, blocksize, maxReplicasPerRack, results); // pick up second
                    numOfReplicas--;

                    // now search for the rest recursively
                    chooseRemainingReplicas(numOfReplicas, excludedNodes, blocksize, maxReplicasPerRack, results);
                    return;

                } else if (bestmatch[2] == null) { // no third replica: 1,1,0

                    // find the third one
                    excludedNodes.put(bestmatch[1], bestmatch[1]);
                    RackRingInfo rack0 = racksMap.get(bestmatch[0].getNetworkLocation());
                    RackRingInfo rack1 = racksMap.get(bestmatch[1].getNetworkLocation());
                    int posR0 = rack0.findNode(bestmatch[0]);
                    int firstMachine = posR0 * rack1.rackNodes.size() / rack0.rackNodes.size();
                    if (!chooseMachine(bestmatch[1].getNetworkLocation(), firstMachine, machineWindow,
                            excludedNodes, blocksize, maxReplicasPerRack, results)) {
                        // if doen't get it in the rack, try at a different one
                        LOG.info("Couldn't find 3rd Datanode on the same rack as 2nd. "
                                + "Resorting to a different rack in the same node group.");
                        chooseFirstInRemoteRack(bestmatch[0], excludedNodes, blocksize, maxReplicasPerRack,
                                results);
                    }
                    numOfReplicas--;

                }

            } else if (bestmatch[1] != null && bestmatch[2] != null) { // 0,1,1

                RackRingInfo rackInfo = racksMap.get(bestmatch[1].getNetworkLocation());
                Integer posN1 = rackInfo.findNode(bestmatch[1]);
                Integer posN2 = rackInfo.findNode(bestmatch[2]);

                if (posN1 != null && posN2 != null) {
                    int rackSize = rackInfo.rackNodes.size();
                    int diff = (posN2 - posN1 + rackSize) % rackSize;
                    if (diff >= machineWindow) {
                        Integer tmp = posN1;
                        posN1 = posN2;
                        posN2 = tmp;
                        diff = rackSize - diff;
                    }

                    int newMachineWindow = machineWindow - diff;

                    assert (newMachineWindow > 0);

                    if (rackSize - diff < machineWindow) {
                        newMachineWindow = rackSize;
                    }
                    final int firstRack = (rackInfo.index - rackWindow + racks.size()) % racks.size();
                    int machineIdx = (posN1 - newMachineWindow + 1 + rackSize) % rackSize;

                    if (chooseRemoteRack(rackInfo.index, firstRack, rackWindow, machineIdx, newMachineWindow,
                            excludedNodes, blocksize, maxReplicasPerRack, results, true)) {
                        numOfReplicas--;
                    }
                }
            }

            // get the rest randomly
            if (numOfReplicas > 0) {
                int replicas = results.size();
                if (replicas < 3) {
                    LOG.info("Picking up random replicas from default policy after " + results.size()
                            + " replicas have been chosen");
                }
                super.chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes, blocksize, maxReplicasPerRack,
                        results);
            }
        } finally {
            readUnlock();
        }
    }

    /**
     * Picks up a remote machine within defined window
     * @param rackIdx rack the request is coming from and that should be avoided
     * @param firstRack rack that starts window
     * @param rackWindow rack window size
     * @param machineIdx index of first replica within its rack
     * @param windowSize size of the machine window
     * @param excludedNodes list of black listed nodes.
     * @param blocksize size of a block
     * @param maxReplicasPerRack maximum number of replicas per rack
     * @param results List of results
     * @param reverse adjustment when looking forward or backward.
     * @return
     * @throws NotEnoughReplicasException
     */
    protected boolean chooseRemoteRack(int rackIdx, int firstRack, int rackWindow, int machineIdx, int windowSize,
            HashMap<Node, Node> excludedNodes, long blocksize, int maxReplicasPerRack,
            List<DatanodeDescriptor> results, boolean reverse) throws NotEnoughReplicasException {
        // randomly choose one node from remote racks

        readLock();
        try {
            HashSet<Integer> excludedRacks = new HashSet<Integer>();
            excludedRacks.add(rackIdx);
            int n = racks.size();
            int currRackSize = racksMap.get(racks.get(rackIdx)).rackNodes.size();
            while (excludedRacks.size() < rackWindow) {

                int newRack = randomIntInWindow(firstRack, rackWindow, n, excludedRacks);
                if (newRack < 0)
                    break;

                excludedRacks.add(newRack);

                int newRackSize = racksMap.get(racks.get(newRack)).rackNodes.size();
                int firstMachine = machineIdx * newRackSize / currRackSize;

                int newWindowSize = windowSize;
                if (reverse) {
                    firstMachine = ((int) Math.ceil((double) machineIdx * newRackSize / currRackSize))
                            % newRackSize;

                    newWindowSize = Math.max(1, windowSize * newRackSize / currRackSize);
                }

                if (newWindowSize <= 0) {
                    continue;
                }

                if (chooseMachine(racks.get(newRack), firstMachine, newWindowSize, excludedNodes, blocksize,
                        maxReplicasPerRack, results)) {
                    return true;
                }
            }
            return false;
        } finally {
            readUnlock();
        }
    }

    /**
     * Chosses a machine within a window inside a rack
     * @param rack rack to choose from
     * @param firstMachine machine that starts window
     * @param windowSize size of machine window
     * @param excludedNodes nodes to avoid
     * @param blocksize size of a block
     * @param maxReplicasPerRack maximum number of replicas within the same rack
     * @param results list of results
     * @return
     */
    protected boolean chooseMachine(String rack, int firstMachine, int windowSize,
            HashMap<Node, Node> excludedNodes, long blocksize, int maxReplicasPerRack,
            List<DatanodeDescriptor> results) {
        readLock();
        try {

            HashSet<Integer> excludedMachines = new HashSet<Integer>();
            RackRingInfo rackInfo = racksMap.get(rack);
            assert (rackInfo != null);

            int n = rackInfo.rackNodesMap.size();

            List<Node> rackDatanodes = clusterMap.getDatanodesInRack(rack);
            if (rackDatanodes == null) {
                return false;
            }

            while (excludedMachines.size() < windowSize) {
                int newMachine = randomIntInWindow(firstMachine, windowSize, n, excludedMachines);

                if (newMachine < 0)
                    return false;

                excludedMachines.add(newMachine);

                DatanodeDescriptor chosenNode = null;
                for (Node node : rackDatanodes) {
                    DatanodeDescriptor datanode = (DatanodeDescriptor) node;
                    Integer idx = rackInfo.findNode(datanode);
                    if (idx != null && idx.intValue() == newMachine) {
                        chosenNode = datanode;
                        break;
                    }
                }

                if (chosenNode == null)
                    continue;

                Node oldNode = excludedNodes.put(chosenNode, chosenNode);
                if (oldNode == null) { // choosendNode was not in the excluded list
                    if (isGoodTarget(chosenNode, blocksize, maxReplicasPerRack, results)) {
                        results.add(chosenNode);
                        return true;
                    }
                }
            }
            return false;
        } finally {
            readUnlock();
        }
    }

    /** {@inheritDoc} */
    public DatanodeDescriptor chooseReplicaToDelete(FSInodeInfo inode, Block block, short replicationFactor,
            Collection<DatanodeDescriptor> first, Collection<DatanodeDescriptor> second) {

        List<DatanodeDescriptor> nodes = new ArrayList<DatanodeDescriptor>();

        if (first != null) {
            nodes.addAll(first);
        }
        if (second != null) {
            nodes.addAll(second);
        }

        DatanodeDescriptor[] best = findBest(nodes);

        boolean saved_two_racks = false;

        if (best[0] != null && best[1] != null) {
            saved_two_racks = true;
        }

        for (DatanodeDescriptor n : nodes) {
            if (saved_two_racks && !n.equals(best[0]) && !n.equals(best[1]) && !n.equals(best[2])) {
                return n;
            }

            if (!saved_two_racks && ((best[0] != null && // different from best[0]'s rack
                    !best[0].getNetworkLocation().equals(n.getNetworkLocation()))

                    ||

                    (best[1] != null && // different from best[1]'s rack
                            !best[1].getNetworkLocation().equals(n.getNetworkLocation()))

            )) {
                saved_two_racks = true; // just skipped (saved) one machine
                                        // in a different rack
            }
        }

        return super.chooseReplicaToDelete(inode, block, replicationFactor, first, second);
    }

}