org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicyStair.java Source code

Introduction

Here is the source code for org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicyStair.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.namenode;

import org.apache.commons.logging.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.net.DNSToSwitchMapping;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.net.Node;
import org.apache.hadoop.net.NodeBase;
import org.apache.hadoop.util.HostsFileReader;
import org.apache.hadoop.raid.Codec;
import org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicyDefault;

import java.io.IOException;
import java.util.*;
import org.apache.commons.lang.ArrayUtils;

/**
 * This block placement policy tries (best effort) to the following:
 * 
 * If the file is under the staging directory (a specially named directory)
 * then all blocks of the file is kept on the same host. Additionally, all
 * the raid blocks (if any) for the same file is also kept on the same host.
 *
 * If the file is not under the staging directory then blocks are put in such
 * a way that all blocks within the same stripe end up on random hosts in
 * different racks. For example, the 10 data blocks and 4 parity blocks in a
 * stripe should end up in different racks.
 */
public class BlockPlacementPolicyStair extends BlockPlacementPolicyRaid {
    private int stripeLen;
    private String stagingDir;
    private String localDir;
    private FSNamesystem namesystem = null;
    private boolean considerLoad;
    private List<Codec> acceptedCodecs = new ArrayList<Codec>();

    /* Added by RH Jan 14th 2015 begins */
    /* modified by RH Jul 27th, 2015 begins 
     * de-hardcode */
    private int nodeCount;
    private ArrayList<Node> indexToNode = new ArrayList<Node>();
    private boolean indexToNodeCreated = false;
    private int dataLen;
    private int parityLen;
    private int totalLen;
    private ArrayList<Integer> stripeIdxToRack = new ArrayList<Integer>();
    private ArrayList<Integer> stripeIdxToNodeInRack = new ArrayList<Integer>();

    /* hardcoded implementation */
    //private int nodeCount=20;
    //private Node[] indexToNode = new Node[nodeCount];
    //private boolean indexToNodeCreated = false;
    //private int dataLen=11;
    //private int parityLen=9;
    //private int totalLen=20;
    //private int[] stripeIdxToRack = {0,1,2,3,0,1,2,3,0,1,2,4,4,3,4,0,1,2,3,4};
    //private int[] stripeIdxToNodeInRack = {0,0,0,0,1,1,1,1,2,2,2,0,1,2,2,3,3,3,3,3};
    /* modified by RH Jul 27th, 2015 begins 
     * de-hardcode */

    /* Added by RH Jul 26th 2015 begins */
    private int stairRow;
    private int stairCol;
    private int stairRowParityNum;
    private int stairColParityNum;
    private ArrayList<Integer> stairErrVec = new ArrayList<Integer>();
    /* Added by RH Jul 26th 2015 ends */
    /* Added by RH Jan 14th 2015 ends */

    private static Set<String> badRacks = new HashSet<String>();
    private static Set<String> badHosts = new HashSet<String>();

    BlockPlacementPolicyStair(Configuration conf, FSClusterStats stats, NetworkTopology clusterMap) {
        initialize(conf, stats, clusterMap, null, null, null);
    }

    BlockPlacementPolicyStair() {
    }

    /** A function to be used by unit tests only */
    public static void setBadHostsAndRacks(Set<String> racks, Set<String> hosts) {
        badRacks = racks;
        badHosts = hosts;
    }

    /** {@inheritDoc} */
    public void initialize(Configuration conf, FSClusterStats stats, NetworkTopology clusterMap,
            HostsFileReader hostsReader, DNSToSwitchMapping dnsToSwitchMapping, FSNamesystem ns) {
        super.initialize(conf, stats, clusterMap, hostsReader, dnsToSwitchMapping, ns);
        this.namesystem = ns;
        // Default
        this.stripeLen = 0;
        this.considerLoad = conf.getBoolean("dfs.replication.considerLoad", true);
        FSNamesystem.LOG.info("F4: Block placement will consider load: " + this.considerLoad);
        initParityConfigs();
        this.stagingDir = conf.get("dfs.f4.staging", "/staging");
        this.localDir = conf.get("dfs.f4.local", "/local");

        /* Added by RH Jul 26th, 2015 begins
         * TODO: reading rstair code settings */
        this.stairRow = conf.getInt("hdfs.raid.stair.row", 4);
        this.stairCol = conf.getInt("hdfs.raid.stair.col", 5);
        this.stairRowParityNum = conf.getInt("hdfs.raid.stair.rowParityNum", 1);
        this.stairColParityNum = conf.getInt("hdfs.raid.stair.colParityNum", 1);

        totalLen = stairRow * stairCol;
        stripeIdxToRack = new ArrayList<Integer>(Collections.nCopies(totalLen, 0));
        stripeIdxToNodeInRack = new ArrayList<Integer>(Collections.nCopies(totalLen, 0));

        /* parsing err vec */
        int i;
        parityLen = 0;
        for (i = 0; i < stairColParityNum; i++) {
            stairErrVec.add(this.stairRow);
            parityLen += this.stairRow;
        }
        //LOG.info("RHDEBUG: parityLen=" + parityLen);
        String errVec = conf.get("hdfs.raid.stair.errVec", "1");
        if (errVec.contains(",")) {
            for (String str : errVec.split(",")) {
                stairErrVec.add(this.stairRowParityNum + Integer.parseInt(str));
                parityLen += (this.stairRowParityNum + Integer.parseInt(str));
                i++;
            }
        } else {
            // single element array
            stairErrVec.add(this.stairRowParityNum + Integer.parseInt(errVec));
            parityLen += (this.stairRowParityNum + Integer.parseInt(errVec));
            i++;
        }
        //LOG.info("RHDEBUG: parityLen=" + parityLen);
        for (; i < this.stairCol; i++) {
            stairErrVec.add(this.stairRowParityNum);
            parityLen += (this.stairRowParityNum);
        }
        //LOG.info("RHDEBUG: parityLen=" + parityLen);

        dataLen = totalLen - parityLen;

        int[] numberParityInRow = new int[this.stairRow];
        int currPos = this.stairCol - 1;
        for (i = this.stairRow - 1; i >= 0; i--) {
            numberParityInRow[i] = stairCol;
            for (int j = 0; j < stairErrVec.size(); j++) {
                if (stairErrVec.get(j) <= stairRow - 1 - i)
                    numberParityInRow[i]--;
            }
            //LOG.info("numberParityInRow[" + i + "]=" + numberParityInRow[i]);
        }
        //LOG.info("RHDEBUG: " + stairErrVec);

        int dataCount = 0;
        int parityCount = 0;
        for (i = 0; i < this.stairRow; i++) {
            for (int j = 0; j < this.stairCol; j++) {
                //LOG.info("i=" + i + " j=" + j + " dataCount=" + dataCount + " parityCount=" + parityCount);
                if (j < this.stairCol - numberParityInRow[i]) {
                    stripeIdxToRack.set(dataCount, j);
                    stripeIdxToNodeInRack.set(dataCount++, i);
                } else {
                    stripeIdxToRack.set(dataLen + parityCount, j);
                    stripeIdxToNodeInRack.set(dataLen + (parityCount++), i);
                }
            }
        }

        LOG.info("R-STAIR code placement initialized: dataLen: " + dataLen + " parityLen: " + parityLen
                + " stripeIdxToRack: " + stripeIdxToRack + " stripeIdxToNodeInRack: " + stripeIdxToNodeInRack);
        /* Added by RH Jul 26th, 2015 ends */
    }

    /**
     * This function initializes configuration for the supported parities.
     *
     * Currently, we support RS and XOR. Those two can have different
     * configurations individually. Respective configurations will be used when
     * placing the parity files. There is one exception. The stripe length is
     * calculated based on the maximum of the stripe lengths of the individual
     * parities.
     */
    private void initParityConfigs() {
        Set<String> acceptedCodecIds = new HashSet<String>();
        for (String s : conf.get("dfs.f4.accepted.codecs", "rs,xor").split(",")) {
            acceptedCodecIds.add(s);
        }
        for (Codec c : Codec.getCodecs()) {
            if (acceptedCodecIds.contains(c.id)) {
                FSNamesystem.LOG.info("F4: Parity info." + " Id: " + c.id + " Parity Length: " + c.parityLength
                        + " Parity Stripe Length: " + c.stripeLength + " Parity directory: " + c.parityDirectory
                        + " Parity temp directory: " + c.tmpParityDirectory);
                acceptedCodecs.add(c);
                if (c.stripeLength > this.stripeLen) {
                    // Use the max stripe length
                    this.stripeLen = c.stripeLength;
                }
            }
        }
        FSNamesystem.LOG.info("F4: Initialized stripe len to: " + this.stripeLen);
    }

    private Codec getCodec(String fileName) {
        for (Codec c : this.acceptedCodecs) {
            // This should be "/raidrs/" or /"raid/". If any of these two is
            // is present in the file path, we will assume that is the parity type.
            String uniqueSubtringId = c.parityDirectory + "/";
            if (fileName.contains(uniqueSubtringId)) {
                return c;
            }
        }
        Codec c = this.acceptedCodecs.get(0);
        FSNamesystem.LOG.error("F4: Could not find any valid codec for the file: " + fileName
                + ", hence returning the first one: " + c.id);
        return c;
    }

    private String getParityStagingDir(String parityFileName) {
        Codec c = getCodec(parityFileName);
        return c.parityDirectory + this.stagingDir;
    }

    private boolean isStaging(String fileName) {
        return fileName.startsWith(this.stagingDir) || fileName.startsWith(this.getParityStagingDir(fileName));
    }

    private boolean isLocal(String fileName) {
        return fileName.startsWith(this.localDir);
    }

    @Override
    public DatanodeDescriptor[] chooseTarget(String srcPath, int numOfReplicas, DatanodeDescriptor writer,
            List<DatanodeDescriptor> chosenNodes, long blocksize) {
        return chooseTargetF4(srcPath, numOfReplicas, writer, chosenNodes, null, blocksize);
    }

    /**
     * This function finds a node where to place a block of a file under the
     * "local" directory. The basic idea is to have as few locations (preferably
     * one, and preferably on the writer node)
     *
     * 1) Choose a node that contains one of the blocks in the blocks argument.
     * 2) If there are multiple such nodes, choose one of them (in some order).
     * 3) If this is the first block, then choose the the writer node.
     * 4) If the writer node is not good, choose a random node within the same
     *    rack as the writer node.
     * 5) If the writer node is null or if all of the above tries fail, then
     *    just choose based on the the parent class's policy.
     *
     * @param fileName       The name of the file for which the block is to be
     *                       placed.
     * @param writer         The writer node.
     * @param blocks         The block locations that are to be used as reference
     *                       for placing the current block. For a data file, it
     *                       is the blocks for that file itself. For a raid file,
     *                       it is the blocks of the source file.
     * @param chosenNodes    @see chooseTarget
     * @param excludedNodes  @see chooseTarget
     * @param blocksize      @see chooseTarget
     */
    private DatanodeDescriptor[] chooseLocalTarget(String fileName, DatanodeDescriptor writer, LocatedBlocks blocks,
            List<Node> excludedNodes, List<DatanodeDescriptor> chosenNodes, long blocksize)
            throws IOException, NotEnoughReplicasException {
        // First try the same node as the one where other blocks reside.
        LOG.info("chooseLocalTarget()!!");
        HashMap<String, DatanodeInfo> hostMap = new HashMap<String, DatanodeInfo>();
        for (LocatedBlock b : blocks.getLocatedBlocks()) {
            for (DatanodeInfo i : b.getLocations()) {
                hostMap.put(i.getNetworkLocation() + "/" + i.getName(), i);
            }
        }

        for (Map.Entry<String, DatanodeInfo> entry : hostMap.entrySet()) {
            DatanodeDescriptor result = null;
            DatanodeInfo i = entry.getValue();
            result = new DatanodeDescriptor(i, i.getNetworkLocation(), i.getHostName(), i.getCapacity(),
                    i.getDfsUsed(), i.getRemaining(), i.getNamespaceUsed(), i.getXceiverCount());
            if (this.isGoodTarget(result, blocksize, Integer.MAX_VALUE, this.considerLoad,
                    new ArrayList<DatanodeDescriptor>())) {
                // I dont care about per rack load.
                DatanodeDescriptor[] r = { result };
                return r;
            }
        }
        // Try something in the same rack as the writer.
        if (writer == null) {
            return super.chooseTarget(fileName, 1, writer, chosenNodes, excludedNodes, blocksize);
        } else if (this.isGoodTarget(writer, blocksize, Integer.MAX_VALUE, this.considerLoad,
                new ArrayList<DatanodeDescriptor>())) {
            DatanodeDescriptor[] r = { writer };
            return r;
        }
        HashMap<Node, Node> exclNodes = new HashMap<Node, Node>();
        for (Node n : excludedNodes) {
            exclNodes.put(n, n);
        }
        List<DatanodeDescriptor> results = new ArrayList<DatanodeDescriptor>();
        chooseRandom(1, writer.getNetworkLocation(), exclNodes, blocksize, 1, results);
        return results.toArray(new DatanodeDescriptor[results.size()]);
    }

    /// A helper function that says some hosts are bad based on test config.
    @Override
    protected boolean isGoodTarget(DatanodeDescriptor node, long blockSize, int maxPerRack, boolean considerLoad,
            List<DatanodeDescriptor> results) {
        if (badRacks.contains(node.getNetworkLocation()) || badHosts.contains(node.getName())) {
            return false;
        }
        return super.isGoodTarget(node, blockSize, maxPerRack, considerLoad, results);
    }

    @Override
    public DatanodeDescriptor[] chooseTarget(String srcInode, int numOfReplicas, DatanodeDescriptor writer,
            List<DatanodeDescriptor> chosenNodes, List<Node> excludesNodes, long blocksize) {
        return chooseTargetF4(srcInode, numOfReplicas, writer, chosenNodes, excludesNodes, blocksize);
    }

    private String getSourceFileFromParity(String fileName, FileInfo info) throws IOException {
        NameWithINode nameWithINode;
        switch (info.type) {
        case PARITY:
            // We need to support the following cases
            // parity = /raidrs/staging/X, source = /X
            // parity = /raidrs/X, source = /X
            nameWithINode = null;
            if (isStaging(fileName)) {
                nameWithINode = getSourceFile(fileName, getParityStagingDir(fileName));
            }
            if (nameWithINode == null) {
                Codec c = getCodec(fileName);
                nameWithINode = getSourceFile(fileName, c.parityDirectory);
            }
            return ((nameWithINode == null) ? null : nameWithINode.name);
        case TEMP_PARITY:
            Codec c = getCodec(fileName);
            nameWithINode = getSourceFile(fileName, c.tmpParityDirectory);
            return ((nameWithINode == null) ? null : nameWithINode.name);
        default:
            FSNamesystem.LOG.error("file type bad");
            return null;
        }
    }

    /**
     * This is the main driver function that dictates block placement.
     *
     * This function figures out the kind of file (staging or not, raid or not)
     * and invokes the appropriate functions
     */
    private DatanodeDescriptor[] chooseTargetF4(String fileName, int numOfReplicas, DatanodeDescriptor writer,
            List<DatanodeDescriptor> chosenNodes, List<Node> exclNodes, long blocksize) {
        FSNamesystem.LOG
                .info("F4: F4 policy invoked for file: " + fileName + ", with replica count: " + numOfReplicas);
        // If replica>1 then just default back to RAID
        if (numOfReplicas > 1) {
            return super.chooseTarget(numOfReplicas, writer, chosenNodes, exclNodes, blocksize);
        }
        FileInfo info;
        LocatedBlocks blocks;
        int blockIndex = -1;
        try {
            blocks = this.namesystem.getBlockLocations(fileName, 0, Long.MAX_VALUE);
            info = getFileInfo(null, fileName);
            blockIndex = blocks.getLocatedBlocks().size();
            //LOG.info("debug msg fileName: " + fileName + " blockIndex: " + blockIndex);
        } catch (IOException e) {
            FSNamesystem.LOG.error("F4: Error happened when calling getFileInfo/getBlockLocations");
            return super.chooseTarget(fileName, numOfReplicas, writer, chosenNodes, exclNodes, blocksize);
        }
        FSNamesystem.LOG.info("F4: The file: " + fileName + " has a type: " + info.type);
        HashMap<String, HashSet<Node>> rackToHosts = new HashMap<String, HashSet<Node>>();
        try {

            // First handle the "localdir" case
            if (isLocal(fileName)) {
                return chooseLocalTarget(fileName, writer, blocks, exclNodes, chosenNodes, blocksize);
            }

            // For a data file, the locations of its own blocks as the reference
            int stripeIndex = -1;
            String srcFileName = null;
            String parityFileName = null;
            int parityLength = 0;
            int stripeLength = 0;
            /* Added by RH Jan 14th 2015 begins */
            String[] fNSplits;
            List<DatanodeDescriptor> retVal = new ArrayList<DatanodeDescriptor>();
            int blkIndexInStripe;
            int rackId;
            int nodeIdxInRack;
            int nodeIdx;
            /* Added by RH Jan 14th 2015 ends */
            switch (info.type) {
            case NOT_RAID:
            case SOURCE:
                /* Added by RH Jan 13th 2015 begins */
                //fNSplits = fileName.split("_");
                //stripeIndex = Integer.parseInt(fNSplits[fNSplits.length-1]);
                if (!fileName.contains("raidTest")) {
                    return super.chooseTarget(numOfReplicas, writer, chosenNodes, exclNodes, blocksize);
                }
                int index = 0;
                if (!indexToNodeCreated) {
                    for (String rack : this.clusterMap.getRacks()) {
                        for (Node node : this.clusterMap.getDatanodesInRack(rack)) {
                            indexToNode.set(index, node);
                            LOG.info(" debug msg: indexToNode[" + index + "]: " + indexToNode.get(index));
                            index++;
                        }
                    }
                    for (int i = 0; i < nodeCount; i++) {
                        for (int j = i + 1; j < nodeCount; j++) {
                            if (indexToNode.get(i).toString().compareTo(indexToNode.get(j).toString()) > 0) {
                                Node tmpNode = indexToNode.get(i);
                                indexToNode.set(i, indexToNode.get(j));
                                indexToNode.set(j, tmpNode);
                            }
                        }
                    }
                    indexToNodeCreated = true;
                }
                stripeIndex = blockIndex / dataLen % indexToNode.size();
                blkIndexInStripe = blockIndex % dataLen;
                /* For degraded read benchmark comment when doing MapReduce test begins */
                //if(fileName.endsWith("test0")){
                //  blkIndexInStripe = 0;
                //}else if(fileName.endsWith("test1")){
                //  blkIndexInStripe = 1;
                //}else if(fileName.endsWith("test2")){
                //  blkIndexInStripe = 2;
                //}else if(fileName.endsWith("test3")){
                //  blkIndexInStripe = 3;
                //}else if(fileName.endsWith("test4")){
                //  blkIndexInStripe = 4;
                //}else if(fileName.endsWith("test5")){
                //  blkIndexInStripe = 5;
                //}else if(fileName.endsWith("test6")){
                //  blkIndexInStripe = 6;
                //}else if(fileName.endsWith("test7")){
                //  blkIndexInStripe = 7;
                //}else if(fileName.endsWith("test8")){
                //  blkIndexInStripe = 8;
                //}else if(fileName.endsWith("test9")){
                //  blkIndexInStripe = 9;
                //}else if(fileName.endsWith("testa")){
                //  blkIndexInStripe = 10;
                //}
                /* For degraded read benchmark comment when doing MapReduce test ends */
                rackId = (stripeIndex % stairCol + stripeIdxToRack.get(blkIndexInStripe)) % stairCol;
                nodeIdxInRack = (stripeIndex / stairCol + stripeIdxToNodeInRack.get(blkIndexInStripe)) % stairRow;
                nodeIdx = rackId * stairRow + nodeIdxInRack;
                LOG.info("debug info: blockIndex: " + blockIndex + " nodeIndex: " + nodeIdx + "blkIndexInStripe: "
                        + blkIndexInStripe);
                retVal.add((DatanodeDescriptor) indexToNode.get(nodeIdx));
                return retVal.toArray(new DatanodeDescriptor[retVal.size()]);
            //return super.chooseTarget(
            //  fileName, numOfReplicas, writer, chosenNodes, exclNodes, blocksize);
            /* Added by RH Jan 13th 2015 ends */
            /* Commented by RH Jan 13th 2015 begins */
            //srcFileName = fileName;
            //parityFileName = null;
            //stripeLength = this.stripeLen;
            //stripeIndex = blockIndex / stripeLength;
            //break;
            /* Commented by RH Jan 13th 2015 ends */
            case TEMP_PARITY:
            case PARITY:
                /* Added by RH Jan 13th 2015 begins */
                fNSplits = fileName.split("_");
                stripeIndex = Integer.parseInt(fNSplits[fNSplits.length - 1]) % nodeCount;
                blkIndexInStripe = blockIndex % parityLen + dataLen;
                rackId = (stripeIndex % stairCol + stripeIdxToRack.get(blkIndexInStripe)) % stairCol;
                nodeIdxInRack = (stripeIndex / stairCol + stripeIdxToNodeInRack.get(blkIndexInStripe)) % stairRow;
                nodeIdx = rackId * stairRow + nodeIdxInRack;
                //LOG.info("debug msg: parity file stripeIndex: " + stripeIndex);
                LOG.info("debug info: parity stripeIndex: " + stripeIndex + "blockIndex: " + blockIndex
                        + " nodeIndex: " + nodeIdx + indexToNode.get(nodeIdx));
                retVal.add((DatanodeDescriptor) indexToNode.get(nodeIdx));
                return retVal.toArray(new DatanodeDescriptor[retVal.size()]);
            //return super.chooseTarget(
            //  fileName, numOfReplicas, writer, chosenNodes, exclNodes, blocksize);
            /* Added by RH Jan 13th 2015 ends */
            /* Commented by RH Jan 13th 2015 begins */
            //srcFileName = getSourceFileFromParity(fileName, info);
            //parityFileName = fileName;
            //if (srcFileName == null ||
            //    this.namesystem.getHdfsFileInfo(srcFileName) == null) {
            //  srcFileName = null;
            //  FSNamesystem.LOG.error("F4: " + srcFileName + " does not exist");
            //}
            //Codec c = getCodec(fileName);
            //parityLength = c.parityLength;
            //stripeLength = c.stripeLength;
            //stripeIndex = blockIndex / parityLength;
            //break;
            /* Commented by RH Jan 13th 2015 ends */
            default:
                return super.chooseTarget(numOfReplicas, writer, chosenNodes, exclNodes, blocksize);
            }

            /* Commented by RH Jan 13th 2015 begins */
            //rackToHosts = getRackToHostsMapForStripe(srcFileName,
            //                                         parityFileName,
            //                                         stripeLength,
            //                                         parityLength,
            //                                         stripeIndex);
            /* Commented by RH Jan 13th 2015 ends */
        } catch (IOException e) {
            FSNamesystem.LOG.error("F4: Error happened when calling " + "getParityFile/getSourceFileFromParity");
            return super.chooseTarget(numOfReplicas, writer, chosenNodes, exclNodes, blocksize);
        } catch (NotEnoughReplicasException e) {
            FSNamesystem.LOG.error("F4: Error happend when calling " + "getCompanionSourceNodes/getSourceFile");
            return super.chooseTarget(numOfReplicas, writer, chosenNodes, exclNodes, blocksize);
        }
        /* Commented by RH Jan 13th 2015 begins */
        //return chooseTargetOnNewFailureDomain(fileName,
        //                                      writer,
        //                                      chosenNodes,
        //                                      exclNodes,
        //                                      rackToHosts,
        //                                      blocksize);
        /* Commented by RH Jan 13th 2015 ends */
    }

    // Given a stripe index returns all racks in which the blocks of the stripe
    // reside and the hosts within those racks that host those blocks
    private HashMap<String, HashSet<Node>> getRackToHostsMapForStripe(String srcFileName, String parityFileName,
            int stripeLen, int parityLen, int stripeIndex) throws IOException {
        HashMap<String, HashSet<Node>> rackToHosts = new HashMap<String, HashSet<Node>>();
        if (srcFileName != null) {
            rackToHosts = getRackToHostsMapForStripe(srcFileName, stripeIndex, stripeLen);
        }
        if (parityFileName != null) {
            HashMap<String, HashSet<Node>> rackToHostsForParity = getRackToHostsMapForStripe(parityFileName,
                    stripeIndex, parityLen);
            for (Map.Entry<String, HashSet<Node>> e : rackToHostsForParity.entrySet()) {
                HashSet<Node> nodes = rackToHosts.get(e.getKey());
                if (nodes == null) {
                    nodes = new HashSet<Node>();
                    rackToHosts.put(e.getKey(), nodes);
                }
                for (Node n : e.getValue()) {
                    nodes.add(n);
                }
            }
        }
        for (Map.Entry<String, HashSet<Node>> e : rackToHosts.entrySet()) {
            if (e.getValue().size() > 1) {
                FSNamesystem.LOG.warn("F4: Rack " + e.getKey() + " being overused for stripe: " + stripeIndex);
            }
        }
        return rackToHosts;
    }

    private HashMap<String, HashSet<Node>> getRackToHostsMapForStripe(String src, int stripeIndex, int stripeLen)
            throws IOException {
        int sourceStart = stripeIndex * stripeLen;
        int sourceEnd = sourceStart + stripeLen;
        LocatedBlocks blocks = this.namesystem.getBlockLocations(src, 0, Long.MAX_VALUE);
        List<LocatedBlock> sourceBlocks = blocks.getLocatedBlocks();
        sourceEnd = Math.min(sourceEnd, sourceBlocks.size());
        HashMap<String, HashSet<Node>> rackNodes = new HashMap<String, HashSet<Node>>();
        if (sourceStart < sourceBlocks.size()) {
            for (LocatedBlock b : sourceBlocks.subList(sourceStart, sourceEnd)) {
                for (Node n : b.getLocations()) {
                    String rack = n.getNetworkLocation();
                    FSNamesystem.LOG.info("F4: Block info for file: " + src + ", offset: " + b.getStartOffset()
                            + ", rack: " + rack);
                    HashSet<Node> nodes = rackNodes.get(rack);
                    if (nodes == null) {
                        nodes = new HashSet<Node>();
                        rackNodes.put(rack, nodes);
                    }
                    nodes.add(n);
                }
            }
        }
        return rackNodes;
    }

    /**
     * This function uses the rackToHosts map (that contains the rack and the
     * corresponding nodes in those racks that contain the relevant blocks).
     *
     * The definition of "relevant blocks" is flexible. It can be used in a
     * variety of contexts. In the F4 placement policy, the relevant blocks
     * are all the peer blocks of the block to be placed. The peer blocks would
     * be all blocks in the raid stripe (data and parity included).
     *
     * It gets the racks that contain the least number of blocks for the stripe.
     * it gets the nodes within those racks and tries one-by-one all such
     * hosts as potential locations for the blocks. The check is based on
     * the host:
     * 1) The host passing the isGoodTarget check.
     * 2) If 1) fails and the "considerLoad" is true, then the same check is
     *    done with considerLoad = false.
     * 3) If 2) fails, then a node is chosen randomly while excluding any hosts
     *    that contain a block in the same stripe as the block to be placed.
     */
    private DatanodeDescriptor[] chooseTargetOnNewFailureDomain(String fileName, DatanodeDescriptor writer,
            List<DatanodeDescriptor> chosenNodes, List<Node> exclNodes, HashMap<String, HashSet<Node>> rackToHosts,
            long blockSize) {

        HashMap<Node, Node> excludedNodes = new HashMap<Node, Node>();
        for (String rack : this.clusterMap.getAllRacks()) {
            if (rackToHosts.get(rack) == null) {
                rackToHosts.put(rack, new HashSet<Node>());
            }
        }
        // Get the min occupancy in the racks.
        int minCount = Integer.MAX_VALUE;
        for (Map.Entry<String, HashSet<Node>> entry : rackToHosts.entrySet()) {
            if (entry.getValue().size() < minCount) {
                minCount = entry.getValue().size();
            }
            // DO NOT choose a host that has already been chosen for this stripe.
            for (Node n : entry.getValue()) {
                excludedNodes.put(n, n);
            }
        }

        if (exclNodes != null) {
            for (Node node : exclNodes) {
                excludedNodes.put(node, node);
            }
        }

        HashMap<String, HashSet<Node>> candidateNodesByRacks = new HashMap<String, HashSet<Node>>();
        for (Map.Entry<String, HashSet<Node>> entry : rackToHosts.entrySet()) {
            if (entry.getValue().size() == minCount) {
                for (Node n : this.clusterMap.getDatanodesInRack(entry.getKey())) {
                    if (excludedNodes.get(n) == null) {
                        HashSet<Node> candidateNodes = candidateNodesByRacks.get(entry.getKey());
                        if (candidateNodes == null) {
                            candidateNodes = new HashSet<Node>();
                            candidateNodesByRacks.put(entry.getKey(), candidateNodes);
                        }
                        candidateNodes.add(n);
                    }
                }
            }
        }

        List<DatanodeDescriptor> results = new ArrayList<DatanodeDescriptor>();
        if (getGoodNode(candidateNodesByRacks, this.considerLoad, blockSize, results)) {
            return results.toArray(new DatanodeDescriptor[results.size()]);
        }
        if (this.considerLoad) {
            FSNamesystem.LOG.info("F4: Retrying without considering load for file: " + fileName);
            if (getGoodNode(candidateNodesByRacks, false, blockSize, results)) {
                return results.toArray(new DatanodeDescriptor[results.size()]);
            }
        }
        FSNamesystem.LOG.error("F4: No datanode in a non-overlapping rack for file:" + fileName);
        // Final effort to get something. But it will always try to get something
        // that is not a host that contains a peer block (block in the same stripe)
        // We assume that this step should succeed. In this step all nodes in the
        // cluster are available except for atmost 13 hosts for placement. So it is
        // highly unlikely that this step would fail.
        try {
            super.chooseRandom(1, NodeBase.ROOT, excludedNodes, blockSize, 1, results);
            return results.toArray(new DatanodeDescriptor[results.size()]);
        } catch (Exception e) {
            FSNamesystem.LOG.error("F4: Could not find a data node using "
                    + "the normal F4 policy. Switching to default of parent");
            return super.chooseTarget(fileName, 1, writer, chosenNodes, null, blockSize);
        }
    }

    private class RackComparator implements Comparator<Map.Entry<String, HashSet<Node>>> {
        public RackComparator(long blockSize) {
            this.blockSize = blockSize;
        }

        public int compare(Map.Entry<String, HashSet<Node>> o1, Map.Entry<String, HashSet<Node>> o2) {
            long ret = 0;
            for (Node node : o1.getValue()) {
                DatanodeDescriptor n = (DatanodeDescriptor) node;
                ret += (n.getRemaining() - (n.getBlocksScheduled() * this.blockSize));
            }
            for (Node node : o2.getValue()) {
                DatanodeDescriptor n = (DatanodeDescriptor) node;
                ret -= (n.getRemaining() - (n.getBlocksScheduled() * this.blockSize));
            }
            return ret == 0 ? 0 : (ret > 0) ? -1 : 1;
        }

        private long blockSize;
    }

    // Helper function to choose less occupied racks first.
    private boolean getGoodNode(HashMap<String, HashSet<Node>> candidateNodesByRacks, boolean considerLoad,
            long blockSize, List<DatanodeDescriptor> results) {
        List<Map.Entry<String, HashSet<Node>>> sorted = new ArrayList<Map.Entry<String, HashSet<Node>>>();
        for (Map.Entry<String, HashSet<Node>> entry : candidateNodesByRacks.entrySet()) {
            sorted.add(entry);
        }
        Collections.sort(sorted, new RackComparator(blockSize));
        int count = sorted.size() / 4;
        Collections.shuffle(sorted.subList(0, count));
        for (Map.Entry<String, HashSet<Node>> e : sorted) {
            if (getGoodNode(e.getValue(), considerLoad, blockSize, results)) {
                return true;
            }
        }
        return false;
    }

    // Helper function to find a good node. Returns true if found.
    private boolean getGoodNode(Set<Node> candidateNodes, boolean considerLoad, long blockSize,
            List<DatanodeDescriptor> results) {
        List<DatanodeDescriptor> sorted = new ArrayList<DatanodeDescriptor>();
        for (Node n : candidateNodes) {
            sorted.add((DatanodeDescriptor) n);
        }
        final long blocksize = blockSize;
        Collections.sort(sorted, new Comparator<DatanodeDescriptor>() {
            public int compare(DatanodeDescriptor n1, DatanodeDescriptor n2) {
                long ret = (n2.getRemaining() - (n2.getBlocksScheduled() * blocksize))
                        - (n1.getRemaining() - (n1.getBlocksScheduled() * blocksize));
                return ret == 0 ? 0 : (ret > 0) ? 1 : -1;
            }
        });
        // Also, add some randomness. We are doing so because it seems
        // that if there are many copies scheduled at the same time, namenode
        // does not have the uptodate information. So, we need to add some
        // randomness so that there is not a lot of copies targeted to
        // the same node, which will overload the hosts and may lead to
        // timeouts.
        int count = sorted.size() / 2;
        Collections.shuffle(sorted.subList(0, count));
        for (DatanodeDescriptor n : sorted) {
            if (this.isGoodTarget((DatanodeDescriptor) n, blocksize, 1, // MaxTargerPerLoc (per rack)
                    considerLoad, results)) {
                results.add((DatanodeDescriptor) n);
                return true;
            }
        }
        return false;
    }
}