Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.namenode; import org.apache.commons.logging.*; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.FSConstants; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.net.DNSToSwitchMapping; import org.apache.hadoop.net.NetworkTopology; import org.apache.hadoop.net.Node; import org.apache.hadoop.net.NodeBase; import org.apache.hadoop.util.HostsFileReader; import org.apache.hadoop.raid.Codec; import org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicyDefault; import java.io.IOException; import java.util.*; import org.apache.commons.lang.ArrayUtils; /** * This block placement policy tries (best effort) to the following: * * If the file is under the staging directory (a specially named directory) * then all blocks of the file is kept on the same host. Additionally, all * the raid blocks (if any) for the same file is also kept on the same host. * * If the file is not under the staging directory then blocks are put in such * a way that all blocks within the same stripe end up on random hosts in * different racks. For example, the 10 data blocks and 4 parity blocks in a * stripe should end up in different racks. */ public class BlockPlacementPolicyStair extends BlockPlacementPolicyRaid { private int stripeLen; private String stagingDir; private String localDir; private FSNamesystem namesystem = null; private boolean considerLoad; private List<Codec> acceptedCodecs = new ArrayList<Codec>(); /* Added by RH Jan 14th 2015 begins */ /* modified by RH Jul 27th, 2015 begins * de-hardcode */ private int nodeCount; private ArrayList<Node> indexToNode = new ArrayList<Node>(); private boolean indexToNodeCreated = false; private int dataLen; private int parityLen; private int totalLen; private ArrayList<Integer> stripeIdxToRack = new ArrayList<Integer>(); private ArrayList<Integer> stripeIdxToNodeInRack = new ArrayList<Integer>(); /* hardcoded implementation */ //private int nodeCount=20; //private Node[] indexToNode = new Node[nodeCount]; //private boolean indexToNodeCreated = false; //private int dataLen=11; //private int parityLen=9; //private int totalLen=20; //private int[] stripeIdxToRack = {0,1,2,3,0,1,2,3,0,1,2,4,4,3,4,0,1,2,3,4}; //private int[] stripeIdxToNodeInRack = {0,0,0,0,1,1,1,1,2,2,2,0,1,2,2,3,3,3,3,3}; /* modified by RH Jul 27th, 2015 begins * de-hardcode */ /* Added by RH Jul 26th 2015 begins */ private int stairRow; private int stairCol; private int stairRowParityNum; private int stairColParityNum; private ArrayList<Integer> stairErrVec = new ArrayList<Integer>(); /* Added by RH Jul 26th 2015 ends */ /* Added by RH Jan 14th 2015 ends */ private static Set<String> badRacks = new HashSet<String>(); private static Set<String> badHosts = new HashSet<String>(); BlockPlacementPolicyStair(Configuration conf, FSClusterStats stats, NetworkTopology clusterMap) { initialize(conf, stats, clusterMap, null, null, null); } BlockPlacementPolicyStair() { } /** A function to be used by unit tests only */ public static void setBadHostsAndRacks(Set<String> racks, Set<String> hosts) { badRacks = racks; badHosts = hosts; } /** {@inheritDoc} */ public void initialize(Configuration conf, FSClusterStats stats, NetworkTopology clusterMap, HostsFileReader hostsReader, DNSToSwitchMapping dnsToSwitchMapping, FSNamesystem ns) { super.initialize(conf, stats, clusterMap, hostsReader, dnsToSwitchMapping, ns); this.namesystem = ns; // Default this.stripeLen = 0; this.considerLoad = conf.getBoolean("dfs.replication.considerLoad", true); FSNamesystem.LOG.info("F4: Block placement will consider load: " + this.considerLoad); initParityConfigs(); this.stagingDir = conf.get("dfs.f4.staging", "/staging"); this.localDir = conf.get("dfs.f4.local", "/local"); /* Added by RH Jul 26th, 2015 begins * TODO: reading rstair code settings */ this.stairRow = conf.getInt("hdfs.raid.stair.row", 4); this.stairCol = conf.getInt("hdfs.raid.stair.col", 5); this.stairRowParityNum = conf.getInt("hdfs.raid.stair.rowParityNum", 1); this.stairColParityNum = conf.getInt("hdfs.raid.stair.colParityNum", 1); totalLen = stairRow * stairCol; stripeIdxToRack = new ArrayList<Integer>(Collections.nCopies(totalLen, 0)); stripeIdxToNodeInRack = new ArrayList<Integer>(Collections.nCopies(totalLen, 0)); /* parsing err vec */ int i; parityLen = 0; for (i = 0; i < stairColParityNum; i++) { stairErrVec.add(this.stairRow); parityLen += this.stairRow; } //LOG.info("RHDEBUG: parityLen=" + parityLen); String errVec = conf.get("hdfs.raid.stair.errVec", "1"); if (errVec.contains(",")) { for (String str : errVec.split(",")) { stairErrVec.add(this.stairRowParityNum + Integer.parseInt(str)); parityLen += (this.stairRowParityNum + Integer.parseInt(str)); i++; } } else { // single element array stairErrVec.add(this.stairRowParityNum + Integer.parseInt(errVec)); parityLen += (this.stairRowParityNum + Integer.parseInt(errVec)); i++; } //LOG.info("RHDEBUG: parityLen=" + parityLen); for (; i < this.stairCol; i++) { stairErrVec.add(this.stairRowParityNum); parityLen += (this.stairRowParityNum); } //LOG.info("RHDEBUG: parityLen=" + parityLen); dataLen = totalLen - parityLen; int[] numberParityInRow = new int[this.stairRow]; int currPos = this.stairCol - 1; for (i = this.stairRow - 1; i >= 0; i--) { numberParityInRow[i] = stairCol; for (int j = 0; j < stairErrVec.size(); j++) { if (stairErrVec.get(j) <= stairRow - 1 - i) numberParityInRow[i]--; } //LOG.info("numberParityInRow[" + i + "]=" + numberParityInRow[i]); } //LOG.info("RHDEBUG: " + stairErrVec); int dataCount = 0; int parityCount = 0; for (i = 0; i < this.stairRow; i++) { for (int j = 0; j < this.stairCol; j++) { //LOG.info("i=" + i + " j=" + j + " dataCount=" + dataCount + " parityCount=" + parityCount); if (j < this.stairCol - numberParityInRow[i]) { stripeIdxToRack.set(dataCount, j); stripeIdxToNodeInRack.set(dataCount++, i); } else { stripeIdxToRack.set(dataLen + parityCount, j); stripeIdxToNodeInRack.set(dataLen + (parityCount++), i); } } } LOG.info("R-STAIR code placement initialized: dataLen: " + dataLen + " parityLen: " + parityLen + " stripeIdxToRack: " + stripeIdxToRack + " stripeIdxToNodeInRack: " + stripeIdxToNodeInRack); /* Added by RH Jul 26th, 2015 ends */ } /** * This function initializes configuration for the supported parities. * * Currently, we support RS and XOR. Those two can have different * configurations individually. Respective configurations will be used when * placing the parity files. There is one exception. The stripe length is * calculated based on the maximum of the stripe lengths of the individual * parities. */ private void initParityConfigs() { Set<String> acceptedCodecIds = new HashSet<String>(); for (String s : conf.get("dfs.f4.accepted.codecs", "rs,xor").split(",")) { acceptedCodecIds.add(s); } for (Codec c : Codec.getCodecs()) { if (acceptedCodecIds.contains(c.id)) { FSNamesystem.LOG.info("F4: Parity info." + " Id: " + c.id + " Parity Length: " + c.parityLength + " Parity Stripe Length: " + c.stripeLength + " Parity directory: " + c.parityDirectory + " Parity temp directory: " + c.tmpParityDirectory); acceptedCodecs.add(c); if (c.stripeLength > this.stripeLen) { // Use the max stripe length this.stripeLen = c.stripeLength; } } } FSNamesystem.LOG.info("F4: Initialized stripe len to: " + this.stripeLen); } private Codec getCodec(String fileName) { for (Codec c : this.acceptedCodecs) { // This should be "/raidrs/" or /"raid/". If any of these two is // is present in the file path, we will assume that is the parity type. String uniqueSubtringId = c.parityDirectory + "/"; if (fileName.contains(uniqueSubtringId)) { return c; } } Codec c = this.acceptedCodecs.get(0); FSNamesystem.LOG.error("F4: Could not find any valid codec for the file: " + fileName + ", hence returning the first one: " + c.id); return c; } private String getParityStagingDir(String parityFileName) { Codec c = getCodec(parityFileName); return c.parityDirectory + this.stagingDir; } private boolean isStaging(String fileName) { return fileName.startsWith(this.stagingDir) || fileName.startsWith(this.getParityStagingDir(fileName)); } private boolean isLocal(String fileName) { return fileName.startsWith(this.localDir); } @Override public DatanodeDescriptor[] chooseTarget(String srcPath, int numOfReplicas, DatanodeDescriptor writer, List<DatanodeDescriptor> chosenNodes, long blocksize) { return chooseTargetF4(srcPath, numOfReplicas, writer, chosenNodes, null, blocksize); } /** * This function finds a node where to place a block of a file under the * "local" directory. The basic idea is to have as few locations (preferably * one, and preferably on the writer node) * * 1) Choose a node that contains one of the blocks in the blocks argument. * 2) If there are multiple such nodes, choose one of them (in some order). * 3) If this is the first block, then choose the the writer node. * 4) If the writer node is not good, choose a random node within the same * rack as the writer node. * 5) If the writer node is null or if all of the above tries fail, then * just choose based on the the parent class's policy. * * @param fileName The name of the file for which the block is to be * placed. * @param writer The writer node. * @param blocks The block locations that are to be used as reference * for placing the current block. For a data file, it * is the blocks for that file itself. For a raid file, * it is the blocks of the source file. * @param chosenNodes @see chooseTarget * @param excludedNodes @see chooseTarget * @param blocksize @see chooseTarget */ private DatanodeDescriptor[] chooseLocalTarget(String fileName, DatanodeDescriptor writer, LocatedBlocks blocks, List<Node> excludedNodes, List<DatanodeDescriptor> chosenNodes, long blocksize) throws IOException, NotEnoughReplicasException { // First try the same node as the one where other blocks reside. LOG.info("chooseLocalTarget()!!"); HashMap<String, DatanodeInfo> hostMap = new HashMap<String, DatanodeInfo>(); for (LocatedBlock b : blocks.getLocatedBlocks()) { for (DatanodeInfo i : b.getLocations()) { hostMap.put(i.getNetworkLocation() + "/" + i.getName(), i); } } for (Map.Entry<String, DatanodeInfo> entry : hostMap.entrySet()) { DatanodeDescriptor result = null; DatanodeInfo i = entry.getValue(); result = new DatanodeDescriptor(i, i.getNetworkLocation(), i.getHostName(), i.getCapacity(), i.getDfsUsed(), i.getRemaining(), i.getNamespaceUsed(), i.getXceiverCount()); if (this.isGoodTarget(result, blocksize, Integer.MAX_VALUE, this.considerLoad, new ArrayList<DatanodeDescriptor>())) { // I dont care about per rack load. DatanodeDescriptor[] r = { result }; return r; } } // Try something in the same rack as the writer. if (writer == null) { return super.chooseTarget(fileName, 1, writer, chosenNodes, excludedNodes, blocksize); } else if (this.isGoodTarget(writer, blocksize, Integer.MAX_VALUE, this.considerLoad, new ArrayList<DatanodeDescriptor>())) { DatanodeDescriptor[] r = { writer }; return r; } HashMap<Node, Node> exclNodes = new HashMap<Node, Node>(); for (Node n : excludedNodes) { exclNodes.put(n, n); } List<DatanodeDescriptor> results = new ArrayList<DatanodeDescriptor>(); chooseRandom(1, writer.getNetworkLocation(), exclNodes, blocksize, 1, results); return results.toArray(new DatanodeDescriptor[results.size()]); } /// A helper function that says some hosts are bad based on test config. @Override protected boolean isGoodTarget(DatanodeDescriptor node, long blockSize, int maxPerRack, boolean considerLoad, List<DatanodeDescriptor> results) { if (badRacks.contains(node.getNetworkLocation()) || badHosts.contains(node.getName())) { return false; } return super.isGoodTarget(node, blockSize, maxPerRack, considerLoad, results); } @Override public DatanodeDescriptor[] chooseTarget(String srcInode, int numOfReplicas, DatanodeDescriptor writer, List<DatanodeDescriptor> chosenNodes, List<Node> excludesNodes, long blocksize) { return chooseTargetF4(srcInode, numOfReplicas, writer, chosenNodes, excludesNodes, blocksize); } private String getSourceFileFromParity(String fileName, FileInfo info) throws IOException { NameWithINode nameWithINode; switch (info.type) { case PARITY: // We need to support the following cases // parity = /raidrs/staging/X, source = /X // parity = /raidrs/X, source = /X nameWithINode = null; if (isStaging(fileName)) { nameWithINode = getSourceFile(fileName, getParityStagingDir(fileName)); } if (nameWithINode == null) { Codec c = getCodec(fileName); nameWithINode = getSourceFile(fileName, c.parityDirectory); } return ((nameWithINode == null) ? null : nameWithINode.name); case TEMP_PARITY: Codec c = getCodec(fileName); nameWithINode = getSourceFile(fileName, c.tmpParityDirectory); return ((nameWithINode == null) ? null : nameWithINode.name); default: FSNamesystem.LOG.error("file type bad"); return null; } } /** * This is the main driver function that dictates block placement. * * This function figures out the kind of file (staging or not, raid or not) * and invokes the appropriate functions */ private DatanodeDescriptor[] chooseTargetF4(String fileName, int numOfReplicas, DatanodeDescriptor writer, List<DatanodeDescriptor> chosenNodes, List<Node> exclNodes, long blocksize) { FSNamesystem.LOG .info("F4: F4 policy invoked for file: " + fileName + ", with replica count: " + numOfReplicas); // If replica>1 then just default back to RAID if (numOfReplicas > 1) { return super.chooseTarget(numOfReplicas, writer, chosenNodes, exclNodes, blocksize); } FileInfo info; LocatedBlocks blocks; int blockIndex = -1; try { blocks = this.namesystem.getBlockLocations(fileName, 0, Long.MAX_VALUE); info = getFileInfo(null, fileName); blockIndex = blocks.getLocatedBlocks().size(); //LOG.info("debug msg fileName: " + fileName + " blockIndex: " + blockIndex); } catch (IOException e) { FSNamesystem.LOG.error("F4: Error happened when calling getFileInfo/getBlockLocations"); return super.chooseTarget(fileName, numOfReplicas, writer, chosenNodes, exclNodes, blocksize); } FSNamesystem.LOG.info("F4: The file: " + fileName + " has a type: " + info.type); HashMap<String, HashSet<Node>> rackToHosts = new HashMap<String, HashSet<Node>>(); try { // First handle the "localdir" case if (isLocal(fileName)) { return chooseLocalTarget(fileName, writer, blocks, exclNodes, chosenNodes, blocksize); } // For a data file, the locations of its own blocks as the reference int stripeIndex = -1; String srcFileName = null; String parityFileName = null; int parityLength = 0; int stripeLength = 0; /* Added by RH Jan 14th 2015 begins */ String[] fNSplits; List<DatanodeDescriptor> retVal = new ArrayList<DatanodeDescriptor>(); int blkIndexInStripe; int rackId; int nodeIdxInRack; int nodeIdx; /* Added by RH Jan 14th 2015 ends */ switch (info.type) { case NOT_RAID: case SOURCE: /* Added by RH Jan 13th 2015 begins */ //fNSplits = fileName.split("_"); //stripeIndex = Integer.parseInt(fNSplits[fNSplits.length-1]); if (!fileName.contains("raidTest")) { return super.chooseTarget(numOfReplicas, writer, chosenNodes, exclNodes, blocksize); } int index = 0; if (!indexToNodeCreated) { for (String rack : this.clusterMap.getRacks()) { for (Node node : this.clusterMap.getDatanodesInRack(rack)) { indexToNode.set(index, node); LOG.info(" debug msg: indexToNode[" + index + "]: " + indexToNode.get(index)); index++; } } for (int i = 0; i < nodeCount; i++) { for (int j = i + 1; j < nodeCount; j++) { if (indexToNode.get(i).toString().compareTo(indexToNode.get(j).toString()) > 0) { Node tmpNode = indexToNode.get(i); indexToNode.set(i, indexToNode.get(j)); indexToNode.set(j, tmpNode); } } } indexToNodeCreated = true; } stripeIndex = blockIndex / dataLen % indexToNode.size(); blkIndexInStripe = blockIndex % dataLen; /* For degraded read benchmark comment when doing MapReduce test begins */ //if(fileName.endsWith("test0")){ // blkIndexInStripe = 0; //}else if(fileName.endsWith("test1")){ // blkIndexInStripe = 1; //}else if(fileName.endsWith("test2")){ // blkIndexInStripe = 2; //}else if(fileName.endsWith("test3")){ // blkIndexInStripe = 3; //}else if(fileName.endsWith("test4")){ // blkIndexInStripe = 4; //}else if(fileName.endsWith("test5")){ // blkIndexInStripe = 5; //}else if(fileName.endsWith("test6")){ // blkIndexInStripe = 6; //}else if(fileName.endsWith("test7")){ // blkIndexInStripe = 7; //}else if(fileName.endsWith("test8")){ // blkIndexInStripe = 8; //}else if(fileName.endsWith("test9")){ // blkIndexInStripe = 9; //}else if(fileName.endsWith("testa")){ // blkIndexInStripe = 10; //} /* For degraded read benchmark comment when doing MapReduce test ends */ rackId = (stripeIndex % stairCol + stripeIdxToRack.get(blkIndexInStripe)) % stairCol; nodeIdxInRack = (stripeIndex / stairCol + stripeIdxToNodeInRack.get(blkIndexInStripe)) % stairRow; nodeIdx = rackId * stairRow + nodeIdxInRack; LOG.info("debug info: blockIndex: " + blockIndex + " nodeIndex: " + nodeIdx + "blkIndexInStripe: " + blkIndexInStripe); retVal.add((DatanodeDescriptor) indexToNode.get(nodeIdx)); return retVal.toArray(new DatanodeDescriptor[retVal.size()]); //return super.chooseTarget( // fileName, numOfReplicas, writer, chosenNodes, exclNodes, blocksize); /* Added by RH Jan 13th 2015 ends */ /* Commented by RH Jan 13th 2015 begins */ //srcFileName = fileName; //parityFileName = null; //stripeLength = this.stripeLen; //stripeIndex = blockIndex / stripeLength; //break; /* Commented by RH Jan 13th 2015 ends */ case TEMP_PARITY: case PARITY: /* Added by RH Jan 13th 2015 begins */ fNSplits = fileName.split("_"); stripeIndex = Integer.parseInt(fNSplits[fNSplits.length - 1]) % nodeCount; blkIndexInStripe = blockIndex % parityLen + dataLen; rackId = (stripeIndex % stairCol + stripeIdxToRack.get(blkIndexInStripe)) % stairCol; nodeIdxInRack = (stripeIndex / stairCol + stripeIdxToNodeInRack.get(blkIndexInStripe)) % stairRow; nodeIdx = rackId * stairRow + nodeIdxInRack; //LOG.info("debug msg: parity file stripeIndex: " + stripeIndex); LOG.info("debug info: parity stripeIndex: " + stripeIndex + "blockIndex: " + blockIndex + " nodeIndex: " + nodeIdx + indexToNode.get(nodeIdx)); retVal.add((DatanodeDescriptor) indexToNode.get(nodeIdx)); return retVal.toArray(new DatanodeDescriptor[retVal.size()]); //return super.chooseTarget( // fileName, numOfReplicas, writer, chosenNodes, exclNodes, blocksize); /* Added by RH Jan 13th 2015 ends */ /* Commented by RH Jan 13th 2015 begins */ //srcFileName = getSourceFileFromParity(fileName, info); //parityFileName = fileName; //if (srcFileName == null || // this.namesystem.getHdfsFileInfo(srcFileName) == null) { // srcFileName = null; // FSNamesystem.LOG.error("F4: " + srcFileName + " does not exist"); //} //Codec c = getCodec(fileName); //parityLength = c.parityLength; //stripeLength = c.stripeLength; //stripeIndex = blockIndex / parityLength; //break; /* Commented by RH Jan 13th 2015 ends */ default: return super.chooseTarget(numOfReplicas, writer, chosenNodes, exclNodes, blocksize); } /* Commented by RH Jan 13th 2015 begins */ //rackToHosts = getRackToHostsMapForStripe(srcFileName, // parityFileName, // stripeLength, // parityLength, // stripeIndex); /* Commented by RH Jan 13th 2015 ends */ } catch (IOException e) { FSNamesystem.LOG.error("F4: Error happened when calling " + "getParityFile/getSourceFileFromParity"); return super.chooseTarget(numOfReplicas, writer, chosenNodes, exclNodes, blocksize); } catch (NotEnoughReplicasException e) { FSNamesystem.LOG.error("F4: Error happend when calling " + "getCompanionSourceNodes/getSourceFile"); return super.chooseTarget(numOfReplicas, writer, chosenNodes, exclNodes, blocksize); } /* Commented by RH Jan 13th 2015 begins */ //return chooseTargetOnNewFailureDomain(fileName, // writer, // chosenNodes, // exclNodes, // rackToHosts, // blocksize); /* Commented by RH Jan 13th 2015 ends */ } // Given a stripe index returns all racks in which the blocks of the stripe // reside and the hosts within those racks that host those blocks private HashMap<String, HashSet<Node>> getRackToHostsMapForStripe(String srcFileName, String parityFileName, int stripeLen, int parityLen, int stripeIndex) throws IOException { HashMap<String, HashSet<Node>> rackToHosts = new HashMap<String, HashSet<Node>>(); if (srcFileName != null) { rackToHosts = getRackToHostsMapForStripe(srcFileName, stripeIndex, stripeLen); } if (parityFileName != null) { HashMap<String, HashSet<Node>> rackToHostsForParity = getRackToHostsMapForStripe(parityFileName, stripeIndex, parityLen); for (Map.Entry<String, HashSet<Node>> e : rackToHostsForParity.entrySet()) { HashSet<Node> nodes = rackToHosts.get(e.getKey()); if (nodes == null) { nodes = new HashSet<Node>(); rackToHosts.put(e.getKey(), nodes); } for (Node n : e.getValue()) { nodes.add(n); } } } for (Map.Entry<String, HashSet<Node>> e : rackToHosts.entrySet()) { if (e.getValue().size() > 1) { FSNamesystem.LOG.warn("F4: Rack " + e.getKey() + " being overused for stripe: " + stripeIndex); } } return rackToHosts; } private HashMap<String, HashSet<Node>> getRackToHostsMapForStripe(String src, int stripeIndex, int stripeLen) throws IOException { int sourceStart = stripeIndex * stripeLen; int sourceEnd = sourceStart + stripeLen; LocatedBlocks blocks = this.namesystem.getBlockLocations(src, 0, Long.MAX_VALUE); List<LocatedBlock> sourceBlocks = blocks.getLocatedBlocks(); sourceEnd = Math.min(sourceEnd, sourceBlocks.size()); HashMap<String, HashSet<Node>> rackNodes = new HashMap<String, HashSet<Node>>(); if (sourceStart < sourceBlocks.size()) { for (LocatedBlock b : sourceBlocks.subList(sourceStart, sourceEnd)) { for (Node n : b.getLocations()) { String rack = n.getNetworkLocation(); FSNamesystem.LOG.info("F4: Block info for file: " + src + ", offset: " + b.getStartOffset() + ", rack: " + rack); HashSet<Node> nodes = rackNodes.get(rack); if (nodes == null) { nodes = new HashSet<Node>(); rackNodes.put(rack, nodes); } nodes.add(n); } } } return rackNodes; } /** * This function uses the rackToHosts map (that contains the rack and the * corresponding nodes in those racks that contain the relevant blocks). * * The definition of "relevant blocks" is flexible. It can be used in a * variety of contexts. In the F4 placement policy, the relevant blocks * are all the peer blocks of the block to be placed. The peer blocks would * be all blocks in the raid stripe (data and parity included). * * It gets the racks that contain the least number of blocks for the stripe. * it gets the nodes within those racks and tries one-by-one all such * hosts as potential locations for the blocks. The check is based on * the host: * 1) The host passing the isGoodTarget check. * 2) If 1) fails and the "considerLoad" is true, then the same check is * done with considerLoad = false. * 3) If 2) fails, then a node is chosen randomly while excluding any hosts * that contain a block in the same stripe as the block to be placed. */ private DatanodeDescriptor[] chooseTargetOnNewFailureDomain(String fileName, DatanodeDescriptor writer, List<DatanodeDescriptor> chosenNodes, List<Node> exclNodes, HashMap<String, HashSet<Node>> rackToHosts, long blockSize) { HashMap<Node, Node> excludedNodes = new HashMap<Node, Node>(); for (String rack : this.clusterMap.getAllRacks()) { if (rackToHosts.get(rack) == null) { rackToHosts.put(rack, new HashSet<Node>()); } } // Get the min occupancy in the racks. int minCount = Integer.MAX_VALUE; for (Map.Entry<String, HashSet<Node>> entry : rackToHosts.entrySet()) { if (entry.getValue().size() < minCount) { minCount = entry.getValue().size(); } // DO NOT choose a host that has already been chosen for this stripe. for (Node n : entry.getValue()) { excludedNodes.put(n, n); } } if (exclNodes != null) { for (Node node : exclNodes) { excludedNodes.put(node, node); } } HashMap<String, HashSet<Node>> candidateNodesByRacks = new HashMap<String, HashSet<Node>>(); for (Map.Entry<String, HashSet<Node>> entry : rackToHosts.entrySet()) { if (entry.getValue().size() == minCount) { for (Node n : this.clusterMap.getDatanodesInRack(entry.getKey())) { if (excludedNodes.get(n) == null) { HashSet<Node> candidateNodes = candidateNodesByRacks.get(entry.getKey()); if (candidateNodes == null) { candidateNodes = new HashSet<Node>(); candidateNodesByRacks.put(entry.getKey(), candidateNodes); } candidateNodes.add(n); } } } } List<DatanodeDescriptor> results = new ArrayList<DatanodeDescriptor>(); if (getGoodNode(candidateNodesByRacks, this.considerLoad, blockSize, results)) { return results.toArray(new DatanodeDescriptor[results.size()]); } if (this.considerLoad) { FSNamesystem.LOG.info("F4: Retrying without considering load for file: " + fileName); if (getGoodNode(candidateNodesByRacks, false, blockSize, results)) { return results.toArray(new DatanodeDescriptor[results.size()]); } } FSNamesystem.LOG.error("F4: No datanode in a non-overlapping rack for file:" + fileName); // Final effort to get something. But it will always try to get something // that is not a host that contains a peer block (block in the same stripe) // We assume that this step should succeed. In this step all nodes in the // cluster are available except for atmost 13 hosts for placement. So it is // highly unlikely that this step would fail. try { super.chooseRandom(1, NodeBase.ROOT, excludedNodes, blockSize, 1, results); return results.toArray(new DatanodeDescriptor[results.size()]); } catch (Exception e) { FSNamesystem.LOG.error("F4: Could not find a data node using " + "the normal F4 policy. Switching to default of parent"); return super.chooseTarget(fileName, 1, writer, chosenNodes, null, blockSize); } } private class RackComparator implements Comparator<Map.Entry<String, HashSet<Node>>> { public RackComparator(long blockSize) { this.blockSize = blockSize; } public int compare(Map.Entry<String, HashSet<Node>> o1, Map.Entry<String, HashSet<Node>> o2) { long ret = 0; for (Node node : o1.getValue()) { DatanodeDescriptor n = (DatanodeDescriptor) node; ret += (n.getRemaining() - (n.getBlocksScheduled() * this.blockSize)); } for (Node node : o2.getValue()) { DatanodeDescriptor n = (DatanodeDescriptor) node; ret -= (n.getRemaining() - (n.getBlocksScheduled() * this.blockSize)); } return ret == 0 ? 0 : (ret > 0) ? -1 : 1; } private long blockSize; } // Helper function to choose less occupied racks first. private boolean getGoodNode(HashMap<String, HashSet<Node>> candidateNodesByRacks, boolean considerLoad, long blockSize, List<DatanodeDescriptor> results) { List<Map.Entry<String, HashSet<Node>>> sorted = new ArrayList<Map.Entry<String, HashSet<Node>>>(); for (Map.Entry<String, HashSet<Node>> entry : candidateNodesByRacks.entrySet()) { sorted.add(entry); } Collections.sort(sorted, new RackComparator(blockSize)); int count = sorted.size() / 4; Collections.shuffle(sorted.subList(0, count)); for (Map.Entry<String, HashSet<Node>> e : sorted) { if (getGoodNode(e.getValue(), considerLoad, blockSize, results)) { return true; } } return false; } // Helper function to find a good node. Returns true if found. private boolean getGoodNode(Set<Node> candidateNodes, boolean considerLoad, long blockSize, List<DatanodeDescriptor> results) { List<DatanodeDescriptor> sorted = new ArrayList<DatanodeDescriptor>(); for (Node n : candidateNodes) { sorted.add((DatanodeDescriptor) n); } final long blocksize = blockSize; Collections.sort(sorted, new Comparator<DatanodeDescriptor>() { public int compare(DatanodeDescriptor n1, DatanodeDescriptor n2) { long ret = (n2.getRemaining() - (n2.getBlocksScheduled() * blocksize)) - (n1.getRemaining() - (n1.getBlocksScheduled() * blocksize)); return ret == 0 ? 0 : (ret > 0) ? 1 : -1; } }); // Also, add some randomness. We are doing so because it seems // that if there are many copies scheduled at the same time, namenode // does not have the uptodate information. So, we need to add some // randomness so that there is not a lot of copies targeted to // the same node, which will overload the hosts and may lead to // timeouts. int count = sorted.size() / 2; Collections.shuffle(sorted.subList(0, count)); for (DatanodeDescriptor n : sorted) { if (this.isGoodTarget((DatanodeDescriptor) n, blocksize, 1, // MaxTargerPerLoc (per rack) considerLoad, results)) { results.add((DatanodeDescriptor) n); return true; } } return false; } }