Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.balancer; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.DataInput; import java.io.DataInputStream; import java.io.DataOutput; import java.io.DataOutputStream; import java.io.IOException; import java.io.OutputStream; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.Socket; import java.text.DateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Date; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Formatter; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Random; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.server.common.HdfsConstants; import org.apache.hadoop.hdfs.server.common.Util; import org.apache.hadoop.hdfs.DFSClient; import org.apache.hadoop.hdfs.protocol.*; import org.apache.hadoop.hdfs.protocol.FSConstants.DatanodeReportType; import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager; import org.apache.hadoop.hdfs.security.token.block.ExportedBlockKeys; import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.retry.RetryPolicies; import org.apache.hadoop.io.retry.RetryPolicy; import org.apache.hadoop.io.retry.RetryProxy; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.net.NetworkTopology; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.util.Daemon; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; /** <p>The balancer is a tool that balances disk space usage on an HDFS cluster * when some datanodes become full or when new empty nodes join the cluster. * The tool is deployed as an application program that can be run by the * cluster administrator on a live HDFS cluster while applications * adding and deleting files. * * <p>SYNOPSIS * <pre> * To start: * bin/start-balancer.sh [-threshold <threshold>] * Example: bin/ start-balancer.sh * start the balancer with a default threshold of 10% * bin/ start-balancer.sh -threshold 5 * start the balancer with a threshold of 5% * To stop: * bin/ stop-balancer.sh * </pre> * * <p>DESCRIPTION * <p>The threshold parameter is a fraction in the range of (0%, 100%) with a * default value of 10%. The threshold sets a target for whether the cluster * is balanced. A cluster is balanced if for each datanode, the utilization * of the node (ratio of used space at the node to total capacity of the node) * differs from the utilization of the (ratio of used space in the cluster * to total capacity of the cluster) by no more than the threshold value. * The smaller the threshold, the more balanced a cluster will become. * It takes more time to run the balancer for small threshold values. * Also for a very small threshold the cluster may not be able to reach the * balanced state when applications write and delete files concurrently. * * <p>The tool moves blocks from highly utilized datanodes to poorly * utilized datanodes iteratively. In each iteration a datanode moves or * receives no more than the lesser of 10G bytes or the threshold fraction * of its capacity. Each iteration runs no more than 20 minutes. * At the end of each iteration, the balancer obtains updated datanodes * information from the namenode. * * <p>A system property that limits the balancer's use of bandwidth is * defined in the default configuration file: * <pre> * <property> * <name>dfs.balance.bandwidthPerSec</name> * <value>1048576</value> * <description> Specifies the maximum bandwidth that each datanode * can utilize for the balancing purpose in term of the number of bytes * per second. </description> * </property> * </pre> * * <p>This property determines the maximum speed at which a block will be * moved from one datanode to another. The default value is 1MB/s. The higher * the bandwidth, the faster a cluster can reach the balanced state, * but with greater competition with application processes. If an * administrator changes the value of this property in the configuration * file, the change is observed when HDFS is next restarted. * * <p>MONITERING BALANCER PROGRESS * <p>After the balancer is started, an output file name where the balancer * progress will be recorded is printed on the screen. The administrator * can monitor the running of the balancer by reading the output file. * The output shows the balancer's status iteration by iteration. In each * iteration it prints the starting time, the iteration number, the total * number of bytes that have been moved in the previous iterations, * the total number of bytes that are left to move in order for the cluster * to be balanced, and the number of bytes that are being moved in this * iteration. Normally "Bytes Already Moved" is increasing while "Bytes Left * To Move" is decreasing. * * <p>Running multiple instances of the balancer in an HDFS cluster is * prohibited by the tool. * * <p>The balancer automatically exits when any of the following five * conditions is satisfied: * <ol> * <li>The cluster is balanced; * <li>No block can be moved; * <li>No block has been moved for five consecutive iterations; * <li>An IOException occurs while communicating with the namenode; * <li>Another balancer is running. * </ol> * * <p>Upon exit, a balancer returns an exit code and prints one of the * following messages to the output file in corresponding to the above exit * reasons: * <ol> * <li>The cluster is balanced. Exiting * <li>No block can be moved. Exiting... * <li>No block has been moved for 3 iterations. Exiting... * <li>Received an IO exception: failure reason. Exiting... * <li>Another balancer is running. Exiting... * </ol> * * <p>The administrator can interrupt the execution of the balancer at any * time by running the command "stop-balancer.sh" on the machine where the * balancer is running. */ public class Balancer implements Tool { private static final Log LOG = LogFactory.getLog(Balancer.class.getName()); final private static long MAX_BLOCKS_SIZE_TO_FETCH = 2 * 1024 * 1024 * 1024L; //2GB /** The maximum number of concurrent blocks moves for * balancing purpose at a datanode */ public static final int MAX_NUM_CONCURRENT_MOVES = 5; private Configuration conf; private double threshold = 10D; private NamenodeProtocol namenode; private ClientProtocol client; private FileSystem fs; private boolean isBlockTokenEnabled; private boolean shouldRun; private long keyUpdaterInterval; private BlockTokenSecretManager blockTokenSecretManager; private Daemon keyupdaterthread = null; // AccessKeyUpdater thread private final static Random rnd = new Random(); // all data node lists private Collection<Source> overUtilizedDatanodes = new LinkedList<Source>(); private Collection<Source> aboveAvgUtilizedDatanodes = new LinkedList<Source>(); private Collection<BalancerDatanode> belowAvgUtilizedDatanodes = new LinkedList<BalancerDatanode>(); private Collection<BalancerDatanode> underUtilizedDatanodes = new LinkedList<BalancerDatanode>(); private Collection<Source> sources = new HashSet<Source>(); private Collection<BalancerDatanode> targets = new HashSet<BalancerDatanode>(); private Map<Block, BalancerBlock> globalBlockList = new HashMap<Block, BalancerBlock>(); private MovedBlocks movedBlocks = new MovedBlocks(); private Map<String, BalancerDatanode> datanodes = new HashMap<String, BalancerDatanode>(); private NetworkTopology cluster = new NetworkTopology(); private double avgUtilization = 0.0D; final static private int MOVER_THREAD_POOL_SIZE = 1000; final private ExecutorService moverExecutor = Executors.newFixedThreadPool(MOVER_THREAD_POOL_SIZE); final static private int DISPATCHER_THREAD_POOL_SIZE = 200; final private ExecutorService dispatcherExecutor = Executors.newFixedThreadPool(DISPATCHER_THREAD_POOL_SIZE); /* This class keeps track of a scheduled block move */ private class PendingBlockMove { private BalancerBlock block; private Source source; private BalancerDatanode proxySource; private BalancerDatanode target; /** constructor */ private PendingBlockMove() { } /* choose a block & a proxy source for this pendingMove * whose source & target have already been chosen. * * Return true if a block and its proxy are chosen; false otherwise */ private boolean chooseBlockAndProxy() { // iterate all source's blocks until find a good one for (Iterator<BalancerBlock> blocks = source.getBlockIterator(); blocks.hasNext();) { if (markMovedIfGoodBlock(blocks.next())) { blocks.remove(); return true; } } return false; } /* Return true if the given block is good for the tentative move; * If it is good, add it to the moved list to marked as "Moved". * A block is good if * 1. it is a good candidate; see isGoodBlockCandidate * 2. can find a proxy source that's not busy for this move */ private boolean markMovedIfGoodBlock(BalancerBlock block) { synchronized (block) { synchronized (movedBlocks) { if (isGoodBlockCandidate(source, target, block)) { this.block = block; if (chooseProxySource()) { movedBlocks.add(block); if (LOG.isDebugEnabled()) { LOG.debug("Decided to move block " + block.getBlockId() + " with a length of " + StringUtils.byteDesc(block.getNumBytes()) + " bytes from " + source.getName() + " to " + target.getName() + " using proxy source " + proxySource.getName()); } return true; } } } } return false; } /* Now we find out source, target, and block, we need to find a proxy * * @return true if a proxy is found; otherwise false */ private boolean chooseProxySource() { // check if there is replica which is on the same rack with the target for (BalancerDatanode loc : block.getLocations()) { if (cluster.isOnSameRack(loc.getDatanode(), target.getDatanode())) { if (loc.addPendingBlock(this)) { proxySource = loc; return true; } } } // find out a non-busy replica for (BalancerDatanode loc : block.getLocations()) { if (loc.addPendingBlock(this)) { proxySource = loc; return true; } } return false; } /* Dispatch the block move task to the proxy source & wait for the response */ private void dispatch() { Socket sock = new Socket(); DataOutputStream out = null; DataInputStream in = null; try { sock.connect(NetUtils.createSocketAddr(target.datanode.getName()), HdfsConstants.READ_TIMEOUT); sock.setKeepAlive(true); out = new DataOutputStream( new BufferedOutputStream(sock.getOutputStream(), FSConstants.BUFFER_SIZE)); sendRequest(out); in = new DataInputStream(new BufferedInputStream(sock.getInputStream(), FSConstants.BUFFER_SIZE)); receiveResponse(in); bytesMoved.inc(block.getNumBytes()); LOG.info("Moving block " + block.getBlock().getBlockId() + " from " + source.getName() + " to " + target.getName() + " through " + proxySource.getName() + " is succeeded."); } catch (IOException e) { LOG.warn("Error moving block " + block.getBlockId() + " from " + source.getName() + " to " + target.getName() + " through " + proxySource.getName() + ": " + e.getMessage()); } finally { IOUtils.closeStream(out); IOUtils.closeStream(in); IOUtils.closeSocket(sock); proxySource.removePendingBlock(this); synchronized (target) { target.removePendingBlock(this); } synchronized (this) { reset(); } synchronized (Balancer.this) { Balancer.this.notifyAll(); } } } /* Send a block replace request to the output stream*/ private void sendRequest(DataOutputStream out) throws IOException { out.writeShort(DataTransferProtocol.DATA_TRANSFER_VERSION); out.writeByte(DataTransferProtocol.OP_REPLACE_BLOCK); out.writeLong(block.getBlock().getBlockId()); out.writeLong(block.getBlock().getGenerationStamp()); Text.writeString(out, source.getStorageID()); proxySource.write(out); Token<BlockTokenIdentifier> accessToken = BlockTokenSecretManager.DUMMY_TOKEN; if (isBlockTokenEnabled) { accessToken = blockTokenSecretManager.generateToken(null, block.getBlock(), EnumSet .of(BlockTokenSecretManager.AccessMode.REPLACE, BlockTokenSecretManager.AccessMode.COPY)); } accessToken.write(out); out.flush(); } /* Receive a block copy response from the input stream */ private void receiveResponse(DataInputStream in) throws IOException { short status = in.readShort(); if (status != DataTransferProtocol.OP_STATUS_SUCCESS) { if (status == DataTransferProtocol.OP_STATUS_ERROR_ACCESS_TOKEN) throw new IOException("block move failed due to access token error"); throw new IOException("block move is failed"); } } /* reset the object */ private void reset() { block = null; source = null; proxySource = null; target = null; } /* start a thread to dispatch the block move */ private void scheduleBlockMove() { moverExecutor.execute(new Runnable() { public void run() { if (LOG.isDebugEnabled()) { LOG.debug("Starting moving " + block.getBlockId() + " from " + proxySource.getName() + " to " + target.getName()); } dispatch(); } }); } } /* A class for keeping track of blocks in the Balancer */ static private class BalancerBlock { private Block block; // the block private List<BalancerDatanode> locations = new ArrayList<BalancerDatanode>(3); // its locations /* Constructor */ private BalancerBlock(Block block) { this.block = block; } /* clean block locations */ private synchronized void clearLocations() { locations.clear(); } /* add a location */ private synchronized void addLocation(BalancerDatanode datanode) { if (!locations.contains(datanode)) { locations.add(datanode); } } /* Return if the block is located on <code>datanode</code> */ private synchronized boolean isLocatedOnDatanode(BalancerDatanode datanode) { return locations.contains(datanode); } /* Return its locations */ private synchronized List<BalancerDatanode> getLocations() { return locations; } /* Return the block */ private Block getBlock() { return block; } /* Return the block id */ private long getBlockId() { return block.getBlockId(); } /* Return the length of the block */ private long getNumBytes() { return block.getNumBytes(); } } /* The class represents a desired move of bytes between two nodes * and the target. * An object of this class is stored in a source node. */ static private class NodeTask { private BalancerDatanode datanode; //target node private long size; //bytes scheduled to move /* constructor */ private NodeTask(BalancerDatanode datanode, long size) { this.datanode = datanode; this.size = size; } /* Get the node */ private BalancerDatanode getDatanode() { return datanode; } /* Get the number of bytes that need to be moved */ private long getSize() { return size; } } /* Return the utilization of a datanode */ static private double getUtilization(DatanodeInfo datanode) { return ((double) datanode.getDfsUsed()) / datanode.getCapacity() * 100; } /* A class that keeps track of a datanode in Balancer */ private static class BalancerDatanode implements Writable { final private static long MAX_SIZE_TO_MOVE = 10 * 1024 * 1024 * 1024L; //10GB protected DatanodeInfo datanode; private double utilization; protected long maxSizeToMove; protected long scheduledSize = 0L; // blocks being moved but not confirmed yet private List<PendingBlockMove> pendingBlocks = new ArrayList<PendingBlockMove>(MAX_NUM_CONCURRENT_MOVES); /* Constructor * Depending on avgutil & threshold, calculate maximum bytes to move */ private BalancerDatanode(DatanodeInfo node, double avgUtil, double threshold) { datanode = node; utilization = Balancer.getUtilization(node); if (utilization >= avgUtil + threshold || utilization <= avgUtil - threshold) { maxSizeToMove = (long) (threshold * datanode.getCapacity() / 100); } else { maxSizeToMove = (long) (Math.abs(avgUtil - utilization) * datanode.getCapacity() / 100); } if (utilization < avgUtil) { maxSizeToMove = Math.min(datanode.getRemaining(), maxSizeToMove); } maxSizeToMove = Math.min(MAX_SIZE_TO_MOVE, maxSizeToMove); } /** Get the datanode */ protected DatanodeInfo getDatanode() { return datanode; } /** Get the name of the datanode */ protected String getName() { return datanode.getName(); } /* Get the storage id of the datanode */ protected String getStorageID() { return datanode.getStorageID(); } /** Decide if still need to move more bytes */ protected boolean isMoveQuotaFull() { return scheduledSize < maxSizeToMove; } /** Return the total number of bytes that need to be moved */ protected long availableSizeToMove() { return maxSizeToMove - scheduledSize; } /* increment scheduled size */ protected void incScheduledSize(long size) { scheduledSize += size; } /* Check if the node can schedule more blocks to move */ synchronized private boolean isPendingQNotFull() { if (pendingBlocks.size() < MAX_NUM_CONCURRENT_MOVES) { return true; } return false; } /* Check if all the dispatched moves are done */ synchronized private boolean isPendingQEmpty() { return pendingBlocks.isEmpty(); } /* Add a scheduled block move to the node */ private synchronized boolean addPendingBlock(PendingBlockMove pendingBlock) { if (isPendingQNotFull()) { return pendingBlocks.add(pendingBlock); } return false; } /* Remove a scheduled block move from the node */ private synchronized boolean removePendingBlock(PendingBlockMove pendingBlock) { return pendingBlocks.remove(pendingBlock); } /** The following two methods support the Writable interface */ /** Deserialize */ public void readFields(DataInput in) throws IOException { datanode.readFields(in); } /** Serialize */ public void write(DataOutput out) throws IOException { datanode.write(out); } } /** A node that can be the sources of a block move */ private class Source extends BalancerDatanode { /* A thread that initiates a block move * and waits for block move to complete */ private class BlockMoveDispatcher implements Runnable { public void run() { dispatchBlocks(); } } private ArrayList<NodeTask> nodeTasks = new ArrayList<NodeTask>(2); private long blocksToReceive = 0L; /* source blocks point to balancerBlocks in the global list because * we want to keep one copy of a block in balancer and be aware that * the locations are changing over time. */ private List<BalancerBlock> srcBlockList = new ArrayList<BalancerBlock>(); /* constructor */ private Source(DatanodeInfo node, double avgUtil, double threshold) { super(node, avgUtil, threshold); } /** Add a node task */ private void addNodeTask(NodeTask task) { assert (task.datanode != this) : "Source and target are the same " + datanode.getName(); incScheduledSize(task.getSize()); nodeTasks.add(task); } /* Return an iterator to this source's blocks */ private Iterator<BalancerBlock> getBlockIterator() { return srcBlockList.iterator(); } /* fetch new blocks of this source from namenode and * update this source's block list & the global block list * Return the total size of the received blocks in the number of bytes. */ private long getBlockList() throws IOException { BlockWithLocations[] newBlocks = namenode .getBlocks(datanode, (long) Math.min(MAX_BLOCKS_SIZE_TO_FETCH, blocksToReceive)).getBlocks(); long bytesReceived = 0; for (BlockWithLocations blk : newBlocks) { bytesReceived += blk.getBlock().getNumBytes(); BalancerBlock block; synchronized (globalBlockList) { block = globalBlockList.get(blk.getBlock()); if (block == null) { block = new BalancerBlock(blk.getBlock()); globalBlockList.put(blk.getBlock(), block); } else { block.clearLocations(); } synchronized (block) { // update locations for (String location : blk.getDatanodes()) { BalancerDatanode datanode = datanodes.get(location); if (datanode != null) { // not an unknown datanode block.addLocation(datanode); } } } if (!srcBlockList.contains(block) && isGoodBlockCandidate(block)) { // filter bad candidates srcBlockList.add(block); } } } return bytesReceived; } /* Decide if the given block is a good candidate to move or not */ private boolean isGoodBlockCandidate(BalancerBlock block) { for (NodeTask nodeTask : nodeTasks) { if (Balancer.this.isGoodBlockCandidate(this, nodeTask.datanode, block)) { return true; } } return false; } /* Return a block that's good for the source thread to dispatch immediately * The block's source, target, and proxy source are determined too. * When choosing proxy and target, source & target throttling * has been considered. They are chosen only when they have the capacity * to support this block move. * The block should be dispatched immediately after this method is returned. */ private PendingBlockMove chooseNextBlockToMove() { for (Iterator<NodeTask> tasks = nodeTasks.iterator(); tasks.hasNext();) { NodeTask task = tasks.next(); BalancerDatanode target = task.getDatanode(); PendingBlockMove pendingBlock = new PendingBlockMove(); if (target.addPendingBlock(pendingBlock)) { // target is not busy, so do a tentative block allocation pendingBlock.source = this; pendingBlock.target = target; if (pendingBlock.chooseBlockAndProxy()) { long blockSize = pendingBlock.block.getNumBytes(); scheduledSize -= blockSize; task.size -= blockSize; if (task.size == 0) { tasks.remove(); } return pendingBlock; } else { // cancel the tentative move target.removePendingBlock(pendingBlock); } } } return null; } /* iterate all source's blocks to remove moved ones */ private void filterMovedBlocks() { for (Iterator<BalancerBlock> blocks = getBlockIterator(); blocks.hasNext();) { if (movedBlocks.contains(blocks.next())) { blocks.remove(); } } } private static final int SOURCE_BLOCK_LIST_MIN_SIZE = 5; /* Return if should fetch more blocks from namenode */ private boolean shouldFetchMoreBlocks() { return srcBlockList.size() < SOURCE_BLOCK_LIST_MIN_SIZE && blocksToReceive > 0; } /* This method iteratively does the following: * it first selects a block to move, * then sends a request to the proxy source to start the block move * when the source's block list falls below a threshold, it asks * the namenode for more blocks. * It terminates when it has dispatch enough block move tasks or * it has received enough blocks from the namenode, or * the elapsed time of the iteration has exceeded the max time limit. */ private static final long MAX_ITERATION_TIME = 20 * 60 * 1000L; //20 mins private void dispatchBlocks() { long startTime = Util.now(); this.blocksToReceive = 2 * scheduledSize; boolean isTimeUp = false; while (!isTimeUp && scheduledSize > 0 && (!srcBlockList.isEmpty() || blocksToReceive > 0)) { PendingBlockMove pendingBlock = chooseNextBlockToMove(); if (pendingBlock != null) { // move the block pendingBlock.scheduleBlockMove(); continue; } /* Since we can not schedule any block to move, * filter any moved blocks from the source block list and * check if we should fetch more blocks from the namenode */ filterMovedBlocks(); // filter already moved blocks if (shouldFetchMoreBlocks()) { // fetch new blocks try { blocksToReceive -= getBlockList(); continue; } catch (IOException e) { LOG.warn(StringUtils.stringifyException(e)); return; } } // check if time is up or not if (Util.now() - startTime > MAX_ITERATION_TIME) { isTimeUp = true; continue; } /* Now we can not schedule any block to move and there are * no new blocks added to the source block list, so we wait. */ try { synchronized (Balancer.this) { Balancer.this.wait(1000); // wait for targets/sources to be idle } } catch (InterruptedException ignored) { } } } } /** Default constructor */ Balancer() { } /** Construct a balancer from the given configuration */ Balancer(Configuration conf) { setConf(conf); } /** Construct a balancer from the given configuration and threshold */ Balancer(Configuration conf, double threshold) { setConf(conf); this.threshold = threshold; } /** * Run a balancer * @param args */ public static void main(String[] args) { try { System.exit(ToolRunner.run(null, new Balancer(), args)); } catch (Throwable e) { LOG.error(StringUtils.stringifyException(e)); System.exit(-1); } } private static void printUsage() { System.out.println("Usage: java Balancer"); System.out.println(" [-threshold <threshold>]\t" + "percentage of disk capacity"); } /* parse argument to get the threshold */ private double parseArgs(String[] args) { double threshold = 0; int argsLen = (args == null) ? 0 : args.length; if (argsLen == 0) { threshold = 10; } else { if (argsLen != 2 || !"-threshold".equalsIgnoreCase(args[0])) { printUsage(); throw new IllegalArgumentException(Arrays.toString(args)); } else { try { threshold = Double.parseDouble(args[1]); if (threshold < 0 || threshold > 100) { throw new NumberFormatException(); } LOG.info("Using a threshold of " + threshold); } catch (NumberFormatException e) { System.err.println("Expect a double parameter in the range of [0, 100]: " + args[1]); printUsage(); throw e; } } } return threshold; } /* Initialize balancer. It sets the value of the threshold, and * builds the communication proxies to * namenode as a client and a secondary namenode and retry proxies * when connection fails. */ private void init(double threshold) throws IOException { this.threshold = threshold; this.namenode = createNamenode(conf); this.client = DFSClient.createNamenode(conf); this.fs = FileSystem.get(conf); ExportedBlockKeys keys = namenode.getBlockKeys(); this.isBlockTokenEnabled = keys.isBlockTokenEnabled(); if (isBlockTokenEnabled) { long blockKeyUpdateInterval = keys.getKeyUpdateInterval(); long blockTokenLifetime = keys.getTokenLifetime(); LOG.info( "Block token params received from NN: keyUpdateInterval=" + blockKeyUpdateInterval / (60 * 1000) + " min(s), tokenLifetime=" + blockTokenLifetime / (60 * 1000) + " min(s)"); this.blockTokenSecretManager = new BlockTokenSecretManager(false, blockKeyUpdateInterval, blockTokenLifetime); this.blockTokenSecretManager.setKeys(keys); /* * Balancer should sync its block keys with NN more frequently than NN * updates its block keys */ this.keyUpdaterInterval = blockKeyUpdateInterval / 4; LOG.info( "Balancer will update its block keys every " + keyUpdaterInterval / (60 * 1000) + " minute(s)"); this.keyupdaterthread = new Daemon(new BlockKeyUpdater()); this.shouldRun = true; this.keyupdaterthread.start(); } } /** * Periodically updates access keys. */ class BlockKeyUpdater implements Runnable { public void run() { while (shouldRun) { try { blockTokenSecretManager.setKeys(namenode.getBlockKeys()); } catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); } try { Thread.sleep(keyUpdaterInterval); } catch (InterruptedException ie) { } } } } /* Build a NamenodeProtocol connection to the namenode and * set up the retry policy */ private static NamenodeProtocol createNamenode(Configuration conf) throws IOException { InetSocketAddress nameNodeAddr = NameNode.getServiceAddress(conf, true); RetryPolicy timeoutPolicy = RetryPolicies.exponentialBackoffRetry(5, 200, TimeUnit.MILLISECONDS); Map<Class<? extends Exception>, RetryPolicy> exceptionToPolicyMap = new HashMap<Class<? extends Exception>, RetryPolicy>(); RetryPolicy methodPolicy = RetryPolicies.retryByException(timeoutPolicy, exceptionToPolicyMap); Map<String, RetryPolicy> methodNameToPolicyMap = new HashMap<String, RetryPolicy>(); methodNameToPolicyMap.put("getBlocks", methodPolicy); methodNameToPolicyMap.put("getAccessKeys", methodPolicy); UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); return (NamenodeProtocol) RetryProxy.create(NamenodeProtocol.class, RPC.getProxy(NamenodeProtocol.class, NamenodeProtocol.versionID, nameNodeAddr, ugi, conf, NetUtils.getDefaultSocketFactory(conf)), methodNameToPolicyMap); } /* Shuffle datanode array */ static private void shuffleArray(DatanodeInfo[] datanodes) { for (int i = datanodes.length; i > 1; i--) { int randomIndex = rnd.nextInt(i); DatanodeInfo tmp = datanodes[randomIndex]; datanodes[randomIndex] = datanodes[i - 1]; datanodes[i - 1] = tmp; } } /* get all live datanodes of a cluster and their disk usage * decide the number of bytes need to be moved */ private long initNodes() throws IOException { return initNodes(client.getDatanodeReport(DatanodeReportType.LIVE)); } /* Given a data node set, build a network topology and decide * over-utilized datanodes, above average utilized datanodes, * below average utilized datanodes, and underutilized datanodes. * The input data node set is shuffled before the datanodes * are put into the over-utilized datanodes, above average utilized * datanodes, below average utilized datanodes, and * underutilized datanodes lists. This will add some randomness * to the node matching later on. * * @return the total number of bytes that are * needed to move to make the cluster balanced. * @param datanodes a set of datanodes */ private long initNodes(DatanodeInfo[] datanodes) { // compute average utilization long totalCapacity = 0L, totalUsedSpace = 0L; for (DatanodeInfo datanode : datanodes) { if (datanode.isDecommissioned() || datanode.isDecommissionInProgress()) { continue; // ignore decommissioning or decommissioned nodes } totalCapacity += datanode.getCapacity(); totalUsedSpace += datanode.getDfsUsed(); } this.avgUtilization = ((double) totalUsedSpace) / totalCapacity * 100; /*create network topology and all data node lists: * overloaded, above-average, below-average, and underloaded * we alternates the accessing of the given datanodes array either by * an increasing order or a decreasing order. */ long overLoadedBytes = 0L, underLoadedBytes = 0L; shuffleArray(datanodes); for (DatanodeInfo datanode : datanodes) { if (datanode.isDecommissioned() || datanode.isDecommissionInProgress()) { continue; // ignore decommissioning or decommissioned nodes } cluster.add(datanode); BalancerDatanode datanodeS; if (getUtilization(datanode) > avgUtilization) { datanodeS = new Source(datanode, avgUtilization, threshold); if (isAboveAvgUtilized(datanodeS)) { this.aboveAvgUtilizedDatanodes.add((Source) datanodeS); } else { assert (isOverUtilized(datanodeS)) : datanodeS.getName() + "is not an overUtilized node"; this.overUtilizedDatanodes.add((Source) datanodeS); overLoadedBytes += (long) ((datanodeS.utilization - avgUtilization - threshold) * datanodeS.datanode.getCapacity() / 100.0); } } else { datanodeS = new BalancerDatanode(datanode, avgUtilization, threshold); if (isBelowAvgUtilized(datanodeS)) { this.belowAvgUtilizedDatanodes.add(datanodeS); } else { assert (isUnderUtilized(datanodeS)) : datanodeS.getName() + "is not an underUtilized node"; this.underUtilizedDatanodes.add(datanodeS); underLoadedBytes += (long) ((avgUtilization - threshold - datanodeS.utilization) * datanodeS.datanode.getCapacity() / 100.0); } } this.datanodes.put(datanode.getStorageID(), datanodeS); } //logging logImbalancedNodes(); assert (this.datanodes.size() == overUtilizedDatanodes.size() + underUtilizedDatanodes.size() + aboveAvgUtilizedDatanodes.size() + belowAvgUtilizedDatanodes.size()) : "Mismatched number of datanodes"; // return number of bytes to be moved in order to make the cluster balanced return Math.max(overLoadedBytes, underLoadedBytes); } /* log the over utilized & under utilized nodes */ private void logImbalancedNodes() { StringBuilder msg = new StringBuilder(); msg.append(overUtilizedDatanodes.size()); msg.append(" over utilized nodes:"); for (Source node : overUtilizedDatanodes) { msg.append(" "); msg.append(node.getName()); } LOG.info(msg); msg = new StringBuilder(); msg.append(underUtilizedDatanodes.size()); msg.append(" under utilized nodes: "); for (BalancerDatanode node : underUtilizedDatanodes) { msg.append(" "); msg.append(node.getName()); } LOG.info(msg); } /* Decide all <source, target> pairs and * the number of bytes to move from a source to a target * Maximum bytes to be moved per node is * Min(1 Band worth of bytes, MAX_SIZE_TO_MOVE). * Return total number of bytes to move in this iteration */ private long chooseNodes() { // Match nodes on the same rack first chooseNodes(true); // Then match nodes on different racks chooseNodes(false); assert (datanodes.size() == overUtilizedDatanodes.size() + underUtilizedDatanodes.size() + aboveAvgUtilizedDatanodes.size() + belowAvgUtilizedDatanodes.size() + sources.size() + targets.size()) : "Mismatched number of datanodes"; long bytesToMove = 0L; for (Source src : sources) { bytesToMove += src.scheduledSize; } return bytesToMove; } /* if onRack is true, decide all <source, target> pairs * where source and target are on the same rack; Otherwise * decide all <source, target> pairs where source and target are * on different racks */ private void chooseNodes(boolean onRack) { /* first step: match each overUtilized datanode (source) to * one or more underUtilized datanodes (targets). */ chooseTargets(underUtilizedDatanodes.iterator(), onRack); /* match each remaining overutilized datanode (source) to * below average utilized datanodes (targets). * Note only overutilized datanodes that haven't had that max bytes to move * satisfied in step 1 are selected */ chooseTargets(belowAvgUtilizedDatanodes.iterator(), onRack); /* match each remaining underutilized datanode to * above average utilized datanodes. * Note only underutilized datanodes that have not had that max bytes to * move satisfied in step 1 are selected. */ chooseSources(aboveAvgUtilizedDatanodes.iterator(), onRack); } /* choose targets from the target candidate list for each over utilized * source datanode. OnRackTarget determines if the chosen target * should be on the same rack as the source */ private void chooseTargets(Iterator<BalancerDatanode> targetCandidates, boolean onRackTarget) { for (Iterator<Source> srcIterator = overUtilizedDatanodes.iterator(); srcIterator.hasNext();) { Source source = srcIterator.next(); while (chooseTarget(source, targetCandidates, onRackTarget)) { } if (!source.isMoveQuotaFull()) { srcIterator.remove(); } } return; } /* choose sources from the source candidate list for each under utilized * target datanode. onRackSource determines if the chosen source * should be on the same rack as the target */ private void chooseSources(Iterator<Source> sourceCandidates, boolean onRackSource) { for (Iterator<BalancerDatanode> targetIterator = underUtilizedDatanodes.iterator(); targetIterator .hasNext();) { BalancerDatanode target = targetIterator.next(); while (chooseSource(target, sourceCandidates, onRackSource)) { } if (!target.isMoveQuotaFull()) { targetIterator.remove(); } } return; } /* For the given source, choose targets from the target candidate list. * OnRackTarget determines if the chosen target * should be on the same rack as the source */ private boolean chooseTarget(Source source, Iterator<BalancerDatanode> targetCandidates, boolean onRackTarget) { if (!source.isMoveQuotaFull()) { return false; } boolean foundTarget = false; BalancerDatanode target = null; while (!foundTarget && targetCandidates.hasNext()) { target = targetCandidates.next(); if (!target.isMoveQuotaFull()) { targetCandidates.remove(); continue; } if (onRackTarget) { // choose from on-rack nodes if (cluster.isOnSameRack(source.datanode, target.datanode)) { foundTarget = true; } } else { // choose from off-rack nodes if (!cluster.isOnSameRack(source.datanode, target.datanode)) { foundTarget = true; } } } if (foundTarget) { assert (target != null) : "Choose a null target"; long size = Math.min(source.availableSizeToMove(), target.availableSizeToMove()); NodeTask nodeTask = new NodeTask(target, size); source.addNodeTask(nodeTask); target.incScheduledSize(nodeTask.getSize()); sources.add(source); targets.add(target); if (!target.isMoveQuotaFull()) { targetCandidates.remove(); } LOG.info("Decided to move " + StringUtils.byteDesc(size) + " bytes from " + source.datanode.getName() + " to " + target.datanode.getName()); return true; } return false; } /* For the given target, choose sources from the source candidate list. * OnRackSource determines if the chosen source * should be on the same rack as the target */ private boolean chooseSource(BalancerDatanode target, Iterator<Source> sourceCandidates, boolean onRackSource) { if (!target.isMoveQuotaFull()) { return false; } boolean foundSource = false; Source source = null; while (!foundSource && sourceCandidates.hasNext()) { source = sourceCandidates.next(); if (!source.isMoveQuotaFull()) { sourceCandidates.remove(); continue; } if (onRackSource) { // choose from on-rack nodes if (cluster.isOnSameRack(source.getDatanode(), target.getDatanode())) { foundSource = true; } } else { // choose from off-rack nodes if (!cluster.isOnSameRack(source.datanode, target.datanode)) { foundSource = true; } } } if (foundSource) { assert (source != null) : "Choose a null source"; long size = Math.min(source.availableSizeToMove(), target.availableSizeToMove()); NodeTask nodeTask = new NodeTask(target, size); source.addNodeTask(nodeTask); target.incScheduledSize(nodeTask.getSize()); sources.add(source); targets.add(target); if (!source.isMoveQuotaFull()) { sourceCandidates.remove(); } LOG.info("Decided to move " + StringUtils.byteDesc(size) + " bytes from " + source.datanode.getName() + " to " + target.datanode.getName()); return true; } return false; } private static class BytesMoved { private long bytesMoved = 0L;; private synchronized void inc(long bytes) { bytesMoved += bytes; } private long get() { return bytesMoved; } }; private BytesMoved bytesMoved = new BytesMoved(); private int notChangedIterations = 0; /* Start a thread to dispatch block moves for each source. * The thread selects blocks to move & sends request to proxy source to * initiate block move. The process is flow controlled. Block selection is * blocked if there are too many un-confirmed block moves. * Return the total number of bytes successfully moved in this iteration. */ private long dispatchBlockMoves() throws InterruptedException { long bytesLastMoved = bytesMoved.get(); Future<?>[] futures = new Future<?>[sources.size()]; int i = 0; for (Source source : sources) { futures[i++] = dispatcherExecutor.submit(source.new BlockMoveDispatcher()); } // wait for all dispatcher threads to finish for (Future<?> future : futures) { try { future.get(); } catch (ExecutionException e) { LOG.warn("Dispatcher thread failed", e.getCause()); } } // wait for all block moving to be done waitForMoveCompletion(); return bytesMoved.get() - bytesLastMoved; } // The sleeping period before checking if block move is completed again static private long blockMoveWaitTime = 30000L; /** set the sleeping period for block move completion check */ static void setBlockMoveWaitTime(long time) { blockMoveWaitTime = time; } /* wait for all block move confirmations * by checking each target's pendingMove queue */ private void waitForMoveCompletion() { boolean shouldWait; do { shouldWait = false; for (BalancerDatanode target : targets) { if (!target.isPendingQEmpty()) { shouldWait = true; } } if (shouldWait) { try { Thread.sleep(blockMoveWaitTime); } catch (InterruptedException ignored) { } } } while (shouldWait); } /** This window makes sure to keep blocks that have been moved within 1.5 hour. * Old window has blocks that are older; * Current window has blocks that are more recent; * Cleanup method triggers the check if blocks in the old window are * more than 1.5 hour old. If yes, purge the old window and then * move blocks in current window to old window. */ private static class MovedBlocks { private long lastCleanupTime = System.currentTimeMillis(); private static long winWidth = 5400 * 1000L; // 1.5 hour final private static int CUR_WIN = 0; final private static int OLD_WIN = 1; final private static int NUM_WINS = 2; final private List<HashMap<Block, BalancerBlock>> movedBlocks = new ArrayList<HashMap<Block, BalancerBlock>>( NUM_WINS); /* initialize the moved blocks collection */ private MovedBlocks() { movedBlocks.add(new HashMap<Block, BalancerBlock>()); movedBlocks.add(new HashMap<Block, BalancerBlock>()); } /* set the win width */ private void setWinWidth(Configuration conf) { winWidth = conf.getLong("dfs.balancer.movedWinWidth", 5400 * 1000L); } /* add a block thus marking a block to be moved */ synchronized private void add(BalancerBlock block) { movedBlocks.get(CUR_WIN).put(block.getBlock(), block); } /* check if a block is marked as moved */ synchronized private boolean contains(BalancerBlock block) { return contains(block.getBlock()); } /* check if a block is marked as moved */ synchronized private boolean contains(Block block) { return movedBlocks.get(CUR_WIN).containsKey(block) || movedBlocks.get(OLD_WIN).containsKey(block); } /* remove old blocks */ synchronized private void cleanup() { long curTime = System.currentTimeMillis(); // check if old win is older than winWidth if (lastCleanupTime + winWidth <= curTime) { // purge the old window movedBlocks.set(OLD_WIN, movedBlocks.get(CUR_WIN)); movedBlocks.set(CUR_WIN, new HashMap<Block, BalancerBlock>()); lastCleanupTime = curTime; } } } /* Decide if it is OK to move the given block from source to target * A block is a good candidate if * 1. the block is not in the process of being moved/has not been moved; * 2. the block does not have a replica on the target; * 3. doing the move does not reduce the number of racks that the block has */ private boolean isGoodBlockCandidate(Source source, BalancerDatanode target, BalancerBlock block) { // check if the block is moved or not if (movedBlocks.contains(block)) { return false; } if (block.isLocatedOnDatanode(target)) { return false; } boolean goodBlock = false; if (cluster.isOnSameRack(source.getDatanode(), target.getDatanode())) { // good if source and target are on the same rack goodBlock = true; } else { boolean notOnSameRack = true; synchronized (block) { for (BalancerDatanode loc : block.locations) { if (cluster.isOnSameRack(loc.datanode, target.datanode)) { notOnSameRack = false; break; } } } if (notOnSameRack) { // good if target is target is not on the same rack as any replica goodBlock = true; } else { // good if source is on the same rack as on of the replicas for (BalancerDatanode loc : block.locations) { if (loc != source && cluster.isOnSameRack(loc.datanode, source.datanode)) { goodBlock = true; break; } } } } return goodBlock; } /* reset all fields in a balancer preparing for the next iteration */ private void resetData() { this.cluster = new NetworkTopology(); this.overUtilizedDatanodes.clear(); this.aboveAvgUtilizedDatanodes.clear(); this.belowAvgUtilizedDatanodes.clear(); this.underUtilizedDatanodes.clear(); this.datanodes.clear(); this.sources.clear(); this.targets.clear(); this.avgUtilization = 0.0D; cleanGlobalBlockList(); this.movedBlocks.cleanup(); } /* Remove all blocks from the global block list except for the ones in the * moved list. */ private void cleanGlobalBlockList() { for (Iterator<Block> globalBlockListIterator = globalBlockList.keySet().iterator(); globalBlockListIterator .hasNext();) { Block block = globalBlockListIterator.next(); if (!movedBlocks.contains(block)) { globalBlockListIterator.remove(); } } } /* Return true if the given datanode is overUtilized */ private boolean isOverUtilized(BalancerDatanode datanode) { return datanode.utilization > (avgUtilization + threshold); } /* Return true if the given datanode is above average utilized * but not overUtilized */ private boolean isAboveAvgUtilized(BalancerDatanode datanode) { return (datanode.utilization <= (avgUtilization + threshold)) && (datanode.utilization > avgUtilization); } /* Return true if the given datanode is underUtilized */ private boolean isUnderUtilized(BalancerDatanode datanode) { return datanode.utilization < (avgUtilization - threshold); } /* Return true if the given datanode is below average utilized * but not underUtilized */ private boolean isBelowAvgUtilized(BalancerDatanode datanode) { return (datanode.utilization >= (avgUtilization - threshold)) && (datanode.utilization < avgUtilization); } // Exit status final public static int SUCCESS = 1; final public static int ALREADY_RUNNING = -1; final public static int NO_MOVE_BLOCK = -2; final public static int NO_MOVE_PROGRESS = -3; final public static int IO_EXCEPTION = -4; final public static int ILLEGAL_ARGS = -5; /** main method of Balancer * @param args arguments to a Balancer * @exception any exception occurs during datanode balancing */ public int run(String[] args) throws Exception { long startTime = Util.now(); OutputStream out = null; try { // initialize a balancer init(parseArgs(args)); /* Check if there is another balancer running. * Exit if there is another one running. */ out = checkAndMarkRunningBalancer(); if (out == null) { System.out.println("Another balancer is running. Exiting..."); return ALREADY_RUNNING; } Formatter formatter = new Formatter(System.out); System.out.println( "Time Stamp Iteration# Bytes Already Moved Bytes Left To Move Bytes Being Moved"); int iterations = 0; while (true) { /* get all live datanodes of a cluster and their disk usage * decide the number of bytes need to be moved */ long bytesLeftToMove = initNodes(); if (bytesLeftToMove == 0) { System.out.println("The cluster is balanced. Exiting..."); return SUCCESS; } else { LOG.info("Need to move " + StringUtils.byteDesc(bytesLeftToMove) + " bytes to make the cluster balanced."); } /* Decide all the nodes that will participate in the block move and * the number of bytes that need to be moved from one node to another * in this iteration. Maximum bytes to be moved per node is * Min(1 Band worth of bytes, MAX_SIZE_TO_MOVE). */ long bytesToMove = chooseNodes(); if (bytesToMove == 0) { System.out.println("No block can be moved. Exiting..."); return NO_MOVE_BLOCK; } else { LOG.info("Will move " + StringUtils.byteDesc(bytesToMove) + "bytes in this iteration"); } formatter.format("%-24s %10d %19s %18s %17s\n", DateFormat.getDateTimeInstance().format(new Date()), iterations, StringUtils.byteDesc(bytesMoved.get()), StringUtils.byteDesc(bytesLeftToMove), StringUtils.byteDesc(bytesToMove)); /* For each pair of <source, target>, start a thread that repeatedly * decide a block to be moved and its proxy source, * then initiates the move until all bytes are moved or no more block * available to move. * Exit no byte has been moved for 5 consecutive iterations. */ if (dispatchBlockMoves() > 0) { notChangedIterations = 0; } else { notChangedIterations++; if (notChangedIterations >= 5) { System.out.println("No block has been moved for 5 iterations. Exiting..."); return NO_MOVE_PROGRESS; } } // clean all lists resetData(); try { Thread.sleep(2 * conf.getLong("dfs.heartbeat.interval", 3)); } catch (InterruptedException ignored) { } iterations++; } } catch (IllegalArgumentException ae) { return ILLEGAL_ARGS; } catch (IOException e) { System.out.println("Received an IO exception: " + e.getMessage() + " . Exiting..."); return IO_EXCEPTION; } finally { // shutdown thread pools dispatcherExecutor.shutdownNow(); moverExecutor.shutdownNow(); shouldRun = false; try { if (keyupdaterthread != null) keyupdaterthread.interrupt(); } catch (Exception e) { LOG.warn("Exception shutting down access key updater thread", e); } // close the output file IOUtils.closeStream(out); if (fs != null) { try { fs.delete(BALANCER_ID_PATH, true); } catch (IOException ignored) { } } System.out.println("Balancing took " + time2Str(Util.now() - startTime)); } } private Path BALANCER_ID_PATH = new Path("/system/balancer.id"); /* The idea for making sure that there is no more than one balancer * running in an HDFS is to create a file in the HDFS, writes the IP address * of the machine on which the balancer is running to the file, but did not * close the file until the balancer exits. * This prevents the second balancer from running because it can not * creates the file while the first one is running. * * This method checks if there is any running balancer and * if no, mark yes if no. * Note that this is an atomic operation. * * Return null if there is a running balancer; otherwise the output stream * to the newly created file. */ private OutputStream checkAndMarkRunningBalancer() throws IOException { try { DataOutputStream out = fs.create(BALANCER_ID_PATH); out.writeBytes(InetAddress.getLocalHost().getHostName()); out.flush(); return out; } catch (RemoteException e) { if (AlreadyBeingCreatedException.class.getName().equals(e.getClassName())) { return null; } else { throw e; } } } /* Given elaspedTime in ms, return a printable string */ private static String time2Str(long elapsedTime) { String unit; double time = elapsedTime; if (elapsedTime < 1000) { unit = "milliseconds"; } else if (elapsedTime < 60 * 1000) { unit = "seconds"; time = time / 1000; } else if (elapsedTime < 3600 * 1000) { unit = "minutes"; time = time / (60 * 1000); } else { unit = "hours"; time = time / (3600 * 1000); } return time + " " + unit; } /** return this balancer's configuration */ public Configuration getConf() { return conf; } /** set this balancer's configuration */ public void setConf(Configuration conf) { this.conf = conf; movedBlocks.setWinWidth(conf); } }