org.apache.hadoop.hdfs.server.balancer.LatencyBalancer.java Source code

Introduction

Here is the source code for org.apache.hadoop.hdfs.server.balancer.LatencyBalancer.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.balancer;

import static com.google.common.base.Preconditions.checkArgument;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.net.URI;
import java.text.DateFormat;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Scanner;
import java.util.Set;
import java.util.HashMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.StorageType;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.server.balancer.Dispatcher.DDatanode;
import org.apache.hadoop.hdfs.server.balancer.Dispatcher.DDatanode.StorageGroup;
import org.apache.hadoop.hdfs.server.balancer.Dispatcher.Source;
import org.apache.hadoop.hdfs.server.balancer.Dispatcher.Task;
import org.apache.hadoop.hdfs.server.balancer.Dispatcher.Util;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicy;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicyDefault;
import org.apache.hadoop.hdfs.server.namenode.UnsupportedActionException;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport;
import org.apache.hadoop.hdfs.server.protocol.StorageReport;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Time;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import static com.google.common.base.Preconditions.checkArgument;

import com.google.common.base.Preconditions;

/** <p>The balancer is a tool that balances disk space usage on an HDFS cluster
 * when some datanodes become full or when new empty nodes join the cluster.
 * The tool is deployed as an application program that can be run by the 
 * cluster administrator on a live HDFS cluster while applications
 * adding and deleting files.
 * 
 * <p>SYNOPSIS
 * <pre>
 * To start:
 *      bin/start-balancer.sh [-threshold <threshold>]
 *      Example: bin/ start-balancer.sh 
 *                     start the balancer with a default threshold of 10%
 *               bin/ start-balancer.sh -threshold 5
 *                     start the balancer with a threshold of 5%
 *               bin/ start-balancer.sh -idleiterations 20
 *                     start the balancer with maximum 20 consecutive idle iterations
 *               bin/ start-balancer.sh -idleiterations -1
 *                     run the balancer with default threshold infinitely
 * To stop:
 *      bin/ stop-balancer.sh
 * </pre>
 * 
 * <p>DESCRIPTION
 * <p>The threshold parameter is a fraction in the range of (1%, 100%) with a 
 * default value of 10%. The threshold sets a target for whether the cluster 
 * is balanced. A cluster is balanced if for each datanode, the utilization 
 * of the node (ratio of used space at the node to total capacity of the node) 
 * differs from the utilization of the (ratio of used space in the cluster 
 * to total capacity of the cluster) by no more than the threshold value. 
 * The smaller the threshold, the more balanced a cluster will become. 
 * It takes more time to run the balancer for small threshold values. 
 * Also for a very small threshold the cluster may not be able to reach the 
 * balanced state when applications write and delete files concurrently.
 * 
 * <p>The tool moves blocks from highly utilized datanodes to poorly 
 * utilized datanodes iteratively. In each iteration a datanode moves or 
 * receives no more than the lesser of 10G bytes or the threshold fraction 
 * of its capacity. Each iteration runs no more than 20 minutes.
 * At the end of each iteration, the balancer obtains updated datanodes
 * information from the namenode.
 * 
 * <p>A system property that limits the balancer's use of bandwidth is 
 * defined in the default configuration file:
 * <pre>
 * <property>
 *   <name>dfs.balance.bandwidthPerSec</name>
 *   <value>1048576</value>
 * <description>  Specifies the maximum bandwidth that each datanode 
 * can utilize for the balancing purpose in term of the number of bytes 
 * per second. </description>
 * </property>
 * </pre>
 * 
 * <p>This property determines the maximum speed at which a block will be 
 * moved from one datanode to another. The default value is 1MB/s. The higher 
 * the bandwidth, the faster a cluster can reach the balanced state, 
 * but with greater competition with application processes. If an 
 * administrator changes the value of this property in the configuration 
 * file, the change is observed when HDFS is next restarted.
 * 
 * <p>MONITERING BALANCER PROGRESS
 * <p>After the balancer is started, an output file name where the balancer 
 * progress will be recorded is printed on the screen.  The administrator 
 * can monitor the running of the balancer by reading the output file. 
 * The output shows the balancer's status iteration by iteration. In each 
 * iteration it prints the starting time, the iteration number, the total 
 * number of bytes that have been moved in the previous iterations, 
 * the total number of bytes that are left to move in order for the cluster 
 * to be balanced, and the number of bytes that are being moved in this 
 * iteration. Normally "Bytes Already Moved" is increasing while "Bytes Left 
 * To Move" is decreasing.
 * 
 * <p>Running multiple instances of the balancer in an HDFS cluster is 
 * prohibited by the tool.
 * 
 * <p>The balancer automatically exits when any of the following five 
 * conditions is satisfied:
 * <ol>
 * <li>The cluster is balanced;
 * <li>No block can be moved;
 * <li>No block has been moved for specified consecutive iterations (5 by default);
 * <li>An IOException occurs while communicating with the namenode;
 * <li>Another balancer is running.
 * </ol>
 * 
 * <p>Upon exit, a balancer returns an exit code and prints one of the 
 * following messages to the output file in corresponding to the above exit 
 * reasons:
 * <ol>
 * <li>The cluster is balanced. Exiting
 * <li>No block can be moved. Exiting...
 * <li>No block has been moved for specified iterations (5 by default). Exiting...
 * <li>Received an IO exception: failure reason. Exiting...
 * <li>Another balancer is running. Exiting...
 * </ol>
 * 
 * <p>The administrator can interrupt the execution of the balancer at any 
 * time by running the command "stop-balancer.sh" on the machine where the 
 * balancer is running.
 */

@InterfaceAudience.Private
public class LatencyBalancer {
    static final Log LOG = LogFactory.getLog(LatencyBalancer.class);

    static final Path BALANCER_ID_PATH = new Path("/system/balancer.id");

    private static final long GB = 1L << 30; //1GB
    private static final long MAX_SIZE_TO_MOVE = 10 * GB;

    private static final String USAGE = "Usage: hdfs balancer" + "\n\t[-policy <policy>]\tthe balancing policy: "
            + BalancingPolicy.Node.INSTANCE.getName() + " or " + BalancingPolicy.Pool.INSTANCE.getName()
            + "\n\t[-threshold <threshold>]\tPercentage of disk capacity"
            + "\n\t[-exclude [-f <hosts-file> | <comma-separated list of hosts>]]"
            + "\tExcludes the specified datanodes."
            + "\n\t[-include [-f <hosts-file> | <comma-separated list of hosts>]]"
            + "\tIncludes only the specified datanodes." + "\n\t[-idleiterations <idleiterations>]"
            + "\tNumber of consecutive idle iterations (-1 for Infinite) before exit.";

    private final Dispatcher dispatcher;
    private final BalancingPolicy policy;
    private final double threshold;

    // all data node lists
    private final Collection<Source> high = new LinkedList<Source>();
    private final Collection<Source> aboveAvg = new LinkedList<Source>();
    private final Collection<StorageGroup> belowAvg = new LinkedList<StorageGroup>();
    private final Collection<StorageGroup> low = new LinkedList<StorageGroup>();

    HashMap<String, Double> latencies = new HashMap<String, Double>();

    /* Check that this Balancer is compatible with the Block Placement Policy
     * used by the Namenode.
     */
    private static void checkReplicationPolicyCompatibility(Configuration conf) throws UnsupportedActionException {
        if (!(BlockPlacementPolicy.getInstance(conf, null, null, null) instanceof BlockPlacementPolicyDefault)) {
            throw new UnsupportedActionException("Balancer without BlockPlacementPolicyDefault");
        }
    }

    void getLatencies(String path) {
        Scanner sc;
        try {
            sc = new Scanner(new File(path));

            while (sc.hasNext()) {
                latencies.put(sc.next(), Double.parseDouble(sc.next()));
            }
            for (String lt : latencies.keySet()) {
                System.out.println(lt + " " + latencies.get(lt));
            }
        } catch (IOException e) {
            System.err.println(e);
        }
    }

    /**
     * Construct a balancer.
     * Initialize balancer. It sets the value of the threshold, and 
     * builds the communication proxies to
     * namenode as a client and a secondary namenode and retry proxies
     * when connection fails.
     */

    LatencyBalancer(NameNodeConnector theblockpool, Parameters p, Configuration conf) {
        final long movedWinWidth = conf.getLong(DFSConfigKeys.DFS_BALANCER_MOVEDWINWIDTH_KEY,
                DFSConfigKeys.DFS_BALANCER_MOVEDWINWIDTH_DEFAULT);
        final int moverThreads = conf.getInt(DFSConfigKeys.DFS_BALANCER_MOVERTHREADS_KEY,
                DFSConfigKeys.DFS_BALANCER_MOVERTHREADS_DEFAULT);
        final int dispatcherThreads = conf.getInt(DFSConfigKeys.DFS_BALANCER_DISPATCHERTHREADS_KEY,
                DFSConfigKeys.DFS_BALANCER_DISPATCHERTHREADS_DEFAULT);
        final int maxConcurrentMovesPerNode = conf.getInt(
                DFSConfigKeys.DFS_DATANODE_BALANCE_MAX_NUM_CONCURRENT_MOVES_KEY,
                DFSConfigKeys.DFS_DATANODE_BALANCE_MAX_NUM_CONCURRENT_MOVES_DEFAULT);

        this.dispatcher = new Dispatcher(theblockpool, p.nodesToBeIncluded, p.nodesToBeExcluded, movedWinWidth,
                moverThreads, dispatcherThreads, maxConcurrentMovesPerNode, conf);
        this.threshold = p.threshold;
        this.policy = p.policy;

        ///
        //latencies.put("10.6.9.240", .166);
        //latencies.put("10.6.9.241", .333);
        //latencies.put("10.6.9.242", .5);
        getLatencies("/home/hadoopnew/latencies.txt");
        ///
    }

    private static long getCapacity(DatanodeStorageReport report, StorageType t) {
        long capacity = 0L;
        for (StorageReport r : report.getStorageReports()) {
            if (r.getStorage().getStorageType() == t) {
                capacity += r.getCapacity();
            }
        }
        return capacity;
    }

    private static long getRemaining(DatanodeStorageReport report, StorageType t) {
        long remaining = 0L;
        for (StorageReport r : report.getStorageReports()) {
            if (r.getStorage().getStorageType() == t) {
                remaining += r.getRemaining();
            }
        }
        return remaining;
    }

    /**
     * Given a datanode storage set, build a network topology and decide
     * over-utilized storages, above average utilized storages, 
     * below average utilized storages, and low storages. 
     * The input datanode storage set is shuffled in order to randomize
     * to the storage matching later on.
     *
     * @return the number of bytes needed to move in order to balance the cluster.
     */
    private long init(List<DatanodeStorageReport> reports) {
        // compute average utilization
        for (DatanodeStorageReport r : reports) {
            ///
            System.out.println(r.getDatanodeInfo().getDatanodeUuid());
            ///
            policy.accumulateSpaces(r);
        }
        policy.initAvgUtilization();
        ///
        policy.initAvgLatency(latencies);
        policy.initAvgOfLatencyUtilizationProduct(reports, latencies);
        ///
        // create network topology and classify utilization collections: 
        //   over-utilized, above-average, below-average and under-utilized.
        long overLoadedBytes = 0L, underLoadedBytes = 0L;
        for (DatanodeStorageReport r : reports) { // for each Datanode
            final DDatanode dn = dispatcher.newDatanode(r.getDatanodeInfo());
            for (StorageType t : StorageType.getMovableTypes()) { //for each storage type in this Datanode
                Double utilization = policy.getUtilization(r, t); // gets the disk utilization only
                if (utilization == null) { // datanode does not have such storage type 
                    continue;
                }
                ///
                utilization = utilization * latencies.get(r.getDatanodeInfo().getIpAddr()); // finding overall product 
                ///
                final long capacity = getCapacity(r, t);
                //threshold = 10
                //final double utilizationDiff = utilization - policy.getAvgUtilization(t); //ud > 0 (70 - 50), ud > 0 (55 - 50)
                final double utilizationDiff = utilization - policy.getAvgLatencyUtilizationProduct(t); //ud > 0 (70 - 50), ud > 0 (55 - 50)
                final double thresholdDiff = Math.abs(utilizationDiff) - threshold; // td > 0 (20 - 10), td < 0 (5 - 10)

                final long maxSize2Move = computeMaxSize2Move(capacity, getRemaining(r, t), utilizationDiff,
                        threshold); //IMPORTANT getting the max size to move from a datanode

                final StorageGroup g;
                if (utilizationDiff > 0) {
                    final Source s = dn.addSource(t, maxSize2Move, dispatcher);
                    if (thresholdDiff <= 0) { // within threshold
                        aboveAvg.add(s);
                    } else {
                        overLoadedBytes += precentage2bytes(thresholdDiff, capacity);
                        high.add(s);
                    }
                    g = s;
                } else {
                    g = dn.addTarget(t, maxSize2Move);
                    if (thresholdDiff <= 0) { // within threshold
                        belowAvg.add(g);
                    } else {
                        underLoadedBytes += precentage2bytes(thresholdDiff, capacity);
                        low.add(g);
                    }
                }

                dispatcher.getStorageGroupMap().put(g);
            }
        }

        logUtilizationCollections();

        Preconditions.checkState(dispatcher.getStorageGroupMap().size() == high.size() + low.size()
                + aboveAvg.size() + belowAvg.size(), "Mismatched number of storage groups");

        // return number of bytes to be moved in order to make the cluster balanced
        return Math.max(overLoadedBytes, underLoadedBytes);
    }

    private static long computeMaxSize2Move(final long capacity, final long remaining, final double utilizationDiff,
            final double threshold) {
        final double diff = Math.min(threshold, Math.abs(utilizationDiff));
        long maxSizeToMove = precentage2bytes(diff, capacity);
        if (utilizationDiff < 0) {
            maxSizeToMove = Math.min(remaining, maxSizeToMove);
        }
        return Math.min(MAX_SIZE_TO_MOVE, maxSizeToMove);
    }

    private static long precentage2bytes(double precentage, long capacity) {
        Preconditions.checkArgument(precentage >= 0, "precentage = " + precentage + " < 0");
        return (long) (precentage * capacity / 100.0);
    }

    /* log the over utilized & under utilized nodes */
    private void logUtilizationCollections() {
        logUtilizationCollection("high", high);
        if (LOG.isTraceEnabled()) {
            logUtilizationCollection("above-average", aboveAvg);
            logUtilizationCollection("below-average", belowAvg);
        }
        logUtilizationCollection("low", low);
    }

    private static <T extends StorageGroup> void logUtilizationCollection(String name, Collection<T> items) {
        LOG.info(items.size() + " " + name + ": " + items);
    }

    /**
     * Decide all <source, target> pairs and
     * the number of bytes to move from a source to a target
     * Maximum bytes to be moved per storage group is
     * min(1 Band worth of bytes,  MAX_SIZE_TO_MOVE).
     * @return total number of bytes to move in this iteration
     */
    private long chooseStorageGroups() {
        // First, match nodes on the same node group if cluster is node group aware
        if (dispatcher.getCluster().isNodeGroupAware()) {
            chooseStorageGroups(Matcher.SAME_NODE_GROUP);
        }

        // Then, match nodes on the same rack
        chooseStorageGroups(Matcher.SAME_RACK);
        // At last, match all remaining nodes
        chooseStorageGroups(Matcher.ANY_OTHER);

        return dispatcher.bytesToMove();
    }

    /** Decide all <source, target> pairs according to the matcher. */
    private void chooseStorageGroups(final Matcher matcher) {
        /* first step: match each high datanode (source) to
         * one or more low datanodes (targets).
         */
        chooseStorageGroups(high, low, matcher);

        /* match each remaining high datanode (source) to 
         * below average utilized datanodes (targets).
         * Note only high datanodes that haven't had that max bytes to move
         * satisfied in step 1 are selected
         */
        chooseStorageGroups(high, belowAvg, matcher);

        /* match each remaining low datanode (target) to 
         * above average utilized datanodes (source).
         * Note only low datanodes that have not had that max bytes to
         * move satisfied in step 1 are selected.
         */
        chooseStorageGroups(low, aboveAvg, matcher);
    }

    /**
     * For each datanode, choose matching nodes from the candidates. Either the
     * datanodes or the candidates are source nodes with (utilization > Avg), and
     * the others are target nodes with (utilization < Avg).
     */
    private <G extends StorageGroup, C extends StorageGroup> void chooseStorageGroups(Collection<G> groups,
            Collection<C> candidates, Matcher matcher) {
        for (final Iterator<G> i = groups.iterator(); i.hasNext();) {
            final G g = i.next();
            for (; choose4One(g, candidates, matcher);)
                ; //keep choosing candidates till all the bytes are moved from g
            if (!g.hasSpaceForScheduling()) { //maxSize2Move - scheduledSize > 0
                i.remove();
            }
        }
    }

    /**
     * For the given datanode, choose a candidate and then schedule it.
     * @return true if a candidate is chosen; false if no candidates is chosen.
     */
    private <C extends StorageGroup> boolean choose4One(StorageGroup g, //IMPORTANT choose correct matcher
            Collection<C> candidates, Matcher matcher) {
        final Iterator<C> i = candidates.iterator();
        final C chosen = chooseCandidate(g, i, matcher);

        if (chosen == null) {
            return false;
        }
        if (g instanceof Source) {
            matchSourceWithTargetToMove((Source) g, chosen);
        } else {
            matchSourceWithTargetToMove((Source) chosen, g);
        }
        if (!chosen.hasSpaceForScheduling()) {
            i.remove();
        }
        return true;
    }

    private void matchSourceWithTargetToMove(Source source, StorageGroup target) { //IMPORTANT here i have to make change
        long size = Math.min(source.availableSizeToMove(), target.availableSizeToMove());
        final Task task = new Task(target, size);
        source.addTask(task); //IMPORTANT
        target.incScheduledSize(task.getSize());
        dispatcher.add(source, target); //IMPORTANT
        LOG.info("Decided to move " + StringUtils.byteDesc(size) + " bytes from " + source.getDisplayName() + " to "
                + target.getDisplayName());
    }

    /** Choose a candidate for the given datanode. */
    private <G extends StorageGroup, C extends StorageGroup> C chooseCandidate(G g, Iterator<C> candidates,
            Matcher matcher) {
        if (g.hasSpaceForScheduling()) {
            for (; candidates.hasNext();) {
                final C c = candidates.next();
                if (!c.hasSpaceForScheduling()) {
                    candidates.remove();
                } else if (matcher.match(dispatcher.getCluster(), g.getDatanodeInfo(), c.getDatanodeInfo())) {
                    return c;
                }
            }
        }
        return null;
    }

    /* reset all fields in a balancer preparing for the next iteration */
    void resetData(Configuration conf) {
        this.high.clear();
        this.aboveAvg.clear();
        this.belowAvg.clear();
        this.low.clear();
        this.policy.reset();
        dispatcher.reset(conf);
    }

    static class Result {
        final ExitStatus exitStatus;
        final long bytesLeftToMove;
        final long bytesBeingMoved;
        final long bytesAlreadyMoved;

        Result(ExitStatus exitStatus, long bytesLeftToMove, long bytesBeingMoved, long bytesAlreadyMoved) {
            this.exitStatus = exitStatus;
            this.bytesLeftToMove = bytesLeftToMove;
            this.bytesBeingMoved = bytesBeingMoved;
            this.bytesAlreadyMoved = bytesAlreadyMoved;
        }

        void print(int iteration, PrintStream out) {
            out.printf("%-24s %10d  %19s  %18s  %17s%n", DateFormat.getDateTimeInstance().format(new Date()),
                    iteration, StringUtils.byteDesc(bytesAlreadyMoved), StringUtils.byteDesc(bytesLeftToMove),
                    StringUtils.byteDesc(bytesBeingMoved));
        }
    }

    Result newResult(ExitStatus exitStatus, long bytesLeftToMove, long bytesBeingMoved) {
        return new Result(exitStatus, bytesLeftToMove, bytesBeingMoved, dispatcher.getBytesMoved());
    }

    Result newResult(ExitStatus exitStatus) {
        return new Result(exitStatus, -1, -1, dispatcher.getBytesMoved());
    }

    /** Run an iteration for all datanodes. */
    Result runOneIteration() {
        try {
            final List<DatanodeStorageReport> reports = dispatcher.init();
            final long bytesLeftToMove = init(reports);
            if (bytesLeftToMove == 0) {
                System.out.println("The cluster is balanced. Exiting...");
                return newResult(ExitStatus.SUCCESS, bytesLeftToMove, -1);
            } else {
                LOG.info(
                        "Need to move " + StringUtils.byteDesc(bytesLeftToMove) + " to make the cluster balanced.");
            }

            /* Decide all the nodes that will participate in the block move and
             * the number of bytes that need to be moved from one node to another
             * in this iteration. Maximum bytes to be moved per node is
             * Min(1 Band worth of bytes,  MAX_SIZE_TO_MOVE).
             */
            final long bytesBeingMoved = chooseStorageGroups();
            if (bytesBeingMoved == 0) {
                System.out.println("No block can be moved. Exiting...");
                return newResult(ExitStatus.NO_MOVE_BLOCK, bytesLeftToMove, bytesBeingMoved);
            } else {
                LOG.info("Will move " + StringUtils.byteDesc(bytesBeingMoved) + " in this iteration");
            }

            /* For each pair of <source, target>, start a thread that repeatedly 
             * decide a block to be moved and its proxy source, 
             * then initiates the move until all bytes are moved or no more block
             * available to move.
             * Exit no byte has been moved for 5 consecutive iterations.
             */
            if (!dispatcher.dispatchAndCheckContinue()) { //IMPORTANT
                return newResult(ExitStatus.NO_MOVE_PROGRESS, bytesLeftToMove, bytesBeingMoved);
            }

            return newResult(ExitStatus.IN_PROGRESS, bytesLeftToMove, bytesBeingMoved);
        } catch (IllegalArgumentException e) {
            System.out.println(e + ".  Exiting ...");
            return newResult(ExitStatus.ILLEGAL_ARGUMENTS);
        } catch (IOException e) {
            System.out.println(e + ".  Exiting ...");
            return newResult(ExitStatus.IO_EXCEPTION);
        } catch (InterruptedException e) {
            System.out.println(e + ".  Exiting ...");
            return newResult(ExitStatus.INTERRUPTED);
        } finally {
            dispatcher.shutdownNow();
        }
    }

    /**
     * Balance all namenodes.
     * For each iteration,
     * for each namenode,
     * execute a {@link LatencyBalancer} to work through all datanodes once.  
     */
    static int run(Collection<URI> namenodes, final Parameters p, Configuration conf)
            throws IOException, InterruptedException {
        final long sleeptime = conf.getLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
                DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT) * 2000
                + conf.getLong(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY,
                        DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_DEFAULT) * 1000;
        LOG.info("namenodes  = " + namenodes);
        LOG.info("parameters = " + p);

        System.out.println(
                "Time Stamp               Iteration#  Bytes Already Moved  Bytes Left To Move  Bytes Being Moved");

        List<NameNodeConnector> connectors = Collections.emptyList();
        try {
            connectors = NameNodeConnector.newNameNodeConnectors(namenodes, LatencyBalancer.class.getSimpleName(),
                    BALANCER_ID_PATH, conf, p.maxIdleIteration);

            boolean done = false;
            for (int iteration = 0; !done; iteration++) {
                done = true;
                Collections.shuffle(connectors);
                for (NameNodeConnector nnc : connectors) {
                    final LatencyBalancer b = new LatencyBalancer(nnc, p, conf);
                    final Result r = b.runOneIteration();
                    r.print(iteration, System.out);

                    // clean all lists
                    b.resetData(conf);
                    if (r.exitStatus == ExitStatus.IN_PROGRESS) {
                        done = false;
                    } else if (r.exitStatus != ExitStatus.SUCCESS) {
                        //must be an error statue, return.
                        return r.exitStatus.getExitCode();
                    }
                }

                if (!done) {
                    Thread.sleep(sleeptime);
                }
            }
        } finally {
            for (NameNodeConnector nnc : connectors) {
                IOUtils.cleanup(LOG, nnc);
            }
        }
        return ExitStatus.SUCCESS.getExitCode();
    }

    /* Given elaspedTime in ms, return a printable string */
    private static String time2Str(long elapsedTime) {
        String unit;
        double time = elapsedTime;
        if (elapsedTime < 1000) {
            unit = "milliseconds";
        } else if (elapsedTime < 60 * 1000) {
            unit = "seconds";
            time = time / 1000;
        } else if (elapsedTime < 3600 * 1000) {
            unit = "minutes";
            time = time / (60 * 1000);
        } else {
            unit = "hours";
            time = time / (3600 * 1000);
        }

        return time + " " + unit;
    }

    static class Parameters {
        static final Parameters DEFAULT = new Parameters(BalancingPolicy.Node.INSTANCE, 10.0,
                NameNodeConnector.DEFAULT_MAX_IDLE_ITERATIONS, Collections.<String>emptySet(),
                Collections.<String>emptySet());

        final BalancingPolicy policy;
        final double threshold;
        final int maxIdleIteration;
        // exclude the nodes in this set from balancing operations
        Set<String> nodesToBeExcluded;
        //include only these nodes in balancing operations
        Set<String> nodesToBeIncluded;

        Parameters(BalancingPolicy policy, double threshold, int maxIdleIteration, Set<String> nodesToBeExcluded,
                Set<String> nodesToBeIncluded) {
            this.policy = policy;
            this.threshold = threshold;
            this.maxIdleIteration = maxIdleIteration;
            this.nodesToBeExcluded = nodesToBeExcluded;
            this.nodesToBeIncluded = nodesToBeIncluded;
        }

        @Override
        public String toString() {
            return LatencyBalancer.class.getSimpleName() + "." + getClass().getSimpleName() + "[" + policy
                    + ", threshold=" + threshold + ", max idle iteration = " + maxIdleIteration
                    + ", number of nodes to be excluded = " + nodesToBeExcluded.size()
                    + ", number of nodes to be included = " + nodesToBeIncluded.size() + "]";
        }
    }

    static class Cli extends Configured implements Tool {
        /**
         * Parse arguments and then run Balancer.
         * 
         * @param args command specific arguments.
         * @return exit code. 0 indicates success, non-zero indicates failure.
         */
        @Override
        public int run(String[] args) {
            final long startTime = Time.monotonicNow();
            final Configuration conf = getConf();

            try {
                checkReplicationPolicyCompatibility(conf);

                final Collection<URI> namenodes = DFSUtil.getNsServiceRpcUris(conf);
                return LatencyBalancer.run(namenodes, parse(args), conf); //IMPORTANT balancer is started here.. change
            } catch (IOException e) {
                System.out.println(e + ".  Exiting ...");
                return ExitStatus.IO_EXCEPTION.getExitCode();
            } catch (InterruptedException e) {
                System.out.println(e + ".  Exiting ...");
                return ExitStatus.INTERRUPTED.getExitCode();
            } finally {
                System.out.format("%-24s ", DateFormat.getDateTimeInstance().format(new Date()));
                System.out.println("Balancing took " + time2Str(Time.monotonicNow() - startTime));
            }
        }

        /** parse command line arguments */
        static Parameters parse(String[] args) {
            BalancingPolicy policy = Parameters.DEFAULT.policy;
            double threshold = Parameters.DEFAULT.threshold;
            int maxIdleIteration = Parameters.DEFAULT.maxIdleIteration;
            Set<String> nodesTobeExcluded = Parameters.DEFAULT.nodesToBeExcluded;
            Set<String> nodesTobeIncluded = Parameters.DEFAULT.nodesToBeIncluded;

            if (args != null) {
                try {
                    for (int i = 0; i < args.length; i++) {
                        if ("-threshold".equalsIgnoreCase(args[i])) {
                            checkArgument(++i < args.length,
                                    "Threshold value is missing: args = " + Arrays.toString(args));
                            try {
                                threshold = Double.parseDouble(args[i]);
                                if (threshold < 1 || threshold > 100) {
                                    throw new IllegalArgumentException(
                                            "Number out of range: threshold = " + threshold);
                                }
                                LOG.info("Using a threshold of " + threshold);
                            } catch (IllegalArgumentException e) {
                                System.err.println("Expecting a number in the range of [1.0, 100.0]: " + args[i]);
                                throw e;
                            }
                        } else if ("-policy".equalsIgnoreCase(args[i])) {
                            checkArgument(++i < args.length,
                                    "Policy value is missing: args = " + Arrays.toString(args));
                            try {
                                policy = BalancingPolicy.parse(args[i]);
                            } catch (IllegalArgumentException e) {
                                System.err.println("Illegal policy name: " + args[i]);
                                throw e;
                            }
                        } else if ("-exclude".equalsIgnoreCase(args[i])) {
                            checkArgument(++i < args.length,
                                    "List of nodes to exclude | -f <filename> is missing: args = "
                                            + Arrays.toString(args));
                            if ("-f".equalsIgnoreCase(args[i])) {
                                checkArgument(++i < args.length,
                                        "File containing nodes to exclude is not specified: args = "
                                                + Arrays.toString(args));
                                nodesTobeExcluded = Util.getHostListFromFile(args[i], "exclude");
                            } else {
                                nodesTobeExcluded = Util.parseHostList(args[i]);
                            }
                        } else if ("-include".equalsIgnoreCase(args[i])) {
                            checkArgument(++i < args.length,
                                    "List of nodes to include | -f <filename> is missing: args = "
                                            + Arrays.toString(args));
                            if ("-f".equalsIgnoreCase(args[i])) {
                                checkArgument(++i < args.length,
                                        "File containing nodes to include is not specified: args = "
                                                + Arrays.toString(args));
                                nodesTobeIncluded = Util.getHostListFromFile(args[i], "include");
                            } else {
                                nodesTobeIncluded = Util.parseHostList(args[i]);
                            }
                        } else if ("-idleiterations".equalsIgnoreCase(args[i])) {
                            checkArgument(++i < args.length,
                                    "idleiterations value is missing: args = " + Arrays.toString(args));
                            maxIdleIteration = Integer.parseInt(args[i]);
                            LOG.info("Using a idleiterations of " + maxIdleIteration);
                        } else {
                            throw new IllegalArgumentException("args = " + Arrays.toString(args));
                        }
                    }
                    checkArgument(nodesTobeExcluded.isEmpty() || nodesTobeIncluded.isEmpty(),
                            "-exclude and -include options cannot be specified together.");
                } catch (RuntimeException e) {
                    printUsage(System.err);
                    throw e;
                }
            }

            return new Parameters(policy, threshold, maxIdleIteration, nodesTobeExcluded, nodesTobeIncluded);
        }

        private static void printUsage(PrintStream out) {
            out.println(USAGE + "\n");
        }
    }

    /**
     * Run a balancer
     * @param args Command line arguments
     */
    public static void main(String[] args) {
        if (DFSUtil.parseHelpArgument(args, USAGE, System.out, true)) {
            System.exit(0);
        }

        try {
            System.exit(ToolRunner.run(new HdfsConfiguration(), new Cli(), args));
        } catch (Throwable e) {
            LOG.error("Exiting balancer due an exception", e);
            System.exit(-1);
        }
    }
}