Example usage for java.util NavigableMap entrySet

List of usage examples for java.util NavigableMap entrySet

Introduction

In this page you can find the example usage for java.util NavigableMap entrySet.

Prototype

Set<Map.Entry<K, V>> entrySet();

Source Link

Document

Returns a Set view of the mappings contained in this map.

Usage

From source file:org.apache.hadoop.hbase.master.handler.ServerShutdownHandler.java

@Override
public void process() throws IOException {
    boolean hasLogReplayWork = false;
    final ServerName serverName = this.serverName;
    try {/*  w  w  w .ja v  a  2s  . c om*/

        // We don't want worker thread in the MetaServerShutdownHandler
        // executor pool to block by waiting availability of hbase:meta
        // Otherwise, it could run into the following issue:
        // 1. The current MetaServerShutdownHandler instance For RS1 waits for the hbase:meta
        //    to come online.
        // 2. The newly assigned hbase:meta region server RS2 was shutdown right after
        //    it opens the hbase:meta region. So the MetaServerShutdownHandler
        //    instance For RS1 will still be blocked.
        // 3. The new instance of MetaServerShutdownHandler for RS2 is queued.
        // 4. The newly assigned hbase:meta region server RS3 was shutdown right after
        //    it opens the hbase:meta region. So the MetaServerShutdownHandler
        //    instance For RS1 and RS2 will still be blocked.
        // 5. The new instance of MetaServerShutdownHandler for RS3 is queued.
        // 6. Repeat until we run out of MetaServerShutdownHandler worker threads
        // The solution here is to resubmit a ServerShutdownHandler request to process
        // user regions on that server so that MetaServerShutdownHandler
        // executor pool is always available.
        //
        // If AssignmentManager hasn't finished rebuilding user regions,
        // we are not ready to assign dead regions either. So we re-queue up
        // the dead server for further processing too.
        AssignmentManager am = services.getAssignmentManager();
        if (isCarryingMeta() // hbase:meta
                || !am.isFailoverCleanupDone()) {
            this.services.getServerManager().processDeadServer(serverName, this.shouldSplitHlog);
            return;
        }

        // Wait on meta to come online; we need it to progress.
        // TODO: Best way to hold strictly here?  We should build this retry logic
        // into the MetaReader operations themselves.
        // TODO: Is the reading of hbase:meta necessary when the Master has state of
        // cluster in its head?  It should be possible to do without reading hbase:meta
        // in all but one case. On split, the RS updates the hbase:meta
        // table and THEN informs the master of the split via zk nodes in
        // 'unassigned' dir.  Currently the RS puts ephemeral nodes into zk so if
        // the regionserver dies, these nodes do not stick around and this server
        // shutdown processing does fixup (see the fixupDaughters method below).
        // If we wanted to skip the hbase:meta scan, we'd have to change at least the
        // final SPLIT message to be permanent in zk so in here we'd know a SPLIT
        // completed (zk is updated after edits to hbase:meta have gone in).  See
        // {@link SplitTransaction}.  We'd also have to be figure another way for
        // doing the below hbase:meta daughters fixup.
        NavigableMap<HRegionInfo, Result> hris = null;
        while (!this.server.isStopped()) {
            try {
                this.server.getCatalogTracker().waitForMeta();
                // Skip getting user regions if the server is stopped.
                if (!this.server.isStopped()) {
                    hris = MetaReader.getServerUserRegions(this.server.getCatalogTracker(), this.serverName);
                }
                break;
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                throw (InterruptedIOException) new InterruptedIOException().initCause(e);
            } catch (IOException ioe) {
                LOG.info("Received exception accessing hbase:meta during server shutdown of " + serverName
                        + ", retrying hbase:meta read", ioe);
            }
        }
        if (this.server.isStopped()) {
            throw new IOException("Server is stopped");
        }

        try {
            if (this.shouldSplitHlog) {
                LOG.info("Splitting logs for " + serverName + " before assignment.");
                if (this.distributedLogReplay) {
                    LOG.info("Mark regions in recovery before assignment.");
                    Set<ServerName> serverNames = new HashSet<ServerName>();
                    serverNames.add(serverName);
                    this.services.getMasterFileSystem().prepareLogReplay(serverNames);
                } else {
                    this.services.getMasterFileSystem().splitLog(serverName);
                }
                am.getRegionStates().logSplit(serverName);
            } else {
                LOG.info("Skipping log splitting for " + serverName);
            }
        } catch (IOException ioe) {
            resubmit(serverName, ioe);
        }

        // Clean out anything in regions in transition.  Being conservative and
        // doing after log splitting.  Could do some states before -- OPENING?
        // OFFLINE? -- and then others after like CLOSING that depend on log
        // splitting.
        List<HRegionInfo> regionsInTransition = am.processServerShutdown(serverName);
        LOG.info("Reassigning " + ((hris == null) ? 0 : hris.size()) + " region(s) that "
                + (serverName == null ? "null" : serverName) + " was carrying (and "
                + regionsInTransition.size() + " regions(s) that were opening on this server)");

        List<HRegionInfo> toAssignRegions = new ArrayList<HRegionInfo>();
        toAssignRegions.addAll(regionsInTransition);

        // Iterate regions that were on this server and assign them
        if (hris != null) {
            RegionStates regionStates = am.getRegionStates();
            for (Map.Entry<HRegionInfo, Result> e : hris.entrySet()) {
                HRegionInfo hri = e.getKey();
                if (regionsInTransition.contains(hri)) {
                    continue;
                }
                String encodedName = hri.getEncodedName();
                Lock lock = am.acquireRegionLock(encodedName);
                try {
                    RegionState rit = regionStates.getRegionTransitionState(hri);
                    if (processDeadRegion(hri, e.getValue(), am, server.getCatalogTracker())) {
                        ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri);
                        if (addressFromAM != null && !addressFromAM.equals(this.serverName)) {
                            // If this region is in transition on the dead server, it must be
                            // opening or pending_open, which should have been covered by AM#processServerShutdown
                            LOG.info("Skip assigning region " + hri.getRegionNameAsString()
                                    + " because it has been opened in " + addressFromAM.getServerName());
                            continue;
                        }
                        if (rit != null) {
                            if (rit.getServerName() != null && !rit.isOnServer(serverName)) {
                                // Skip regions that are in transition on other server
                                LOG.info("Skip assigning region in transition on other server" + rit);
                                continue;
                            }
                            try {
                                //clean zk node
                                LOG.info("Reassigning region with rs = " + rit
                                        + " and deleting zk node if exists");
                                ZKAssign.deleteNodeFailSilent(services.getZooKeeper(), hri);
                                regionStates.updateRegionState(hri, State.OFFLINE);
                            } catch (KeeperException ke) {
                                this.server.abort("Unexpected ZK exception deleting unassigned node " + hri,
                                        ke);
                                return;
                            }
                        } else if (regionStates.isRegionInState(hri, State.SPLITTING_NEW, State.MERGING_NEW)) {
                            regionStates.regionOffline(hri);
                        }
                        toAssignRegions.add(hri);
                    } else if (rit != null) {
                        if (rit.isPendingCloseOrClosing() && am.getTableStateManager().isTableState(
                                hri.getTable(), ZooKeeperProtos.Table.State.DISABLED,
                                ZooKeeperProtos.Table.State.DISABLING)) {
                            // If the table was partially disabled and the RS went down, we should clear the RIT
                            // and remove the node for the region.
                            // The rit that we use may be stale in case the table was in DISABLING state
                            // but though we did assign we will not be clearing the znode in CLOSING state.
                            // Doing this will have no harm. See HBASE-5927
                            regionStates.updateRegionState(hri, State.OFFLINE);
                            am.deleteClosingOrClosedNode(hri, rit.getServerName());
                            am.offlineDisabledRegion(hri);
                        } else {
                            LOG.warn("THIS SHOULD NOT HAPPEN: unexpected region in transition " + rit
                                    + " not to be assigned by SSH of server " + serverName);
                        }
                    }
                } finally {
                    lock.unlock();
                }
            }
        }

        try {
            am.assign(toAssignRegions);
        } catch (InterruptedException ie) {
            LOG.error("Caught " + ie + " during round-robin assignment");
            throw (InterruptedIOException) new InterruptedIOException().initCause(ie);
        }

        if (this.shouldSplitHlog && this.distributedLogReplay) {
            // wait for region assignment completes
            for (HRegionInfo hri : toAssignRegions) {
                try {
                    if (!am.waitOnRegionToClearRegionsInTransition(hri, regionAssignmentWaitTimeout)) {
                        // Wait here is to avoid log replay hits current dead server and incur a RPC timeout
                        // when replay happens before region assignment completes.
                        LOG.warn("Region " + hri.getEncodedName() + " didn't complete assignment in time");
                    }
                } catch (InterruptedException ie) {
                    throw new InterruptedIOException(
                            "Caught " + ie + " during waitOnRegionToClearRegionsInTransition");
                }
            }
            // submit logReplay work
            this.services.getExecutorService().submit(
                    new LogReplayHandler(this.server, this.services, this.deadServers, this.serverName));
            hasLogReplayWork = true;
        }
    } finally {
        this.deadServers.finish(serverName);
    }

    if (!hasLogReplayWork) {
        LOG.info("Finished processing of shutdown of " + serverName);
    }
}

From source file:com.google.gwt.emultest.java.util.TreeMapTest.java

@SuppressWarnings("SuspiciousMethodCalls")
public void testEntrySet_contains() {
    K[] keys = getSortedKeys();//from   ww w . ja v  a  2 s . c o m
    V[] values = getSortedValues();
    NavigableMap<K, V> master = createNavigableMap();
    NavigableMap<K, V> testMap = createNavigableMap();

    master.put(keys[0], null);
    Object[] entry = master.entrySet().toArray();
    assertFalse(testMap.entrySet().contains(entry[0]));

    Map<K, V> submap = testMap.subMap(keys[2], keys[3]);
    entry = master.entrySet().toArray();
    assertFalse(submap.entrySet().contains(entry[0]));

    testMap.put(keys[0], null);
    assertTrue(testMap.entrySet().containsAll(master.entrySet()));

    master.clear();
    master.put(keys[0], values[0]);
    entry = master.entrySet().toArray();
    assertFalse(testMap.entrySet().contains(entry[0]));
}

From source file:com.google.gwt.emultest.java.util.TreeMapTest.java

public void testEntrySet() {
    K[] keys = getSortedKeys();// ww w. j  a v  a2  s  .  c  o  m
    V[] values = getSortedValues();
    NavigableMap<K, V> map = createNavigableMap();
    map.put(keys[0], values[0]);
    map.put(keys[1], values[1]);
    map.put(keys[2], values[2]);

    Set<Map.Entry<K, V>> entries = map.entrySet();
    Iterator<Map.Entry<K, V>> entrySetIterator = entries.iterator();
    assertEquals(3, entries.size());
    assertEquals(keys[0] + "=" + values[0], entrySetIterator.next().toString());
    while (entrySetIterator.hasNext()) {
        Map.Entry<K, V> entry = entrySetIterator.next();
        assertTrue(map.get(entry.getKey()) == entry.getValue());
    }

    assertEquals(map.size(), entries.size());
    _assertEquals(entries, map.entrySet());
    map.clear();
    assertEquals(map.size(), entries.size());
    _assertEquals(entries, map.entrySet());
    map.put(keys[0], values[0]);
    assertEquals(map.size(), entries.size());
    _assertEquals(entries, map.entrySet());
    entries.clear();
    assertEquals(map.size(), entries.size());
    _assertEquals(entries, map.entrySet());

    map.put(keys[1], values[1]);
    map.put(keys[2], values[2]);
    Iterator<Entry<K, V>> it = entries.iterator();
    while (it.hasNext()) {
        Map.Entry<K, V> entry = it.next();
        map.containsKey(entry.getKey());
        map.containsValue(entry.getValue());
        it.remove();
    }
    try {
        it.next();
        fail("should throw NoSuchElementException");
    } catch (NoSuchElementException expected) {
    }
    _assertEmpty(map);
}

From source file:org.commonvox.hbase_column_manager.TestRepositoryAdmin.java

private void verifyColumnData(Configuration configuration, boolean useDetailedScan) throws IOException {
    try (Connection connection = MConnectionFactory.createConnection(configuration)) {
        for (TableName tableName : testTableNamesAndDescriptors.keySet()) {
            List<Result> rows;
            Scan scan = new Scan().setMaxVersions();
            if (useDetailedScan) {
                //          scan.addFamily(CF01);
                //          scan.addFamily(CF02);
                scan.addColumn(CF01, COLQUALIFIER01);
                scan.addColumn(CF01, COLQUALIFIER02);
                scan.addColumn(CF01, COLQUALIFIER03);
                scan.addColumn(CF02, COLQUALIFIER04);
                scan.addColumn(CF01, QUALIFIER_IN_EXCLUDED_TABLE);
                scan.addColumn(CF02, QUALIFIER_IN_EXCLUDED_TABLE);
            }/*from   w ww .ja va2s .  c o  m*/
            rows = getUserTableRows(connection, tableName, scan);
            System.out.println("CONTENTS of user Table: " + tableName.getNameAsString() + " retrieved with "
                    + (useDetailedScan ? "DETAILED" : "EMPTY") + " Scan parms");
            for (Result row : rows) {
                System.out.println("  **ROW-ID**: " + Bytes.toString(row.getRow()));
                NavigableMap<byte[], NavigableMap<byte[], NavigableMap<Long, byte[]>>> contentMap = row
                        .getMap();
                for (Entry<byte[], NavigableMap<byte[], NavigableMap<Long, byte[]>>> familyMap : contentMap
                        .entrySet()) {
                    System.out.println("  -- Column Family: " + Bytes.toString(familyMap.getKey()));
                    for (Entry<byte[], NavigableMap<Long, byte[]>> columnMap : familyMap.getValue()
                            .entrySet()) {
                        if (Repository.isPrintable(columnMap.getKey())) {
                            System.out.println("    -- Column: " + Bytes.toString(columnMap.getKey()));
                        } else {
                            try {
                                System.out.println("    -- Column (ALIAS): " + Bytes.toInt(columnMap.getKey()));
                            } catch (IllegalArgumentException e) {
                                System.out.println("    -- Column name UNPRINTABLE (neither String nor int)!!");
                            }
                        }
                        for (Entry<Long, byte[]> cellMap : columnMap.getValue().entrySet()) {
                            // System.out.println("      -- Cell Timestamp: " + cellMap.getKey().toString());
                            System.out.println("      -- Cell Value: " + Bytes.toString(cellMap.getValue()));
                        }
                    }
                }
            }
        }
    }
}

From source file:com.google.gwt.emultest.java.util.TreeMapTest.java

public void testDescendingMap() {
    K[] keys = getSortedKeys();/*from  ww  w.  j  av a2 s  .c o m*/
    V[] values = getSortedValues();
    NavigableMap<K, V> map = createNavigableMap();
    map.put(keys[0], values[0]);

    NavigableMap<K, V> descendingMap = map.descendingMap();
    _assertEquals(descendingMap, map.descendingMap());

    map.put(keys[1], values[1]);
    _assertEquals(map, descendingMap.descendingMap());
    _assertEquals(reverseCollection(map.entrySet()), descendingMap.entrySet());

    descendingMap.put(keys[2], values[2]);
    _assertEquals(reverseCollection(map.entrySet()), descendingMap.entrySet());
    _assertEquals(map.entrySet(), descendingMap.descendingMap().entrySet());

    descendingMap.remove(keys[1]);
    _assertEquals(reverseCollection(map.entrySet()), descendingMap.entrySet());

    descendingMap.clear();
    assertEquals(0, descendingMap.size());
    assertEquals(0, map.size());

    map.put(keys[0], values[0]);
    map.put(keys[1], values[1]);
    map.put(keys[2], values[2]);
    assertEquals(3, descendingMap.size());

    NavigableMap<K, V> headMap = descendingMap.headMap(keys[1], false);
    assertEquals(1, headMap.size());
    assertTrue(headMap.containsKey(keys[2]));

    NavigableMap<K, V> subMap = descendingMap.subMap(keys[2], true, keys[1], true);
    assertEquals(2, subMap.size());
    assertTrue(subMap.containsKey(keys[1]));
    assertTrue(subMap.containsKey(keys[2]));

    NavigableMap<K, V> tailMap = descendingMap.tailMap(keys[1], false);
    assertEquals(1, tailMap.size());
    assertTrue(tailMap.containsKey(keys[0]));
}

From source file:org.apache.hadoop.hbase.master.balancer.DefaultLoadBalancer.java

/**
 * Generate a global load balancing plan according to the specified map of
 * server information to the most loaded regions of each server.
 *
 * The load balancing invariant is that all servers are within 1 region of the
 * average number of regions per server.  If the average is an integer number,
 * all servers will be balanced to the average.  Otherwise, all servers will
 * have either floor(average) or ceiling(average) regions.
 *
 * HBASE-3609 Modeled regionsToMove using Guava's MinMaxPriorityQueue so that
 *   we can fetch from both ends of the queue. 
 * At the beginning, we check whether there was empty region server 
 *   just discovered by Master. If so, we alternately choose new / old
 *   regions from head / tail of regionsToMove, respectively. This alternation
 *   avoids clustering young regions on the newly discovered region server.
 *   Otherwise, we choose new regions from head of regionsToMove.
 *   //from  w ww.j  a  v a  2s . co  m
 * Another improvement from HBASE-3609 is that we assign regions from
 *   regionsToMove to underloaded servers in round-robin fashion.
 *   Previously one underloaded server would be filled before we move onto
 *   the next underloaded server, leading to clustering of young regions.
 *   
 * Finally, we randomly shuffle underloaded servers so that they receive
 *   offloaded regions relatively evenly across calls to balanceCluster().
 *         
 * The algorithm is currently implemented as such:
 *
 * <ol>
 * <li>Determine the two valid numbers of regions each server should have,
 *     <b>MIN</b>=floor(average) and <b>MAX</b>=ceiling(average).
 *
 * <li>Iterate down the most loaded servers, shedding regions from each so
 *     each server hosts exactly <b>MAX</b> regions.  Stop once you reach a
 *     server that already has &lt;= <b>MAX</b> regions.
 *     <p>
 *     Order the regions to move from most recent to least.
 *
 * <li>Iterate down the least loaded servers, assigning regions so each server
 *     has exactly </b>MIN</b> regions.  Stop once you reach a server that
 *     already has &gt;= <b>MIN</b> regions.
 *
 *     Regions being assigned to underloaded servers are those that were shed
 *     in the previous step.  It is possible that there were not enough
 *     regions shed to fill each underloaded server to <b>MIN</b>.  If so we
 *     end up with a number of regions required to do so, <b>neededRegions</b>.
 *
 *     It is also possible that we were able to fill each underloaded but ended
 *     up with regions that were unassigned from overloaded servers but that
 *     still do not have assignment.
 *
 *     If neither of these conditions hold (no regions needed to fill the
 *     underloaded servers, no regions leftover from overloaded servers),
 *     we are done and return.  Otherwise we handle these cases below.
 *
 * <li>If <b>neededRegions</b> is non-zero (still have underloaded servers),
 *     we iterate the most loaded servers again, shedding a single server from
 *     each (this brings them from having <b>MAX</b> regions to having
 *     <b>MIN</b> regions).
 *
 * <li>We now definitely have more regions that need assignment, either from
 *     the previous step or from the original shedding from overloaded servers.
 *     Iterate the least loaded servers filling each to <b>MIN</b>.
 *
 * <li>If we still have more regions that need assignment, again iterate the
 *     least loaded servers, this time giving each one (filling them to
 *     </b>MAX</b>) until we run out.
 *
 * <li>All servers will now either host <b>MIN</b> or <b>MAX</b> regions.
 *
 *     In addition, any server hosting &gt;= <b>MAX</b> regions is guaranteed
 *     to end up with <b>MAX</b> regions at the end of the balancing.  This
 *     ensures the minimal number of regions possible are moved.
 * </ol>
 *
 * TODO: We can at-most reassign the number of regions away from a particular
 *       server to be how many they report as most loaded.
 *       Should we just keep all assignment in memory?  Any objections?
 *       Does this mean we need HeapSize on HMaster?  Or just careful monitor?
 *       (current thinking is we will hold all assignments in memory)
 *
 * @param clusterMap Map of regionservers and their load/region information to
 *                   a list of their most loaded regions
 * @return a list of regions to be moved, including source and destination,
 *         or null if cluster is already balanced
 */
public List<RegionPlan> balanceCluster(Map<ServerName, List<HRegionInfo>> clusterMap) {
    boolean emptyRegionServerPresent = false;
    long startTime = System.currentTimeMillis();

    ClusterLoadState cs = new ClusterLoadState(clusterMap);

    if (!this.needsBalance(cs))
        return null;

    int numServers = cs.getNumServers();
    NavigableMap<ServerAndLoad, List<HRegionInfo>> serversByLoad = cs.getServersByLoad();
    int numRegions = cs.getNumRegions();
    int min = numRegions / numServers;
    int max = numRegions % numServers == 0 ? min : min + 1;

    // Using to check balance result.
    StringBuilder strBalanceParam = new StringBuilder();
    strBalanceParam.append("Balance parameter: numRegions=").append(numRegions).append(", numServers=")
            .append(numServers).append(", max=").append(max).append(", min=").append(min);
    LOG.debug(strBalanceParam.toString());

    // Balance the cluster
    // TODO: Look at data block locality or a more complex load to do this
    MinMaxPriorityQueue<RegionPlan> regionsToMove = MinMaxPriorityQueue.orderedBy(rpComparator).create();
    List<RegionPlan> regionsToReturn = new ArrayList<RegionPlan>();

    // Walk down most loaded, pruning each to the max
    int serversOverloaded = 0;
    // flag used to fetch regions from head and tail of list, alternately
    boolean fetchFromTail = false;
    Map<ServerName, BalanceInfo> serverBalanceInfo = new TreeMap<ServerName, BalanceInfo>();
    for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.descendingMap().entrySet()) {
        ServerAndLoad sal = server.getKey();
        int regionCount = sal.getLoad();
        if (regionCount <= max) {
            serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(0, 0));
            break;
        }
        serversOverloaded++;
        List<HRegionInfo> regions = server.getValue();
        int numToOffload = Math.min(regionCount - max, regions.size());
        // account for the out-of-band regions which were assigned to this server
        // after some other region server crashed 
        Collections.sort(regions, riComparator);
        int numTaken = 0;
        for (int i = 0; i <= numToOffload;) {
            HRegionInfo hri = regions.get(i); // fetch from head
            if (fetchFromTail) {
                hri = regions.get(regions.size() - 1 - i);
            }
            i++;
            // Don't rebalance meta regions.
            if (hri.isMetaRegion())
                continue;
            regionsToMove.add(new RegionPlan(hri, sal.getServerName(), null));
            numTaken++;
            if (numTaken >= numToOffload)
                break;
            // fetch in alternate order if there is new region server
            if (emptyRegionServerPresent) {
                fetchFromTail = !fetchFromTail;
            }
        }
        serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(numToOffload, (-1) * numTaken));
    }
    int totalNumMoved = regionsToMove.size();

    // Walk down least loaded, filling each to the min
    int neededRegions = 0; // number of regions needed to bring all up to min
    fetchFromTail = false;

    Map<ServerName, Integer> underloadedServers = new HashMap<ServerName, Integer>();
    float average = (float) numRegions / numServers; // for logging
    int maxToTake = numRegions - (int) average;
    for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) {
        if (maxToTake == 0)
            break; // no more to take
        int regionCount = server.getKey().getLoad();
        if (regionCount >= min && regionCount > 0) {
            continue; // look for other servers which haven't reached min
        }
        int regionsToPut = min - regionCount;
        if (regionsToPut == 0) {
            regionsToPut = 1;
            maxToTake--;
        }
        underloadedServers.put(server.getKey().getServerName(), regionsToPut);
    }
    // number of servers that get new regions
    int serversUnderloaded = underloadedServers.size();
    int incr = 1;
    List<ServerName> sns = Arrays
            .asList(underloadedServers.keySet().toArray(new ServerName[serversUnderloaded]));
    Collections.shuffle(sns, RANDOM);
    while (regionsToMove.size() > 0) {
        int cnt = 0;
        int i = incr > 0 ? 0 : underloadedServers.size() - 1;
        for (; i >= 0 && i < underloadedServers.size(); i += incr) {
            if (regionsToMove.isEmpty())
                break;
            ServerName si = sns.get(i);
            int numToTake = underloadedServers.get(si);
            if (numToTake == 0)
                continue;

            addRegionPlan(regionsToMove, fetchFromTail, si, regionsToReturn);
            if (emptyRegionServerPresent) {
                fetchFromTail = !fetchFromTail;
            }

            underloadedServers.put(si, numToTake - 1);
            cnt++;
            BalanceInfo bi = serverBalanceInfo.get(si);
            if (bi == null) {
                bi = new BalanceInfo(0, 0);
                serverBalanceInfo.put(si, bi);
            }
            bi.setNumRegionsAdded(bi.getNumRegionsAdded() + 1);
        }
        if (cnt == 0)
            break;
        // iterates underloadedServers in the other direction
        incr = -incr;
    }
    for (Integer i : underloadedServers.values()) {
        // If we still want to take some, increment needed
        neededRegions += i;
    }

    // If none needed to fill all to min and none left to drain all to max,
    // we are done
    if (neededRegions == 0 && regionsToMove.isEmpty()) {
        long endTime = System.currentTimeMillis();
        LOG.info("Calculated a load balance in " + (endTime - startTime) + "ms. " + "Moving " + totalNumMoved
                + " regions off of " + serversOverloaded + " overloaded servers onto " + serversUnderloaded
                + " less loaded servers");
        return regionsToReturn;
    }

    // Need to do a second pass.
    // Either more regions to assign out or servers that are still underloaded

    // If we need more to fill min, grab one from each most loaded until enough
    if (neededRegions != 0) {
        // Walk down most loaded, grabbing one from each until we get enough
        for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.descendingMap().entrySet()) {
            BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName());
            int idx = balanceInfo == null ? 0 : balanceInfo.getNextRegionForUnload();
            if (idx >= server.getValue().size())
                break;
            HRegionInfo region = server.getValue().get(idx);
            if (region.isMetaRegion())
                continue; // Don't move meta regions.
            regionsToMove.add(new RegionPlan(region, server.getKey().getServerName(), null));
            totalNumMoved++;
            if (--neededRegions == 0) {
                // No more regions needed, done shedding
                break;
            }
        }
    }

    // Now we have a set of regions that must be all assigned out
    // Assign each underloaded up to the min, then if leftovers, assign to max

    // Walk down least loaded, assigning to each to fill up to min
    for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) {
        int regionCount = server.getKey().getLoad();
        if (regionCount >= min)
            break;
        BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName());
        if (balanceInfo != null) {
            regionCount += balanceInfo.getNumRegionsAdded();
        }
        if (regionCount >= min) {
            continue;
        }
        int numToTake = min - regionCount;
        int numTaken = 0;
        while (numTaken < numToTake && 0 < regionsToMove.size()) {
            addRegionPlan(regionsToMove, fetchFromTail, server.getKey().getServerName(), regionsToReturn);
            numTaken++;
            if (emptyRegionServerPresent) {
                fetchFromTail = !fetchFromTail;
            }
        }
    }

    // If we still have regions to dish out, assign underloaded to max
    if (0 < regionsToMove.size()) {
        for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) {
            int regionCount = server.getKey().getLoad();
            if (regionCount >= max) {
                break;
            }
            addRegionPlan(regionsToMove, fetchFromTail, server.getKey().getServerName(), regionsToReturn);
            if (emptyRegionServerPresent) {
                fetchFromTail = !fetchFromTail;
            }
            if (regionsToMove.isEmpty()) {
                break;
            }
        }
    }

    long endTime = System.currentTimeMillis();

    if (!regionsToMove.isEmpty() || neededRegions != 0) {
        // Emit data so can diagnose how balancer went astray.
        LOG.warn("regionsToMove=" + totalNumMoved + ", numServers=" + numServers + ", serversOverloaded="
                + serversOverloaded + ", serversUnderloaded=" + serversUnderloaded);
        StringBuilder sb = new StringBuilder();
        for (Map.Entry<ServerName, List<HRegionInfo>> e : clusterMap.entrySet()) {
            if (sb.length() > 0)
                sb.append(", ");
            sb.append(e.getKey().toString());
            sb.append(" ");
            sb.append(e.getValue().size());
        }
        LOG.warn("Input " + sb.toString());
    }

    // All done!
    LOG.info("Done. Calculated a load balance in " + (endTime - startTime) + "ms. " + "Moving " + totalNumMoved
            + " regions off of " + serversOverloaded + " overloaded servers onto " + serversUnderloaded
            + " less loaded servers");

    return regionsToReturn;
}

From source file:org.apache.hadoop.hbase.master.balancer.SimpleLoadBalancer.java

/**
 * Generate a global load balancing plan according to the specified map of
 * server information to the most loaded regions of each server.
 *
 * The load balancing invariant is that all servers are within 1 region of the
 * average number of regions per server.  If the average is an integer number,
 * all servers will be balanced to the average.  Otherwise, all servers will
 * have either floor(average) or ceiling(average) regions.
 *
 * HBASE-3609 Modeled regionsToMove using Guava's MinMaxPriorityQueue so that
 *   we can fetch from both ends of the queue. 
 * At the beginning, we check whether there was empty region server 
 *   just discovered by Master. If so, we alternately choose new / old
 *   regions from head / tail of regionsToMove, respectively. This alternation
 *   avoids clustering young regions on the newly discovered region server.
 *   Otherwise, we choose new regions from head of regionsToMove.
 *   /* w w  w  . j  a v a 2  s.co m*/
 * Another improvement from HBASE-3609 is that we assign regions from
 *   regionsToMove to underloaded servers in round-robin fashion.
 *   Previously one underloaded server would be filled before we move onto
 *   the next underloaded server, leading to clustering of young regions.
 *   
 * Finally, we randomly shuffle underloaded servers so that they receive
 *   offloaded regions relatively evenly across calls to balanceCluster().
 *         
 * The algorithm is currently implemented as such:
 *
 * <ol>
 * <li>Determine the two valid numbers of regions each server should have,
 *     <b>MIN</b>=floor(average) and <b>MAX</b>=ceiling(average).
 *
 * <li>Iterate down the most loaded servers, shedding regions from each so
 *     each server hosts exactly <b>MAX</b> regions.  Stop once you reach a
 *     server that already has &lt;= <b>MAX</b> regions.
 *     <p>
 *     Order the regions to move from most recent to least.
 *
 * <li>Iterate down the least loaded servers, assigning regions so each server
 *     has exactly </b>MIN</b> regions.  Stop once you reach a server that
 *     already has &gt;= <b>MIN</b> regions.
 *
 *     Regions being assigned to underloaded servers are those that were shed
 *     in the previous step.  It is possible that there were not enough
 *     regions shed to fill each underloaded server to <b>MIN</b>.  If so we
 *     end up with a number of regions required to do so, <b>neededRegions</b>.
 *
 *     It is also possible that we were able to fill each underloaded but ended
 *     up with regions that were unassigned from overloaded servers but that
 *     still do not have assignment.
 *
 *     If neither of these conditions hold (no regions needed to fill the
 *     underloaded servers, no regions leftover from overloaded servers),
 *     we are done and return.  Otherwise we handle these cases below.
 *
 * <li>If <b>neededRegions</b> is non-zero (still have underloaded servers),
 *     we iterate the most loaded servers again, shedding a single server from
 *     each (this brings them from having <b>MAX</b> regions to having
 *     <b>MIN</b> regions).
 *
 * <li>We now definitely have more regions that need assignment, either from
 *     the previous step or from the original shedding from overloaded servers.
 *     Iterate the least loaded servers filling each to <b>MIN</b>.
 *
 * <li>If we still have more regions that need assignment, again iterate the
 *     least loaded servers, this time giving each one (filling them to
 *     </b>MAX</b>) until we run out.
 *
 * <li>All servers will now either host <b>MIN</b> or <b>MAX</b> regions.
 *
 *     In addition, any server hosting &gt;= <b>MAX</b> regions is guaranteed
 *     to end up with <b>MAX</b> regions at the end of the balancing.  This
 *     ensures the minimal number of regions possible are moved.
 * </ol>
 *
 * TODO: We can at-most reassign the number of regions away from a particular
 *       server to be how many they report as most loaded.
 *       Should we just keep all assignment in memory?  Any objections?
 *       Does this mean we need HeapSize on HMaster?  Or just careful monitor?
 *       (current thinking is we will hold all assignments in memory)
 *
 * @param clusterMap Map of regionservers and their load/region information to
 *                   a list of their most loaded regions
 * @return a list of regions to be moved, including source and destination,
 *         or null if cluster is already balanced
 */
public List<RegionPlan> balanceCluster(Map<ServerName, List<HRegionInfo>> clusterMap) {
    List<RegionPlan> regionsToReturn = balanceMasterRegions(clusterMap);
    if (regionsToReturn != null) {
        return regionsToReturn;
    }
    filterExcludedServers(clusterMap);
    boolean emptyRegionServerPresent = false;
    long startTime = System.currentTimeMillis();

    Collection<ServerName> backupMasters = getBackupMasters();
    ClusterLoadState cs = new ClusterLoadState(masterServerName, backupMasters, backupMasterWeight, clusterMap);

    if (!this.needsBalance(cs))
        return null;

    int numServers = cs.getNumServers();
    NavigableMap<ServerAndLoad, List<HRegionInfo>> serversByLoad = cs.getServersByLoad();
    int numRegions = cs.getNumRegions();
    float average = cs.getLoadAverage();
    int max = (int) Math.ceil(average);
    int min = (int) average;

    // Using to check balance result.
    StringBuilder strBalanceParam = new StringBuilder();
    strBalanceParam.append("Balance parameter: numRegions=").append(numRegions).append(", numServers=")
            .append(numServers).append(", numBackupMasters=").append(cs.getNumBackupMasters())
            .append(", backupMasterWeight=").append(backupMasterWeight).append(", max=").append(max)
            .append(", min=").append(min);
    LOG.debug(strBalanceParam.toString());

    // Balance the cluster
    // TODO: Look at data block locality or a more complex load to do this
    MinMaxPriorityQueue<RegionPlan> regionsToMove = MinMaxPriorityQueue.orderedBy(rpComparator).create();
    regionsToReturn = new ArrayList<RegionPlan>();

    // Walk down most loaded, pruning each to the max
    int serversOverloaded = 0;
    // flag used to fetch regions from head and tail of list, alternately
    boolean fetchFromTail = false;
    Map<ServerName, BalanceInfo> serverBalanceInfo = new TreeMap<ServerName, BalanceInfo>();
    for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.descendingMap().entrySet()) {
        ServerAndLoad sal = server.getKey();
        int load = sal.getLoad();
        if (load <= max) {
            serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(0, 0));
            break;
        }
        serversOverloaded++;
        List<HRegionInfo> regions = server.getValue();
        int w = 1; // Normal region server has weight 1
        if (backupMasters != null && backupMasters.contains(sal.getServerName())) {
            w = backupMasterWeight; // Backup master has heavier weight
        }
        int numToOffload = Math.min((load - max) / w, regions.size());
        // account for the out-of-band regions which were assigned to this server
        // after some other region server crashed 
        Collections.sort(regions, riComparator);
        int numTaken = 0;
        for (int i = 0; i <= numToOffload;) {
            HRegionInfo hri = regions.get(i); // fetch from head
            if (fetchFromTail) {
                hri = regions.get(regions.size() - 1 - i);
            }
            i++;
            // Don't rebalance special regions.
            if (shouldBeOnMaster(hri) && masterServerName.equals(sal.getServerName()))
                continue;
            regionsToMove.add(new RegionPlan(hri, sal.getServerName(), null));
            numTaken++;
            if (numTaken >= numToOffload)
                break;
            // fetch in alternate order if there is new region server
            if (emptyRegionServerPresent) {
                fetchFromTail = !fetchFromTail;
            }
        }
        serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(numToOffload, (-1) * numTaken));
    }
    int totalNumMoved = regionsToMove.size();

    // Walk down least loaded, filling each to the min
    int neededRegions = 0; // number of regions needed to bring all up to min
    fetchFromTail = false;

    Map<ServerName, Integer> underloadedServers = new HashMap<ServerName, Integer>();
    int maxToTake = numRegions - min;
    for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) {
        if (maxToTake == 0)
            break; // no more to take
        int load = server.getKey().getLoad();
        if (load >= min && load > 0) {
            continue; // look for other servers which haven't reached min
        }
        int w = 1; // Normal region server has weight 1
        if (backupMasters != null && backupMasters.contains(server.getKey().getServerName())) {
            w = backupMasterWeight; // Backup master has heavier weight
        }
        int regionsToPut = (min - load) / w;
        if (regionsToPut == 0) {
            regionsToPut = 1;
        }
        maxToTake -= regionsToPut;
        underloadedServers.put(server.getKey().getServerName(), regionsToPut);
    }
    // number of servers that get new regions
    int serversUnderloaded = underloadedServers.size();
    int incr = 1;
    List<ServerName> sns = Arrays
            .asList(underloadedServers.keySet().toArray(new ServerName[serversUnderloaded]));
    Collections.shuffle(sns, RANDOM);
    while (regionsToMove.size() > 0) {
        int cnt = 0;
        int i = incr > 0 ? 0 : underloadedServers.size() - 1;
        for (; i >= 0 && i < underloadedServers.size(); i += incr) {
            if (regionsToMove.isEmpty())
                break;
            ServerName si = sns.get(i);
            int numToTake = underloadedServers.get(si);
            if (numToTake == 0)
                continue;

            addRegionPlan(regionsToMove, fetchFromTail, si, regionsToReturn);
            if (emptyRegionServerPresent) {
                fetchFromTail = !fetchFromTail;
            }

            underloadedServers.put(si, numToTake - 1);
            cnt++;
            BalanceInfo bi = serverBalanceInfo.get(si);
            if (bi == null) {
                bi = new BalanceInfo(0, 0);
                serverBalanceInfo.put(si, bi);
            }
            bi.setNumRegionsAdded(bi.getNumRegionsAdded() + 1);
        }
        if (cnt == 0)
            break;
        // iterates underloadedServers in the other direction
        incr = -incr;
    }
    for (Integer i : underloadedServers.values()) {
        // If we still want to take some, increment needed
        neededRegions += i;
    }

    // If none needed to fill all to min and none left to drain all to max,
    // we are done
    if (neededRegions == 0 && regionsToMove.isEmpty()) {
        long endTime = System.currentTimeMillis();
        LOG.info("Calculated a load balance in " + (endTime - startTime) + "ms. " + "Moving " + totalNumMoved
                + " regions off of " + serversOverloaded + " overloaded servers onto " + serversUnderloaded
                + " less loaded servers");
        return regionsToReturn;
    }

    // Need to do a second pass.
    // Either more regions to assign out or servers that are still underloaded

    // If we need more to fill min, grab one from each most loaded until enough
    if (neededRegions != 0) {
        // Walk down most loaded, grabbing one from each until we get enough
        for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.descendingMap().entrySet()) {
            BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName());
            int idx = balanceInfo == null ? 0 : balanceInfo.getNextRegionForUnload();
            if (idx >= server.getValue().size())
                break;
            HRegionInfo region = server.getValue().get(idx);
            if (region.isMetaRegion())
                continue; // Don't move meta regions.
            regionsToMove.add(new RegionPlan(region, server.getKey().getServerName(), null));
            totalNumMoved++;
            if (--neededRegions == 0) {
                // No more regions needed, done shedding
                break;
            }
        }
    }

    // Now we have a set of regions that must be all assigned out
    // Assign each underloaded up to the min, then if leftovers, assign to max

    // Walk down least loaded, assigning to each to fill up to min
    for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) {
        int regionCount = server.getKey().getLoad();
        if (regionCount >= min)
            break;
        BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName());
        if (balanceInfo != null) {
            regionCount += balanceInfo.getNumRegionsAdded();
        }
        if (regionCount >= min) {
            continue;
        }
        int numToTake = min - regionCount;
        int numTaken = 0;
        while (numTaken < numToTake && 0 < regionsToMove.size()) {
            addRegionPlan(regionsToMove, fetchFromTail, server.getKey().getServerName(), regionsToReturn);
            numTaken++;
            if (emptyRegionServerPresent) {
                fetchFromTail = !fetchFromTail;
            }
        }
    }

    // If we still have regions to dish out, assign underloaded to max
    if (0 < regionsToMove.size()) {
        for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) {
            int regionCount = server.getKey().getLoad();
            BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName());
            if (balanceInfo != null) {
                regionCount += balanceInfo.getNumRegionsAdded();
            }
            if (regionCount >= max) {
                break;
            }
            addRegionPlan(regionsToMove, fetchFromTail, server.getKey().getServerName(), regionsToReturn);
            if (emptyRegionServerPresent) {
                fetchFromTail = !fetchFromTail;
            }
            if (regionsToMove.isEmpty()) {
                break;
            }
        }
    }

    long endTime = System.currentTimeMillis();

    if (!regionsToMove.isEmpty() || neededRegions != 0) {
        // Emit data so can diagnose how balancer went astray.
        LOG.warn("regionsToMove=" + totalNumMoved + ", numServers=" + numServers + ", serversOverloaded="
                + serversOverloaded + ", serversUnderloaded=" + serversUnderloaded);
        StringBuilder sb = new StringBuilder();
        for (Map.Entry<ServerName, List<HRegionInfo>> e : clusterMap.entrySet()) {
            if (sb.length() > 0)
                sb.append(", ");
            sb.append(e.getKey().toString());
            sb.append(" ");
            sb.append(e.getValue().size());
        }
        LOG.warn("Input " + sb.toString());
    }

    // All done!
    LOG.info("Done. Calculated a load balance in " + (endTime - startTime) + "ms. " + "Moving " + totalNumMoved
            + " regions off of " + serversOverloaded + " overloaded servers onto " + serversUnderloaded
            + " less loaded servers");

    return regionsToReturn;
}

From source file:com.alibaba.wasp.master.balancer.DefaultLoadBalancer.java

/**
 * Generate a global load balancing plan according to the specified map of
 * server information to the most loaded entityGroups of each server.
 * //ww  w.ja v a2s .  c  o m
 * The load balancing invariant is that all servers are within 1 entityGroup of the
 * average number of entityGroups per server. If the average is an integer number,
 * all servers will be balanced to the average. Otherwise, all servers will
 * have either floor(average) or ceiling(average) entityGroups.
 * 
 * HBASE-3609 Modeled entityGroupsToMove using Guava's MinMaxPriorityQueue so that
 * we can fetch from both ends of the queue. At the beginning, we check
 * whether there was empty entityGroup server just discovered by Master. If so, we
 * alternately choose new / old entityGroups from head / tail of entityGroupsToMove,
 * respectively. This alternation avoids clustering young entityGroups on the newly
 * discovered entityGroup server. Otherwise, we choose new entityGroups from head of
 * entityGroupsToMove.
 * 
 * Another improvement from HBASE-3609 is that we assign entityGroups from
 * entityGroupsToMove to underloaded servers in round-robin fashion. Previously one
 * underloaded server would be filled before we move onto the next underloaded
 * server, leading to clustering of young entityGroups.
 * 
 * Finally, we randomly shuffle underloaded servers so that they receive
 * offloaded entityGroups relatively evenly across calls to balanceCluster().
 * 
 * The algorithm is currently implemented as such:
 * 
 * <ol>
 * <li>Determine the two valid numbers of entityGroups each server should have,
 * <b>MIN</b>=floor(average) and <b>MAX</b>=ceiling(average).
 * 
 * <li>Iterate down the most loaded servers, shedding entityGroups from each so
 * each server hosts exactly <b>MAX</b> entityGroups. Stop once you reach a server
 * that already has &lt;= <b>MAX</b> entityGroups.
 * <p>
 * Order the entityGroups to move from most recent to least.
 * 
 * <li>Iterate down the least loaded servers, assigning entityGroups so each server
 * has exactly </b>MIN</b> entityGroups. Stop once you reach a server that already
 * has &gt;= <b>MIN</b> entityGroups.
 * 
 * EntityGroups being assigned to underloaded servers are those that were shed in
 * the previous step. It is possible that there were not enough entityGroups shed
 * to fill each underloaded server to <b>MIN</b>. If so we end up with a
 * number of entityGroups required to do so, <b>neededEntityGroups</b>.
 * 
 * It is also possible that we were able to fill each underloaded but ended up
 * with entityGroups that were unassigned from overloaded servers but that still do
 * not have assignment.
 * 
 * If neither of these conditions hold (no entityGroups needed to fill the
 * underloaded servers, no entityGroups leftover from overloaded servers), we are
 * done and return. Otherwise we handle these cases below.
 * 
 * <li>If <b>neededEntityGroups</b> is non-zero (still have underloaded servers),
 * we iterate the most loaded servers again, shedding a single server from
 * each (this brings them from having <b>MAX</b> entityGroups to having <b>MIN</b>
 * entityGroups).
 * 
 * <li>We now definitely have more entityGroups that need assignment, either from
 * the previous step or from the original shedding from overloaded servers.
 * Iterate the least loaded servers filling each to <b>MIN</b>.
 * 
 * <li>If we still have more entityGroups that need assignment, again iterate the
 * least loaded servers, this time giving each one (filling them to
 * </b>MAX</b>) until we run out.
 * 
 * <li>All servers will now either host <b>MIN</b> or <b>MAX</b> entityGroups.
 * 
 * In addition, any server hosting &gt;= <b>MAX</b> entityGroups is guaranteed to
 * end up with <b>MAX</b> entityGroups at the end of the balancing. This ensures
 * the minimal number of entityGroups possible are moved.
 * </ol>
 * 
 * TODO: We can at-most reassign the number of entityGroups away from a particular
 * server to be how many they report as most loaded. Should we just keep all
 * assignment in memory? Any objections? Does this mean we need HeapSize on
 * HMaster? Or just careful monitor? (current thinking is we will hold all
 * assignments in memory)
 * 
 * @param clusterState Map of entityGroupservers and their load/entityGroup information
 *          to a list of their most loaded entityGroups
 * @return a list of entityGroups to be moved, including source and destination, or
 *         null if cluster is already balanced
 */
public List<EntityGroupPlan> balanceCluster(Map<ServerName, List<EntityGroupInfo>> clusterMap) {
    boolean emptyFServerPresent = false;
    long startTime = System.currentTimeMillis();

    ClusterLoadState cs = new ClusterLoadState(clusterMap);

    int numServers = cs.getNumServers();
    if (numServers == 0) {
        LOG.debug("numServers=0 so skipping load balancing");
        return null;
    }
    NavigableMap<ServerAndLoad, List<EntityGroupInfo>> serversByLoad = cs.getServersByLoad();

    int numEntityGroups = cs.getNumEntityGroups();

    if (!this.needsBalance(cs)) {
        // Skipped because no server outside (min,max) range
        float average = cs.getLoadAverage(); // for logging
        LOG.info("Skipping load balancing because balanced cluster; " + "servers=" + numServers + " "
                + "entityGroups=" + numEntityGroups + " average=" + average + " " + "mostloaded="
                + serversByLoad.lastKey().getLoad() + " leastloaded=" + serversByLoad.firstKey().getLoad());
        return null;
    }

    int min = numEntityGroups / numServers;
    int max = numEntityGroups % numServers == 0 ? min : min + 1;

    // Using to check balance result.
    StringBuilder strBalanceParam = new StringBuilder();
    strBalanceParam.append("Balance parameter: numEntityGroups=").append(numEntityGroups)
            .append(", numServers=").append(numServers).append(", max=").append(max).append(", min=")
            .append(min);
    LOG.debug(strBalanceParam.toString());

    // Balance the cluster
    // TODO: Look at data block locality or a more complex load to do this
    MinMaxPriorityQueue<EntityGroupPlan> entityGroupsToMove = MinMaxPriorityQueue.orderedBy(rpComparator)
            .create();
    List<EntityGroupPlan> entityGroupsToReturn = new ArrayList<EntityGroupPlan>();

    // Walk down most loaded, pruning each to the max
    int serversOverloaded = 0;
    // flag used to fetch entityGroups from head and tail of list, alternately
    boolean fetchFromTail = false;
    Map<ServerName, BalanceInfo> serverBalanceInfo = new TreeMap<ServerName, BalanceInfo>();
    for (Map.Entry<ServerAndLoad, List<EntityGroupInfo>> server : serversByLoad.descendingMap().entrySet()) {
        ServerAndLoad sal = server.getKey();
        int entityGroupCount = sal.getLoad();
        if (entityGroupCount <= max) {
            serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(0, 0));
            break;
        }
        serversOverloaded++;
        List<EntityGroupInfo> entityGroups = server.getValue();
        int numToOffload = Math.min(entityGroupCount - max, entityGroups.size());
        // account for the out-of-band entityGroups which were assigned to this server
        // after some other entityGroup server crashed
        Collections.sort(entityGroups, riComparator);
        int numTaken = 0;
        for (int i = 0; i <= numToOffload;) {
            EntityGroupInfo egInfo = entityGroups.get(i); // fetch from head
            if (fetchFromTail) {
                egInfo = entityGroups.get(entityGroups.size() - 1 - i);
            }
            i++;
            entityGroupsToMove.add(new EntityGroupPlan(egInfo, sal.getServerName(), null));
            numTaken++;
            if (numTaken >= numToOffload)
                break;
            // fetch in alternate order if there is new entityGroup server
            if (emptyFServerPresent) {
                fetchFromTail = !fetchFromTail;
            }
        }
        serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(numToOffload, (-1) * numTaken));
    }
    int totalNumMoved = entityGroupsToMove.size();

    // Walk down least loaded, filling each to the min
    int neededEntityGroups = 0; // number of entityGroups needed to bring all up to min
    fetchFromTail = false;

    Map<ServerName, Integer> underloadedServers = new HashMap<ServerName, Integer>();
    for (Map.Entry<ServerAndLoad, List<EntityGroupInfo>> server : serversByLoad.entrySet()) {
        int entityGroupCount = server.getKey().getLoad();
        if (entityGroupCount >= min) {
            break;
        }
        underloadedServers.put(server.getKey().getServerName(), min - entityGroupCount);
    }
    // number of servers that get new entityGroups
    int serversUnderloaded = underloadedServers.size();
    int incr = 1;
    List<ServerName> sns = Arrays
            .asList(underloadedServers.keySet().toArray(new ServerName[serversUnderloaded]));
    Collections.shuffle(sns, RANDOM);
    while (entityGroupsToMove.size() > 0) {
        int cnt = 0;
        int i = incr > 0 ? 0 : underloadedServers.size() - 1;
        for (; i >= 0 && i < underloadedServers.size(); i += incr) {
            if (entityGroupsToMove.isEmpty())
                break;
            ServerName si = sns.get(i);
            int numToTake = underloadedServers.get(si);
            if (numToTake == 0)
                continue;

            addEntityGroupPlan(entityGroupsToMove, fetchFromTail, si, entityGroupsToReturn);
            if (emptyFServerPresent) {
                fetchFromTail = !fetchFromTail;
            }

            underloadedServers.put(si, numToTake - 1);
            cnt++;
            BalanceInfo bi = serverBalanceInfo.get(si);
            if (bi == null) {
                bi = new BalanceInfo(0, 0);
                serverBalanceInfo.put(si, bi);
            }
            bi.setNumEntityGroupsAdded(bi.getNumEntityGroupsAdded() + 1);
        }
        if (cnt == 0)
            break;
        // iterates underloadedServers in the other direction
        incr = -incr;
    }
    for (Integer i : underloadedServers.values()) {
        // If we still want to take some, increment needed
        neededEntityGroups += i;
    }

    // If none needed to fill all to min and none left to drain all to max,
    // we are done
    if (neededEntityGroups == 0 && entityGroupsToMove.isEmpty()) {
        long endTime = System.currentTimeMillis();
        LOG.info("Calculated a load balance in " + (endTime - startTime) + "ms. " + "Moving " + totalNumMoved
                + " entityGroups off of " + serversOverloaded + " overloaded servers onto " + serversUnderloaded
                + " less loaded servers");
        return entityGroupsToReturn;
    }

    // Need to do a second pass.
    // Either more entityGroups to assign out or servers that are still underloaded

    // If we need more to fill min, grab one from each most loaded until enough
    if (neededEntityGroups != 0) {
        // Walk down most loaded, grabbing one from each until we get enough
        for (Map.Entry<ServerAndLoad, List<EntityGroupInfo>> server : serversByLoad.descendingMap()
                .entrySet()) {
            BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName());
            int idx = balanceInfo == null ? 0 : balanceInfo.getNextEntityGroupForUnload();
            if (idx >= server.getValue().size())
                break;
            EntityGroupInfo entityGroup = server.getValue().get(idx);
            entityGroupsToMove.add(new EntityGroupPlan(entityGroup, server.getKey().getServerName(), null));
            totalNumMoved++;
            if (--neededEntityGroups == 0) {
                // No more entityGroups needed, done shedding
                break;
            }
        }
    }

    // Now we have a set of entityGroups that must be all assigned out
    // Assign each underloaded up to the min, then if leftovers, assign to max

    // Walk down least loaded, assigning to each to fill up to min
    for (Map.Entry<ServerAndLoad, List<EntityGroupInfo>> server : serversByLoad.entrySet()) {
        int entityGroupCount = server.getKey().getLoad();
        if (entityGroupCount >= min)
            break;
        BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName());
        if (balanceInfo != null) {
            entityGroupCount += balanceInfo.getNumEntityGroupsAdded();
        }
        if (entityGroupCount >= min) {
            continue;
        }
        int numToTake = min - entityGroupCount;
        int numTaken = 0;
        while (numTaken < numToTake && 0 < entityGroupsToMove.size()) {
            addEntityGroupPlan(entityGroupsToMove, fetchFromTail, server.getKey().getServerName(),
                    entityGroupsToReturn);
            numTaken++;
            if (emptyFServerPresent) {
                fetchFromTail = !fetchFromTail;
            }
        }
    }

    // If we still have entityGroups to dish out, assign underloaded to max
    if (0 < entityGroupsToMove.size()) {
        for (Map.Entry<ServerAndLoad, List<EntityGroupInfo>> server : serversByLoad.entrySet()) {
            int entityGroupCount = server.getKey().getLoad();
            if (entityGroupCount >= max) {
                break;
            }
            addEntityGroupPlan(entityGroupsToMove, fetchFromTail, server.getKey().getServerName(),
                    entityGroupsToReturn);
            if (emptyFServerPresent) {
                fetchFromTail = !fetchFromTail;
            }
            if (entityGroupsToMove.isEmpty()) {
                break;
            }
        }
    }

    long endTime = System.currentTimeMillis();

    if (!entityGroupsToMove.isEmpty() || neededEntityGroups != 0) {
        // Emit data so can diagnose how balancer went astray.
        LOG.warn("entityGroupsToMove=" + totalNumMoved + ", numServers=" + numServers + ", serversOverloaded="
                + serversOverloaded + ", serversUnderloaded=" + serversUnderloaded);
        StringBuilder sb = new StringBuilder();
        for (Map.Entry<ServerName, List<EntityGroupInfo>> e : clusterMap.entrySet()) {
            if (sb.length() > 0)
                sb.append(", ");
            sb.append(e.getKey().toString());
            sb.append(" ");
            sb.append(e.getValue().size());
        }
        LOG.warn("Input " + sb.toString());
    }

    // All done!
    LOG.info("Done. Calculated a load balance in " + (endTime - startTime) + "ms. " + "Moving " + totalNumMoved
            + " entityGroups off of " + serversOverloaded + " overloaded servers onto " + serversUnderloaded
            + " less loaded servers");

    return entityGroupsToReturn;
}

From source file:org.apache.hadoop.hbase.coprocessor.transactional.SsccRegionEndpoint.java

/**
 * Add a write to the transaction.//ww  w. ja  v  a 2 s .  com
 * process.
 * @param long transactionId
 * @param Put put
 * @param boolean stateless  // Is this a stateless put?
 * @return int
 * @throws IOException
 */

public int put(final long transactionId, final long startId, final Put put, boolean stateless)
        throws IOException {
    if (LOG.isTraceEnabled())
        LOG.trace("Enter SsccRegionEndpoint coprocessor: put, txid " + transactionId + ", startId " + startId
                + ", stateless: " + stateless);
    SsccTransactionState state = this.beginTransIfNotExist(transactionId, startId);

    // check whether has del before
    state.removeDelBeforePut(put, stateless);

    /*need to change the timestamp, but HBase API does not support
      At this point, the solution is to create a new Put object
    */
    //So the solution at this point is
    //add a mapping of current timestamp of the put row with the startId
    //mapStartIdFromTs(put.getTimeStamp(),startId);
    // try to use getFamilyCellMap to get out all data from the put object and generate a new one
    byte[] rowkey = put.getRow();
    Put newPut = new Put(rowkey, startId);
    byte[] mergedCols = null;
    byte[] mergedColsV = null;
    byte[] cv = null;
    NavigableMap<byte[], List<Cell>> familyCellMap = put.getFamilyCellMap();
    for (Entry<byte[], List<Cell>> entry : familyCellMap.entrySet()) {
        for (Iterator<Cell> iterator = entry.getValue().iterator(); iterator.hasNext();) {
            Cell cell = iterator.next();
            byte[] family = CellUtil.cloneFamily(cell);
            byte[] qualifier = CellUtil.cloneQualifier(cell);
            mergedCols = null;
            mergedCols = byteMerger("|".getBytes(), qualifier);
            mergedCols = byteMerger(mergedCols, "|".getBytes());
            byte[] value = CellUtil.cloneValue(cell);
            newPut.add(family, qualifier, startId, value);
            byte[] currentCollist = state.getColList(rowkey);
            if (indexOf(currentCollist, mergedCols) != -1) //already in this list
            {
                mergedColsV = byteMerger(currentCollist, null);
                continue;
            }
            mergedColsV = byteMerger(mergedCols, currentCollist);
            state.addToColList(rowkey, mergedColsV);
        }
    }

    //get the statusList
    Get statusGet = new Get(rowkey);
    //statusGet.setTimeStamp(startId);
    statusGet.addColumn(DtmConst.TRANSACTION_META_FAMILY, SsccConst.STATUS_COL);
    //statusGet.setTimeRange(0, startId + 1);  //only get data updated before me
    //statusGet.setMaxVersions(DtmConst.MAX_VERSION);
    statusGet.setMaxVersions();

    Result statusResult = m_Region.get(statusGet);

    List<Cell> sl = null;
    List<Cell> vl = null;

    //get the versionList
    //  If this is a stateless put we don't need the version list
    if (stateless == false) {
        Get verGet = new Get(rowkey);

        //verGet.setTimeStamp(startId);
        verGet.addColumn(DtmConst.TRANSACTION_META_FAMILY, SsccConst.VERSION_COL);
        verGet.setMaxVersions(DtmConst.MAX_VERSION);

        Result verResult = m_Region.get(verGet);
        if (verResult != null)
            vl = verResult.listCells();
    }

    if (statusResult != null)
        sl = statusResult.listCells();
    if (LOG.isTraceEnabled())
        LOG.trace("SsccRegionEndpoint coprocessor: put stateless: " + stateless);
    if (state.hasConflict(sl, vl, stateless, startId, transactionId) == false) {
        state.addToPutList(rowkey);
        //update status metadata
        byte[] statusValue;
        if (stateless) {
            statusValue = SsccConst.generateStatusValue(SsccConst.S_STATELESS_BYTE, transactionId); //stateless update
        } else {
            statusValue = SsccConst.generateStatusValue(SsccConst.S_STATEFUL_BYTE, transactionId); //stateful update
        }
        newPut.add(DtmConst.TRANSACTION_META_FAMILY, SsccConst.STATUS_COL, startId, statusValue);
        newPut.add(DtmConst.TRANSACTION_META_FAMILY, SsccConst.COLUMNS_COL, startId, mergedColsV);
        //perform the put operation, persistently save the data now.
        //        LOG.info("UNIQUE: put ok "   );
        m_Region.put(newPut);
        return stateless ? STATELESS_UPDATE_OK : STATEFUL_UPDATE_OK;

    } else { //conflict
             // Return conflict, but don't trigger and abort.  That needs to be triggered from the client, if desired.
        if (LOG.isTraceEnabled())
            LOG.trace("UNIQUE: put STATEFUL_UPDATE_CONFLICT ");
        return stateless ? STATELESS_UPDATE_CONFLICT : STATEFUL_UPDATE_CONFLICT;
    }
}

From source file:org.apache.tajo.storage.hbase.HBaseScanner.java

private Datum getDatum(Result result, int fieldId) throws IOException {
    byte[] value = null;
    if (isRowKeyMappings[fieldId]) {
        value = result.getRow();/*w  ww  .  j a  va  2  s. c  om*/
        if (!isBinaryColumns[fieldId] && rowKeyFieldIndexes[fieldId] >= 0) {
            int rowKeyFieldIndex = rowKeyFieldIndexes[fieldId];

            byte[][] rowKeyFields = BytesUtils.splitPreserveAllTokens(value, rowKeyDelimiter,
                    columnMapping.getNumColumns());

            if (rowKeyFields.length < rowKeyFieldIndex) {
                return NullDatum.get();
            } else {
                value = rowKeyFields[rowKeyFieldIndex];
            }
        }
    } else {
        if (isColumnKeys[fieldId]) {
            NavigableMap<byte[], byte[]> cfMap = result.getFamilyMap(mappingColumnFamilies[fieldId][0]);
            if (cfMap != null) {
                Set<byte[]> keySet = cfMap.keySet();
                if (keySet.size() == 1) {
                    try {
                        return HBaseTextSerializerDeserializer.deserialize(schemaColumns[fieldId],
                                keySet.iterator().next());
                    } catch (Exception e) {
                        LOG.error(e.getMessage(), e);
                        throw new RuntimeException(e.getMessage(), e);
                    }
                } else {
                    StringBuilder sb = new StringBuilder();
                    sb.append("[");
                    int count = 0;
                    for (byte[] eachKey : keySet) {
                        if (count > 0) {
                            sb.append(", ");
                        }
                        Datum datum = HBaseTextSerializerDeserializer.deserialize(schemaColumns[fieldId],
                                eachKey);
                        sb.append("\"").append(datum.asChars()).append("\"");
                        count++;
                        if (count > MAX_LIST_SIZE) {
                            break;
                        }
                    }
                    sb.append("]");
                    return new TextDatum(sb.toString());
                }
            }
        } else if (isColumnValues[fieldId]) {
            NavigableMap<byte[], byte[]> cfMap = result.getFamilyMap(mappingColumnFamilies[fieldId][0]);
            if (cfMap != null) {
                Collection<byte[]> valueList = cfMap.values();
                if (valueList.size() == 1) {
                    try {
                        return HBaseTextSerializerDeserializer.deserialize(schemaColumns[fieldId],
                                valueList.iterator().next());
                    } catch (Exception e) {
                        LOG.error(e.getMessage(), e);
                        throw new RuntimeException(e.getMessage(), e);
                    }
                } else {
                    StringBuilder sb = new StringBuilder();
                    sb.append("[");
                    int count = 0;
                    for (byte[] eachValue : valueList) {
                        if (count > 0) {
                            sb.append(", ");
                        }
                        Datum datum = HBaseTextSerializerDeserializer.deserialize(schemaColumns[fieldId],
                                eachValue);
                        sb.append("\"").append(datum.asChars()).append("\"");
                        count++;
                        if (count > MAX_LIST_SIZE) {
                            break;
                        }
                    }
                    sb.append("]");
                    return new TextDatum(sb.toString());
                }
            }
        } else {
            if (mappingColumnFamilies[fieldId][1] == null) {
                NavigableMap<byte[], byte[]> cfMap = result.getFamilyMap(mappingColumnFamilies[fieldId][0]);
                if (cfMap != null && !cfMap.isEmpty()) {
                    int count = 0;
                    String delim = "";

                    if (cfMap.size() == 0) {
                        return NullDatum.get();
                    } else if (cfMap.size() == 1) {
                        // If a column family is mapped without column name like "cf1:" and the number of cells is one,
                        // return value is flat format not json format.
                        NavigableMap.Entry<byte[], byte[]> entry = cfMap.entrySet().iterator().next();
                        byte[] entryKey = entry.getKey();
                        byte[] entryValue = entry.getValue();
                        if (entryKey == null || entryKey.length == 0) {
                            try {
                                if (isBinaryColumns[fieldId]) {
                                    return HBaseBinarySerializerDeserializer.deserialize(schemaColumns[fieldId],
                                            entryValue);
                                } else {
                                    return HBaseTextSerializerDeserializer.deserialize(schemaColumns[fieldId],
                                            entryValue);
                                }
                            } catch (Exception e) {
                                LOG.error(e.getMessage(), e);
                                throw new RuntimeException(e.getMessage(), e);
                            }
                        }
                    }
                    StringBuilder sb = new StringBuilder();
                    sb.append("{");
                    for (NavigableMap.Entry<byte[], byte[]> entry : cfMap.entrySet()) {
                        byte[] entryKey = entry.getKey();
                        byte[] entryValue = entry.getValue();

                        String keyText = new String(entryKey);
                        String valueText = null;
                        if (entryValue != null) {
                            try {
                                if (isBinaryColumns[fieldId]) {
                                    valueText = HBaseBinarySerializerDeserializer
                                            .deserialize(schemaColumns[fieldId], entryValue).asChars();
                                } else {
                                    valueText = HBaseTextSerializerDeserializer
                                            .deserialize(schemaColumns[fieldId], entryValue).asChars();
                                }
                            } catch (Exception e) {
                                LOG.error(e.getMessage(), e);
                                throw new RuntimeException(e.getMessage(), e);
                            }
                        }
                        sb.append(delim).append("\"").append(keyText).append("\":\"").append(valueText)
                                .append("\"");
                        delim = ", ";
                        count++;
                        if (count > MAX_LIST_SIZE) {
                            break;
                        }
                    } //end of for
                    sb.append("}");
                    return new TextDatum(sb.toString());
                } else {
                    value = null;
                }
            } else {
                value = result.getValue(mappingColumnFamilies[fieldId][0], mappingColumnFamilies[fieldId][1]);
            }
        }
    }

    if (value == null) {
        return NullDatum.get();
    } else {
        try {
            if (isBinaryColumns[fieldId]) {
                return HBaseBinarySerializerDeserializer.deserialize(schemaColumns[fieldId], value);
            } else {
                return HBaseTextSerializerDeserializer.deserialize(schemaColumns[fieldId], value);
            }
        } catch (Exception e) {
            LOG.error(e.getMessage(), e);
            throw new RuntimeException(e.getMessage(), e);
        }
    }
}