se.sics.caracaldb.global.DefaultPolicy.java Source code

Introduction

Here is the source code for se.sics.caracaldb.global.DefaultPolicy.java
Source

/*
 * This file is part of the CaracalDB distributed storage system.
 *
 * Copyright (C) 2009 Swedish Institute of Computer Science (SICS) 
 * Copyright (C) 2009 Royal Institute of Technology (KTH)
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package se.sics.caracaldb.global;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableSortedMap;
import com.google.common.collect.Maps;
import com.google.common.collect.SortedMapDifference;
import com.google.common.collect.TreeMultimap;
import com.google.common.primitives.UnsignedBytes;
import com.google.common.primitives.UnsignedInteger;
import com.google.common.primitives.UnsignedLong;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeSet;
import org.javatuples.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import se.sics.caracaldb.Key;
import se.sics.caracaldb.system.Stats;
import se.sics.caracaldb.utils.ExtremeKMap;
import se.sics.caracaldb.utils.HashIdGenerator;
import se.sics.kompics.address.Address;

/**
 *
 * @author lkroll
 */
public class DefaultPolicy implements MaintenancePolicy {

    private static final Logger LOG = LoggerFactory.getLogger(DefaultPolicy.class);

    private static final double ALPHA = 0.16;
    private static final double MINUS_ALPHA = 1 - ALPHA;
    private static final int K = 5;
    private static final double THRESHOLD = 0.35;
    private static final int MAX_ACTIONS = 5;

    private HashIdGenerator idGen;
    private double memoryAvg = 0.0;
    private double cpuAvg = 0.0;
    private long averageHostSize = 0;

    @Override
    public void init(HashIdGenerator idGen) {
        this.idGen = idGen;
    }

    @Override
    public void rebalance(LUTWorkingBuffer lut, ImmutableSet<Address> joins, ImmutableSet<Address> fails,
            ImmutableMap<Address, Stats.Report> stats, ImmutableSet<Schema.Req> schemaChanges) {
        Pair<ExtremeKMap<Double, Address>, ExtremeKMap<Double, Address>> xKs = updateAverages(stats);
        ExtremeKMap<Double, Address> xKMemory = xKs.getValue0();
        ExtremeKMap<Double, Address> xKCpu = xKs.getValue1();

        double upperCpuLimit = cpuAvg + cpuAvg * THRESHOLD;
        double lowerCpuLimit = cpuAvg - cpuAvg * THRESHOLD;
        double upperMemoryLimit = memoryAvg + memoryAvg * THRESHOLD;
        double lowerMemoryLimit = memoryAvg - memoryAvg * THRESHOLD;

        ImmutableSortedMap<Integer, Address> failIds = getIdsForFails(lut, fails);
        ImmutableSortedMap<Integer, Address> joinIds = getIdsForJoins(lut, joins, failIds);

        getHostActions(lut, failIds, joinIds);

        replaceFailedNodes(lut, failIds, joinIds, xKMemory, xKCpu, stats);

        getSchemaActions(lut, schemaChanges);

        if (lut.numberOfActions() > MAX_ACTIONS) {
            // Don't try any more balancing...there'll already be enough data movement
            return;
        }

        // TODO don't load balance for now...this needs to be tested
        return;
        /*
             // VERY conservative balacer
             // TODO make better^^
             Entry<Double, Address> topMemory = xKMemory.top().ceilingEntry();
             Entry<Double, Address> bottomMemory = xKMemory.bottom().ceilingEntry();
             if ((topMemory.getKey() > upperMemoryLimit) && (bottomMemory.getKey() < lowerMemoryLimit)) {
        switchSizeBalance(lut, topMemory.getValue(), bottomMemory.getValue(), stats);
             }
             if (lut.numberOfActions() > MAX_ACTIONS) {
        // Don't try any more balancing...there'll already be enough data movement
        return;
             }
             Entry<Double, Address> topCpu = xKCpu.top().ceilingEntry();
             Entry<Double, Address> bottomCpu = xKCpu.bottom().ceilingEntry();
             if ((topCpu.getKey() > upperCpuLimit) && (bottomCpu.getKey() < lowerCpuLimit)) {
        switchOperationBalance(lut, topCpu.getValue(), bottomCpu.getValue(), stats);
             }
            
             return;
        */
    }

    private Pair<ExtremeKMap<Double, Address>, ExtremeKMap<Double, Address>> updateAverages(
            ImmutableMap<Address, Stats.Report> stats) {
        ExtremeKMap<Double, Address> xKMemory = new ExtremeKMap<Double, Address>(K);
        ExtremeKMap<Double, Address> xKCpu = new ExtremeKMap<Double, Address>(K);

        double totalCpu = 0.0;
        double totalMemory = 0.0;
        long totalClusterSize = 0;
        for (Stats.Report report : stats.values()) {
            totalCpu += report.cpuUsage;
            totalMemory += report.memoryUsage;
            totalClusterSize += report.averageSize * report.numberOfVNodes;
            xKMemory.put(report.memoryUsage, report.atHost);
            xKCpu.put(report.cpuUsage, report.atHost);
        }
        double newMemAvg = totalMemory / ((double) stats.size());
        double newCpuAvg = totalCpu / ((double) stats.size());
        averageHostSize = Stats.floorDiv(totalClusterSize, stats.size());
        // Exponential moving average with coefficient ALPHA
        memoryAvg = ALPHA * newMemAvg + MINUS_ALPHA * memoryAvg;
        cpuAvg = ALPHA * newCpuAvg + MINUS_ALPHA * cpuAvg;
        LOG.info("Current cluster stats: Memory: {}%, CPU: {}% -- Moving: Memory: {}%, CPU: {}%", newMemAvg,
                newCpuAvg, memoryAvg, cpuAvg);
        return Pair.with(xKMemory, xKCpu);
    }

    private ImmutableSortedMap<Integer, Address> getIdsForJoins(LUTWorkingBuffer lut, ImmutableSet<Address> joins,
            ImmutableSortedMap<Integer, Address> failIds) {
        TreeSet<Address> remaining = new TreeSet<Address>(joins);
        ImmutableSortedMap.Builder<Integer, Address> ids = ImmutableSortedMap.naturalOrder();
        TreeSet<Integer> usedIds = new TreeSet<Integer>();
        // If nodes fail and rejoind try to assign the same id
        for (Entry<Integer, Address> e : failIds.entrySet()) {
            if (remaining.isEmpty()) {
                return ids.build();
            }
            if (remaining.contains(e.getValue())) {
                ids.put(e);
                remaining.remove(e.getValue());
                usedIds.add(e.getKey());
            }
        }
        // Assign all the other failed ids to new nodes
        for (Entry<Integer, Address> e : failIds.entrySet()) {
            if (remaining.isEmpty()) {
                return ids.build();
            }
            if (!usedIds.contains(e.getKey())) {
                Address j = remaining.pollFirst();
                ids.put(e.getKey(), j);
                usedIds.add(e.getKey());
            }
        }
        // Look for other empty slots in the LUT
        int index = 0;
        for (Address addr : lut.hosts()) {
            if (remaining.isEmpty()) {
                return ids.build();
            }
            if (addr == null) {
                if (usedIds.contains(index)) {
                    LOG.warn("The node at index {} apparently failed twice. This is weird -.-");
                    index++;
                    continue;
                }
                Address j = remaining.pollFirst();
                ids.put(index, j);
                usedIds.add(index);
            }
            index++;
        }
        return ids.build();
    }

    private ImmutableSortedMap<Integer, Address> getIdsForFails(LUTWorkingBuffer lut, ImmutableSet<Address> fails) {
        Set<Address> remaining = new HashSet<Address>(fails);
        ImmutableSortedMap.Builder<Integer, Address> ids = ImmutableSortedMap.naturalOrder();
        if (remaining.isEmpty()) {
            return ids.build();
        }
        int index = 0;
        for (Address addr : lut.hosts()) {
            if (remaining.remove(addr)) {
                ids.put(index, addr);
                if (remaining.isEmpty()) {
                    break;
                }
            }
            index++;
        }
        return ids.build();
    }

    private void getHostActions(LUTWorkingBuffer lut, ImmutableSortedMap<Integer, Address> failIds,
            ImmutableSortedMap<Integer, Address> joinIds) {
        SortedMapDifference<Integer, Address> idDiff = Maps.difference(failIds, joinIds);
        SortedMap<Integer, Address> nullableIds = idDiff.entriesOnlyOnLeft();
        for (Entry<Integer, Address> e : nullableIds.entrySet()) {
            lut.removeHost(e.getKey());
        }
        for (Entry<Integer, Address> e : joinIds.entrySet()) {
            lut.putHost(e.getKey(), e.getValue());
        }
    }

    private void replaceFailedNodes(LUTWorkingBuffer lut, ImmutableSortedMap<Integer, Address> failIds,
            ImmutableSortedMap<Integer, Address> joinIds, ExtremeKMap<Double, Address> xKMemory,
            ExtremeKMap<Double, Address> xKCpu, ImmutableMap<Address, Stats.Report> stats) {

        TreeSet<Integer> idsToDealWith = new TreeSet<Integer>(failIds.keySet());
        TreeMultimap<Long, Integer> candidates = TreeMultimap.create();
        // If a node fails and rejoins immediately, assign the same id and don't touch 
        // its replicationSets, since it may still have data from before the failure
        for (Entry<Integer, Address> e : joinIds.entrySet()) {
            Address curNode = lut.lut.getHost(e.getKey());
            if (curNode.equals(e.getValue())) {
                idsToDealWith.remove(e.getKey());
                candidates.put(0l, e.getKey());
            }
        }
        // Add nodes with lowest resource usage to candidates
        ImmutableSet.Builder<Address> candidateAddrs = ImmutableSet.builder(); // still need to look up their ids
        candidateAddrs.addAll(xKMemory.bottom().values()).addAll(xKCpu.bottom().values()).build();
        Map<Address, Integer> candidateIds = lut.lut.getIdsForAddresses(candidateAddrs.build());
        for (Entry<Address, Integer> e : candidateIds.entrySet()) {
            Address addr = e.getKey();
            Integer id = e.getValue();
            Stats.Report rep = stats.get(addr);
            long curSize = rep.averageSize * rep.numberOfVNodes;
            candidates.put(curSize, id);
        }
        // Replace nodes in affected sets
        int index = 0;
        for (Integer[] member : lut.replicationSets()) {
            Integer[] newRepSet = Arrays.copyOf(member, member.length);
            for (int pos = 0; pos < member.length; pos++) {
                if (idsToDealWith.contains(member[pos])) {
                    long lowestSize = candidates.keySet().first();
                    if (lowestSize > averageHostSize) {
                        addMoreCandidates(lut, candidates, stats);
                    }
                    // pick the first (lowestSize) host not in the replicationSet
                    long curSize = -1;
                    long addedSize = -1;
                    for (Entry<Long, Integer> e : candidates.entries()) {
                        if (LookupTable.positionInSet(newRepSet, e.getValue()) < 0) {
                            newRepSet[pos] = e.getValue();
                            curSize = e.getKey();
                            addedSize = guessAddedSize(lut, member, stats);
                            break;
                        }
                    }
                    if ((curSize < 0) || (addedSize < 0)) {
                        LOG.error("Could not find any candidate for replacing {} in replicationSet {}!",
                                member[pos], index);
                        continue;
                    }
                    // Update candidates
                    candidates.remove(curSize, newRepSet[pos]);
                    candidates.put(curSize + addedSize, newRepSet[pos]);
                }
            }
            if (!Arrays.equals(member, newRepSet)) {
                lut.putRepSet(index, newRepSet);
            }
            index++;
        }
    }

    private void addMoreCandidates(LUTWorkingBuffer lut, TreeMultimap<Long, Integer> candidates,
            ImmutableMap<Address, Stats.Report> stats) {
        if (candidates.values().size() >= stats.size()) {
            return; // Already everyone a candidate...nothing to do
        }
        int index = 0;
        for (Address addr : lut.hosts()) {
            if (!candidates.containsValue(index)) {
                Stats.Report rep = stats.get(addr);
                if (rep != null) {
                    long curSize = rep.averageSize * rep.numberOfVNodes;
                    candidates.put(curSize, index);
                }
            }
            index++;
        }
    }

    private long guessAddedSize(LUTWorkingBuffer lut, Integer[] member, ImmutableMap<Address, Stats.Report> stats) {
        // Use the average of the average node size of all live members to guess the size for this group
        ArrayList<Long> avgSizes = new ArrayList<Long>(member.length);
        for (Integer id : member) {
            Address addr = lut.getHost(id);
            Stats.Report rep = stats.get(addr);
            if (rep != null) {
                avgSizes.add(rep.averageSize);
            }
        }
        long sum = 0;
        for (Long l : avgSizes) {
            sum += l;
        }
        return Stats.floorDiv(sum, avgSizes.size());
    }

    private void switchSizeBalance(LUTWorkingBuffer lut, Address topAddr, Address bottomAddr,
            ImmutableMap<Address, Stats.Report> stats) {
        Stats.Report topRep = stats.get(topAddr);
        Stats.Report bottomRep = stats.get(bottomAddr);
        Map<Address, Integer> ids = lut.lut.getIdsForAddresses(ImmutableSet.of(topAddr, bottomAddr));
        for (Key k : topRep.topKSize) {
            Integer repGroupId = lut.getRepGroup(k);
            if (repGroupId == null) {
                continue;
            }
            Integer[] repGroup = lut.getRepSet(repGroupId);
            int topPos = LookupTable.positionInSet(repGroup, ids.get(topAddr));
            int bottomPos = LookupTable.positionInSet(repGroup, ids.get(bottomAddr));
            if (bottomPos < 0) { // new address is not already part of the replication group
                Integer[] newRepGroup = Arrays.copyOf(repGroup, repGroup.length);
                newRepGroup[topPos] = ids.get(bottomAddr);
                lut.findGroupOrAddNew(k, newRepGroup);
                return;
            }
        }
        // if all of the topKSize vNodes already share a group with the bottomAddr there's nothing we can do
    }

    private void switchOperationBalance(LUTWorkingBuffer lut, Address topAddr, Address bottomAddr,
            ImmutableMap<Address, Stats.Report> stats) {
        Stats.Report topRep = stats.get(topAddr);
        //Stats.Report bottomRep = stats.get(bottomAddr);
        Map<Address, Integer> ids = lut.lut.getIdsForAddresses(ImmutableSet.of(topAddr, bottomAddr));
        for (Key k : topRep.topKOps) {
            Integer repGroupId = lut.getRepGroup(k);
            if (repGroupId == null) {
                continue;
            }
            Integer[] repGroup = lut.getRepSet(repGroupId);
            int topPos = LookupTable.positionInSet(repGroup, ids.get(topAddr));
            int bottomPos = LookupTable.positionInSet(repGroup, ids.get(bottomAddr));
            if (bottomPos < 0) { // new address is not already part of the replication group
                Integer[] newRepGroup = Arrays.copyOf(repGroup, repGroup.length);
                newRepGroup[topPos] = ids.get(bottomAddr);
                lut.findGroupOrAddNew(k, newRepGroup);
                return;
            }
        }
        // if all of the topKOps vNodes already share a group with the bottomAddr there's nothing we can do
    }

    private void getSchemaActions(LUTWorkingBuffer lut, ImmutableSet<Schema.Req> schemaChanges) {
        for (Schema.Req req : schemaChanges) {
            if (req instanceof Schema.CreateReq) {
                Schema.CreateReq creq = (Schema.CreateReq) req;
                byte[] schemaId = idGen.idForNameDontStartWith(creq.name, LookupTable.RESERVED_PREFIX.getArray());
                SchemaData.SingleSchema schema = new SchemaData.SingleSchema(ByteBuffer.wrap(schemaId), creq.name,
                        creq.metaData);
                lut.addSchema(schema);
                // also assign initial vnodes
                int vnodes = 1; //default
                int rfactor = 3; //default
                String vnodeS = creq.metaData.get("vnodes");
                if (vnodeS != null) {
                    vnodes = Integer.parseInt(vnodeS);
                }
                String rfactorS = creq.metaData.get("rfactor");
                if (rfactorS != null) {
                    rfactor = Integer.parseInt(rfactorS);
                }
                // find right size repsets
                ArrayList<Integer> replicationSets = new ArrayList<Integer>();
                int index = 0;
                for (Integer[] repset : lut.replicationSets()) {
                    if (repset.length == rfactor) {
                        replicationSets.add(index);
                    }
                    index++;
                }
                if (replicationSets.isEmpty()) {
                    replicationSets = lut.createRepSets(rfactor);
                }
                Set<Key> keys = generateVNodes(schemaId, vnodes);
                index = 0;
                for (Key k : keys) {
                    lut.putRepGroup(k, replicationSets.get(index));
                    index++;
                    index = index % replicationSets.size();
                }
            } else if (req instanceof Schema.DropReq) {
                Schema.DropReq dreq = (Schema.DropReq) req;
                lut.removeSchema(dreq.name);
            } else {
                LOG.error("Unkown type of schema request: {}", req);
            }
        }
    }

    private Set<Key> generateVNodes(byte[] schemaId, int num) {
        Set<Key> keys = new TreeSet<Key>();
        // boundary nodes
        Key start = new Key(schemaId);
        keys.add(start);
        //virtualHostsPut(end, rset); // this mind end up being override by the next schema, but that's ok since we only need it if there is no next schema
        if (num == 1) { // single vnode needed
            return keys;
        }
        num--; // account for the initial vnode already created (the end-nodes doesn't count)
        if (num <= UnsignedBytes.MAX_VALUE) { // another byte for subkeys needed
            int incr = (int) UnsignedBytes.MAX_VALUE / (int) num;
            int last = 0;
            int ceiling = (int) UnsignedBytes.MAX_VALUE - incr;
            while (last < ceiling) {
                last = last + incr;
                Key k = start.append(new byte[] { UnsignedBytes.saturatedCast(last) }).get();
                keys.add(k);
            }
        } else if (num <= UnsignedInteger.MAX_VALUE.longValue()) { // another 4 bytes for subkeys needed
            long incr = UnsignedInteger.MAX_VALUE.longValue() / num;
            long last = 0;
            long ceiling = UnsignedInteger.MAX_VALUE.longValue() - incr;
            while (last < ceiling) {
                last = last + incr;
                Key k = start.append(new Key(UnsignedInteger.valueOf(last).intValue())).get();
                keys.add(k);
            }
        } else { // another 8 bytes for subkeys needed (don't support more!)
            UnsignedLong incr = UnsignedLong.MAX_VALUE.dividedBy(UnsignedLong.valueOf(num));
            UnsignedLong last = UnsignedLong.ZERO;
            UnsignedLong ceiling = UnsignedLong.MAX_VALUE.minus(incr);
            while (last.compareTo(ceiling) <= 0) {
                last = last.plus(incr);
                Key k = start.append(new Key(last.intValue())).get();
                keys.add(k);
            }
        }
        return keys;
    }

}