se.sics.caracaldb.global.CatHerder.java Source code

Java tutorial

Introduction

Here is the source code for se.sics.caracaldb.global.CatHerder.java

Source

/*
 * This file is part of the CaracalDB distributed storage system.
 *
 * Copyright (C) 2009 Swedish Institute of Computer Science (SICS) 
 * Copyright (C) 2009 Royal Institute of Technology (KTH)
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package se.sics.caracaldb.global;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.primitives.Longs;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Random;
import java.util.TreeMap;
import java.util.UUID;
import java.util.concurrent.locks.ReadWriteLock;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import se.sics.caracaldb.Key;
import se.sics.caracaldb.KeyRange;
import se.sics.caracaldb.bootstrap.BootUp;
import se.sics.caracaldb.bootstrap.BootstrapRequest;
import se.sics.caracaldb.bootstrap.BootstrapResponse;
import se.sics.caracaldb.bootstrap.Ready;
import se.sics.caracaldb.operations.CaracalMsg;
import se.sics.caracaldb.operations.MultiOpRequest;
import se.sics.caracaldb.operations.MultiOpResponse;
import se.sics.caracaldb.operations.OpUtil;
import se.sics.caracaldb.operations.ResponseCode;
import se.sics.caracaldb.store.ActionFactory;
import se.sics.caracaldb.store.Limit;
import se.sics.caracaldb.store.RangeReq;
import se.sics.caracaldb.store.RangeResp;
import se.sics.caracaldb.store.Store;
import se.sics.caracaldb.store.TFFactory;
import se.sics.caracaldb.system.Stats;
import se.sics.caracaldb.utils.TimestampIdFactory;
import se.sics.kompics.ComponentDefinition;
import se.sics.kompics.Handler;
import se.sics.kompics.Negative;
import se.sics.kompics.Positive;
import se.sics.kompics.Start;
import se.sics.kompics.Stop;
import se.sics.kompics.address.Address;
import se.sics.kompics.network.Network;
import se.sics.kompics.timer.CancelPeriodicTimeout;
import se.sics.kompics.timer.SchedulePeriodicTimeout;
import se.sics.kompics.timer.Timeout;
import se.sics.kompics.timer.Timer;

/**
 *
 * @author lkroll
 */
public class CatHerder extends ComponentDefinition {

    // statics
    private static final Logger LOG = LoggerFactory.getLogger(CatHerder.class);
    private static final Random RAND = new Random();
    // ports
    Negative<HerderService> herder = provides(HerderService.class);
    Positive<Network> net = requires(Network.class);
    Positive<Timer> timer = requires(Timer.class);
    Positive<Store> store = requires(Store.class);
    // finals
    private final long heartbeatInterval;
    private final long heartbeatTimeout;
    private final ReadWriteLock lutLock;
    // instance
    private LookupTable lut; // READ-ONLY!
    private Address self;
    private UUID checkHeartbeatsId = null;

    // master
    private HashSet<Address> outstandingJoins = new HashSet<Address>();
    private HashSet<Schema.Req> outstandingSchemaChanges = new HashSet<Schema.Req>();
    private TreeMap<UUID, LUTUpdate> pendingUpdates = new TreeMap<UUID, LUTUpdate>();
    //private Component paxos;
    //private Positive<ReplicatedLog> rlog = requires(ReplicatedLog.class);
    private final MaintenancePolicy policy;

    public CatHerder(GlobalInit init) {
        heartbeatInterval = init.conf.getMilliseconds("caracal.heartbeatInterval");
        heartbeatTimeout = 2 * heartbeatInterval;
        policy = init.conf.getMaintenancePolicy();
        lutLock = init.lock;
        lut = init.bootEvent.lut;
        self = init.self;
        policy.init(init.conf.getIdGenerator());

        // Subscriptions
        subscribe(startHandler, control);
        subscribe(stopHandler, control);
        subscribe(checkHBHandler, timer);
        subscribe(rangeHandler, store);
        subscribe(multiOpHandler, net);
        subscribe(readyHandler, net);
        subscribe(createSchemaHandler, net);
        subscribe(dropSchemaHandler, net);
        subscribe(bootstrapHandler, net);
        subscribe(updateHandler, herder);

        scheduleTimeout();
    }

    Handler<Start> startHandler = new Handler<Start>() {

        @Override
        public void handle(Start event) {
            scheduleTimeout();
        }
    };
    Handler<Stop> stopHandler = new Handler<Stop>() {
        @Override
        public void handle(Stop event) {
            /*
             * Cleanup
             */
            if (checkHeartbeatsId != null) {
                trigger(new CancelPeriodicTimeout(checkHeartbeatsId), timer);
            }
        }
    };

    Handler<CheckHeartbeats> checkHBHandler = new Handler<CheckHeartbeats>() {

        @Override
        public void handle(CheckHeartbeats event) {
            RangeReq rr = new RangeReq(KeyRange.prefix(LookupTable.RESERVED_HEARTBEATS), Limit.noLimit(),
                    TFFactory.noTF(), ActionFactory.fullDelete(), -1);
            /*
             * I suppose one can argue whether or not it's correct to do this
             * directly on the storage, bypassing the replication. But consider
             * this: If you involve the replication, then there is really no
             * benefit to collocating the reserved range with the master group.
             */
            trigger(rr, store);
        }
    };
    Handler<RangeResp> rangeHandler = new Handler<RangeResp>() {

        @Override
        public void handle(RangeResp event) {
            LOG.info("{}: Got range response with {} items!", self, event.result.size());
            ImmutableMap.Builder<Address, Stats.Report> statsB = ImmutableMap.builder();
            for (Map.Entry<Key, byte[]> e : event.result.entrySet()) {
                Key k = e.getKey();
                try {
                    Stats.Report r = Stats.Report.deserialise(e.getValue());
                    statsB.put(r.atHost, r);
                } catch (IOException ex) {
                    LOG.error("Could not deserialise Statistics Report for key " + k, ex);
                }
            }
            lutLock.readLock().lock();
            try {
                ImmutableMap<Address, Stats.Report> stats = statsB.build();
                ImmutableSet.Builder<Address> failsB = ImmutableSet.builder();
                for (Iterator<Address> it = lut.hostIterator(); it.hasNext();) {
                    Address host = it.next();
                    if ((host != null) && !stats.containsKey(host)) {
                        failsB.add(host);
                        LOG.info("{}: It appears that host {} has failed.", self, host);
                    }
                }
                ImmutableSet<Address> fails = failsB.build();
                if (fails.size() > (lut.hosts().size() / 2)) {
                    LOG.warn("{}: More than 50% of all system nodes seem to have failed at the same time. \n"
                            + "Under the assumption that this is a partition Caracal will not rebalance, but wait for the partition to be resolved.",
                            self);
                    return;
                }
                LUTWorkingBuffer buffer = new LUTWorkingBuffer(lut);
                LOG.debug("Running rebalancer with {} outstanding joins, {} fails and {} schema changes.",
                        new Object[] { outstandingJoins.size(), fails.size(), outstandingSchemaChanges.size() });
                policy.rebalance(buffer, fails, ImmutableSet.copyOf(outstandingJoins), stats,
                        ImmutableSet.copyOf(outstandingSchemaChanges));
                LUTUpdate update = buffer.assembleUpdate();
                if (update == null) {
                    LOG.debug("{}: No new LUT version created by policy.", self);
                    return; // nothing to do
                }
                LOG.info("{}: Created new LUTUpdate: {}", self, update);
                try {
                    Key updateKey = LookupTable.RESERVED_LUTUPDATES
                            .append(new Key(Longs.toByteArray(update.version))).get();
                    UUID id = TimestampIdFactory.get().newId();
                    MultiOpRequest op = OpUtil.putIfAbsent(id, updateKey, update.serialise());
                    Address dest = lut.findDest(updateKey, self, RAND);
                    CaracalMsg msg = new CaracalMsg(self, dest, op);
                    trigger(msg, net);
                    pendingUpdates.put(id, update);
                } catch (LookupTable.NoResponsibleForKeyException ex) {
                    LOG.error("{}: Apparently no-one is responsible for the reserved range oO: {}", self, ex);
                } catch (LookupTable.NoSuchSchemaException ex) {
                    LOG.error("{}: Apparently there is not schema for the reserved range oO: {}", self, ex);
                    return;
                }
                outstandingJoins.clear(); // clear occasionally to keep only active nodes in there
            } finally {
                lutLock.readLock().unlock();
            }
        }
    };
    Handler<CaracalMsg> multiOpHandler = new Handler<CaracalMsg>() {

        @Override
        public void handle(CaracalMsg event) {
            if (event.op instanceof MultiOpResponse) {
                MultiOpResponse mor = (MultiOpResponse) event.op;
                LUTUpdate update = pendingUpdates.remove(mor.id);
                if (update == null) {
                    LOG.info("{}: Can't find pending update with id {}", self, mor.id);
                    return;
                }
                if (mor.code == ResponseCode.SUCCESS && mor.success == true) {
                    LOG.info("{}: My new LUT version {} got accepted. Broadcasting!", self, update.version);
                    broadcast(update);
                }
                return;
            }
            //LOG.info("{}: Got CaracalMsg {}, but am only expecting MultiOpResponse", self, event);
        }

    };
    Handler<Ready> readyHandler = new Handler<Ready>() {

        @Override
        public void handle(Ready event) {
            BootUp bu = new BootUp(self, event.getOrigin());
            trigger(bu, net); // no need to wait for anyone
        }
    };
    Handler<Schema.CreateReq> createSchemaHandler = new Handler<Schema.CreateReq>() {

        @Override
        public void handle(Schema.CreateReq event) {
            if (!event.orig.equals(event.src)) {
                LOG.info("{}: Taking note of pending Schema.CreateReq: {}", self, event);
                // TODO check if there needs to be some check about existance of nodes in the LUT here
                outstandingSchemaChanges.add(event);
            }
            // Ignore, since it will be forwarded again (just to avoid duplication)
        }
    };
    Handler<Schema.DropReq> dropSchemaHandler = new Handler<Schema.DropReq>() {

        @Override
        public void handle(Schema.DropReq event) {
            if (!event.orig.equals(event.src)) {
                LOG.info("{}: Taking note of pending Schema.DropReq: {}", self, event);
                // TODO check if there needs to be some check about existance of nodes in the LUT here
                outstandingSchemaChanges.add(event);
            }
            // Ignore, since it will be forwarded again (just to avoid duplication)
        }
    };
    Handler<BootstrapRequest> bootstrapHandler = new Handler<BootstrapRequest>() {

        @Override
        public void handle(BootstrapRequest event) {
            if (!event.origin.equals(event.src)) { // hasn't been forwarded to masters, yet
                // TODO check if there needs to be some check about existance of nodes in the LUT here
                outstandingJoins.add(event.origin);
            }
            // Ignore, since it will be forwarded again (just to avoid duplication)
        }
    };
    Handler<AppliedUpdate> updateHandler = new Handler<AppliedUpdate>() {

        @Override
        public void handle(AppliedUpdate event) {
            LOG.debug("{}: Got AppliedUpdate event...cleaning up local state...", self);
            lutLock.readLock().lock();
            HashSet<Schema.Req> addback = new HashSet<Schema.Req>();
            try {
                for (Schema.Req req : outstandingSchemaChanges) {
                    if (req instanceof Schema.CreateReq) {
                        Schema.CreateReq creq = (Schema.CreateReq) req;
                        ByteBuffer schemaId = lut.schemas().schemaIDs.get(creq.name);
                        if (schemaId == null) {
                            addback.add(req);
                            continue;
                        }
                        if (event.isOwnUpdate) {
                            Schema.Response res = creq.reply(self, schemaId.array());
                            trigger(res, net);
                        }
                    } else if (req instanceof Schema.DropReq) {
                        Schema.DropReq dreq = (Schema.DropReq) req;
                        ByteBuffer schemaId = lut.schemas().schemaIDs.get(dreq.name);
                        if (schemaId != null) {
                            addback.add(req);
                            continue;
                        }
                        if (event.isOwnUpdate) {
                            Schema.Response res = dreq.reply(self, null);
                            trigger(res, net);
                        }
                    }
                }
                outstandingSchemaChanges.clear();
                outstandingSchemaChanges.addAll(addback);
            } finally {
                lutLock.readLock().unlock();
            }
            bootstrapJoiners(event.update);
        }
    };

    private void broadcast(Maintenance op) {
        for (Address addr : lut.hosts()) {
            MaintenanceMsg msg = new MaintenanceMsg(self, addr, op);
            trigger(msg, net);
        }
    }

    private void bootstrapJoiners(LUTUpdate update) {
        ImmutableSet<Address> joiners = update.joiners();
        if (joiners.isEmpty()) {
            return;
        }
        byte[] lutS = lut.serialise();
        for (Address addr : joiners) {
            BootstrapResponse br = new BootstrapResponse(self, addr, lutS);
            trigger(br, net);
            outstandingJoins.remove(addr);
        }
    }

    private void scheduleTimeout() {
        SchedulePeriodicTimeout sptC = new SchedulePeriodicTimeout(heartbeatTimeout, heartbeatTimeout);
        LOG.info("{}: Scheduling Heartbeats: {}ms < {}ms", self, heartbeatInterval, heartbeatTimeout);
        CheckHeartbeats chs = new CheckHeartbeats(sptC);
        checkHeartbeatsId = chs.getTimeoutId();
        sptC.setTimeoutEvent(chs);
        trigger(sptC, timer);
    }

    public static class CheckHeartbeats extends Timeout {

        public CheckHeartbeats(SchedulePeriodicTimeout spt) {
            super(spt);
        }
    }
}