co.paralleluniverse.galaxy.netty.UDPComm.java Source code

Java tutorial

Introduction

Here is the source code for co.paralleluniverse.galaxy.netty.UDPComm.java

Source

/*
 * Galaxy
 * Copyright (c) 2012-2014, Parallel Universe Software Co. All rights reserved.
 * 
 * This program and the accompanying materials are dual-licensed under
 * either the terms of the Eclipse Public License v1.0 as published by
 * the Eclipse Foundation
 *  
 *   or (per the licensee's choosing)
 *  
 * under the terms of the GNU Lesser General Public License version 3.0
 * as published by the Free Software Foundation.
 */
package co.paralleluniverse.galaxy.netty;

import co.paralleluniverse.common.monitoring.ThreadPoolExecutorMonitor;
import co.paralleluniverse.galaxy.Cluster;
import co.paralleluniverse.galaxy.cluster.ReaderWriters;
import co.paralleluniverse.galaxy.core.AbstractComm;
import co.paralleluniverse.galaxy.core.Comm;
import co.paralleluniverse.galaxy.core.CommThread;
import co.paralleluniverse.galaxy.core.Message;
import co.paralleluniverse.galaxy.core.Message.LineMessage;
import co.paralleluniverse.galaxy.core.MessageReceiver;
import co.paralleluniverse.galaxy.core.NodeNotFoundException;
import co.paralleluniverse.galaxy.core.ServerComm;
import static co.paralleluniverse.galaxy.netty.IpConstants.*;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import it.unimi.dsi.fastutil.shorts.ShortIterator;
import it.unimi.dsi.fastutil.shorts.ShortOpenHashSet;
import it.unimi.dsi.fastutil.shorts.ShortSet;
import java.beans.ConstructorProperties;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.NetworkInterface;
import java.net.SocketAddress;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.ThreadPoolExecutor;
import static java.util.concurrent.TimeUnit.*;
import org.jboss.netty.bootstrap.ConnectionlessBootstrap;
import org.jboss.netty.channel.ChannelHandlerContext;
import org.jboss.netty.channel.ChannelPipeline;
import org.jboss.netty.channel.ExceptionEvent;
import org.jboss.netty.channel.FixedReceiveBufferSizePredictorFactory;
import org.jboss.netty.channel.MessageEvent;
import org.jboss.netty.channel.SimpleChannelHandler;
import org.jboss.netty.channel.socket.DatagramChannel;
import org.jboss.netty.channel.socket.DatagramChannelFactory;
import org.jboss.netty.channel.socket.nio.NioDatagramChannelFactory;
import org.jboss.netty.channel.socket.oio.OioDatagramChannelFactory;
import org.jboss.netty.handler.execution.OrderedMemoryAwareThreadPoolExecutor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.jmx.export.annotation.ManagedAttribute;

/**
 * This crucial class could use a good refactoring.
 *
 * @author pron
 */
public class UDPComm extends AbstractComm<InetSocketAddress> {
    // Note: class must be public for Spring's auto generated javax.management.modelmbean.RequiredModelMBean to expose @ManagedAttribute
    private static final Logger LOG = LoggerFactory.getLogger(UDPComm.class);
    //
    private final int port;
    private InetSocketAddress multicastGroup;
    private NetworkInterface multicastNetworkInterface;
    private int maxQueueSize = 50;
    private int maxPacketSize = 4096;
    private int maxRequestOnlyPacketSize = maxPacketSize / 2;
    private long minDelayNanos = NANOSECONDS.convert(1, MILLISECONDS);
    private long maxDelayNanos = NANOSECONDS.convert(10, MILLISECONDS);
    private long resendPeriodNanos = NANOSECONDS.convert(20, MILLISECONDS);
    private boolean jitter = false;
    private boolean exponentialBackoff = true;
    private int minimumNodesToMulticast = 3;
    private ThreadPoolExecutor workerExecutor;
    private OrderedMemoryAwareThreadPoolExecutor receiveExecutor;
    //
    private final Comm serverComm;
    private DatagramChannelFactory channelFactory;
    private ConnectionlessBootstrap bootstrap;
    private DatagramChannel channel;
    private DatagramChannel multicastChannel;
    private BroadcastPeer broadcastPeer = new BroadcastPeer();
    private SocketAddress myAddress;
    private final ConcurrentMap<Short, NodePeer> peers = new ConcurrentHashMap<Short, NodePeer>();
    private final ScheduledExecutorService executor = Executors.newScheduledThreadPool(1,
            new ThreadFactoryBuilder().setNameFormat("uspCommScheduled-%d").setDaemon(true).build());
    private final UDPCommMonitor monitor;

    @ConstructorProperties({ "name", "cluster", "serverComm", "port" })
    UDPComm(String name, Cluster cluster, ServerComm serverComm, int port) throws Exception {
        super(name, cluster, new SocketNodeAddressResolver(cluster, IP_COMM_PORT));
        this.serverComm = serverComm;
        this.port = port;

        cluster.addNodeProperty(IP_ADDRESS, true, true, INET_ADDRESS_READER_WRITER);
        cluster.setNodeProperty(IP_ADDRESS, InetAddress.getLocalHost());
        cluster.addNodeProperty(IP_COMM_PORT, true, false, ReaderWriters.INTEGER);
        cluster.setNodeProperty(IP_COMM_PORT, port);

        this.monitor = new UDPCommMonitor(name, this);
    }

    @ManagedAttribute
    public int getPort() {
        return port;
    }

    public void setReceiveBufferSize(int size) {
        assertDuringInitialization();
        bootstrap.setOption("receiveBufferSize", size);
    }

    public void setMulticastGroup(InetSocketAddress group) {
        assertDuringInitialization();
        this.multicastGroup = group;
    }

    @ManagedAttribute
    public String getMulticastGroupName() {
        return multicastGroup.toString();
    }

    public void setMulticastNetworkInterface(NetworkInterface multicastNetworkInterface) {
        assertDuringInitialization();
        this.multicastNetworkInterface = multicastNetworkInterface;
    }

    @ManagedAttribute
    public String getMulticastNetworkInterfaceName() {
        return multicastNetworkInterface.toString();
    }

    public void setMaxQueueSize(int maxQueueSize) {
        assertDuringInitialization();
        this.maxQueueSize = maxQueueSize;
    }

    @ManagedAttribute
    public int getMaxQueueSize() {
        return maxQueueSize;
    }

    public void setMaxPacketSize(int maxPacketSize) {
        assertDuringInitialization();
        this.maxPacketSize = maxPacketSize;
    }

    @ManagedAttribute
    public int getMaxPacketSize() {
        return maxPacketSize;
    }

    public void setMaxRequestOnlyPacketSize(int maxRequestOnlyPacketSize) {
        assertDuringInitialization();
        this.maxRequestOnlyPacketSize = maxRequestOnlyPacketSize;
    }

    @ManagedAttribute
    public int getMaxRequestOnlyPacketSize() {
        return maxRequestOnlyPacketSize;
    }

    public void setMaxDelayMicrosecs(int maxDelayMicrosecs) {
        assertDuringInitialization();
        this.maxDelayNanos = NANOSECONDS.convert(maxDelayMicrosecs, MICROSECONDS);
    }

    @ManagedAttribute
    public int getMaxDelayMicrosecs() {
        return (int) MICROSECONDS.convert(maxDelayNanos, NANOSECONDS);
    }

    public void setMinDelayMicrosecs(int minDelayMicrosecs) {
        assertDuringInitialization();
        this.minDelayNanos = NANOSECONDS.convert(minDelayMicrosecs, MICROSECONDS);
    }

    @ManagedAttribute
    public int getMinDelayMicrosecs() {
        return (int) MICROSECONDS.convert(minDelayNanos, NANOSECONDS);
    }

    public void setResendPeriodMillisecs(int resnedPeriodMillisecs) {
        assertDuringInitialization();
        this.resendPeriodNanos = NANOSECONDS.convert(resnedPeriodMillisecs, MILLISECONDS);
    }

    @ManagedAttribute
    public int getResendPeriodMillisecs() {
        return (int) MILLISECONDS.convert(resendPeriodNanos, NANOSECONDS);
    }

    public void setMinimumNodesToMulticast(int minimumNodesToMulticast) {
        assertDuringInitialization();
        this.minimumNodesToMulticast = minimumNodesToMulticast;
    }

    @ManagedAttribute
    public int getMinimumNodesToMulticast() {
        return minimumNodesToMulticast;
    }

    public void setWorkerExecutor(ThreadPoolExecutor executor) {
        assertDuringInitialization();
        this.workerExecutor = executor;
    }

    @ManagedAttribute
    public String getWorkerExecutorName() {
        return "udpCommWorkerExecutor";
    }

    public void setReceiveExecutor(OrderedMemoryAwareThreadPoolExecutor executor) {
        assertDuringInitialization();
        this.receiveExecutor = executor;
    }

    @ManagedAttribute
    public String getReceiveExecutorName() {
        return "udpCommReceiveExecutor";
    }

    public void setJitter(boolean value) {
        // see http://highscalability.com/blog/2012/4/17/youtube-strategy-adding-jitter-isnt-a-bug.html and http://news.ycombinator.com/item?id=3757456 
        assertDuringInitialization();
        this.jitter = value;
    }

    @ManagedAttribute
    public boolean isJitter() {
        return jitter;
    }

    public void setExponentialBackoff(boolean value) {
        assertDuringInitialization();
        this.exponentialBackoff = value;
    }

    @ManagedAttribute
    public boolean isExponentialBackoff() {
        return exponentialBackoff;
    }

    @Override
    public void setReceiver(MessageReceiver receiver) {
        super.setReceiver(receiver);
        if (serverComm != null)
            serverComm.setReceiver(receiver);
    }

    @Override
    public void init() throws Exception {
        super.init();

        if (!isSendToServerInsteadOfMulticast() && multicastGroup == null) {
            LOG.error("If sendToServerInsteadOfBroadcast, multicastGroup must be set!");
            throw new RuntimeException("multicastGroup not set.");
        }

        this.myAddress = new InetSocketAddress(InetAddress.getLocalHost(), port);
        if (workerExecutor == null)
            workerExecutor = (ThreadPoolExecutor) Executors.newCachedThreadPool();

        // Netty ignores executor thread naming strategy cause of Worker renaming policy.
        // org.jboss.netty.channel.socket.nio.AbstractNioWorker.newThreadRenamingRunnable()
        // And unfortunately for NioDatagramChannelFactory itsn't possible to pass our own ThreadNameDeterminer.
        configureThreadPool(getWorkerExecutorName(), workerExecutor);

        if (receiveExecutor != null)
            configureThreadPool(getReceiveExecutorName(), receiveExecutor);

        this.channelFactory = isSendToServerInsteadOfMulticast()
                ? new NioDatagramChannelFactory(workerExecutor, NettyUtils.getWorkerCount(workerExecutor))
                : new OioDatagramChannelFactory(workerExecutor);
        this.bootstrap = new ConnectionlessBootstrap(channelFactory);
        this.bootstrap.setOption("receiveBufferSizePredictorFactory",
                new FixedReceiveBufferSizePredictorFactory(4096));

        bootstrap.setPipelineFactory(new UdpMessagePipelineFactory(LOG,
                new ChannelNodeAddressResolver(addressResolver), receiveExecutor) {
            @Override
            public ChannelPipeline getPipeline() throws Exception {
                final ChannelPipeline pipeline = super.getPipeline();
                pipeline.addLast("router", new SimpleChannelHandler() {
                    @Override
                    public void messageReceived(ChannelHandlerContext ctx, MessageEvent e) {
                        if (ctx.getChannel() == multicastChannel) {
                            if (e.getRemoteAddress().equals(myAddress))
                                return; // this is our own multicast
                            ((MessagePacket) e.getMessage()).setMulticast();
                        }
                        UDPComm.this.messageReceived((MessagePacket) e.getMessage());
                    }

                    @Override
                    public void exceptionCaught(ChannelHandlerContext ctx, ExceptionEvent e) {
                        LOG.info("Channel exception: {} {}", e.getCause().getClass().getName(),
                                e.getCause().getMessage());
                        LOG.debug("Channel exception", e.getCause());
                    }
                });
                return pipeline;
            }
        });

        bootstrap.setOption("localAddress", new InetSocketAddress(InetAddress.getLocalHost(), port));
        bootstrap.setOption("tcpNoDelay", true);

        monitor.registerMBean();
    }

    private void configureThreadPool(String name, ThreadPoolExecutor executor) {
        executor.setRejectedExecutionHandler(new ThreadPoolExecutor.DiscardPolicy());
        executor.setThreadFactory(new ThreadFactoryBuilder().setNameFormat(name + "-%d").setDaemon(true)
                .setThreadFactory(new ThreadFactory() {
                    @Override
                    public Thread newThread(Runnable r) {
                        return new CommThread(r);
                    }
                }).build());
        ThreadPoolExecutorMonitor.register(name, executor);
    }

    @Override
    public void postInit() throws Exception {
        if (!sendToServerInsteadOfMulticast)
            this.broadcastPeer = new BroadcastPeer();
        super.postInit();
    }

    @Override
    public void start(boolean master) {
        this.channel = (DatagramChannel) bootstrap.bind();
        LOG.info("Channel {} listening on port {}", channel, port);
        if (!isSendToServerInsteadOfMulticast()) {

            final int multicastPort = multicastGroup.getPort();
            this.multicastChannel = (DatagramChannel) bootstrap.bind(new InetSocketAddress(multicastPort));
            if (multicastNetworkInterface != null) {
                LOG.info("Channel {} joining multicast group {} on network interface {}",
                        new Object[] { multicastChannel, multicastGroup, multicastNetworkInterface });
                multicastChannel.joinGroup(multicastGroup, multicastNetworkInterface);
            } else {
                LOG.info("Channel {} joining multicast group {} ", multicastChannel, multicastGroup);
                multicastChannel.joinGroup(multicastGroup.getAddress());
            }
        } else
            this.multicastChannel = null;
        setReady(true);
    }

    @Override
    public void shutdown() {
        super.shutdown();
        LOG.info("Shutting down.");
        monitor.unregisterMBean();
        if (channel != null)
            channel.close();
        if (multicastChannel != null)
            multicastChannel.close();
        channelFactory.releaseExternalResources();
    }

    // for testing only
    void setChannel(DatagramChannel channel) {
        this.channel = channel;
    }

    ExecutorService getExecutor() {
        return executor;
    }

    @Override
    protected void sendToServer(Message message) {
        super.sendToServer(message);
        try {
            serverComm.send(message);
        } catch (NodeNotFoundException e) {
            throw new RuntimeException("Server not found!", e);
        }
    }

    /**
     * Can block if buffer is full
     */
    @Override
    protected void sendToNode(Message message, short node, InetSocketAddress address) {
        try {
            if (LOG.isDebugEnabled())
                LOG.debug("Sending to node {} ({}): {}", new Object[] { node, address, message });
            message.cloneDataBuffers(); // important, as we're going to be doing actual sending on another thread

            final NodePeer peer = peers.get(node);
            if (peer == null)
                throw new NodeNotFoundException(node);

            peer.sendMessage(message);
            executor.submit(peer);
        } catch (InterruptedException ex) {
            LOG.error("InterruptedException", ex);
            throw new RuntimeException(ex);
        } catch (Exception ex) {
            LOG.error("Error while sending message " + message + " to node " + node, ex);
        }
    }

    /**
     * Can block
     */
    @Override
    protected synchronized void broadcast(Message message) { // synchronized for message ID ordering
        try {
            assert message.isBroadcast() && !message.isResponse();

            assignMessageId(message);
            final boolean unicast = getNumPeerNodes() < minimumNodesToMulticast;
            final ShortSet nodes = new ShortOpenHashSet();
            for (NodePeer peer : peers.values()) {
                nodes.add(peer.node);
                peer.sendMessage(message, unicast);
                executor.submit(peer);
            }
            if (nodes.isEmpty()) {
                if (message instanceof LineMessage) {
                    LOG.debug("No other nodes in cluster. Responding with NOT_FOUND to message {}", message);
                    receive(Message.NOT_FOUND((LineMessage) message).setIncoming());
                }
                return;
            }
            broadcastPeer.sendMessage(message, nodes, unicast);
            if (!unicast)
                executor.submit(broadcastPeer);
        } catch (InterruptedException ex) {
            LOG.error("InterruptedException", ex);
            throw new RuntimeException(ex);
        }
    }

    // visible for testing
    void messageReceived(MessagePacket packet) {
        if (!getCluster().isMaster())
            return;
        LOG.debug("Received packet {}", packet);

        final long now = System.nanoTime();
        packet.setTimestamp(now);

        final short node = packet.getNode();
        final NodePeer peer = peers.get(node);
        if (peer == null)
            throw new RuntimeException("Message received from unhandled node " + node);

        try {
            peer.receivePacket(packet); // we're now running in the executor we installed in the netty pipeline.
        } catch (InterruptedException ex) {
            LOG.error("InterruptedException", ex);
            throw new RuntimeException(ex);
        }
    }

    @Override
    public synchronized void nodeAdded(short id) {
        super.nodeAdded(id);
        if (id == 0)
            return;
        if (peers.get(id) != null)
            return;
        final NodePeer peer = new NodePeer(id);
        LOG.info("Adding peer {} for node {}", peer, id);
        peer.setAddress(getNodeAddress(id));
        peers.put(id, peer);
    }

    @Override
    public synchronized void nodeSwitched(short id) {
        super.nodeSwitched(id);
        final NodePeer peer = peers.get(id);
        LOG.info("Node switched. Fixing peer {}", peer);
        peer.setAddress(getNodeAddress(id));
        executor.submit(peer); // resend
        executor.submit(broadcastPeer); // resend
    }

    @Override
    public synchronized void nodeRemoved(short id) {
        super.nodeRemoved(id);
        final NodePeer peer = peers.get(id);
        if (peer != null)
            peer.removed();
        peers.remove(id);
        broadcastPeer.removeNode(id);
    }

    private static final ThreadLocal<Boolean> recursive = new ThreadLocal<Boolean>();

    abstract class Peer implements Callable<Void> {
        protected final ArrayBlockingQueue<Message> queue = new ArrayBlockingQueue<Message>(maxQueueSize);
        protected Message overflow;
        protected MessagePacket sentPacket;
        private int delayMultiplier = 1;
        private long lastSent;
        private long nextSend;
        private final Set<Message> timeouts = Collections.newSetFromMap(new ConcurrentHashMap<Message, Boolean>());
        private long lastTimeoutsCleanup;

        /**
         * This can block!
         */
        public void sendMessage(Message message) throws InterruptedException {
            if (!queue.offer(message)) {
                LOG.info("Adding message {} to full queue. Waiting for available space.", message);
                LOG.warn("no space in Peer {}", this);
                if (recursive.get() == Boolean.TRUE) {
                    LOG.error("Queue is too small");
                    throw new RuntimeException("Queue full");
                }
                queue.put(message);
            }
        }

        public int getQueueLength() {
            return queue.size();
        }

        protected void forceResend() {
            this.lastSent = 0;
            this.nextSend = 0;
            this.delayMultiplier = 0;
        }

        protected boolean isTimeToResned(long now) {
            if (now > nextSend) {
                nextSend = Long.MAX_VALUE;
                lastSent = now;
                return true;
            } else
                return false;
        }

        protected void resendIn(long now, long delay) {
            if (LOG.isDebugEnabled())
                LOG.debug("Peer {} rescheduling in {}", this, delay);
            nextSend = now + delay;
            executor.schedule(this, delay, NANOSECONDS);
        }

        protected void resend(long now) {
            long delay = resendPeriodNanos << delayMultiplier;
            if (exponentialBackoff)
                delayMultiplier++;
            if (jitter)
                delay = randInterval(delay);
            resendIn(now, delay);
        }

        protected long getLastSent() {
            return lastSent;
        }

        protected void addTimeout(Message message) {
            timeouts.add(message);
        }

        protected boolean isTimeout(Message response) {
            return timeouts.remove(response);
        }

        protected synchronized void cleanupTimeouts(long now) {
            if (now - lastTimeoutsCleanup >= NANOSECONDS.convert(10, SECONDS)) {
                for (Iterator<Message> it = timeouts.iterator(); it.hasNext();) {
                    if (now - it.next().getTimestamp() >= NANOSECONDS.convert(10, SECONDS))
                        it.remove();
                }
                lastTimeoutsCleanup = now;
            }
        }
    }

    class NodePeer extends Peer {
        public final short node;
        private volatile boolean removed = false;
        private InetSocketAddress nodeAddress;
        private boolean hasRequests = false; // true if not all messages in the sent packet are responses
        private boolean requestsOnly = true; // true if none of the messages in the sent packet are responses
        private volatile boolean broadcast; // true if the sent packet contains a (single) broadcast (and only that)
        private final LongSet pendingRequests = new LongOpenHashSet();
        private final Set<Message> unicastBroadcasts = Collections
                .newSetFromMap(new ConcurrentHashMap<Message, Boolean>());
        private long lastReceivedBroadcastId;

        public NodePeer(short node) {
            this.node = node;
        }

        public synchronized void setAddress(InetSocketAddress nodeAddress) {
            LOG.info("Node peer {} set address to {}", this, nodeAddress);
            this.nodeAddress = nodeAddress;
            lastReceivedBroadcastId = 0;
            if (sentPacket != null) {
                for (Iterator<Message> it = sentPacket.iterator(); it.hasNext();) {
                    final Message message = it.next();
                    if (message.isResponse()) {
                        LOG.debug("Peer {} removing response {} because of node switch.", this, message);
                        it.remove(); // if our peer hasn't requested again then it must have received our response
                    }
                }
            }
            forceResend();
        }

        @Override
        public synchronized String toString() {
            return "NodePeer{" + "node=" + node + ", nodeAddress=" + nodeAddress + ", lastSent=" + getLastSent()
                    + ", sentPacket=" + sentPacket + ", pendingRequests=" + pendingRequests + ", next=" + overflow
                    + ", queue=" + queue + ", broadcast=" + broadcast + '}';
        }

        public boolean isBroadcast() {
            return broadcast;
        }

        public void unicastBroadcast() {
            assert broadcast;
            LOG.debug("Node peer {} is asked to unicast broadcast.", this);
            broadcast = false;
        }

        public void removed() {
            removed = true;
        }

        @Override
        public void sendMessage(Message message) throws InterruptedException {
            synchronized (queue) { // syncrhonization ensures message id is in the order of messages put in the queue
                assignMessageId(message);
                super.sendMessage(message);
            }
        }

        public void sendMessage(Message message, boolean unicastBroadcast) throws InterruptedException {
            if (unicastBroadcast && message.isBroadcast())
                unicastBroadcasts.add(message);
            sendMessage(message);
        }

        void receivePacket(MessagePacket packet) throws InterruptedException {
            final List<Message> received = new ArrayList<Message>(packet.numMessages());
            final List<Message> broadcastResponses = new ArrayList<Message>(packet.numMessages());

            synchronized (this) {
                handleReceived(packet, received, broadcastResponses);
            }

            for (Message message : broadcastResponses)
                broadcastPeer.receivedResponse(message, received);

            recursive.set(Boolean.TRUE);
            try {
                for (Message message : received) {
                    LOG.debug("Passing received message {} to cache", message);
                    receive(message); // XXXX
                }
            } finally {
                recursive.remove();
            }

            call();
        }

        @Override
        public Void call() throws InterruptedException {
            if (recursive.get() == Boolean.TRUE)
                return null;
            recursive.set(Boolean.TRUE);
            try {
                if (removed || getCluster().getMaster(node) == null) {
                    LOG.debug("Node removed from the cluster so returning from peer {}", this);
                    return null; // don't reschedule
                }

                final List<Message> received = new ArrayList<Message>();
                synchronized (this) {
                    LOG.trace("Peer {} CALL", this);

                    final long now = System.nanoTime();

                    handleTimeout(now, received);
                    handleQueue(now);

                    if (sentPacket != null && sentPacket.isEmpty())
                        sentPacket = null;
                    if (sentPacket != null && !broadcast) {
                        if (isTimeToResned(now)) { // if messages have been added to sentPacket has changed, handleQueue sets lastSent to 0
                            LOG.debug("Peer {} sending packet {}", this, sentPacket);
                            channel.write(sentPacket, nodeAddress);
                            if (hasRequests)
                                resend(now);
                        }
                    }
                }

                for (Message message : received)
                    receive(message);
                LOG.trace("Peer {} CALL DONE", this);
                return null;
            } finally {
                recursive.remove();
            }
        }

        private void handleReceived(MessagePacket receivedPacket, List<Message> received,
                List<Message> broadcastResponses) {
            if (receivedPacket == null)
                return;
            LOG.debug("Peer {} has received packet {}", this, receivedPacket);

            boolean oobMulticast = false;
            if (receivedPacket.isMulticast()) { // multicast messages may overlap with unicast ones if the original broadcast was sent as a unicast, say if the peers sentPacket wasn't empty
                long maxIdInPacket = -1;
                for (Iterator<Message> it = receivedPacket.iterator(); it.hasNext();) {
                    final Message message = it.next();
                    //                    if (message.getMessageId() < lastReceivedBroadcastId) {
                    //                        LOG.trace("Peer {} received a multicast message {} which has already been seen.", this, message);
                    //                        it.remove();
                    //                    }
                    maxIdInPacket = Math.max(maxIdInPacket, message.getMessageId());
                }
                if (maxIdInPacket < lastReceivedBroadcastId) {
                    LOG.debug("Peer {} received an out-of-band multicast packet {} which has already been seen.",
                            this, receivedPacket);
                    oobMulticast = true;
                }
            }
            if (receivedPacket.isEmpty())
                return;

            if (!oobMulticast && sentPacket != null) {
                for (Iterator<Message> it = sentPacket.iterator(); it.hasNext();) {
                    final Message message = it.next();
                    // here we rely on Message.equals() to match request/response
                    if (message.isResponse() && !receivedPacket.contains(message)) {
                        LOG.debug(
                                "Peer {} removing response {} from sent packet because it was no longer asked for.",
                                this, message);
                        it.remove(); // if our peer hasn't requested again then it must have received our response
                    }
                }
            }
            for (Message message : receivedPacket) {
                message.setTimestamp(receivedPacket.getTimestamp());
                if (message.isBroadcast()) {
                    if (message.getMessageId() > lastReceivedBroadcastId)
                        lastReceivedBroadcastId = message.getMessageId();
                }
                // here we rely on Message.equals() to match request/response
                if (message.isResponse()) {
                    final Message request = (sentPacket != null ? sentPacket.getMessage(message) : null);
                    if (request == null
                            && !(isTimeout(message) || (broadcast && broadcastPeer.isTimeout(message)))) {
                        LOG.debug("Peer {} ignoring repeat response {}", this, message);
                        continue; // we may be re-receiving the response, so the request may be gone. in this case we don't need to pass the message again to the receiver
                    }
                    if (LOG.isDebugEnabled())
                        LOG.debug("Peer {} received response {} for request ({})",
                                new Object[] { this, message, request != null ? request : "TIMEOUT" });
                    if (request != null) {
                        if (request.isBroadcast())
                            broadcastResponses.add(message);

                        //                        if(message.getType() == Message.Type.CHNGD_OWNR && ((Message.CHNGD_OWNR)message).getNewOwner() == message.getNode()) {
                        //                            // this is a quickReplyToBroadcast
                        //                            // TODO
                        //                        }
                        sentPacket.removeMessage(message);
                    }
                } else {
                    if (sentPacket != null && sentPacket.contains(message)) {
                        LOG.debug("Peer {} already has a response for message {}", this, message);
                        continue; // no need to re-generate a response we already have
                    }
                    if (pendingRequests.contains(message.getMessageId())) {
                        LOG.debug("Peer {} already has a request pending for message {}", this, message);
                        continue; // we don't pass on requests to the receiver more than once
                    } else
                        pendingRequests.add(message.getMessageId());
                }

                if (message.getType() == Message.Type.ACK)
                    continue; // we do not pass ACKs on to the receiver

                received.add(message); // getReceiver().receive(message);

                if (!message.isResponse() && !message.isReplyRequired()) {
                    if (!queue.offer(Message.ACK(message))) {
                        LOG.error("Queue capacity for perr {} exceeded", this);
                        throw new RuntimeException("Peer queue full!");
                    }
                }
            }
            //receivedPacket = null;
            if (sentPacket != null) {
                forceResend();
                if (sentPacket.isEmpty()) {
                    sentPacket = null;
                    broadcast = false;
                    hasRequests = false;
                    requestsOnly = true;
                } else {
                    // update hasRequests, requestsOnly and broadcast
                    boolean _hasRequests = false;
                    boolean _requestsOnly = true;
                    boolean _broadcast = true;
                    for (Message message : sentPacket) {
                        if (message.isResponse())
                            _requestsOnly = false;
                        else
                            _hasRequests = true;
                        if (!message.isBroadcast())
                            _broadcast = false;
                    }
                    hasRequests = _hasRequests;
                    requestsOnly = _requestsOnly;
                    if (!broadcast && _broadcast) {
                        LOG.trace("Peer {} notifying broadcast.", this);
                        executor.submit(broadcastPeer);
                    }
                    broadcast = _broadcast;
                }
            }
        }

        private void handleTimeout(long now, List<Message> received) {
            if (broadcast || sentPacket == null || sentPacket.isEmpty())
                return;

            final long timeoutNanos = NANOSECONDS.convert(getTimeout(), MILLISECONDS);
            for (Iterator<Message> it = sentPacket.reverseIterator(); it.hasNext();) {
                final Message message = it.next();
                if (message.getType() != Message.Type.INV && now - message.getTimestamp() > timeoutNanos) {
                    if (message.isResponse() || message.isBroadcast())
                        continue;
                    if (message instanceof LineMessage) {
                        LOG.debug("Timeout on message {}", message);
                        received.add(Message.TIMEOUT((LineMessage) message).setIncoming());
                    }
                    it.remove();
                    addTimeout(message);
                } else
                    break;
            }
            if (sentPacket.isEmpty()) {
                sentPacket = null;
                broadcast = false;
                hasRequests = false;
                requestsOnly = true;
            }

            cleanupTimeouts(now);
        }

        /**
         * Specifies that a message should not be resent, but a response is still possible
         *
         * @param message
         */
        public synchronized void markAsTimeout(Message message) {
            if (sentPacket.removeMessage(message.getMessageId()))
                addTimeout(message);
        }

        private synchronized void handleQueue(long start) throws InterruptedException {
            // ProbLem:
            // assume we send a full packet with requests only, and our peer send us a full packet with requests only.
            // we cannot add requests to the sentPacket b/c it's full, so we must wait for our peer to respond so that we can emty 
            // the packet, only it can't b/c its sentPacket is also full - we got a deadlock.
            // as I see it, the only way to truly resolve it is to have multi-part packets, but we don't want to do that.
            // what we do is that we don't allow a packet with requests only to be full - we always leave room for a response.

            // assumes hasRequests and requestsOnly are up to date.
            Message next = overflow;
            overflow = null;
            if (next == null)
                next = queue.poll();
            for (;;) {
                LOG.trace("handleQueue loop");
                if (next == null) {
                    LOG.trace("handleQueue loop: next == null");
                    break;
                }
                overflow = next; // we put the next message into overflow. if we _don't_ break out of the loop and use the message, we'll null overflow

                final boolean unicastBroadcast = next.isBroadcast() && unicastBroadcasts.remove(next);

                if (broadcast && (!next.isBroadcast() || unicastBroadcast)) {
                    LOG.trace("Node peer {} not taking non-broadcast message {} during broadcast", this, next);
                    break; // we're not taking any non-broadcast messages during broadcast
                }

                if (!broadcast && next.isBroadcast() && !unicastBroadcast) {
                    if (sentPacket == null || sentPacket.isEmpty()) {
                        LOG.debug("Node peer {} going into broadcast mode for message {}.", this, next);
                        broadcast = true;
                    }
                    // else, we add message to packet, and continue transmitting.
                    // if the packet had responses only, the new broadcast request would force a re-send and expedite matters
                    // if a response for the broadcast is received before we get a chance to multicast, that's ok because we simply remove the node
                    // from the BroadcastEntry
                }

                if (next.size() > maxPacketSize) {
                    LOG.error("Message {} is larger than the maximum packet size {}", next, maxPacketSize);
                    throw new RuntimeException("Message is larger than maxPacketSize");
                }

                if (next.size() + sentPacketSizeInBytes() > maxPacketSize) {
                    if (next.isResponse() && requestsOnly)
                        LOG.warn(
                                "IMPORTANT: Response message {} does not fit in packet {} which contains only requests. THIS MAY CAUSE A DEADLOCK!",
                                next, sentPacket);
                    LOG.debug("Message {} cannot be added to packet now; packet full (size = {})", next,
                            next.size());
                    break;
                }

                if (!next.isResponse()) {
                    if (requestsOnly && next.size() + sentPacketSizeInBytes() > maxRequestOnlyPacketSize
                            && sentPacketSizeInBytes() > 0) {
                        // check if packet consists of requestOnly message unless it is only one message.
                        LOG.debug("NOT Sending requests only {}. can't add to packet {} bytes long.", next,
                                sentPacketSizeInBytes());
                        break;
                    }
                    hasRequests = true;
                } else
                    requestsOnly = false;

                if (next.isResponse())
                    pendingRequests.remove(next.getMessageId());

                LOG.debug("Adding message {} to sent-packet", next);
                if (sentPacket == null)
                    sentPacket = new MessagePacket();
                sentPacket.addMessage(next);
                forceResend();
                overflow = null;

                if (broadcast) {
                    LOG.trace("Peer {} notifying broadcast.", this);
                    executor.submit(broadcastPeer);
                }

                final long now = System.nanoTime();
                if ((now - start + minDelayNanos) > maxDelayNanos)
                    break;
                next = queue.poll(minDelayNanos, NANOSECONDS);
            }
        }

        private int sentPacketSizeInBytes() {
            return sentPacket != null ? sentPacket.sizeInBytes() : 0;
        }
    }

    class BroadcastPeer extends Peer {
        private final ConcurrentMap<Long, BroadcastEntry> broadcasts = new ConcurrentHashMap<Long, BroadcastEntry>();

        @Override
        public String toString() {
            return "BroadcastPeer{" + "multicastAddress=" + multicastGroup + ", lastSent=" + getLastSent()
                    + ", sentPacket=" + sentPacket + ", next=" + overflow + ", queue=" + queue + '}';
        }

        public void sendMessage(Message message, ShortSet nodes, boolean unicast) throws InterruptedException {
            broadcasts.put(message.getMessageId(), new BroadcastEntry(message, nodes));
            if (!unicast)
                sendMessage(message);
        }

        @Override
        public Void call() throws InterruptedException {
            final List<Message> received = new ArrayList<Message>();
            synchronized (this) {
                LOG.trace("BroadcastPeer CALL");
                final long now = System.nanoTime();

                handleTimeout(now, received);
                handleQueue(now);

                if (sentPacket != null && sentPacket.isEmpty())
                    sentPacket = null;
                if (isTimeToResned(now)) {
                    if (sentPacket != null) { // if messages have been added tos sentPacket has changed, handleQueue sets lastSent to 0
                        assert !sendToServerInsteadOfMulticast;
                        LOG.debug("BroadcastPeer {} multicasting packet {}", this, sentPacket);
                        channel.write(sentPacket, multicastGroup);
                        resend(now);
                    } else if (!broadcasts.isEmpty()) {
                        executor.schedule(this, getTimeout(), MILLISECONDS);
                    }
                }
            }
            for (Message message : received)
                receive(message);
            LOG.trace("BroadcastPeer CALL DONE");
            return null;
        }

        private void handleQueue(long start) throws InterruptedException {
            Message next = overflow;
            overflow = null;
            if (next == null)
                next = queue.poll();
            loop: for (;;) {
                if (next == null)
                    break;

                overflow = next; // we put the next message into overflow. if we _don't_ break out of the loop and use the message, we'll null overflow

                if (next.size() > maxPacketSize) {
                    LOG.error("Message {} is larger than the maximum packet size {}", next, maxPacketSize);
                    throw new RuntimeException("Message is larger than maxPacketSize");
                }

                if (sentPacket != null && next.size() + sentPacket.sizeInBytes() > maxPacketSize)
                    break;

                LOG.debug("Waiting for peers to enter broadcast mode for message {}", next);
                BroadcastEntry entry = broadcasts.get(next.getMessageId());

                if (entry != null) {
                    if (entry.nodes.isEmpty()) {
                        broadcasts.remove(next.getMessageId());
                        if (next instanceof LineMessage) {
                            LOG.debug("No other nodes in cluster. Responding with NOT_FOUND to message {}", next);
                            receive(Message.NOT_FOUND((LineMessage) next).setIncoming());
                        }
                        entry = null;
                    }
                }

                if (entry != null) {
                    for (ShortIterator it = entry.nodes.iterator(); it.hasNext();) {
                        final short node = it.next();
                        final NodePeer peer = peers.get(node);
                        synchronized (peer) {
                            if (!(peer.isBroadcast() && peer.sentPacket.contains(next.getMessageId()))) {
                                LOG.trace("Waiting for peer {}.", peer);
                                break loop;
                            }
                            LOG.trace("Peer {} ok (broadcast {})", peer, next);
                        }
                    }

                    LOG.debug("Adding message {} to sent-packet", next);
                    if (sentPacket == null)
                        sentPacket = new MessagePacket();
                    sentPacket.addMessage(next);
                    forceResend();
                }

                overflow = null;

                final long now = System.nanoTime();
                if (maxDelayNanos > (now - start + minDelayNanos))
                    break;
                next = queue.poll(minDelayNanos, NANOSECONDS);
            }
        }

        private void handleTimeout(long now, List<Message> received) {
            if (broadcasts.isEmpty())
                return;
            final long timeoutNanos = NANOSECONDS.convert(getTimeout(), MILLISECONDS);

            for (Iterator<BroadcastEntry> it = broadcasts.values().iterator(); it.hasNext();) {
                final BroadcastEntry entry = it.next();
                final Message message = entry.message;
                if (message.getType() != Message.Type.INV && now - message.getTimestamp() > timeoutNanos) {
                    if (message instanceof LineMessage) {
                        LOG.debug("Timeout on message {}", message);
                        received.add(Message.TIMEOUT((LineMessage) message).setIncoming());
                    }
                    it.remove();
                    releasePeers(entry, (short) -1);
                    addTimeout(message);
                    if (sentPacket != null)
                        sentPacket.removeMessage(message.getMessageId());
                }
            }
            if (sentPacket != null && sentPacket.isEmpty())
                sentPacket = null;

            cleanupTimeouts(now);
        }

        public void receivedResponse(Message message, List<Message> received) {
            final BroadcastEntry entry = broadcasts.get(message.getMessageId());
            if (entry == null)
                return;
            synchronized (this) {
                boolean done = entry.removeNode(message.getNode());
                if (message.getType() != Message.Type.ACK) {// this is a response - no need to wait for further acks
                    LOG.debug("Message {} is a reply to a broadcast! (discarding pending)", message);
                    if (!done)
                        releasePeers(entry, message.getNode());
                    done = true;
                } else {
                    if (LOG.isDebugEnabled())
                        LOG.debug("Got ACK from {} to message {}", message.getNode(), entry.message);
                    final int numNodes = entry.nodes.size();
                    if (done) {
                        if (entry.message instanceof LineMessage) {
                            LOG.debug("Got all ACKs for message {}, but no response - sending NOT_FOUND to cache!",
                                    entry.message);
                            received.add(Message.NOT_FOUND((LineMessage) entry.message).setIncoming());
                        }
                    } else if (numNodes < minimumNodesToMulticast && (numNodes + 1) >= minimumNodesToMulticast) {
                        if (sentPacket != null)
                            sentPacket.removeMessage(message.getMessageId()); // don't multicast...

                        // unicast:
                        final long now = System.nanoTime();
                        final long sinceLastSent = now - getLastSent();
                        long delay = resendPeriodNanos - sinceLastSent;
                        delay = (delay >= 0 ? delay : 0);
                        for (ShortIterator it = entry.nodes.iterator(); it.hasNext();) {
                            final NodePeer peer = peers.get(it.next());
                            if (peer.isBroadcast()) {
                                peer.unicastBroadcast();
                                peer.forceResend();
                                peer.resendIn(now, delay);
                                executor.submit(peer);
                            }
                        }
                    }
                }

                if (done) {
                    if (sentPacket != null)
                        sentPacket.removeMessage(message.getMessageId());
                    broadcasts.remove(message.getMessageId());
                }
                if (sentPacket != null && sentPacket.isEmpty())
                    sentPacket = null;
            }
        }

        private void releasePeers(BroadcastEntry entry, short node) {
            final Message message = entry.message;
            for (ShortIterator it = entry.nodes.iterator(); it.hasNext();) {
                final NodePeer peer = peers.get(it.next());
                if (peer.isBroadcast()) {
                    LOG.debug("Broadcast releasing peer {} for message {}", peer, message);
                    if (peer.node != node) {
                        LOG.debug("Broadcast marking message {} as timeout for peer {}", message, peer);
                        peer.markAsTimeout(message);
                    }
                    peer.unicastBroadcast();
                    executor.submit(peer);
                }
            }
        }

        public void removeNode(short node) {
            synchronized (this) {
                for (Iterator<Map.Entry<Long, BroadcastEntry>> it = broadcasts.entrySet().iterator(); it
                        .hasNext();) {
                    BroadcastEntry entry = it.next().getValue();
                    if (entry.removeNode(node) && entry.message instanceof LineMessage) {
                        LOG.debug("Got all ACKs for message {}, but no response - sending NOT_FOUND to cache!",
                                entry.message);
                        receive(Message.NOT_FOUND((LineMessage) entry.message).setIncoming());
                        it.remove();
                    }
                }
            }
        }
    }

    private static class BroadcastEntry {
        final Message message;
        final ShortSet nodes;

        public BroadcastEntry(Message message, ShortSet nodes) {
            this.message = message;
            this.nodes = nodes;
            this.nodes.remove(Comm.SERVER); // NOT TO SERVER
            LOG.debug("Awaiting ACKS for message {} from nodes {}", message, this.nodes);
        }

        public synchronized void addNode(short node) {
            nodes.add(node);
        }

        public synchronized boolean removeNode(short node) {
            nodes.remove(node);
            return nodes.isEmpty();
        }
    }

    private int getNumPeerNodes() {
        return getCluster().getNodes().size() - (getCluster().getNodes().contains(Comm.SERVER) ? 1 : 0) + 1;
    }

    private static long randInterval(long expected) {
        return (long) randExp(1.0 / expected);
    }

    /**
     * Return a real number from an exponential distribution with rate lambda. Based on
     * http://en.wikipedia.org/wiki/Inverse_transform_sampling
     */
    private static double randExp(double lambda) {
        return -Math.log(1 - ThreadLocalRandom.current().nextDouble()) / lambda;
    }

    BroadcastPeer getBroadcastPeer() {
        return broadcastPeer;
    }

    ConcurrentMap<Short, NodePeer> getPeers() {
        return peers;
    }
}