org.diqube.cluster.ClusterManager.java Source code

Introduction

Here is the source code for org.diqube.cluster.ClusterManager.java
Source

/**
 * diqube: Distributed Query Base.
 *
 * Copyright (C) 2015 Bastian Gloeckle
 *
 * This file is part of diqube.
 *
 * diqube is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.diqube.cluster;

import java.io.IOException;
import java.lang.Thread.UncaughtExceptionHandler;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import javax.annotation.PostConstruct;
import javax.annotation.PreDestroy;
import javax.inject.Inject;

import org.apache.thrift.TException;
import org.diqube.cluster.ClusterLayoutStateMachine.RemoveNode;
import org.diqube.cluster.ClusterLayoutStateMachine.SetTablesOfNode;
import org.diqube.config.Config;
import org.diqube.config.ConfigKey;
import org.diqube.connection.ClusterNodeStatusDetailListener;
import org.diqube.connection.Connection;
import org.diqube.connection.ConnectionException;
import org.diqube.connection.ConnectionPool;
import org.diqube.connection.NodeAddress;
import org.diqube.connection.OurNodeAddressProvider;
import org.diqube.consensus.ConsensusClient;
import org.diqube.consensus.ConsensusClient.ClosableProvider;
import org.diqube.consensus.ConsensusClient.ConsensusClusterUnavailableException;
import org.diqube.consensus.ConsensusClusterNodeAddressProvider;
import org.diqube.consensus.ConsensusIsLeaderProvider;
import org.diqube.consensus.ConsensusServer;
import org.diqube.consensus.ConsensusStateMachineClientInterruptedException;
import org.diqube.context.AutoInstatiate;
import org.diqube.context.InjectOptional;
import org.diqube.context.shutdown.ContextShutdownListener;
import org.diqube.context.shutdown.ShutdownBefore;
import org.diqube.listeners.ClusterManagerListener;
import org.diqube.listeners.ServingListener;
import org.diqube.listeners.TableLoadListener;
import org.diqube.listeners.providers.LoadedTablesProvider;
import org.diqube.listeners.providers.OurNodeAddressStringProvider;
import org.diqube.remote.cluster.thrift.ClusterManagementService;
import org.diqube.threads.ExecutorManager;
import org.diqube.thrift.base.thrift.RNodeAddress;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Iterables;

/**
 * Manages state of the diqube-server cluster, this nodes state in other cluster nodes and shares information about
 * that.
 * 
 * <p>
 * This class ensures that the information other nodes have about this node is correct, it manages our nodes address.
 *
 * @author Bastian Gloeckle
 */
@AutoInstatiate
public class ClusterManager implements ServingListener, TableLoadListener, OurNodeAddressStringProvider,
        ClusterNodeStatusDetailListener, OurNodeAddressProvider, ConsensusClusterNodeAddressProvider,
        ContextShutdownListener {
    private static final Logger logger = LoggerFactory.getLogger(ClusterManager.class);

    private static final String OUR_HOST_AUTOMATIC = "*";

    @Config(ConfigKey.OUR_HOST)
    private String ourHost;

    @Config(ConfigKey.PORT)
    private int ourPort;

    private NodeAddress ourHostAddr;

    @Config(ConfigKey.CLUSTER_NODES)
    private String clusterNodesConfigString;

    @Inject
    private ConnectionPool connectionPool;

    @InjectOptional
    private List<ClusterManagerListener> clusterManagerListeners;

    /** will contain "this", too! */
    @InjectOptional
    private List<ClusterNodeStatusDetailListener> clusterNodeDiedListeners;

    private List<NodeAddress> consensusClusterNodes = new ArrayList<>();

    @Inject
    private ClusterLayout clusterLayout;

    @Inject
    private ConsensusClient consensusClient;

    @Inject
    private ConsensusIsLeaderProvider consensusIsLeaderProvider;

    @Inject
    private LoadedTablesProvider loadedTablesProvider;

    @Inject
    private ExecutorManager executorManager;

    /**
     * Disable the methods of {@link ClusterNodeStatusDetailListener} on startup, until we have initially found some
     * cluster nodes.
     */
    private boolean clusterNodeStatusDetailListenerDisabled = true;

    private ExecutorService executorService;

    @PostConstruct
    public void initialize() {
        if (ourHost.equals(OUR_HOST_AUTOMATIC)) {
            try {
                InetAddress foundAddr = InetAddress.getLocalHost();
                ourHost = foundAddr.getHostAddress();
                logger.info(
                        "Using {} as our host address. We expect that other cluster nodes will be able to reach this "
                                + "node under that address. If not, define a different host in the configuration!",
                        ourHost);
            } catch (UnknownHostException e) {
                logger.error("Configuration said to identify our host automatically, "
                        + "but was not able to inspect network interfaces.", e);
                throw new RuntimeException("Configuration said to identify our host automatically, "
                        + "but was not able to inspect network interfaces.", e);
            }
        } else
            logger.info(
                    "Using {} as our host address. We expect that other cluster nodes will be able to reach this node "
                            + "under that address!",
                    ourHost);

        ourHostAddr = new NodeAddress(ourHost, (short) ourPort);

        executorService = executorManager.newCachedThreadPool("clustermanager-%d", new UncaughtExceptionHandler() {
            @Override
            public void uncaughtException(Thread t, Throwable e) {
                logger.error("Error while executing asynchronous ClusterManager task", e);
                // swallow otherwise, as we'd like to continue as well as possible.
            }
        });
    }

    @PreDestroy
    public void cleanup() {
        if (executorService != null)
            executorService.shutdownNow();
    }

    @Override
    @ShutdownBefore({ ConsensusClient.class, ConsensusServer.class })
    public void contextAboutToShutdown() {
        // try to gracefully tell the ClusterLayout that we're gone. If it does not work within a second, skip it. The other
        // nodes might then try to submit stuff to our node, but will soon discover that we're down and remove us from the
        // ClusterLayout themselves.
        try {
            logger.debug("Trying to remove ourselves from the cluster layout, as we're shutting down...");
            executorService.submit(() -> {
                try (ClosableProvider<ClusterLayoutStateMachine> p = consensusClient
                        .getStateMachineClient(ClusterLayoutStateMachine.class)) {
                    p.getClient().removeNode(RemoveNode.local(ourHostAddr));
                } catch (ConsensusClusterUnavailableException e) {
                    logger.warn("Could not access consensus cluster to remove ourselves from cluster layout.");
                }
            }).get(1, TimeUnit.SECONDS);
        } catch (TimeoutException | InterruptedException | ExecutionException e) {
            logger.warn(
                    "Could not deregister from cluster layout gracefully. The other cluster nodes might show exceptions "
                            + "about this soon, but the cluster should recover.",
                    e);
        }
    }

    private List<NodeAddress> parseClusterNodes(String clusterNodes) {
        List<NodeAddress> res = new ArrayList<>();

        for (String clusterNodeString : clusterNodes.split(",")) {
            int lastColon = clusterNodeString.lastIndexOf(":");
            if (lastColon == -1) {
                logger.warn("No port specified in '{}'. Ignoring.", clusterNodeString);
                continue;
            }
            if (lastColon == 0) {
                logger.warn("No host specified in '{}'. Ignoring.", clusterNodeString);
                continue;
            }
            short port;
            try {
                port = Short.valueOf(clusterNodeString.substring(lastColon + 1));
            } catch (NumberFormatException e) {
                logger.warn("Could not parse port in '{}'. Ignoring.", clusterNodeString);
                continue;
            }
            String host = clusterNodeString.substring(0, lastColon);

            res.add(new NodeAddress(host, port));
        }

        if (res.isEmpty())
            return null;

        return res;
    }

    @Override
    public void localServerStartedServing() {

        if (clusterNodesConfigString == null || "".equals(clusterNodesConfigString)) {
            logger.info("There are no cluster nodes configured, will therefore not connect anywhere.");
            if (clusterManagerListeners != null)
                clusterManagerListeners.forEach(l -> l.clusterInitialized());
            return;
        }
        List<NodeAddress> initialClusterNodes = parseClusterNodes(this.clusterNodesConfigString);
        if (initialClusterNodes == null) {
            logger.warn("There are no cluster nodes configured, will therefore not connect anywhere.");
            if (clusterManagerListeners != null)
                clusterManagerListeners.forEach(l -> l.clusterInitialized());
            return;
        }

        logger.debug("Starting to communicate to cluster using the configured hosts ({})...", initialClusterNodes);

        try {
            // use the first node we can contact to fetch a list of all cluster nodes it knows. That list will later be used
            // to startup the consensus node.
            Set<RNodeAddress> allClusterNodes = new HashSet<>();
            for (NodeAddress nodeAddr : initialClusterNodes) {
                try (Connection<ClusterManagementService.Iface> conn = reserveConnection(nodeAddr)) {
                    allClusterNodes.addAll(conn.getService().getAllKnownClusterNodes());
                } catch (ConnectionException | TException | IOException e) {
                    logger.warn("Could not contact cluster node at {}.", nodeAddr, e);
                }
            }

            if (allClusterNodes.isEmpty()) {
                logger.warn("There are no cluster nodes alive, will therefore not connect anywhere.");
                if (clusterManagerListeners != null)
                    clusterManagerListeners.forEach(l -> l.clusterInitialized());
                return;
            }

            allClusterNodes.forEach(remoteAddr -> consensusClusterNodes.add(new NodeAddress(remoteAddr)));
        } catch (InterruptedException e) {
            logger.error("Interrupted while starting to communicate with cluster", e);
            return;
        }

        logger.info("Gathered {} node addresses of the cluster (limit): {}", consensusClusterNodes.size(),
                Iterables.limit(consensusClusterNodes, 100));

        // enable activity when dead or alive nodes are identified.
        clusterNodeStatusDetailListenerDisabled = false;

        if (clusterManagerListeners != null)
            clusterManagerListeners.forEach(l -> l.clusterInitialized());
    }

    private Connection<ClusterManagementService.Iface> reserveConnection(NodeAddress addr)
            throws ConnectionException, InterruptedException {
        return connectionPool.reserveConnection(ClusterManagementService.Iface.class, addr.createRemote(),
                null /* node will be removed automatically from ClusterManager, therefore no separate listener needed */);
    }

    @Override
    public void localServerStoppedServing() {
        // noop.
    }

    @Override
    public void nodeDied(RNodeAddress diedAddr) {
        if (clusterNodeStatusDetailListenerDisabled)
            // Disable during startup, as we do not want to act on "dead" nodes of the config file.
            return;

        // This will typically be called when a connection to a node fails. We will not remove the node from the consensus
        // cluster (as that would allow split-brains), but we ensure that its information is removed from the clusterLayout
        // across the consensus cluster. That way, no connections will be opened to that cluster node for queries any more
        // etc. We have to ensure that we integrate the current information again as soon as the node gets back online (=the
        // node gets restarted which would be a normal join to the consensus cluster or if the e.g. network partition ends
        // and we can communicate with the node again without it re-joining the cluster).
        if (diedAddr.isSetDefaultAddr()) {
            NodeAddress addr = new NodeAddress(diedAddr);

            logger.trace(
                    "Cluster node died. Checking consensus cluster if we need to distribute that information...");

            // execute asynchronously, as this might take some time and we might even still be in startup (e.g. internal
            // consensus cluster server startup).
            executorService.execute(() -> {
                try {
                    if (clusterLayout.isNodeKnown(addr)) {
                        logger.info(
                                "Cluster node died: {}. Distributing information on changed cluster layout in consensus cluster.",
                                addr);

                        // This might actually be executed by multiple cluster nodes in parallel, but that does not hurt that much,
                        // as node deaths should be rare.
                        try (ClosableProvider<ClusterLayoutStateMachine> p = consensusClient
                                .getStateMachineClient(ClusterLayoutStateMachine.class)) {
                            p.getClient().removeNode(RemoveNode.local(addr));
                        } catch (ConsensusClusterUnavailableException e) {
                            logger.warn(
                                    "Could not remove node {} from cluster layout since consensus cluster is unavailable",
                                    addr);
                        }
                    } else
                        logger.trace(
                                "Cluster node died. No need to distribute information since that node was unknown to the "
                                        + "consensus cluster anyway.");
                } catch (InterruptedException | ConsensusStateMachineClientInterruptedException
                        | ConsensusClusterUnavailableException e) {
                    // exit quietly.
                }
            });
        }
    }

    @Override
    public void nodeAlive(RNodeAddress remoteNodeAddr) throws InterruptedException {
        if (clusterNodeStatusDetailListenerDisabled)
            // Disable during startup, as we are not yet interesting in "alive" nodes - we will receive cluster layout
            // information automatically if we join a cluster (= our consensus log will be filled) or if we're a single node
            // setup, there are no nodes anyway.
            return;

        // This will typically be called on the consensus master node when a new node joined or became alive again, as the
        // consensus master periodically sends keepAlives to all nodes. We ensure here that we get current information about
        // that new node.

        if (!consensusIsLeaderProvider.isLeader())
            // Only let the consensus leader find new alive nodes. This is to reduce the number of times a new node is asked
            // to "publishLoadedTables" and also to limit the number of times "clusterLayout.isNodeKnown" is called: This is
            // pretty slow on non-leader nodes, but we will receive a lot of "nodeAlive" calls.
            return;

        if (remoteNodeAddr.isSetDefaultAddr()) {
            NodeAddress addr = new NodeAddress(remoteNodeAddr);
            try {
                if (!clusterLayout.isNodeKnown(addr)) {
                    logger.info(
                            "Cluster node seems to be accessible now: {}. As we do not have information on the tables this "
                                    + "new node serves, we ask it to publicize that.",
                            addr);

                    try (Connection<ClusterManagementService.Iface> conn = reserveConnection(addr)) {
                        conn.getService().publishLoadedTablesInConsensus();
                    } catch (ConnectionException | TException | IOException e) {
                        logger.warn("Could not contact cluster node at {}.", addr, e);
                    }
                }
            } catch (ConsensusClusterUnavailableException e) {
                logger.warn("Could not inform cluster about the node {} becoming alive, since the consensus "
                        + "cluster is not reachable", e);
            }
        }
    }

    @Override
    public synchronized void tableLoaded(String newTableName) throws AbortTableLoadException {
        logger.info("Informing consensus cluster of our updated table list.");
        try (ClosableProvider<ClusterLayoutStateMachine> p = consensusClient
                .getStateMachineClient(ClusterLayoutStateMachine.class)) {
            p.getClient().setTablesOfNode(
                    SetTablesOfNode.local(ourHostAddr, loadedTablesProvider.getNamesOfLoadedTables()));
        } catch (ConsensusClusterUnavailableException e) {
            logger.error("Table cannot be loaded because consensus cluster is not available", e);
            throw new AbortTableLoadException("Table cannot be loaded because consensus cluster is not available",
                    e);
        }
        logger.trace("Informed consensus cluster of our updated table list.");
    }

    @Override
    public void tableUnloaded(String tableName) {
        logger.info("Informing consensus cluster of our updated table list.");
        try (ClosableProvider<ClusterLayoutStateMachine> p = consensusClient
                .getStateMachineClient(ClusterLayoutStateMachine.class)) {
            p.getClient().setTablesOfNode(
                    SetTablesOfNode.local(ourHostAddr, loadedTablesProvider.getNamesOfLoadedTables()));
            logger.trace("Informed consensus cluster of our updated table list.");
        } catch (ConsensusClusterUnavailableException e) {
            logger.warn("Could not inform consensus cluster that we do not serve the table any more.", e);
        }
    }

    @Override
    public NodeAddress getOurNodeAddress() {
        return ourHostAddr;
    }

    @Override
    public String getOurNodeAddressAsString() {
        return ourHostAddr.toString();
    }

    @Override
    public List<NodeAddress> getClusterNodeAddressesForConsensus() {
        return consensusClusterNodes;
    }

}