com.splout.db.qnode.Deployer.java Source code

Introduction

Here is the source code for com.splout.db.qnode.Deployer.java
Source

package com.splout.db.qnode;

/*
 * #%L
 * Splout SQL Server
 * %%
 * Copyright (C) 2012 Datasalt Systems S.L.
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * #L%
 */

import com.google.common.util.concurrent.*;
import com.hazelcast.core.ICountDownLatch;
import com.hazelcast.core.IMap;
import com.splout.db.common.JSONSerDe;
import com.splout.db.common.PartitionEntry;
import com.splout.db.common.ReplicationEntry;
import com.splout.db.common.Tablespace;
import com.splout.db.hazelcast.CoordinationStructures;
import com.splout.db.hazelcast.TablespaceVersion;
import com.splout.db.qnode.beans.*;
import com.splout.db.thrift.DNodeService;
import com.splout.db.thrift.DeployAction;
import com.splout.db.thrift.PartitionMetadata;
import org.apache.commons.collections.MapUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.thrift.transport.TTransportException;

import java.text.SimpleDateFormat;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

/**
 * The Deployer is a specialized module ({@link com.splout.db.qnode.QNodeHandlerModule}) of the
 * {@link com.splout.db.qnode.QNode} that performs the business logic associated with a distributed deployment. It is
 * used by the {@link com.splout.db.qnode.QNodeHandler}.
 */
@SuppressWarnings({ "unchecked", "rawtypes" })
public class Deployer extends QNodeHandlerModule {

    private final static Log log = LogFactory.getLog(Deployer.class);
    private ListeningExecutorService deployExecutor;
    // Registry of deployments being running. Usefull for cancelling them.
    private ConcurrentHashMap<Long, Future<?>> runningDeployments = new ConcurrentHashMap<Long, Future<?>>();

    @SuppressWarnings("serial")
    public static class UnexistingVersion extends Exception {

        public UnexistingVersion() {
            super();
        }

        public UnexistingVersion(String message) {
            super(message);
        }
    }

    /**
     * Runnable that deals with the asynchronous part of the deployment. Particularly, it waits until DNodes finish their
     * work, and then performs the version switch.
     */
    public class ManageDeploy implements Runnable {

        // Number of seconds to wait until another
        // check to see if timeout was reached or
        // if a DNode failed.
        private long secondsToCheckFailureOrTimeout = 15l;

        private long version;
        private List<String> dnodes;
        private long timeoutSeconds;
        private List<DeployRequest> deployRequests;
        private long dnodesSpreadMetadataTimeout;
        private boolean isReplicaBalancingEnabled;

        public ManageDeploy(List<String> dnodes, List<DeployRequest> deployRequests, long version,
                long timeoutSeconds, long secondsToCheckFailureOrTimeout, long dnodesSpreadMetadataTimeout,
                boolean isReplicaBalancingEnabled) {
            this.dnodes = dnodes;
            this.deployRequests = deployRequests;
            this.version = version;
            this.timeoutSeconds = timeoutSeconds;
            this.secondsToCheckFailureOrTimeout = secondsToCheckFailureOrTimeout;
            this.dnodesSpreadMetadataTimeout = Math.max(dnodesSpreadMetadataTimeout, 1);
            this.isReplicaBalancingEnabled = isReplicaBalancingEnabled;
        }

        @Override
        public void run() {
            log.info(context.getConfig().getProperty(QNodeProperties.PORT) + " Executing deploy for version ["
                    + version + "]");
            CoordinationStructures.DEPLOY_IN_PROGRESS.incrementAndGet();

            try {
                long waitSeconds = 0;
                ICountDownLatch countDownLatchForDeploy = context.getCoordinationStructures()
                        .getCountDownLatchForDeploy(version);
                boolean finished;
                do {
                    finished = countDownLatchForDeploy.await(secondsToCheckFailureOrTimeout, TimeUnit.SECONDS);
                    // We have to do this check as the await method seems to ignore the interrupt signal. Grrrrr!!
                    // We use interrupted as we want the interrupt flag do be cleared. Otherwise cancelling code
                    // could throw another InterruptedException further.
                    if (Thread.interrupted()) {
                        throw new InterruptedException("Deployment for version [" + version
                                + "] received an interrupt. Probably somebody is cancelling this deployment.");
                    }
                    waitSeconds += secondsToCheckFailureOrTimeout;
                    if (!finished) {
                        // If any of the DNodes failed, then we cancel the deployment.
                        if (checkForFailure()) {
                            explainErrors();
                            abortDeploy(dnodes, "One or more DNodes failed", version);
                            return;
                        }
                        // Let's see if we reached the timeout.
                        // Negative timeoutSeconds => waits forever
                        if (waitSeconds > timeoutSeconds && timeoutSeconds >= 0) {
                            log.warn("Deploy of version [" + version + "] timed out. Reached [" + waitSeconds
                                    + "] seconds.");
                            abortDeploy(dnodes, "Timeout reached", version);
                            return;
                        }
                    }
                } while (!finished);

                // It's still possible that the deploy failed so let's check it again
                if (checkForFailure()) {
                    explainErrors();
                    abortDeploy(dnodes, "One or more DNodes failed.", version);
                    return;
                }

                // Check after the wait than the complete tablespaces are available to that QNode. If that is the
                // case for this QNode it will be probably the case for the rest of QNodes.
                long millisToWait = 50;
                double acumulatedMillis = 0.;
                List<SwitchVersionRequest> versionsToCheck = switchActions();
                do {
                    Thread.sleep(millisToWait);
                    acumulatedMillis += millisToWait;

                    // Let's see if we reached the timeout.
                    // Negative timeoutSeconds => waits forever
                    if ((acumulatedMillis / 1000) > dnodesSpreadMetadataTimeout) {
                        log.warn("Deploy of version [" + version
                                + "] timed out when waiting DNodes to spread the metadata. Reached ["
                                + (acumulatedMillis / 1000) + "] seconds.");
                        abortDeploy(dnodes, "Timeout reached", version);
                        return;
                    }

                    Iterator<SwitchVersionRequest> it = versionsToCheck.iterator();
                    while (it.hasNext()) {
                        SwitchVersionRequest req = it.next();
                        Tablespace t = context.getTablespaceVersionsMap()
                                .get(new TablespaceVersion(req.getTablespace(), req.getVersion()));
                        // Check that this TablespaceVersion has been reported by some node through Hazelcast
                        if (t != null && t.getReplicationMap() != null && t.getPartitionMap() != null
                                && t.getPartitionMap().getPartitionEntries() != null
                                && t.getReplicationMap().getReplicationEntries() != null
                                && t.getReplicationMap().getReplicationEntries().size() > 0) {
                            if (t.getPartitionMap().getPartitionEntries().size() == t.getReplicationMap()
                                    .getReplicationEntries().size()) {
                                log.info("Ok, TablespaceVersion [" + req.getTablespace() + ", " + req.getVersion()
                                        + "] being handled by enough DNodes as reported by Hazelcast.");
                                it.remove();
                            }
                        }
                    }
                } while (versionsToCheck.size() > 0);

                log.info("All DNodes performed the deploy of version [" + version + "]. Publishing tablespaces...");

                // We finish by publishing the versions table with the new versions.
                try {
                    switchVersions(switchActions());
                } catch (UnexistingVersion e) {
                    throw new RuntimeException(
                            "Unexisting version after deploying this version. Sounds like a bug.", e);
                }

                // If some replicas are under-replicated, start a balancing process
                context.maybeBalance();

                log.info("Deploy of version [" + version + "] Finished PROPERLY. :-)");
                context.getCoordinationStructures().logDeployMessage(version,
                        "Deploy of version [" + version + "] finished properly.");
                context.getCoordinationStructures().getDeploymentsStatusPanel().put(version, DeployStatus.FINISHED);
            } catch (InterruptedException e) {
                // Case when a deployment is cancelled.
                log.info("Deployment of [" + version + "] interrupted.");
                abortDeploy(dnodes, e.getMessage(), version);
            } catch (Throwable t) {
                t.printStackTrace();
                throw new RuntimeException(t);
            } finally {
                CoordinationStructures.DEPLOY_IN_PROGRESS.decrementAndGet();
            }
        }

        /**
         * Compose the list of switch actions to switch
         *
         * @return
         */
        private List<SwitchVersionRequest> switchActions() {
            ArrayList<SwitchVersionRequest> actions = new ArrayList<SwitchVersionRequest>();
            for (DeployRequest req : deployRequests) {
                actions.add(new SwitchVersionRequest(req.getTablespace(), version));
            }
            return actions;
        }

        /**
         * Log DNodes errors in deployment. We log both to the QNode logger and to Hazelcast so the info is persisted in the
         * session.
         */
        private void explainErrors() {
            IMap<String, String> deployErrorPanel = context.getCoordinationStructures()
                    .getDeployErrorPanel(version);
            String msg = "Deployment of version [" + version + "] failed in DNode[";
            for (Entry<String, String> entry : deployErrorPanel.entrySet()) {
                String fMsg = msg + entry.getKey() + "] - it failed with the error [" + entry.getValue() + "]";
                log.error(fMsg);
                context.getCoordinationStructures().logDeployMessage(version, fMsg);
            }
        }

        /**
         * Return true if one or more of the DNodes reported an error.
         */
        private boolean checkForFailure() {
            IMap<String, String> deployErrorPanel = context.getCoordinationStructures()
                    .getDeployErrorPanel(version);
            if (!isReplicaBalancingEnabled) {
                return !deployErrorPanel.isEmpty();
            }
            // If replica balancing is enabled we check whether we could survive after the failed DNodes
            Set<String> failedDNodes = new HashSet<String>(deployErrorPanel.keySet());
            // Check if deploy needs to be canceled or if the system could auto-rebalance itself afterwards
            for (DeployRequest deployRequest : deployRequests) {
                for (ReplicationEntry repEntry : deployRequest.getReplicationMap()) {
                    if (failedDNodes.containsAll(repEntry.getNodes())) {
                        // There is AT LEAST one partition that depends on the failed DNodes so the deploy must fail!
                        return true;
                    }
                }
            }
            return false;
        }
    } /* End ManageDeploy */

    /**
     * The Deployer deals with deploy and switch version requests.
     */
    public Deployer(QNodeHandlerContext context) {
        super(context);
        deployExecutor = MoreExecutors.listeningDecorator(
                Executors.newCachedThreadPool(new ThreadFactoryBuilder().setNameFormat("deploy-%d").build()));
    }

    /**
     * Call this method for starting an asynchronous deployment given a proper deploy request - proxy method for
     * {@link QNodeHandler}. Returns a {@link QueryStatus} with the status of the request.
     *
     * @throws InterruptedException
     */
    public DeployInfo deploy(List<DeployRequest> deployRequests) throws InterruptedException {
        // A new unique version number is generated.
        final long version = context.getCoordinationStructures().uniqueVersionId();

        DeployInfo deployInfo = fillDeployInfo(deployRequests, version, context.getQNodeAddress());

        context.getCoordinationStructures().logDeployMessage(version,
                "Deploy [" + version + "] for tablespaces" + deployInfo.getTablespacesDeployed() + " started.");
        context.getCoordinationStructures().getDeploymentsStatusPanel().put(version, DeployStatus.ONGOING);

        // Generate the list of actions per DNode
        Map<String, List<DeployAction>> actionsPerDNode = generateDeployActionsPerDNode(deployRequests, version);

        // Starting the countdown latch.
        ICountDownLatch countDownLatchForDeploy = context.getCoordinationStructures()
                .getCountDownLatchForDeploy(version);
        Set<String> dnodesInvolved = actionsPerDNode.keySet();
        countDownLatchForDeploy.trySetCount(dnodesInvolved.size());

        // Sending deploy signals to each DNode
        for (Map.Entry<String, List<DeployAction>> actionPerDNode : actionsPerDNode.entrySet()) {
            DNodeService.Client client = null;
            boolean renew = false;
            try {
                client = context.getDNodeClientFromPool(actionPerDNode.getKey());
                client.deploy(actionPerDNode.getValue(), version);
            } catch (TTransportException e) {
                renew = true;
            } catch (Exception e) {
                String errorMsg = "Error sending deploy actions to DNode [" + actionPerDNode.getKey() + "]";
                log.error(errorMsg, e);
                abortDeploy(new ArrayList<String>(actionsPerDNode.keySet()), errorMsg, version);
                deployInfo.setError("Error connecting to DNode " + actionPerDNode.getKey());
                context.getCoordinationStructures().getDeployInfoPanel().put(version, deployInfo);
                return deployInfo;
            } finally {
                if (client != null) {
                    context.returnDNodeClientToPool(actionPerDNode.getKey(), client, renew);
                }
            }
        }

        // Initiating an asynchronous process to manage the deployment
        ListenableFuture<?> future = deployExecutor.submit(new ManageDeploy(new ArrayList(actionsPerDNode.keySet()),
                deployRequests, version, context.getConfig().getLong(QNodeProperties.DEPLOY_TIMEOUT, -1),
                context.getConfig().getLong(QNodeProperties.DEPLOY_SECONDS_TO_CHECK_ERROR),
                context.getConfig().getLong(QNodeProperties.DEPLOY_DNODES_SPREAD_METADATA_TIMEOUT, 180),
                context.getConfig().getBoolean(QNodeProperties.REPLICA_BALANCE_ENABLE)));

        registerAsRunning(version, future);

        context.getCoordinationStructures().getDeployInfoPanel().put(version, deployInfo);
        return deployInfo;
    }

    /**
     * Registers a future as being running. Also registers an automatic callback
     * that will unregister the future once finished.
     */
    protected void registerAsRunning(final long version, ListenableFuture<?> future) {
        runningDeployments.put(version, future);
        Futures.addCallback(future, new FutureCallback<Object>() {
            @Override
            public void onSuccess(Object result) {
                unregister();
            }

            @Override
            public void onFailure(Throwable t) {
                unregister();
            }

            public void unregister() {
                runningDeployments.remove(version);
            }
        });
    }

    protected DeployInfo fillDeployInfo(List<DeployRequest> deployRequests, long version, String qNodeAddress) {
        DeployInfo deployInfo = new DeployInfo();

        deployInfo.setVersion(version);

        List<String> tablespaces = new ArrayList<String>();
        List<String> dataURIs = new ArrayList<String>();

        for (DeployRequest request : deployRequests) {
            tablespaces.add(request.getTablespace());
            dataURIs.add(request.getData_uri());
        }

        deployInfo.setTablespacesDeployed(tablespaces);
        deployInfo.setDataURIs(dataURIs);

        Date startTime = new Date();
        deployInfo.setStartedAt(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(startTime));

        deployInfo.setqNode(qNodeAddress);
        return deployInfo;
    }

    /**
     * DNodes are informed to stop the deployment, as something failed.
     *
     * @throws InterruptedException
     */
    public void abortDeploy(List<String> dnodes, String deployerErrorMessage, long version) {
        for (String dnode : dnodes) {
            DNodeService.Client client = null;
            boolean renew = false;
            try {
                client = context.getDNodeClientFromPool(dnode);
                client.abortDeploy(version);
            } catch (TTransportException e) {
                renew = true;
            } catch (Exception e) {
                log.error("Error sending abort deploy flag to DNode [" + dnode + "]", e);
            } finally {
                if (client != null) {
                    context.returnDNodeClientToPool(dnode, client, renew);
                }
            }
        }
        context.getCoordinationStructures().logDeployMessage(version,
                "Deploy failed due to: " + deployerErrorMessage);
        context.getCoordinationStructures().getDeploymentsStatusPanel().put(version, DeployStatus.FAILED);
    }

    /**
     * Switches current versions being served for some tablespaces, in an atomic way.
     */
    public void switchVersions(List<SwitchVersionRequest> switchRequest) throws UnexistingVersion {
        // We compute the new versions table, and then try to update it
        // We use optimistic locking: we read the original
        // map and try to update it. If the original has changed during
        // this process, we retry: reload the original map, ...
        Map<String, Long> versionsTable;
        Map<String, Long> newVersionsTable;
        do {
            versionsTable = context.getCoordinationStructures().getCopyVersionsBeingServed();
            newVersionsTable = new HashMap<String, Long>();
            if (versionsTable != null) {
                newVersionsTable.putAll(versionsTable);
            }

            for (SwitchVersionRequest req : switchRequest) {
                TablespaceVersion tsv = new TablespaceVersion(req.getTablespace(), req.getVersion());
                if (context.getTablespaceVersionsMap().get(tsv) == null) {
                    throw new UnexistingVersion("Trying to switch to unexisting version[" + req.getVersion()
                            + "] for tablespace[" + req.getTablespace() + "]");
                }
                newVersionsTable.put(tsv.getTablespace(), tsv.getVersion());
            }

        } while (!context.getCoordinationStructures().updateVersionsBeingServed(versionsTable, newVersionsTable));
    }

    /**
     * Generates the list of individual deploy actions that has to be sent to each DNode.
     */
    private static Map<String, List<DeployAction>> generateDeployActionsPerDNode(List<DeployRequest> deployRequests,
            long version) {
        HashMap<String, List<DeployAction>> actions = new HashMap<String, List<DeployAction>>();

        long deployDate = System.currentTimeMillis(); // Here is where we decide the data of the deployment for all deployed
        // tablespaces

        for (DeployRequest req : deployRequests) {
            for (Object obj : req.getReplicationMap()) {
                ReplicationEntry rEntry = (ReplicationEntry) obj;
                PartitionEntry pEntry = null;
                for (PartitionEntry partEntry : req.getPartitionMap()) {
                    if (partEntry.getShard().equals(rEntry.getShard())) {
                        pEntry = partEntry;
                    }
                }
                if (pEntry == null) {
                    String msg = "No Partition metadata for shard: " + rEntry.getShard()
                            + " this is very likely to be a software bug.";
                    log.error(msg);
                    try {
                        log.error("Partition map: " + JSONSerDe.ser(req.getPartitionMap()));
                        log.error("Replication map: " + JSONSerDe.ser(req.getReplicationMap()));
                    } catch (JSONSerDe.JSONSerDeException e) {
                        log.error("JSON error", e);
                    }
                    throw new RuntimeException(msg);
                }
                // Normalize DNode ids -> The convention is that DNodes are identified by host:port . So we need to strip the
                // protocol, if any
                for (int i = 0; i < rEntry.getNodes().size(); i++) {
                    String dnodeId = rEntry.getNodes().get(i);
                    if (dnodeId.startsWith("tcp://")) {
                        dnodeId = dnodeId.substring("tcp://".length(), dnodeId.length());
                    }
                    rEntry.getNodes().set(i, dnodeId);
                }
                for (String dNode : rEntry.getNodes()) {
                    List<DeployAction> actionsSoFar = (List<DeployAction>) MapUtils.getObject(actions, dNode,
                            new ArrayList<DeployAction>());
                    actions.put(dNode, actionsSoFar);
                    DeployAction deployAction = new DeployAction();
                    deployAction.setDataURI(req.getData_uri() + "/" + rEntry.getShard() + ".db");
                    deployAction.setTablespace(req.getTablespace());
                    deployAction.setVersion(version);
                    deployAction.setPartition(rEntry.getShard());

                    // Add partition metadata to the deploy action for DNodes to save it
                    PartitionMetadata metadata = new PartitionMetadata();
                    metadata.setMinKey(pEntry.getMin());
                    metadata.setMaxKey(pEntry.getMax());
                    metadata.setNReplicas(rEntry.getNodes().size());
                    metadata.setDeploymentDate(deployDate);
                    metadata.setInitStatements(req.getInitStatements());
                    metadata.setEngineId(req.getEngine());

                    deployAction.setMetadata(metadata);
                    actionsSoFar.add(deployAction);
                }
            }
        }
        return actions;
    }

    public StatusMessage cancelDeployment(long version) {
        Future<?> future = runningDeployments.get(version);
        if (future == null) {
            return new StatusMessage(StatusMessage.Status.ERROR,
                    "No deployment running for " + version + " found.");
        }
        future.cancel(true);
        return new StatusMessage(StatusMessage.Status.OK,
                "Deployment for version " + version + " being cancelled.");
    }
}