org.apache.hadoop.ha.ActiveStandbyElector.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.ha.ActiveStandbyElector.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.ha;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.HadoopIllegalArgumentException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.util.ZKUtil.ZKAuthInfo;
import org.apache.hadoop.util.StringUtils;
import org.apache.zookeeper.data.ACL;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.Watcher;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher.Event;
import org.apache.zookeeper.ZKUtil;
import org.apache.zookeeper.ZooKeeper;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.AsyncCallback.*;
import org.apache.zookeeper.data.Stat;
import org.apache.zookeeper.KeeperException.Code;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;

/**
 * 
 * This class implements a simple library to perform leader election on top of
 * Apache Zookeeper. Using Zookeeper as a coordination service, leader election
 * can be performed by atomically creating an ephemeral lock file (znode) on
 * Zookeeper. The service instance that successfully creates the znode becomes
 * active and the rest become standbys. <br/>
 * This election mechanism is only efficient for small number of election
 * candidates (order of 10's) because contention on single znode by a large
 * number of candidates can result in Zookeeper overload. <br/>
 * The elector does not guarantee fencing (protection of shared resources) among
 * service instances. After it has notified an instance about becoming a leader,
 * then that instance must ensure that it meets the service consistency
 * requirements. If it cannot do so, then it is recommended to quit the
 * election. The application implements the {@link ActiveStandbyElectorCallback}
 * to interact with the elector
 */
@InterfaceAudience.Private
@InterfaceStability.Evolving
public class ActiveStandbyElector implements StatCallback, StringCallback {

    /**
     * Callback interface to interact with the ActiveStandbyElector object. <br/>
     * The application will be notified with a callback only on state changes
     * (i.e. there will never be successive calls to becomeActive without an
     * intermediate call to enterNeutralMode). <br/>
     * The callbacks will be running on Zookeeper client library threads. The
     * application should return from these callbacks quickly so as not to impede
     * Zookeeper client library performance and notifications. The app will
     * typically remember the state change and return from the callback. It will
     * then proceed with implementing actions around that state change. It is
     * possible to be called back again while these actions are in flight and the
     * app should handle this scenario.
     */
    public interface ActiveStandbyElectorCallback {
        /**
         * This method is called when the app becomes the active leader.
         * If the service fails to become active, it should throw
         * ServiceFailedException. This will cause the elector to
         * sleep for a short period, then re-join the election.
         * 
         * Callback implementations are expected to manage their own
         * timeouts (e.g. when making an RPC to a remote node).
         */
        void becomeActive() throws ServiceFailedException;

        /**
         * This method is called when the app becomes a standby
         */
        void becomeStandby();

        /**
         * If the elector gets disconnected from Zookeeper and does not know about
         * the lock state, then it will notify the service via the enterNeutralMode
         * interface. The service may choose to ignore this or stop doing state
         * changing operations. Upon reconnection, the elector verifies the leader
         * status and calls back on the becomeActive and becomeStandby app
         * interfaces. <br/>
         * Zookeeper disconnects can happen due to network issues or loss of
         * Zookeeper quorum. Thus enterNeutralMode can be used to guard against
         * split-brain issues. In such situations it might be prudent to call
         * becomeStandby too. However, such state change operations might be
         * expensive and enterNeutralMode can help guard against doing that for
         * transient issues.
         */
        void enterNeutralMode();

        /**
         * If there is any fatal error (e.g. wrong ACL's, unexpected Zookeeper
         * errors or Zookeeper persistent unavailability) then notifyFatalError is
         * called to notify the app about it.
         */
        void notifyFatalError(String errorMessage);

        /**
         * If an old active has failed, rather than exited gracefully, then
         * the new active may need to take some fencing actions against it
         * before proceeding with failover.
         * 
         * @param oldActiveData the application data provided by the prior active
         */
        void fenceOldActive(byte[] oldActiveData);
    }

    /**
     * Name of the lock znode used by the library. Protected for access in test
     * classes
     */
    @VisibleForTesting
    protected static final String LOCK_FILENAME = "ActiveStandbyElectorLock";
    @VisibleForTesting
    protected static final String BREADCRUMB_FILENAME = "ActiveBreadCrumb";

    public static final Log LOG = LogFactory.getLog(ActiveStandbyElector.class);

    private static final int SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE = 1000;

    private static enum ConnectionState {
        DISCONNECTED, CONNECTED, TERMINATED
    };

    static enum State {
        INIT, ACTIVE, STANDBY, NEUTRAL
    };

    private State state = State.INIT;
    private int createRetryCount = 0;
    private int statRetryCount = 0;
    private ZooKeeper zkClient;
    private WatcherWithClientRef watcher;
    private ConnectionState zkConnectionState = ConnectionState.TERMINATED;

    private final ActiveStandbyElectorCallback appClient;
    private final String zkHostPort;
    private final int zkSessionTimeout;
    private final List<ACL> zkAcl;
    private final List<ZKAuthInfo> zkAuthInfo;
    private byte[] appData;
    private final String zkLockFilePath;
    private final String zkBreadCrumbPath;
    private final String znodeWorkingDir;
    private final int maxRetryNum;

    private Lock sessionReestablishLockForTests = new ReentrantLock();
    private boolean wantToBeInElection;
    private boolean monitorLockNodePending = false;
    private ZooKeeper monitorLockNodeClient;

    /**
     * Create a new ActiveStandbyElector object <br/>
     * The elector is created by providing to it the Zookeeper configuration, the
     * parent znode under which to create the znode and a reference to the
     * callback interface. <br/>
     * The parent znode name must be the same for all service instances and
     * different across services. <br/>
     * After the leader has been lost, a new leader will be elected after the
     * session timeout expires. Hence, the app must set this parameter based on
     * its needs for failure response time. The session timeout must be greater
     * than the Zookeeper disconnect timeout and is recommended to be 3X that
     * value to enable Zookeeper to retry transient disconnections. Setting a very
     * short session timeout may result in frequent transitions between active and
     * standby states during issues like network outages/GS pauses.
     * 
     * @param zookeeperHostPorts
     *          ZooKeeper hostPort for all ZooKeeper servers
     * @param zookeeperSessionTimeout
     *          ZooKeeper session timeout
     * @param parentZnodeName
     *          znode under which to create the lock
     * @param acl
     *          ZooKeeper ACL's
     * @param authInfo a list of authentication credentials to add to the
     *                 ZK connection
     * @param app
     *          reference to callback interface object
     * @throws IOException
     * @throws HadoopIllegalArgumentException
     */
    public ActiveStandbyElector(String zookeeperHostPorts, int zookeeperSessionTimeout, String parentZnodeName,
            List<ACL> acl, List<ZKAuthInfo> authInfo, ActiveStandbyElectorCallback app, int maxRetryNum)
            throws IOException, HadoopIllegalArgumentException, KeeperException {
        this(zookeeperHostPorts, zookeeperSessionTimeout, parentZnodeName, acl, authInfo, app, maxRetryNum, true);
    }

    /**
     * Create a new ActiveStandbyElector object <br/>
     * The elector is created by providing to it the Zookeeper configuration, the
     * parent znode under which to create the znode and a reference to the
     * callback interface. <br/>
     * The parent znode name must be the same for all service instances and
     * different across services. <br/>
     * After the leader has been lost, a new leader will be elected after the
     * session timeout expires. Hence, the app must set this parameter based on
     * its needs for failure response time. The session timeout must be greater
     * than the Zookeeper disconnect timeout and is recommended to be 3X that
     * value to enable Zookeeper to retry transient disconnections. Setting a very
     * short session timeout may result in frequent transitions between active and
     * standby states during issues like network outages/GS pauses.
     * 
     * @param zookeeperHostPorts
     *          ZooKeeper hostPort for all ZooKeeper servers
     * @param zookeeperSessionTimeout
     *          ZooKeeper session timeout
     * @param parentZnodeName
     *          znode under which to create the lock
     * @param acl
     *          ZooKeeper ACL's
     * @param authInfo a list of authentication credentials to add to the
     *                 ZK connection
     * @param app
     *          reference to callback interface object
     * @param failFast
     *          whether need to add the retry when establishing ZK connection.
     * @throws IOException
     * @throws HadoopIllegalArgumentException
     */
    public ActiveStandbyElector(String zookeeperHostPorts, int zookeeperSessionTimeout, String parentZnodeName,
            List<ACL> acl, List<ZKAuthInfo> authInfo, ActiveStandbyElectorCallback app, int maxRetryNum,
            boolean failFast) throws IOException, HadoopIllegalArgumentException, KeeperException {
        if (app == null || acl == null || parentZnodeName == null || zookeeperHostPorts == null
                || zookeeperSessionTimeout <= 0) {
            throw new HadoopIllegalArgumentException("Invalid argument");
        }
        zkHostPort = zookeeperHostPorts;
        zkSessionTimeout = zookeeperSessionTimeout;
        zkAcl = acl;
        zkAuthInfo = authInfo;
        appClient = app;
        znodeWorkingDir = parentZnodeName;
        zkLockFilePath = znodeWorkingDir + "/" + LOCK_FILENAME;
        zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;
        this.maxRetryNum = maxRetryNum;

        // establish the ZK Connection for future API calls
        if (failFast) {
            createConnection();
        } else {
            reEstablishSession();
        }
    }

    /**
     * To participate in election, the app will call joinElection. The result will
     * be notified by a callback on either the becomeActive or becomeStandby app
     * interfaces. <br/>
     * After this the elector will automatically monitor the leader status and
     * perform re-election if necessary<br/>
     * The app could potentially start off in standby mode and ignore the
     * becomeStandby call.
     * 
     * @param data
     *          to be set by the app. non-null data must be set.
     * @throws HadoopIllegalArgumentException
     *           if valid data is not supplied
     */
    public synchronized void joinElection(byte[] data) throws HadoopIllegalArgumentException {

        if (data == null) {
            throw new HadoopIllegalArgumentException("data cannot be null");
        }

        if (wantToBeInElection) {
            LOG.info("Already in election. Not re-connecting.");
            return;
        }

        appData = new byte[data.length];
        System.arraycopy(data, 0, appData, 0, data.length);

        if (LOG.isDebugEnabled()) {
            LOG.debug("Attempting active election for " + this);
        }
        joinElectionInternal();
    }

    /**
     * @return true if the configured parent znode exists
     */
    public synchronized boolean parentZNodeExists() throws IOException, InterruptedException {
        Preconditions.checkState(zkClient != null);
        try {
            return zkClient.exists(znodeWorkingDir, false) != null;
        } catch (KeeperException e) {
            throw new IOException("Couldn't determine existence of znode '" + znodeWorkingDir + "'", e);
        }
    }

    /**
     * Utility function to ensure that the configured base znode exists.
     * This recursively creates the znode as well as all of its parents.
     */
    public synchronized void ensureParentZNode() throws IOException, InterruptedException {
        Preconditions.checkState(!wantToBeInElection,
                "ensureParentZNode() may not be called while in the election");

        String pathParts[] = znodeWorkingDir.split("/");
        Preconditions.checkArgument(pathParts.length >= 1 && pathParts[0].isEmpty(), "Invalid path: %s",
                znodeWorkingDir);

        StringBuilder sb = new StringBuilder();
        for (int i = 1; i < pathParts.length; i++) {
            sb.append("/").append(pathParts[i]);
            String prefixPath = sb.toString();
            LOG.debug("Ensuring existence of " + prefixPath);
            try {
                createWithRetries(prefixPath, new byte[] {}, zkAcl, CreateMode.PERSISTENT);
            } catch (KeeperException e) {
                if (isNodeExists(e.code())) {
                    // This is OK - just ensuring existence.
                    continue;
                } else {
                    throw new IOException("Couldn't create " + prefixPath, e);
                }
            }
        }

        LOG.info("Successfully created " + znodeWorkingDir + " in ZK.");
    }

    /**
     * Clear all of the state held within the parent ZNode.
     * This recursively deletes everything within the znode as well as the
     * parent znode itself. It should only be used when it's certain that
     * no electors are currently participating in the election.
     */
    public synchronized void clearParentZNode() throws IOException, InterruptedException {
        Preconditions.checkState(!wantToBeInElection, "clearParentZNode() may not be called while in the election");

        try {
            LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK...");

            zkDoWithRetries(new ZKAction<Void>() {
                @Override
                public Void run() throws KeeperException, InterruptedException {
                    ZKUtil.deleteRecursive(zkClient, znodeWorkingDir);
                    return null;
                }
            });
        } catch (KeeperException e) {
            throw new IOException("Couldn't clear parent znode " + znodeWorkingDir, e);
        }
        LOG.info("Successfully deleted " + znodeWorkingDir + " from ZK.");
    }

    /**
     * Any service instance can drop out of the election by calling quitElection. 
     * <br/>
     * This will lose any leader status, if held, and stop monitoring of the lock
     * node. <br/>
     * If the instance wants to participate in election again, then it needs to
     * call joinElection(). <br/>
     * This allows service instances to take themselves out of rotation for known
     * impending unavailable states (e.g. long GC pause or software upgrade).
     * 
     * @param needFence true if the underlying daemon may need to be fenced
     * if a failover occurs due to dropping out of the election.
     */
    public synchronized void quitElection(boolean needFence) {
        LOG.info("Yielding from election");
        if (!needFence && state == State.ACTIVE) {
            // If active is gracefully going back to standby mode, remove
            // our permanent znode so no one fences us.
            tryDeleteOwnBreadCrumbNode();
        }
        reset();
        wantToBeInElection = false;
    }

    /**
     * Exception thrown when there is no active leader
     */
    public static class ActiveNotFoundException extends Exception {
        private static final long serialVersionUID = 3505396722342846462L;
    }

    /**
     * get data set by the active leader
     * 
     * @return data set by the active instance
     * @throws ActiveNotFoundException
     *           when there is no active leader
     * @throws KeeperException
     *           other zookeeper operation errors
     * @throws InterruptedException
     * @throws IOException
     *           when ZooKeeper connection could not be established
     */
    public synchronized byte[] getActiveData()
            throws ActiveNotFoundException, KeeperException, InterruptedException, IOException {
        try {
            if (zkClient == null) {
                createConnection();
            }
            Stat stat = new Stat();
            return getDataWithRetries(zkLockFilePath, false, stat);
        } catch (KeeperException e) {
            Code code = e.code();
            if (isNodeDoesNotExist(code)) {
                // handle the commonly expected cases that make sense for us
                throw new ActiveNotFoundException();
            } else {
                throw e;
            }
        }
    }

    /**
     * interface implementation of Zookeeper callback for create
     */
    @Override
    public synchronized void processResult(int rc, String path, Object ctx, String name) {
        if (isStaleClient(ctx))
            return;
        if (LOG.isDebugEnabled()) {
            LOG.debug("CreateNode result: " + rc + " for path: " + path + " connectionState: " + zkConnectionState
                    + "  for " + this);
        }

        Code code = Code.get(rc);
        if (isSuccess(code)) {
            // we successfully created the znode. we are the leader. start monitoring
            if (becomeActive()) {
                monitorActiveStatus();
            } else {
                reJoinElectionAfterFailureToBecomeActive();
            }
            return;
        }

        if (isNodeExists(code)) {
            if (createRetryCount == 0) {
                // znode exists and we did not retry the operation. so a different
                // instance has created it. become standby and monitor lock.
                becomeStandby();
            }
            // if we had retried then the znode could have been created by our first
            // attempt to the server (that we lost) and this node exists response is
            // for the second attempt. verify this case via ephemeral node owner. this
            // will happen on the callback for monitoring the lock.
            monitorActiveStatus();
            return;
        }

        String errorMessage = "Received create error from Zookeeper. code:" + code.toString() + " for path " + path;
        LOG.debug(errorMessage);

        if (shouldRetry(code)) {
            if (createRetryCount < maxRetryNum) {
                LOG.debug("Retrying createNode createRetryCount: " + createRetryCount);
                ++createRetryCount;
                createLockNodeAsync();
                return;
            }
            errorMessage = errorMessage + ". Not retrying further znode create connection errors.";
        } else if (isSessionExpired(code)) {
            // This isn't fatal - the client Watcher will re-join the election
            LOG.warn("Lock acquisition failed because session was lost");
            return;
        }

        fatalError(errorMessage);
    }

    /**
     * interface implementation of Zookeeper callback for monitor (exists)
     */
    @Override
    public synchronized void processResult(int rc, String path, Object ctx, Stat stat) {
        if (isStaleClient(ctx))
            return;
        monitorLockNodePending = false;

        assert wantToBeInElection : "Got a StatNode result after quitting election";

        if (LOG.isDebugEnabled()) {
            LOG.debug("StatNode result: " + rc + " for path: " + path + " connectionState: " + zkConnectionState
                    + " for " + this);
        }

        Code code = Code.get(rc);
        if (isSuccess(code)) {
            // the following owner check completes verification in case the lock znode
            // creation was retried
            if (stat.getEphemeralOwner() == zkClient.getSessionId()) {
                // we own the lock znode. so we are the leader
                if (!becomeActive()) {
                    reJoinElectionAfterFailureToBecomeActive();
                }
            } else {
                // we dont own the lock znode. so we are a standby.
                becomeStandby();
            }
            // the watch set by us will notify about changes
            return;
        }

        if (isNodeDoesNotExist(code)) {
            // the lock znode disappeared before we started monitoring it
            enterNeutralMode();
            joinElectionInternal();
            return;
        }

        String errorMessage = "Received stat error from Zookeeper. code:" + code.toString();
        LOG.debug(errorMessage);

        if (shouldRetry(code)) {
            if (statRetryCount < maxRetryNum) {
                ++statRetryCount;
                monitorLockNodeAsync();
                return;
            }
            errorMessage = errorMessage + ". Not retrying further znode monitoring connection errors.";
        } else if (isSessionExpired(code)) {
            // This isn't fatal - the client Watcher will re-join the election
            LOG.warn("Lock monitoring failed because session was lost");
            return;
        }

        fatalError(errorMessage);
    }

    /**
     * We failed to become active. Re-join the election, but
     * sleep for a few seconds after terminating our existing
     * session, so that other nodes have a chance to become active.
     * The failure to become active is already logged inside
     * becomeActive().
     */
    private void reJoinElectionAfterFailureToBecomeActive() {
        reJoinElection(SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE);
    }

    /**
     * interface implementation of Zookeeper watch events (connection and node),
     * proxied by {@link WatcherWithClientRef}.
     */
    synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) {
        Event.EventType eventType = event.getType();
        if (isStaleClient(zk))
            return;
        if (LOG.isDebugEnabled()) {
            LOG.debug("Watcher event type: " + eventType + " with state:" + event.getState() + " for path:"
                    + event.getPath() + " connectionState: " + zkConnectionState + " for " + this);
        }

        if (eventType == Event.EventType.None) {
            // the connection state has changed
            switch (event.getState()) {
            case SyncConnected:
                LOG.info("Session connected.");
                // if the listener was asked to move to safe state then it needs to
                // be undone
                ConnectionState prevConnectionState = zkConnectionState;
                zkConnectionState = ConnectionState.CONNECTED;
                if (prevConnectionState == ConnectionState.DISCONNECTED && wantToBeInElection) {
                    monitorActiveStatus();
                }
                break;
            case Disconnected:
                LOG.info("Session disconnected. Entering neutral mode...");

                // ask the app to move to safe state because zookeeper connection
                // is not active and we dont know our state
                zkConnectionState = ConnectionState.DISCONNECTED;
                enterNeutralMode();
                break;
            case Expired:
                // the connection got terminated because of session timeout
                // call listener to reconnect
                LOG.info("Session expired. Entering neutral mode and rejoining...");
                enterNeutralMode();
                reJoinElection(0);
                break;
            case SaslAuthenticated:
                LOG.info("Successfully authenticated to ZooKeeper using SASL.");
                break;
            default:
                fatalError("Unexpected Zookeeper watch event state: " + event.getState());
                break;
            }

            return;
        }

        // a watch on lock path in zookeeper has fired. so something has changed on
        // the lock. ideally we should check that the path is the same as the lock
        // path but trusting zookeeper for now
        String path = event.getPath();
        if (path != null) {
            switch (eventType) {
            case NodeDeleted:
                if (state == State.ACTIVE) {
                    enterNeutralMode();
                }
                joinElectionInternal();
                break;
            case NodeDataChanged:
                monitorActiveStatus();
                break;
            default:
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Unexpected node event: " + eventType + " for path: " + path);
                }
                monitorActiveStatus();
            }

            return;
        }

        // some unexpected error has occurred
        fatalError("Unexpected watch error from Zookeeper");
    }

    /**
     * Get a new zookeeper client instance. protected so that test class can
     * inherit and mock out the zookeeper instance
     * 
     * @return new zookeeper client instance
     * @throws IOException
     * @throws KeeperException zookeeper connectionloss exception
     */
    protected synchronized ZooKeeper connectToZooKeeper() throws IOException, KeeperException {

        // Unfortunately, the ZooKeeper constructor connects to ZooKeeper and
        // may trigger the Connected event immediately. So, if we register the
        // watcher after constructing ZooKeeper, we may miss that event. Instead,
        // we construct the watcher first, and have it block any events it receives
        // before we can set its ZooKeeper reference.
        watcher = new WatcherWithClientRef();
        ZooKeeper zk = createZooKeeper();
        watcher.setZooKeeperRef(zk);

        // Wait for the asynchronous success/failure. This may throw an exception
        // if we don't connect within the session timeout.
        watcher.waitForZKConnectionEvent(zkSessionTimeout);

        for (ZKAuthInfo auth : zkAuthInfo) {
            zk.addAuthInfo(auth.getScheme(), auth.getAuth());
        }
        return zk;
    }

    /**
     * Get a new zookeeper client instance. protected so that test class can
     * inherit and pass in a mock object for zookeeper
     *
     * @return new zookeeper client instance
     * @throws IOException
     */
    protected ZooKeeper createZooKeeper() throws IOException {
        return new ZooKeeper(zkHostPort, zkSessionTimeout, watcher);
    }

    private void fatalError(String errorMessage) {
        LOG.fatal(errorMessage);
        reset();
        appClient.notifyFatalError(errorMessage);
    }

    private void monitorActiveStatus() {
        assert wantToBeInElection;
        if (LOG.isDebugEnabled()) {
            LOG.debug("Monitoring active leader for " + this);
        }
        statRetryCount = 0;
        monitorLockNodeAsync();
    }

    private void joinElectionInternal() {
        Preconditions.checkState(appData != null, "trying to join election without any app data");
        if (zkClient == null) {
            if (!reEstablishSession()) {
                fatalError("Failed to reEstablish connection with ZooKeeper");
                return;
            }
        }

        createRetryCount = 0;
        wantToBeInElection = true;
        createLockNodeAsync();
    }

    private void reJoinElection(int sleepTime) {
        LOG.info("Trying to re-establish ZK session");

        // Some of the test cases rely on expiring the ZK sessions and
        // ensuring that the other node takes over. But, there's a race
        // where the original lease holder could reconnect faster than the other
        // thread manages to take the lock itself. This lock allows the
        // tests to block the reconnection. It's a shame that this leaked
        // into non-test code, but the lock is only acquired here so will never
        // be contended.
        sessionReestablishLockForTests.lock();
        try {
            terminateConnection();
            sleepFor(sleepTime);
            // Should not join election even before the SERVICE is reported
            // as HEALTHY from ZKFC monitoring.
            if (appData != null) {
                joinElectionInternal();
            } else {
                LOG.info("Not joining election since service has not yet been " + "reported as healthy.");
            }
        } finally {
            sessionReestablishLockForTests.unlock();
        }
    }

    /**
     * Sleep for the given number of milliseconds.
     * This is non-static, and separated out, so that unit tests
     * can override the behavior not to sleep.
     */
    @VisibleForTesting
    protected void sleepFor(int sleepMs) {
        if (sleepMs > 0) {
            try {
                Thread.sleep(sleepMs);
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
            }
        }
    }

    @VisibleForTesting
    void preventSessionReestablishmentForTests() {
        sessionReestablishLockForTests.lock();
    }

    @VisibleForTesting
    void allowSessionReestablishmentForTests() {
        sessionReestablishLockForTests.unlock();
    }

    @VisibleForTesting
    synchronized long getZKSessionIdForTests() {
        if (zkClient != null) {
            return zkClient.getSessionId();
        } else {
            return -1;
        }
    }

    @VisibleForTesting
    synchronized State getStateForTests() {
        return state;
    }

    @VisibleForTesting
    synchronized boolean isMonitorLockNodePending() {
        return monitorLockNodePending;
    }

    private boolean reEstablishSession() {
        int connectionRetryCount = 0;
        boolean success = false;
        while (!success && connectionRetryCount < maxRetryNum) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Establishing zookeeper connection for " + this);
            }
            try {
                createConnection();
                success = true;
            } catch (IOException e) {
                LOG.warn(e);
                sleepFor(5000);
            } catch (KeeperException e) {
                LOG.warn(e);
                sleepFor(5000);
            }
            ++connectionRetryCount;
        }
        return success;
    }

    private void createConnection() throws IOException, KeeperException {
        if (zkClient != null) {
            try {
                zkClient.close();
            } catch (InterruptedException e) {
                throw new IOException("Interrupted while closing ZK", e);
            }
            zkClient = null;
            watcher = null;
        }
        zkClient = connectToZooKeeper();
        if (LOG.isDebugEnabled()) {
            LOG.debug("Created new connection for " + this);
        }
    }

    @InterfaceAudience.Private
    public synchronized void terminateConnection() {
        if (zkClient == null) {
            return;
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug("Terminating ZK connection for " + this);
        }
        ZooKeeper tempZk = zkClient;
        zkClient = null;
        watcher = null;
        try {
            tempZk.close();
        } catch (InterruptedException e) {
            LOG.warn(e);
        }
        zkConnectionState = ConnectionState.TERMINATED;
        wantToBeInElection = false;
    }

    private void reset() {
        state = State.INIT;
        terminateConnection();
    }

    private boolean becomeActive() {
        assert wantToBeInElection;
        if (state == State.ACTIVE) {
            // already active
            return true;
        }
        try {
            Stat oldBreadcrumbStat = fenceOldActive();
            writeBreadCrumbNode(oldBreadcrumbStat);

            if (LOG.isDebugEnabled()) {
                LOG.debug("Becoming active for " + this);
            }
            appClient.becomeActive();
            state = State.ACTIVE;
            return true;
        } catch (Exception e) {
            LOG.warn("Exception handling the winning of election", e);
            // Caller will handle quitting and rejoining the election.
            return false;
        }
    }

    /**
     * Write the "ActiveBreadCrumb" node, indicating that this node may need
     * to be fenced on failover.
     * @param oldBreadcrumbStat 
     */
    private void writeBreadCrumbNode(Stat oldBreadcrumbStat) throws KeeperException, InterruptedException {
        Preconditions.checkState(appData != null, "no appdata");

        LOG.info("Writing znode " + zkBreadCrumbPath
                + " to indicate that the local node is the most recent active...");
        if (oldBreadcrumbStat == null) {
            // No previous active, just create the node
            createWithRetries(zkBreadCrumbPath, appData, zkAcl, CreateMode.PERSISTENT);
        } else {
            // There was a previous active, update the node
            setDataWithRetries(zkBreadCrumbPath, appData, oldBreadcrumbStat.getVersion());
        }
    }

    /**
     * Try to delete the "ActiveBreadCrumb" node when gracefully giving up
     * active status.
     * If this fails, it will simply warn, since the graceful release behavior
     * is only an optimization.
     */
    private void tryDeleteOwnBreadCrumbNode() {
        assert state == State.ACTIVE;
        LOG.info("Deleting bread-crumb of active node...");

        // Sanity check the data. This shouldn't be strictly necessary,
        // but better to play it safe.
        Stat stat = new Stat();
        byte[] data = null;
        try {
            data = zkClient.getData(zkBreadCrumbPath, false, stat);

            if (!Arrays.equals(data, appData)) {
                throw new IllegalStateException(
                        "We thought we were active, but in fact " + "the active znode had the wrong data: "
                                + StringUtils.byteToHexString(data) + " (stat=" + stat + ")");
            }

            deleteWithRetries(zkBreadCrumbPath, stat.getVersion());
        } catch (Exception e) {
            LOG.warn("Unable to delete our own bread-crumb of being active at " + zkBreadCrumbPath + ": "
                    + e.getLocalizedMessage() + ". " + "Expecting to be fenced by the next active.");
        }
    }

    /**
     * If there is a breadcrumb node indicating that another node may need
     * fencing, try to fence that node.
     * @return the Stat of the breadcrumb node that was read, or null
     * if no breadcrumb node existed
     */
    private Stat fenceOldActive() throws InterruptedException, KeeperException {
        final Stat stat = new Stat();
        byte[] data;
        LOG.info("Checking for any old active which needs to be fenced...");
        try {
            data = zkDoWithRetries(new ZKAction<byte[]>() {
                @Override
                public byte[] run() throws KeeperException, InterruptedException {
                    return zkClient.getData(zkBreadCrumbPath, false, stat);
                }
            });
        } catch (KeeperException ke) {
            if (isNodeDoesNotExist(ke.code())) {
                LOG.info("No old node to fence");
                return null;
            }

            // If we failed to read for any other reason, then likely we lost
            // our session, or we don't have permissions, etc. In any case,
            // we probably shouldn't become active, and failing the whole
            // thing is the best bet.
            throw ke;
        }

        LOG.info("Old node exists: " + StringUtils.byteToHexString(data));
        if (Arrays.equals(data, appData)) {
            LOG.info("But old node has our own data, so don't need to fence it.");
        } else {
            appClient.fenceOldActive(data);
        }
        return stat;
    }

    private void becomeStandby() {
        if (state != State.STANDBY) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Becoming standby for " + this);
            }
            state = State.STANDBY;
            appClient.becomeStandby();
        }
    }

    private void enterNeutralMode() {
        if (state != State.NEUTRAL) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Entering neutral mode for " + this);
            }
            state = State.NEUTRAL;
            appClient.enterNeutralMode();
        }
    }

    private void createLockNodeAsync() {
        zkClient.create(zkLockFilePath, appData, zkAcl, CreateMode.EPHEMERAL, this, zkClient);
    }

    private void monitorLockNodeAsync() {
        if (monitorLockNodePending && monitorLockNodeClient == zkClient) {
            LOG.info("Ignore duplicate monitor lock-node request.");
            return;
        }
        monitorLockNodePending = true;
        monitorLockNodeClient = zkClient;
        zkClient.exists(zkLockFilePath, watcher, this, zkClient);
    }

    private String createWithRetries(final String path, final byte[] data, final List<ACL> acl,
            final CreateMode mode) throws InterruptedException, KeeperException {
        return zkDoWithRetries(new ZKAction<String>() {
            @Override
            public String run() throws KeeperException, InterruptedException {
                return zkClient.create(path, data, acl, mode);
            }
        });
    }

    private byte[] getDataWithRetries(final String path, final boolean watch, final Stat stat)
            throws InterruptedException, KeeperException {
        return zkDoWithRetries(new ZKAction<byte[]>() {
            @Override
            public byte[] run() throws KeeperException, InterruptedException {
                return zkClient.getData(path, watch, stat);
            }
        });
    }

    private Stat setDataWithRetries(final String path, final byte[] data, final int version)
            throws InterruptedException, KeeperException {
        return zkDoWithRetries(new ZKAction<Stat>() {
            @Override
            public Stat run() throws KeeperException, InterruptedException {
                return zkClient.setData(path, data, version);
            }
        });
    }

    private void deleteWithRetries(final String path, final int version)
            throws KeeperException, InterruptedException {
        zkDoWithRetries(new ZKAction<Void>() {
            @Override
            public Void run() throws KeeperException, InterruptedException {
                zkClient.delete(path, version);
                return null;
            }
        });
    }

    private <T> T zkDoWithRetries(ZKAction<T> action) throws KeeperException, InterruptedException {
        int retry = 0;
        while (true) {
            try {
                return action.run();
            } catch (KeeperException ke) {
                if (shouldRetry(ke.code()) && ++retry < maxRetryNum) {
                    continue;
                }
                throw ke;
            }
        }
    }

    private interface ZKAction<T> {
        T run() throws KeeperException, InterruptedException;
    }

    /**
     * The callbacks and watchers pass a reference to the ZK client
     * which made the original call. We don't want to take action
     * based on any callbacks from prior clients after we quit
     * the election.
     * @param ctx the ZK client passed into the watcher
     * @return true if it matches the current client
     */
    private synchronized boolean isStaleClient(Object ctx) {
        Preconditions.checkNotNull(ctx);
        if (zkClient != (ZooKeeper) ctx) {
            LOG.warn("Ignoring stale result from old client with sessionId "
                    + String.format("0x%08x", ((ZooKeeper) ctx).getSessionId()));
            return true;
        }
        return false;
    }

    /**
     * Watcher implementation which keeps a reference around to the
     * original ZK connection, and passes it back along with any
     * events.
     */
    private final class WatcherWithClientRef implements Watcher {
        private ZooKeeper zk;

        /**
         * Latch fired whenever any event arrives. This is used in order
         * to wait for the Connected event when the client is first created.
         */
        private CountDownLatch hasReceivedEvent = new CountDownLatch(1);

        /**
         * Latch used to wait until the reference to ZooKeeper is set.
         */
        private CountDownLatch hasSetZooKeeper = new CountDownLatch(1);

        /**
         * Waits for the next event from ZooKeeper to arrive.
         * 
         * @param connectionTimeoutMs zookeeper connection timeout in milliseconds
         * @throws KeeperException if the connection attempt times out. This will
         * be a ZooKeeper ConnectionLoss exception code.
         * @throws IOException if interrupted while connecting to ZooKeeper
         */
        private void waitForZKConnectionEvent(int connectionTimeoutMs) throws KeeperException, IOException {
            try {
                if (!hasReceivedEvent.await(connectionTimeoutMs, TimeUnit.MILLISECONDS)) {
                    LOG.error("Connection timed out: couldn't connect to ZooKeeper in " + connectionTimeoutMs
                            + " milliseconds");
                    zk.close();
                    throw KeeperException.create(Code.CONNECTIONLOSS);
                }
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                throw new IOException("Interrupted when connecting to zookeeper server", e);
            }
        }

        private void setZooKeeperRef(ZooKeeper zk) {
            Preconditions.checkState(this.zk == null, "zk already set -- must be set exactly once");
            this.zk = zk;
            hasSetZooKeeper.countDown();
        }

        @Override
        public void process(WatchedEvent event) {
            hasReceivedEvent.countDown();
            try {
                if (!hasSetZooKeeper.await(zkSessionTimeout, TimeUnit.MILLISECONDS)) {
                    LOG.debug("Event received with stale zk");
                }
                ActiveStandbyElector.this.processWatchEvent(zk, event);
            } catch (Throwable t) {
                fatalError("Failed to process watcher event " + event + ": " + StringUtils.stringifyException(t));
            }
        }
    }

    private static boolean isSuccess(Code code) {
        return (code == Code.OK);
    }

    private static boolean isNodeExists(Code code) {
        return (code == Code.NODEEXISTS);
    }

    private static boolean isNodeDoesNotExist(Code code) {
        return (code == Code.NONODE);
    }

    private static boolean isSessionExpired(Code code) {
        return (code == Code.SESSIONEXPIRED);
    }

    private static boolean shouldRetry(Code code) {
        return code == Code.CONNECTIONLOSS || code == Code.OPERATIONTIMEOUT;
    }

    @Override
    public String toString() {
        return "elector id=" + System.identityHashCode(this) + " appData="
                + ((appData == null) ? "null" : StringUtils.byteToHexString(appData)) + " cb=" + appClient;
    }

    public String getHAZookeeperConnectionState() {
        return this.zkConnectionState.name();
    }
}