com.nokia.dempsy.mpcluster.zookeeper.TestZookeeperClusterResilience.java Source code

Java tutorial

Introduction

Here is the source code for com.nokia.dempsy.mpcluster.zookeeper.TestZookeeperClusterResilience.java

Source

/*
 * Copyright 2012 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.nokia.dempsy.mpcluster.zookeeper;

import static com.nokia.dempsy.TestUtils.poll;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher;
import org.apache.zookeeper.ZooKeeper;
import org.junit.Before;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.nokia.dempsy.Dempsy;
import com.nokia.dempsy.TestUtils.Condition;
import com.nokia.dempsy.config.ApplicationDefinition;
import com.nokia.dempsy.config.ClusterId;
import com.nokia.dempsy.messagetransport.tcp.TcpTransport;
import com.nokia.dempsy.monitoring.coda.StatsCollectorFactoryCoda;
import com.nokia.dempsy.mpcluster.MpCluster;
import com.nokia.dempsy.mpcluster.MpClusterException;
import com.nokia.dempsy.mpcluster.MpClusterSession;
import com.nokia.dempsy.mpcluster.MpClusterSessionFactory;
import com.nokia.dempsy.mpcluster.MpClusterWatcher;
import com.nokia.dempsy.router.ClusterInformation;
import com.nokia.dempsy.router.DefaultRoutingStrategy;
import com.nokia.dempsy.router.SlotInformation;
import com.nokia.dempsy.router.SpecificClusterCheck;
import com.nokia.dempsy.serialization.java.JavaSerializer;

/**
 * The goal here is to make sure the cluster is always consistent even if it looses 
 * the zookeeper session connection or doesn't have it to begin with.
 */
public class TestZookeeperClusterResilience {
    public static final String appname = TestZookeeperClusterResilience.class.getSimpleName();
    private static Logger logger = LoggerFactory.getLogger(TestZookeeperClusterResilience.class);
    static private final long baseTimeoutMillis = 20000;

    private int port;

    @Before
    public void setup() throws IOException {
        port = ZookeeperTestServer.findNextPort();
        logger.debug("Running zookeeper test server on port " + port);
    }

    public static class TestWatcher implements MpClusterWatcher {
        AtomicBoolean called = new AtomicBoolean(false);

        @Override
        public void process() {
            called.set(true);
        }

    }

    @Test
    public void testBouncingServer() throws Throwable {
        ZookeeperTestServer server = new ZookeeperTestServer();
        ZookeeperSession<String, String> session = null;

        try {
            server.start();

            ZookeeperSessionFactory<String, String> factory = new ZookeeperSessionFactory<String, String>(
                    "127.0.0.1:" + port, 5000);
            session = (ZookeeperSession<String, String>) factory.createSession();
            final MpCluster<String, String> cluster = session
                    .getCluster(new ClusterId(appname, "testBouncingServer"));
            TestWatcher callback = new TestWatcher() {
                MpCluster<String, String> m_cluster = cluster;

                @Override
                public void process() {
                    try {
                        if (m_cluster.getActiveSlots().size() == 0) {
                            m_cluster.join("slot1");
                            called.set(true);
                        }
                    } catch (MpClusterException e) {
                        // this will fail when the connection is severed... that's ok.
                    }
                }

            };

            cluster.addWatcher(callback);
            callback.process();

            // create another session and look
            ZookeeperSession<String, String> session2 = (ZookeeperSession<String, String>) factory.createSession();
            MpCluster<String, String> cluster2 = session2.getCluster(new ClusterId(appname, "testBouncingServer"));
            assertEquals(1, cluster2.getActiveSlots().size());
            session2.stop();

            // kill the server.
            server.shutdown();

            // reset the flags
            callback.called.set(false);

            // restart the server
            server.start();

            // wait for the call
            for (long endTime = System.currentTimeMillis() + baseTimeoutMillis; endTime > System.currentTimeMillis()
                    && !callback.called.get();)
                Thread.sleep(1);
            assertTrue(callback.called.get());

            // get the view from a new session.
            session2 = (ZookeeperSession<String, String>) factory.createSession();
            cluster2 = session2.getCluster(new ClusterId(appname, "testBouncingServer"));
            assertEquals(1, cluster2.getActiveSlots().size());
            session2.stop();
        } finally {
            if (server != null)
                server.shutdown();

            if (session != null)
                session.stop();
        }
    }

    @Test
    public void testNoServerOnStartup() throws Throwable {
        // create a session factory
        ZookeeperSessionFactory<String, String> factory = new ZookeeperSessionFactory<String, String>(
                "127.0.0.1:" + port, 5000);

        // create a session from the session factory
        ZookeeperSession<String, String> session = (ZookeeperSession<String, String>) factory.createSession();

        // create a cluster from the session
        MpCluster<String, String> cluster = session.getCluster(new ClusterId(appname, "testNoServerOnStartup"));

        // hook a test watch to make sure that callbacks work correctly
        TestWatcher callback = new TestWatcher();
        cluster.addWatcher(callback);

        assertNotNull(cluster);

        // now accessing the cluster should get us an error.
        boolean gotCorrectError = false;
        try {
            cluster.getActiveSlots();
        } catch (MpClusterException e) {
            gotCorrectError = true;
        }
        assertTrue(gotCorrectError);

        // now lets startup the server.
        ZookeeperTestServer server = null;
        try {
            server = new ZookeeperTestServer();
            server.start();

            // wait until this works.
            for (long endTime = System.currentTimeMillis() + baseTimeoutMillis; endTime > System.currentTimeMillis()
                    && !callback.called.get();)
                Thread.sleep(1);

            assertTrue(callback.called.get());
            callback.called.set(false); // reset the callbacker ...

            // now see if the cluster works.
            cluster.getActiveSlots();

            // now we should be all happycakes ... but with the server running lets sever the connection
            // according to the zookeeper faq we can force a session expired to occur by closing the session from another client.
            // see: http://wiki.apache.org/hadoop/ZooKeeper/FAQ#A4
            ZooKeeper origZk = session.zkref.get();
            long sessionid = origZk.getSessionId();
            callback.called.set(false); // reset the callbacker ...
            ZooKeeper killer = new ZooKeeper("127.0.0.1:" + port, 5000, new Watcher() {
                @Override
                public void process(WatchedEvent arg0) {
                }
            }, sessionid, null);
            killer.close(); // tricks the server into expiring the other session

            // wait for the callback
            for (long endTime = System.currentTimeMillis() + baseTimeoutMillis; endTime > System.currentTimeMillis()
                    && !callback.called.get();)
                Thread.sleep(10);
            assertTrue(callback.called.get());

            // unfortunately I cannot check the getActiveSlots for failure because there's a race condition I can't fix.
            //  No matter how fast I check it's possible that it's okay again OR that allSlots hasn't been cleared.
            // 
            // however, they should eventually recover.
            gotCorrectError = true;
            for (long endTime = System.currentTimeMillis() + baseTimeoutMillis; endTime > System.currentTimeMillis()
                    && gotCorrectError;) {
                Thread.sleep(1);
                try {
                    cluster.getActiveSlots();
                    gotCorrectError = false;
                } catch (MpClusterException e) {
                }
            }

            cluster.getActiveSlots();

            // And join should work
            gotCorrectError = true;
            for (long endTime = System.currentTimeMillis() + baseTimeoutMillis; endTime > System.currentTimeMillis()
                    && gotCorrectError;) {
                Thread.sleep(1);
                try {
                    cluster.join("join-1");
                    gotCorrectError = false;
                } catch (MpClusterException e) {
                }
            }

            assertFalse(gotCorrectError);
        } finally {
            if (server != null)
                server.shutdown();

            if (session != null)
                session.stop();
        }
    }

    @Test
    public void testSessionExpired() throws Throwable {
        // now lets startup the server.
        ZookeeperTestServer server = null;
        ZookeeperSession<String, String> session = null;
        final AtomicLong processCount = new AtomicLong(0);
        final AtomicReference<CountDownLatch> processFinishLatch = new AtomicReference<CountDownLatch>();
        processFinishLatch.set(new CountDownLatch(1));

        try {
            server = new ZookeeperTestServer();
            server.start();

            session = new ZookeeperSession<String, String>("127.0.0.1:" + port, 5000) {
                @Override
                public ZookeeperCluster makeZookeeperCluster(ClusterId clusterId) throws MpClusterException {
                    return new ZookeeperCluster(clusterId) {
                        @Override
                        public void process(WatchedEvent event) {
                            processCount.incrementAndGet();
                            super.process(event);
                            if (processFinishLatch.get() != null)
                                processFinishLatch.get().countDown();
                        }
                    };
                }
            };

            assertEquals(0, processCount.intValue()); // no calls yet

            // This will create the cluster itself and so will call process.
            MpCluster<String, String> cluster = session.getCluster(new ClusterId(appname, "testSessionExpired"));

            // now the count should reach 1
            assertTrue(poll(5000, null, new Condition<Object>() {
                @Override
                public boolean conditionMet(Object o) {
                    return processCount.intValue() == 1;
                }
            }));
            TestWatcher callback = new TestWatcher();

            // wait until the process call is actually finished ...
            assertTrue(processFinishLatch.get().await(5, TimeUnit.SECONDS));

            // ... before adding the watcher. There's a race condition (without the latch)
            // where the watcher could get added but the process loop is still running.
            cluster.addWatcher(callback);

            assertNotNull(cluster);

            // now see if the cluster works.
            cluster.getActiveSlots();

            // cause a problem with the server running lets sever the connection
            // according to the zookeeper faq we can force a session expired to occur by closing the session from another client.
            // see: http://wiki.apache.org/hadoop/ZooKeeper/FAQ#A4
            ZooKeeper origZk = session.zkref.get();
            long sessionid = origZk.getSessionId();
            ZooKeeper killer = new ZooKeeper("127.0.0.1:" + port, 5000, new Watcher() {
                @Override
                public void process(WatchedEvent arg0) {
                }
            }, sessionid, null);

            // now the count should still be 1
            Thread.sleep(10);
            assertEquals(1, processCount.intValue());

            // and the callback wasn't called.
            assertFalse(callback.called.get());

            killer.close(); // tricks the server into expiring the other session

            // now I should get a process call
            assertTrue(poll(5000, null, new Condition<Object>() {
                @Override
                public boolean conditionMet(Object o) {
                    return processCount.intValue() > 1;
                }
            }));

            // and eventually a callback
            assertTrue(poll(5000, callback, new Condition<TestWatcher>() {
                @Override
                public boolean conditionMet(TestWatcher o) {
                    return o.called.get();
                }
            }));
        } finally {
            if (server != null)
                server.shutdown();

            if (session != null)
                session.stop();
        }
    }

    private static Dempsy getDempsyFor(ClusterId clusterId, ApplicationDefinition ad) throws Throwable {
        //------------------------------------------------------------------------------
        // here is a complete non-spring, non-DI Dempsy instantiation
        //------------------------------------------------------------------------------
        List<ApplicationDefinition> ads = new ArrayList<ApplicationDefinition>();
        ads.add(ad);

        Dempsy dempsy = new Dempsy();
        dempsy.setApplicationDefinitions(ads);
        dempsy.setClusterCheck(new SpecificClusterCheck(clusterId));
        dempsy.setDefaultRoutingStrategy(new DefaultRoutingStrategy(20, 1));
        dempsy.setDefaultSerializer(new JavaSerializer<Object>());
        dempsy.setDefaultStatsCollectorFactory(new StatsCollectorFactoryCoda());
        dempsy.setDefaultTransport(new TcpTransport());
        //------------------------------------------------------------------------------

        return dempsy;
    }

    @SuppressWarnings("rawtypes")
    @Test
    public void testSessionExpiredWithFullApp() throws Throwable {
        // now lets startup the server.
        ZookeeperTestServer server = null;
        final AtomicReference<ZookeeperSession> sessionRef = new AtomicReference<ZookeeperSession>();
        ZookeeperSession session = null;
        final AtomicLong processCount = new AtomicLong(0);

        Dempsy[] dempsy = new Dempsy[3];
        try {
            server = new ZookeeperTestServer();
            server.start();

            session = new ZookeeperSession("127.0.0.1:" + port, 5000) {
                @Override
                public ZookeeperCluster makeZookeeperCluster(ClusterId clusterId) throws MpClusterException {
                    return new ZookeeperCluster(clusterId) {
                        @Override
                        public void process(WatchedEvent event) {
                            //                     System.out.println("" + event);
                            processCount.incrementAndGet();
                            super.process(event);
                        }
                    };
                }
            };
            sessionRef.set(session);

            final FullApplication app = new FullApplication();
            ApplicationDefinition ad = app.getTopology();

            assertEquals(0, processCount.intValue()); // no calls yet

            dempsy[0] = getDempsyFor(new ClusterId(FullApplication.class.getSimpleName(),
                    FullApplication.MyAdaptor.class.getSimpleName()), ad);
            dempsy[0].setClusterSessionFactory(
                    new ZookeeperSessionFactory<ClusterInformation, SlotInformation>("127.0.0.1:" + port, 5000));

            dempsy[1] = getDempsyFor(new ClusterId(FullApplication.class.getSimpleName(),
                    FullApplication.MyMp.class.getSimpleName()), ad);
            dempsy[1].setClusterSessionFactory(
                    new ZookeeperSessionFactory<ClusterInformation, SlotInformation>("127.0.0.1:" + port, 5000));

            dempsy[2] = getDempsyFor(new ClusterId(FullApplication.class.getSimpleName(),
                    FullApplication.MyRankMp.class.getSimpleName()), ad);
            //         dempsy[2].setClusterSessionFactory(new ZookeeperSessionFactory<ClusterInformation, SlotInformation>("127.0.0.1:" + port,5000));

            dempsy[2].setClusterSessionFactory(new MpClusterSessionFactory<ClusterInformation, SlotInformation>() {
                @SuppressWarnings("unchecked")
                @Override
                public MpClusterSession<ClusterInformation, SlotInformation> createSession()
                        throws MpClusterException {
                    return sessionRef.get();
                }
            });

            // start everything in reverse order
            for (int i = 2; i >= 0; i--)
                dempsy[i].start();

            // make sure the final count is incrementing
            long curCount = app.finalMessageCount.get();
            assertTrue(poll(30000, curCount, new Condition<Long>() {

                @Override
                public boolean conditionMet(Long o) {
                    return app.finalMessageCount.get() > (o + 100L);
                }

            }));

            // cause a problem with the server running lets sever the connection
            // according to the zookeeper faq we can force a session expired to occur by closing the session from another client.
            // see: http://wiki.apache.org/hadoop/ZooKeeper/FAQ#A4
            ZooKeeper origZk = (ZooKeeper) session.zkref.get();
            long sessionid = origZk.getSessionId();
            ZooKeeper killer = new ZooKeeper("127.0.0.1:" + port, 5000, new Watcher() {
                @Override
                public void process(WatchedEvent arg0) {
                }
            }, sessionid, null);

            killer.close(); // tricks the server into expiring the other session

            Thread.sleep(300);

            // make sure the final count is STILL incrementing
            curCount = app.finalMessageCount.get();
            assertTrue(poll(30000, curCount, new Condition<Long>() {

                @Override
                public boolean conditionMet(Long o) {
                    return app.finalMessageCount.get() > (o + 100L);
                }

            }));

        } finally {
            if (server != null)
                server.shutdown();

            if (session != null)
                session.stop();

            for (int i = 0; i < 3; i++)
                if (dempsy[i] != null)
                    dempsy[i].stop();
        }
    }

    private AtomicBoolean forceIOException = new AtomicBoolean(false);
    private CountDownLatch forceIOExceptionLatch = new CountDownLatch(5);

    @Test
    public void testRecoverWithIOException() throws Throwable {
        // now lets startup the server.
        ZookeeperTestServer server = null;
        ZookeeperSession<String, String> session = null;
        try {
            server = new ZookeeperTestServer();
            server.start();

            session = new ZookeeperSession<String, String>("127.0.0.1:" + port, 5000) {
                @Override
                protected ZooKeeper makeZookeeperInstance(String connectString, int sessionTimeout)
                        throws IOException {
                    if (forceIOException.get()) {
                        forceIOExceptionLatch.countDown();
                        throw new IOException("Fake IO Problem.");
                    }
                    return super.makeZookeeperInstance(connectString, sessionTimeout);
                }
            };

            MpCluster<String, String> cluster = session
                    .getCluster(new ClusterId(appname, "testRecoverWithIOException"));
            TestWatcher callback = new TestWatcher();
            cluster.addWatcher(callback);

            assertNotNull(cluster);

            // now see if the cluster works.
            cluster.getActiveSlots();

            // cause a problem with the server running lets sever the connection
            // according to the zookeeper faq we can force a session expired to occur by closing the session from another client.
            // see: http://wiki.apache.org/hadoop/ZooKeeper/FAQ#A4
            ZooKeeper origZk = session.zkref.get();
            long sessionid = origZk.getSessionId();
            ZooKeeper killer = new ZooKeeper("127.0.0.1:" + port, 5000, new Watcher() {
                @Override
                public void process(WatchedEvent arg0) {
                }
            }, sessionid, null);

            // force the ioexception to happen
            forceIOException.set(true);

            killer.close(); // tricks the server into expiring the other session

            // just stop the damn server
            server.shutdown();

            // now in the background it should be retrying but hosed.
            assertTrue(forceIOExceptionLatch.await(baseTimeoutMillis * 3, TimeUnit.MILLISECONDS));

            // There is no longer a callback on a disconnect....only a callback when the reconnect is successful         
            //         // wait for the callback
            //         for (long endTime = System.currentTimeMillis() + baseTimeoutMillis; endTime > System.currentTimeMillis() && !callback.called.get();)
            //            Thread.sleep(1);
            //         assertTrue(callback.called.get());

            // TODO: do I really meed this sleep?
            Thread.sleep(1000);

            // now the getActiveSlots call should fail since i'm preventing the recovery by throwing IOExceptions
            boolean gotCorrectError = false;
            for (long endTime = System.currentTimeMillis() + baseTimeoutMillis; endTime > System.currentTimeMillis()
                    && !gotCorrectError;) {
                Thread.sleep(1);
                try {
                    cluster.join("yo");
                } catch (MpClusterException e) {
                    gotCorrectError = true;
                }
            }
            assertTrue(gotCorrectError);

            callback.called.set(false); // reset the callbacker ...

            // now we should allow the code to proceed.
            forceIOException.set(false);

            // we might want the server running.
            server = new ZookeeperTestServer();
            server.start();

            // wait for the callback
            for (long endTime = System.currentTimeMillis() + baseTimeoutMillis; endTime > System.currentTimeMillis()
                    && !callback.called.get();)
                Thread.sleep(1);
            assertTrue(callback.called.get());

            // this should eventually recover.
            gotCorrectError = true;
            for (long endTime = System.currentTimeMillis() + baseTimeoutMillis; endTime > System.currentTimeMillis()
                    && gotCorrectError;) {
                Thread.sleep(1);
                try {
                    cluster.getActiveSlots();
                    gotCorrectError = false;
                } catch (MpClusterException e) {
                }
            }

            cluster.getActiveSlots();

            // And join should work
            gotCorrectError = true;
            for (long endTime = System.currentTimeMillis() + baseTimeoutMillis; endTime > System.currentTimeMillis()
                    && gotCorrectError;) {
                Thread.sleep(1);
                try {
                    cluster.join("join-1");
                    gotCorrectError = false;
                } catch (MpClusterException e) {
                }
            }

            assertFalse(gotCorrectError);
        } finally {
            if (server != null)
                server.shutdown();

            if (session != null)
                session.stop();
        }
    }

}