com.yahoo.gondola.tsunami.Tsunami.java Source code

Java tutorial

Introduction

Here is the source code for com.yahoo.gondola.tsunami.Tsunami.java

Source

/*
 * Copyright 2015, Yahoo Inc.
 * Copyrights licensed under the New BSD License.
 * See the accompanying LICENSE file for terms.
 */

package com.yahoo.gondola.tsunami;

import com.yahoo.gondola.Config;
import com.yahoo.gondola.cli.CliClient;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.log4j.PropertyConfigurator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.net.InetSocketAddress;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.ReentrantLock;
import java.util.stream.Collectors;

/**
 * This test creates a 3-server cluster and randomly kills and restarts the servers.
 * On each server are N writer threads, each thread committing their thread id and a sequence number.
 * The writers to the leader will succeed while the writers on the non-leaders will all fail.
 * The writers will retry a command until it succeeds.
 * <p>
 * A verifier thread reads the committed commands from every server and verifies that the commands are
 * read in sequence. Duplicate commands are allowed. But missing commands are flagged as a major error.
 * The commands are read from each server in round-robin fashion.
 * <p>
 * The test runs three phases continuously.
 * <li> Safe - the N writers are allowed to write without interruption for N seconds.
 * <li> Nasty - the servers are randonly restarted to induce errors.
 * <li> Sync - the writers are stopped until the reader has caught up.
 * <p>
 * For even more stress, the storage system is wrapped around NastyStorage, which introduces random delays
 * and exception to storage operations.
 */
public class Tsunami {
    static Logger logger = LoggerFactory.getLogger(Tsunami.class);

    static Config config;
    List<String> hostIds = Arrays.asList("host1:1099", "host2:1100", "host3:1101");
    AgentClient[] agents = new AgentClient[hostIds.size()];

    ExecutorService executorService = Executors.newCachedThreadPool();

    ReentrantLock lock = new ReentrantLock();

    // This map contains the last index written by every writer. Used to verify consecutive indices.
    // map<writer-id, index>
    Map<String, Integer> lastIndices = new HashMap<>();

    AtomicInteger writes = new AtomicInteger();
    AtomicInteger reads = new AtomicInteger();
    AtomicInteger errors = new AtomicInteger();

    static int numWriters = 5;

    // Used in sync phase
    int lastWrittenIndex = 0;
    int lastReadIndex = 0;

    // The instance of gondola that the verifier is waiting to get a committed command
    int verifyWaitingFor = 0;
    int verifyWaitingForIndex = 0;

    // When no read has succeeded for a while, stop killing gondola instances
    long lastReadTs = 0;

    public enum Phase {
        SYNC, // Writers stop until the reader has caught up
        SAFE, // The instances are not restarted during this phase
        NASTY // Instances are randomly restarted
    }

    // Current phase
    Phase phase = Phase.SYNC;

    public static void main(String[] args) throws Exception {
        PropertyConfigurator.configure("conf/log4j.properties");
        config = new Config(new File("conf/gondola-tsunami.conf"));

        // Process command line arguments
        CommandLineParser parser = new DefaultParser();
        Options options = new Options();
        options.addOption("w", true, "Number of writers per member, default = " + numWriters);
        options.addOption("h", false, "help");
        CommandLine commandLine = parser.parse(options, args);
        if (commandLine.hasOption("h")) {
            new HelpFormatter().printHelp("Tsunami Test", options);
            return;
        }
        if (commandLine.hasOption("w")) {
            numWriters = Integer.parseInt(commandLine.getOptionValue("w"));
        }
        new Tsunami();
    }

    public Tsunami() throws Exception {
        setup();

        // Print status thread
        executorService.execute(() -> {
            while (true) {
                logger.info("writes: {}, reads: {}, errors: {}, waiting for index={} on {}", writes.get(),
                        reads.get(), errors.get(), verifyWaitingForIndex, agents[verifyWaitingFor].hostId);
                logger.info("  " + Arrays.stream(agents).map(agent -> agent.hostId + ": up=" + agent.up)
                        .collect(Collectors.joining(", ")));
                logger.info("  lastWrite: {}, lastRead: {}", lastWrittenIndex, lastReadIndex);
                sleep(10000);
            }
        });

        executorService.execute(new Killer());
        executorService.execute(new Verifier());

        // Initialize writer threads. numWriter writers per gondola
        for (int a = 0; a < agents.length; a++) {
            AgentClient agent = agents[a];
            for (int w = 0; w < numWriters; w++) {
                String writerId = String.format("%c%d", (char) ('A' + a), w);

                executorService.execute(new Writer(writerId, agent.hostname, agent.gondolaCc.getPort()));
            }
        }
    }

    void setup() throws Exception {
        // Create gondola instances
        for (int i = 0; i < hostIds.size(); i++) {
            String[] split = hostIds.get(i).split(":");
            String hostId = split[0];
            String cliPort = split[1];
            InetSocketAddress addr = config.getAddressForHost(hostId);

            // Initialize the instance
            agents[i] = new AgentClient(hostId, addr.getHostName(), 1200,
                    new CliClient(addr.getHostName(), Integer.parseInt(cliPort), 60000)); // 1m timeout
            //agents[i].createInstance();
        }
    }

    static void sleep(int delayMs) {
        try {
            Thread.sleep(delayMs);
        } catch (InterruptedException e) {
            logger.error("", e);
        }
    }

    /**
     * ************* threads ***************
     */

    /**
     * This thread continuously tries to write a command into the log. If a failure occurs, it waits for a small delay
     * and then tries again. The command contains the writer's id and a monotonically incrementing counter specific to
     * this writer. The verifier confirms that the counter increases consecutively.
     */
    class Writer implements Runnable {
        String id;
        CliClient cliClient;

        public Writer(String id, String hostname, int cliPort) throws Exception {
            this.id = id;
            cliClient = new CliClient(hostname, cliPort, 0); // no timeout
            lastIndices.put(id, 0);
            Thread.currentThread().setName("Writer");
        }

        @Override
        public void run() {
            boolean lastSuccess = false;
            for (int index = 1;; index++) {
                String command = String.format("%s-%d", id, index);

                // Keep retrying until the write succeeds
                while (true) {
                    try {
                        // Wait for the reader to catch up
                        while (phase == Phase.SYNC) {
                            Thread.sleep(1000);
                        }

                        if (Math.random() < .001) {
                            cliClient.forceLeader(1000);
                        }

                        // Write the command
                        lastWrittenIndex = Math.max(lastWrittenIndex, cliClient.commit(command));
                        //logger.info("Wrote {}", command);
                        writes.incrementAndGet();
                        lastSuccess = true;
                        sleep((int) (Math.random() * 500));
                        break;
                    } catch (Exception e) {
                        if (lastSuccess) {
                            // Show the error only if the last command succeeded, to cut down on output noise
                            logger.info("Failed to write {}: {}", command, e.getMessage());
                            lastSuccess = false;
                        }
                        errors.incrementAndGet();
                        sleep(1000);
                    }
                }
            }
        }
    }

    /**
     * This thread reads the committed commands from each member and checks that they're all
     * identical.
     * It also does a second check to make sure that the counters from each writer is monotonically increasing.
     */
    class Verifier implements Runnable {
        Verifier() {
            Thread.currentThread().setName("Verifier");
        }

        @Override
        public void run() {
            CliClient[] clients = new CliClient[agents.length];
            try {
                for (int i = 0; i < clients.length; i++) {
                    clients[i] = new CliClient(agents[i].hostname, agents[i].gondolaCc.getPort(), 60000);
                }
            } catch (Exception e) {
                logger.error(e.getMessage(), e);
            }

            for (int index = 1;; index++) {
                while (true) {
                    try {
                        verifyWaitingFor = 0;
                        verifyWaitingForIndex = index;
                        String command = clients[0].getCommand(index, 30000);
                        lastReadTs = System.currentTimeMillis();
                        //logger.info("[{}] Read index={}: {}", agents[0].hostId, index, command);

                        // Verify all three hosts are the same
                        for (int i = 1; i < agents.length; i++) {
                            verifyWaitingFor = i;
                            verifyWaitingForIndex = index;
                            String command2 = clients[i].getCommand(index, 30000);
                            lastReadTs = System.currentTimeMillis();

                            if (!command.equals(command2)) {
                                logger.error(
                                        "Command at index {} from {}={} does not match command from {}={}. lastIndices={}",
                                        index, agents[0].hostId, command, agents[i].hostId, command2, lastIndices);
                                System.exit(1);
                            }
                        }

                        // Verify that the indexes are monotonically increasing
                        if (!command.equals("")) {
                            String[] parts = command.split("-");
                            String writerId = parts[0];
                            int ix = Integer.parseInt(parts[1]);

                            int lastIndex = lastIndices.get(writerId);
                            // Allow duplicates (for now until we can determine if can be avoided)
                            if (ix == lastIndex) {
                                logger.warn("Command at index {} for writer {}={} is a duplicate", index, writerId,
                                        ix);
                            } else if (ix == lastIndex + 1) {
                                lastIndices.put(writerId, ix);
                            } else {
                                logger.error(
                                        "Command at index {} for writer {}={} is not one bigger than last index {}. lastIndices={}",
                                        index, writerId, ix, lastIndex, lastIndices);
                                System.exit(1);
                            }
                        }
                        reads.incrementAndGet();
                        lastReadIndex = index;
                        break;
                    } catch (EOFException e) {
                        logger.error("EOF: Failed to execute: {}", e.getMessage());
                        errors.incrementAndGet();
                    } catch (Exception e) {
                        String msg = e.getMessage();
                        if (msg.indexOf("Timeout") >= 0 || msg.indexOf("ERROR: Unhandled") >= 0) {
                            logger.error(msg);
                        } else {
                            logger.error(e.getMessage(), e);
                        }
                        errors.incrementAndGet();
                        try {
                            Thread.sleep(1000);
                        } catch (InterruptedException e1) {
                            logger.error(e.getMessage(), e);
                        }
                    }
                }
            }
        }
    }

    /**
     * This thread randomly kills and restarts members.
     */
    class Killer implements Runnable {
        static final int SYNC_PERIOD = 1000;
        static final int NASTY_PERIOD = 60000;
        static final int SAFE_PERIOD = 1000;

        Killer() {
            Thread.currentThread().setName("Killer");
        }

        @Override
        public void run() {
            long phaseTimeoutTs = 0;

            while (true) {
                if (System.currentTimeMillis() >= phaseTimeoutTs) {
                    switch (phase) {
                    case SAFE:
                        phase = Phase.NASTY;
                        logger.info("--- " + phase + " phase ---");
                        enableNastyStorage(true);
                        phaseTimeoutTs = System.currentTimeMillis() + NASTY_PERIOD;
                        break;
                    case NASTY:
                        phase = Phase.SYNC;
                        logger.info("--- " + phase + " phase ---");
                        phaseTimeoutTs = System.currentTimeMillis() + SYNC_PERIOD;
                        break;
                    case SYNC:
                        // Restart instances
                        for (int i = 0; i < agents.length; i++) {
                            if (!agents[i].gondolaCc.isOperational()) {
                                logger.info("Creating instance {}", agents[i].hostId);
                                agents[i].createInstance();
                            }
                        }
                        enableNastyStorage(false);
                        if (lastReadIndex >= lastWrittenIndex) {
                            phase = Phase.SAFE;
                            logger.info("--- " + phase + " phase ---");
                            phaseTimeoutTs = System.currentTimeMillis() + SAFE_PERIOD;
                        } else {
                            phaseTimeoutTs = System.currentTimeMillis() + SYNC_PERIOD;
                        }
                        break;
                    }
                }
                sleep((int) (Math.random() * 5000));

                int r = (int) (Math.random() * agents.length);
                AgentClient agent = agents[r];
                long lastReadTime = System.currentTimeMillis() - lastReadTs;

                if (agent.up) {
                    if (lastReadTime >= 60000 && !agent.gondolaCc.isOperational()) {
                        logger.info("Creating instance {} which appears to be down", agent.hostId);
                        agent.createInstance();
                        lastReadTs = System.currentTimeMillis();
                    } else if (phase == Phase.NASTY && Math.random() > .8) {
                        logger.info("Killing instance {}", agent.hostId);

                        // Now ask the agent to do it again, just in case
                        agent.destroyInstance();
                    }
                } else {
                    logger.info("Creating instance {}", agent.hostId);
                    agent.createInstance();
                }
            }
        }

        void enableNastyStorage(boolean on) {
            // Enable nasty storage
            for (int i = 0; i < agents.length; i++) {
                try {
                    agents[i].gondolaCc.enableNastyStorage(on);
                } catch (Exception e) {
                    logger.error(e.getMessage(), e);
                }
            }
        }
    }
}