uk.ac.sanger.cgp.wwdocker.daemon.WorkerDaemon.java Source code

Java tutorial

Introduction

Here is the source code for uk.ac.sanger.cgp.wwdocker.daemon.WorkerDaemon.java

Source

/**
 * Copyright (c) 2015 Genome Research Ltd.
 * 
 * Author: Cancer Genome Project cgpit@sanger.ac.uk
 * 
 * This file is part of WwDocker.
 * 
 * WwDocker is free software: you can redistribute it and/or modify it under
 * the terms of the GNU Affero General Public License as published by the Free
 * Software Foundation; either version 3 of the License, or (at your option) any
 * later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
 * details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * 1. The usage of a range of years within a copyright statement contained within
 * this distribution should be interpreted as being equivalent to a list of years
 * including the first and last year specified and all consecutive years between
 * them. For example, a copyright statement that reads 'Copyright (c) 2005, 2007-
 * 2009, 2011-2012' should be interpreted as being identical to a statement that
 * reads 'Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012' and a copyright
 * statement that reads "Copyright (c) 2005-2012' should be interpreted as being
 * identical to a statement that reads 'Copyright (c) 2005, 2006, 2007, 2008,
 * 2009, 2010, 2011, 2012'."
 */

package uk.ac.sanger.cgp.wwdocker.daemon;

import java.io.File;
import java.io.IOException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeoutException;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import uk.ac.sanger.cgp.wwdocker.Config;
import uk.ac.sanger.cgp.wwdocker.actions.Local;
import uk.ac.sanger.cgp.wwdocker.callable.Docker;
import uk.ac.sanger.cgp.wwdocker.actions.Utils;
import uk.ac.sanger.cgp.wwdocker.beans.WorkerState;
import uk.ac.sanger.cgp.wwdocker.beans.WorkerResources;
import uk.ac.sanger.cgp.wwdocker.beans.WorkflowIni;
import uk.ac.sanger.cgp.wwdocker.enums.HostStatus;
import uk.ac.sanger.cgp.wwdocker.factories.WorkflowFactory;
import uk.ac.sanger.cgp.wwdocker.interfaces.Daemon;
import uk.ac.sanger.cgp.wwdocker.interfaces.Workflow;
import uk.ac.sanger.cgp.wwdocker.messages.Messaging;

/**
 *
 * @author kr2
 */
public class WorkerDaemon implements Daemon {
    private static final Logger logger = LogManager.getLogger();
    private static PropertiesConfiguration config;
    private static Messaging messaging;
    private static Docker dockerThread = null;
    private static ExecutorService executor = null;
    private static FutureTask<Integer> futureTask = null;

    public WorkerDaemon(PropertiesConfiguration config, Messaging rmq) {
        WorkerDaemon.config = config;
        WorkerDaemon.messaging = rmq;
    }

    @Override
    public void run(String mode)
            throws IOException, InterruptedException, TimeoutException, ConfigurationException {
        WorkerResources hr = new WorkerResources();
        logger.debug(Utils.objectToJson(hr));

        Thread shutdownThread = null;

        String qPrefix = config.getString("qPrefix");

        File thisConfig = new File("/opt/wwdocker/" + qPrefix + ".remote.cfg");
        File thisJar = Utils.thisJarFile();

        // build a local WorkerState
        WorkerState thisState = new WorkerState(thisJar, thisConfig);
        thisState.setStatus(HostStatus.CLEAN);
        String hostName = thisState.getResource().getHostName();

        // Remove from all queues as I'll set my state again now
        messaging.removeFromStateQueue(qPrefix.concat(".").concat("BROKEN"), hostName);
        messaging.removeFromStateQueue(qPrefix.concat(".").concat("CLEAN"), hostName);
        messaging.removeFromStateQueue(qPrefix.concat(".").concat("DONE"), hostName);
        messaging.removeFromStateQueue(qPrefix.concat(".").concat("ERROR"), hostName);
        messaging.removeFromStateQueue(qPrefix.concat(".").concat("ERRORLOGS"), hostName);
        messaging.removeFromStateQueue(qPrefix.concat(".").concat("RECEIVE"), hostName);
        messaging.removeFromStateQueue(qPrefix.concat(".").concat("RUNNING"), hostName);

        // I'm running so send a message to the CLEAN queue
        messaging.sendMessage(qPrefix.concat(".CLEAN"), thisState);
        boolean firstCleanIter = true;
        String myQueue = qPrefix.concat(".").concat(hostName);

        int counter = 30;
        Workflow workflowImp = new WorkflowFactory().getWorkflow(config);
        int failedRmqGet = 0;
        while (true) {
            Thread.sleep(500); // don't eat cpu
            //Only control messages will be sent directly to the host now

            WorkerState recievedState = null;
            try {
                recievedState = (WorkerState) messaging.getWorkerState(myQueue, 10);
                failedRmqGet = 0;
            } catch (IOException e) {
                failedRmqGet++;
                if (failedRmqGet == 10) {
                    logger.fatal("Failed to communicate with RMQ server 10 times, aborting.", e);
                    System.exit(1);
                }
                logger.warn("Failed to communicate with RMQ server, allowable for 10 iterations only.", e);
            }

            thisState.getResource().init();

            if (recievedState != null) {
                if (!recievedState.equals(thisState) && thisState.getStatus().equals(HostStatus.CLEAN)) {
                    messaging.removeFromStateQueue(qPrefix.concat(".").concat(thisState.getStatus().name()),
                            hostName);
                    logger.fatal("Host refresh required, shutting down...");
                    System.exit(0);
                }
                if (recievedState.getChangeStatusTo() != null) {
                    if (recievedState.getChangeStatusTo().equals(HostStatus.KILL)) {
                        messaging.removeFromStateQueue(qPrefix.concat(".").concat(thisState.getStatus().name()),
                                hostName);
                        messaging.removeFromStateQueue(qPrefix.concat(".").concat("RUNNING"), hostName); // this is never changed unless a host dies/killed
                        if (thisState.getStatus().equals(HostStatus.ERROR)) {
                            messaging.removeFromStateQueue(qPrefix.concat(".").concat("ERRORLOGS"), hostName);
                        }
                        if (!thisState.getStatus().equals(HostStatus.CLEAN)) {
                            if (shutdownThread == null) {
                                messaging.sendMessage(qPrefix.concat(".").concat("PEND"),
                                        Utils.objectToJson(thisState.getWorkflowIni()));
                            }
                        }
                        logger.fatal("FORCED SHUTDOWN...");
                        if (dockerThread != null) {
                            Local.execCommand("docker ps | tail -n +2 | cut -d ' ' -f 1 | xargs docker kill",
                                    Config.getEnvs(config), true);
                            futureTask.cancel(true);
                            executor.shutdownNow();
                        }
                        System.exit(0);
                    } else if (recievedState.getChangeStatusTo().equals(HostStatus.CHECKIN)) {
                        logger.info(recievedState.toString());
                        messaging.sendMessage(recievedState.getReplyToQueue(), thisState);
                    } else if (recievedState.getChangeStatusTo().equals(HostStatus.RUNNING)) {
                        // this is only sent if we want to retry the execution of an errored workflow
                        throw new RuntimeException("Restart attempted, I don't know how yet");
                    }
                }
            }

            // then we do the actual work
            if (thisState.getStatus().equals(HostStatus.CLEAN)) {

                // clean up any other queues that may have legacy entries, boolean to prevent rapid query rates
                if (firstCleanIter) {
                    messaging.removeFromStateQueue(qPrefix.concat(".").concat("DONE"), hostName);
                    messaging.removeFromStateQueue(qPrefix.concat(".").concat("ERROR"), hostName);
                    messaging.removeFromStateQueue(qPrefix.concat(".").concat("ERRORLOGS"), hostName);
                    messaging.removeFromStateQueue(qPrefix.concat(".").concat("RECEIVE"), hostName);
                    messaging.removeFromStateQueue(qPrefix.concat(".").concat("RUNNING"), hostName);
                    messaging.removeFromStateQueue(qPrefix.concat(".").concat("BROKEN"), hostName);
                    firstCleanIter = false;
                }

                //We pull data from the wwd_PEND queue
                WorkflowIni workIni = (WorkflowIni) messaging.getMessageObject(qPrefix.concat(".").concat("PEND"),
                        WorkflowIni.class, 10);
                if (workIni == null) {
                    continue;
                }
                logger.debug(thisState.toString());
                thisState.setWorkflowIni(workIni);
                shutdownThread = attachWorkIniShutdownHook(thisState.getWorkflowIni(), messaging, qPrefix,
                        hostName);
                workflowImp.cleanDockerPath(config); // clean up the workarea

                dockerThread = new Docker(workIni, config);

                futureTask = new FutureTask<>(dockerThread);
                executor = Executors.newSingleThreadExecutor();
                executor.execute(futureTask);
                // this section saves having to check you've got it right
                messaging.removeFromStateQueue(qPrefix.concat(".").concat(thisState.getStatus().name()), hostName);
                thisState.setStatus(HostStatus.RUNNING);
                messaging.sendMessage(qPrefix.concat(".").concat(thisState.getStatus().name()), thisState);
            } else if (thisState.getStatus().equals(HostStatus.RUNNING)) {
                if (futureTask.isDone()) {
                    try {
                        int dockerExitCode = futureTask.get();
                        logger.info("Exit code: " + dockerExitCode);
                        if (dockerExitCode == 0) {
                            thisState.setStatus(HostStatus.DONE);
                            messaging.sendMessage(qPrefix.concat(".").concat("UPLOADED"),
                                    thisState.getWorkflowIni());
                        } else {
                            if (dockerThread.getLogArchive() != null) {
                                messaging.sendFile(qPrefix.concat(".").concat("ERRORLOGS"), hostName,
                                        dockerThread.getLogArchive());
                            }
                            thisState.setStatus(HostStatus.ERROR);
                        }

                        Runtime.getRuntime().removeShutdownHook(shutdownThread);
                        messaging.removeFromStateQueue(qPrefix.concat(".").concat("RUNNING"), hostName);
                        messaging.sendMessage(qPrefix.concat(".").concat(thisState.getStatus().name()), thisState);
                        shutdownThread = null;

                        executor.shutdown();

                        dockerThread = null;
                        executor = null;
                        futureTask = null;
                    } catch (InterruptedException | ExecutionException | IOException e) {
                        logger.warn(e.getMessage(), e);
                        thisState.setStatus(HostStatus.ERROR);
                    }
                }
            } else if (thisState.getStatus().equals(HostStatus.DONE)) {
                /* if we need to handle working without GNOS access on images
                   then we need to change the logic here to wait for a
                   state change pushed from the control code */
                messaging.removeFromStateQueue(qPrefix.concat(".").concat(thisState.getStatus().name()), hostName);
                thisState.setStatus(HostStatus.CLEAN);
                firstCleanIter = true;
                thisState.setWorkflowIni(null);
                messaging.sendMessage(qPrefix.concat(".").concat(thisState.getStatus().name()), thisState);
            } else if (thisState.getStatus().equals(HostStatus.ERROR)) {
                if (counter == 60) {
                    logger.debug("I'm set to error, waiting for directions...");
                    counter = 0;
                }
                counter++;
                Thread.sleep(500); // sleep at top too
            } else {
                throw new RuntimeException("Don't know what to do yet");
            }
        }
    }

    private Thread attachWorkIniShutdownHook(WorkflowIni ini, Messaging messaging, String qPrefix,
            String hostName) {
        Thread sdt = new Thread() {
            @Override
            public void run() {
                try {
                    // We need to know which INI's may have been lost and which hosts they were on when things go odd.
                    // This way we can track which hosts may have issues unrelated to workflow state.
                    messaging.sendMessage(qPrefix.concat(".").concat("UNCLEAN"), Utils.objectToJson(ini));
                    // really not running if this has been executed.
                    messaging.removeFromStateQueue(qPrefix.concat(".").concat("BROKEN"), hostName); // broken means failed to provision
                    messaging.removeFromStateQueue(qPrefix.concat(".").concat("CLEAN"), hostName);
                    messaging.removeFromStateQueue(qPrefix.concat(".").concat("ERROR"), hostName);
                    messaging.removeFromStateQueue(qPrefix.concat(".").concat("ERRORLOG"), hostName);
                    messaging.removeFromStateQueue(qPrefix.concat(".").concat("RECEIVE"), hostName);
                    messaging.removeFromStateQueue(qPrefix.concat(".").concat("RUNNING"), hostName);
                } catch (IOException | InterruptedException | TimeoutException e) {
                    throw new RuntimeException("Error while executing shutdownHook", e);
                }
            }
        };

        Runtime.getRuntime().addShutdownHook(sdt);
        return sdt;
    }

}