org.lamport.tla.toolbox.jcloud.CloudDistributedTLCJob.java Source code

Java tutorial

Introduction

Here is the source code for org.lamport.tla.toolbox.jcloud.CloudDistributedTLCJob.java

Source

/*******************************************************************************
 * Copyright (c) 2014 Microsoft Research. All rights reserved. 
 *
 * The MIT License (MIT)
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy 
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 * of the Software, and to permit persons to whom the Software is furnished to do
 * so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software. 
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * Contributors:
 *   Markus Alexander Kuppe - initial API and implementation
 ******************************************************************************/

package org.lamport.tla.toolbox.jcloud;

import static com.google.common.base.Predicates.not;
import static org.jclouds.compute.predicates.NodePredicates.TERMINATED;
import static org.jclouds.compute.predicates.NodePredicates.inGroup;
import static org.jclouds.scriptbuilder.domain.Statements.exec;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Properties;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import org.eclipse.core.runtime.IProgressMonitor;
import org.eclipse.core.runtime.IStatus;
import org.eclipse.core.runtime.Status;
import org.eclipse.core.runtime.jobs.Job;
import org.jclouds.ContextBuilder;
import org.jclouds.compute.ComputeService;
import org.jclouds.compute.ComputeServiceContext;
import org.jclouds.compute.RunNodesException;
import org.jclouds.compute.RunScriptOnNodesException;
import org.jclouds.compute.domain.ComputeMetadata;
import org.jclouds.compute.domain.ExecChannel;
import org.jclouds.compute.domain.ExecResponse;
import org.jclouds.compute.domain.NodeMetadata;
import org.jclouds.compute.domain.TemplateBuilder;
import org.jclouds.compute.domain.internal.NodeMetadataImpl;
import org.jclouds.compute.options.TemplateOptions;
import org.jclouds.domain.LoginCredentials;
import org.jclouds.io.Payload;
import org.jclouds.logging.config.ConsoleLoggingModule;
import org.jclouds.logging.slf4j.config.SLF4JLoggingModule;
import org.jclouds.rest.AuthorizationException;
import org.jclouds.scriptbuilder.statements.login.AdminAccess;
import org.jclouds.ssh.SshClient;
import org.jclouds.ssh.SshException;
import org.jclouds.sshj.config.SshjSshClientModule;
import org.lamport.tla.toolbox.tool.tlc.job.ITLCJobStatus;
import org.lamport.tla.toolbox.tool.tlc.job.TLCJobFactory;

import com.google.common.base.Charsets;
import com.google.common.base.Predicate;
import com.google.common.base.Predicates;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.io.ByteStreams;
import com.google.common.io.Files;
import com.google.inject.AbstractModule;

/*
 * TODO
 * ====
 * - Reverse PTR records in DNS to make it less likely that emails coming out of the VM are classified as SPAM
 * -- Azure has only support for it in its service API but not in JClouds
 * --- Also see https://docs.microsoft.com/en-us/azure/dns/dns-reverse-dns-for-azure-services
 * -- AWS just has a form where users can request a PTR record  
 * - Send test mail during instance startup and communicate back to user on failure
 */
public class CloudDistributedTLCJob extends Job {

    private static final String SHUTDOWN_AFTER = System
            .getProperty(CloudDistributedTLCJob.class.getName() + ".shutdownMinutes", "10");
    /**
     * The groupName has to be unique per job. This is how cloud instances are
     * associated to this job. If two jobs use the same groupName, they will talk
     * to the same set of nodes.
     */
    private final String providerName;
    private final String groupNameUUID;
    private final Path modelPath;
    private final int nodes;
    private final Properties props;
    private final CloudTLCInstanceParameters params;
    private boolean isCLI = false;
    private boolean doJfr = false;

    public CloudDistributedTLCJob(String aName, File aModelFolder, int numberOfWorkers, final Properties properties,
            CloudTLCInstanceParameters params) {
        super(aName);
        this.providerName = aName;
        this.nodes = numberOfWorkers;
        this.params = params;
        //TODO groupNameUUID is used by some providers (azure) as a hostname/DNS name. Thus, format.
        groupNameUUID = aName.toLowerCase() + "-" + UUID.randomUUID().toString();
        props = properties;
        modelPath = aModelFolder.toPath();
    }

    @Override
    protected IStatus run(final IProgressMonitor monitor) {
        monitor.beginTask("Starting TLC model checker in the cloud", 90 + (nodes > 1 ? 20 : 0));
        // Validate credentials and fail fast if null or syntactically incorrect
        if (!params.validateCredentials().equals(Status.OK_STATUS)) {
            return params.validateCredentials();
        }

        ComputeServiceContext context = null;
        try {
            // Tweak tla2tools in a background thread. It takes a couple of seconds to run
            // pack200 to shrink the files size but we can lookup or launch a cloud instance
            // in the meantime.
            monitor.subTask("Tweaking tla2tools.jar to contain the spec & model (in background)");
            final ExecutorService executor = Executors.newSingleThreadExecutor();
            final Future<Payload> future = executor.submit(() -> {
                return PayloadHelper.appendModel2Jar(modelPath, props.getProperty(TLCJobFactory.MAIN_CLASS), props,
                        monitor);
            });
            executor.shutdown();
            // User has canceled the job, thus stop it (there will be more
            // cancelled checks down below).
            if (monitor.isCanceled()) {
                return Status.CANCEL_STATUS;
            }

            // example of specific properties, in this case optimizing image
            // list to only amazon supplied
            final Properties properties = new Properties();
            params.mungeProperties(properties);

            // Create compute environment in the cloud and inject an ssh
            // implementation. ssh is our means of communicating with the node.
            final Iterable<AbstractModule> modules = ImmutableSet.<AbstractModule>of(new SshjSshClientModule(),
                    isCLI ? new ConsoleLoggingModule() : new SLF4JLoggingModule());

            final ContextBuilder builder = ContextBuilder.newBuilder(params.getCloudProvider())
                    .credentials(params.getIdentity(), params.getCredentials()).modules(modules)
                    .overrides(properties);
            params.mungeBuilder(builder);

            monitor.subTask("Initializing " + builder.getApiMetadata().getName());
            context = builder.buildView(ComputeServiceContext.class);
            final ComputeService compute = context.getComputeService();
            monitor.worked(10);
            if (monitor.isCanceled()) {
                return Status.CANCEL_STATUS;
            }

            //TODO Support instance reuse with Cloud distributed TLC.
            monitor.subTask("Looking for resusable nodes to quick-start model checking");
            final Set<NodeMetadata> createNodesInGroup = nodes > 1 ? new HashSet<>()
                    : findReusableNodes(compute, monitor);
            monitor.worked(5);
            if (monitor.isCanceled()) {
                return Status.CANCEL_STATUS;
            } else if (createNodesInGroup.isEmpty()) {
                createNodesInGroup.addAll(provisionNodes(compute, monitor));
                if (monitor.isCanceled()) {
                    return Status.CANCEL_STATUS;
                }
            } else {
                // skipped provisionNodes(...) which takes 35 steps.
                monitor.worked(35);
            }

            // Choose one of the nodes to be the master and create an
            // identifying predicate.
            final NodeMetadata master = Iterables.getLast(createNodesInGroup);
            final String hostname = Iterables.getOnlyElement(master.getPublicAddresses()); // master.getHostname() only returns internal name

            // Copy tlatools.jar to _one_ remote host (do not exhaust upload of
            // the machine running the toolbox).
            // TODO Share the tla2tools.jar with the worker nodes by making it
            // available on the master's webserver for the clients to download.
            // On the other hand this means we are making the spec
            // world-readable. It is cloud-readable already through the RMI api.
            monitor.subTask("Copying tla2tools.jar to master node at " + hostname);
            SshClient sshClient = context.utils().sshForNode().apply(master);
            sshClient.put("/tmp/tla2tools.pack.gz", future.get());
            monitor.worked(10);
            if (monitor.isCanceled()) {
                return Status.CANCEL_STATUS;
            }

            final String tlcMasterCommand = " shutdown -c && rm -rf /mnt/tlc/* && " // Cancel and remove any pending shutdown and leftovers from previous runs.
                    + "cd /mnt/tlc/ && "
                    // Decompress tla2tools.pack.gz
                    + "unpack200 /tmp/tla2tools.pack.gz /tmp/tla2tools.jar" + " && "
                    // Execute TLC (java) process inside screen
                    // and shutdown on TLC's completion. But
                    // detach from screen directly. Name screen 
                    // session "tlc".
                    // (see http://stackoverflow.com/a/10126799)
                    + (isCLI ? "" : "screen -dm -S tlc bash -c \" ")
                    // This requires a modified version where all parameters and
                    // all spec modules are stored in files in a model/ folder
                    // inside of the jar.
                    // This is done in anticipation of other cloud providers
                    // where one cannot easily pass in parameters on the command
                    // line because there is no command line.
                    + "java " + params.getJavaVMArgs() + " " + (doJfr ? params.getFlightRecording() + " " : "")
                    // Write all tmp files to the ephemeral instance
                    // storage which is expected to have a higher IOPS
                    // compared to non-local storage.
                    + "-Djava.io.tmpdir=/mnt/tlc/ "
                    // These properties cannot be "backed" into
                    // the payload jar as java itself does not 
                    // support this.
                    // It might be able to read the properties from 
                    // the config file with 'com.sun.management.config.file=path',
                    // but I haven't tried if the path can point into the jar.
                    + "-Dcom.sun.management.jmxremote " + "-Dcom.sun.management.jmxremote.port=5400 "
                    + "-Dcom.sun.management.jmxremote.ssl=false "
                    + "-Dcom.sun.management.jmxremote.authenticate=false "
                    // TLC tuning options
                    + params.getJavaSystemProperties() + " " + "-jar /tmp/tla2tools.jar "
                    + params.getTLCParameters() + " " + (isCLI ? "|& tee /mnt/tlc/MC.out " : "") + "&& "
                    // Run any cloud specific cleanup tasks.
                    // When CloudDistributedTLCJob runs in synchronous CLI mode (isCLI), it will destroy
                    // the VMs (nodes) via the jclouds API. No need to deallocate nodes
                    // via special logic.
                    + (isCLI ? "/bin/true" : params.getCloudAPIShutdown()) + " && "
                    // Let the machine power down immediately after
                    // finishing model checking to cut costs. However,
                    // do not shut down (hence "&&") when TLC finished
                    // with an error.
                    // It uses "sudo" because the script is explicitly
                    // run as a user. No need to run the TLC process as
                    // root.
                    + "sudo shutdown -h +" + SHUTDOWN_AFTER + (isCLI ? "" : "\""); // closing opening '"' of screen/bash -c
            if (isCLI) {
                monitor.subTask("Starting TLC model checker process");
                // Execute command via ssh instead of as a script to get access to the TLC
                // processes' stdout and stderr.
                //TODO Better handle error case.
                ExecChannel channel = sshClient.execChannel(tlcMasterCommand);
                // Send remote TLC's stdout to local stdout (this throws a TransportException
                // unless shutdown is postponed by a few minutes above).
                ByteStreams.copy(channel.getOutput(), System.out);
                if (doJfr) {
                    // Get Java Flight Recording from remote machine and save if to a local file in
                    // the current working directory. We call "cat" because sftclient#get fails with
                    // the old net.schmizz.sshj and an update to the newer com.hierynomus seems 
                    // awful lot of work.
                    channel = sshClient.execChannel("cat /mnt/tlc/tlc.jfr");
                    final InputStream output = channel.getOutput();
                    final String cwd = Paths.get(".").toAbsolutePath().normalize().toString() + File.separator;
                    final File jfr = new File(cwd + "tlc.jfr");
                    ByteStreams.copy(output, new FileOutputStream(jfr));
                    if (jfr.length() == 0) {
                        System.err.println("Received empty Java Flight recording. Not creating tlc.jfr file");
                        jfr.delete();
                    }
                }
                // Finally close the ssh connection.
                sshClient.disconnect();
                monitor.subTask("TLC model checker process finished");
                // Eagerly destroy the instance after we pulled the tlc.jfr file from it. No
                // point in waiting for shutdown -h +10 to shutdown the instance.
                destroyNodes(context, groupNameUUID);
            } else {
                sshClient.disconnect();

                // Run model checker master on master
                monitor.subTask("Starting TLC model checker process on the master node (in background)");
                final ExecResponse response = compute.runScriptOnNode(master.getId(), exec(tlcMasterCommand),
                        new TemplateOptions().overrideLoginCredentials(master.getCredentials()).runAsRoot(false)
                                .wrapInInitScript(true).blockOnComplete(false).blockUntilRunning(false));
                throwExceptionOnErrorResponse(master, response,
                        "Starting TLC model checker process on the master node");
                monitor.worked(5);

                if (nodes > 1) {
                    // The predicate will be applied to ALL instances owned by the
                    // cloud account (ie AWS), even the ones in different regions
                    // completely unrelated to TLC.
                    final Predicate<NodeMetadata> isMaster = new Predicate<NodeMetadata>() {
                        private final String masterHostname = master.getHostname();

                        public boolean apply(NodeMetadata nodeMetadata) {
                            // hostname can be null if instance is terminated.
                            final String hostname = nodeMetadata.getHostname();
                            return masterHostname.equals(hostname);
                        };
                    };
                    // copy the tla2tools.jar to the root of the master's webserver
                    // to make it available to workers. However, strip the spec
                    // (*.tla/*.cfg/...) from the jar file to not share the spec
                    // with the world.
                    monitor.subTask("Make TLC code available to all worker node(s)");
                    Map<? extends NodeMetadata, ExecResponse> execResponse = compute.runScriptOnNodesMatching(
                            isMaster,
                            exec("cp /tmp/tla2tools.jar /var/www/html/tla2tools.jar && "
                                    + "zip -d /var/www/html/tla2tools.jar model/*.tla model/*.cfg model/generated.properties"),
                            new TemplateOptions().runAsRoot(true).wrapInInitScript(false));
                    throwExceptionOnErrorResponse(execResponse, "Make TLC code available to all worker node");
                    monitor.worked(10);
                    if (monitor.isCanceled()) {
                        return Status.CANCEL_STATUS;
                    }

                    // The predicate will be applied to ALL instances owned by the
                    // AWS account, even the ones in different regions completely
                    // unrelated to TLC.
                    final Predicate<NodeMetadata> onWorkers = new Predicate<NodeMetadata>() {
                        // Remove the master from the set of our nodes.
                        private final Iterable<? extends NodeMetadata> workers = Iterables
                                .filter(createNodesInGroup, new Predicate<NodeMetadata>() {
                                    private final String masterHostname = master.getHostname();

                                    public boolean apply(NodeMetadata nodeMetadata) {
                                        // nodeMetadata.getHostname is null for terminated hosts.
                                        return !masterHostname.equals(nodeMetadata.getHostname());
                                    };
                                });

                        public boolean apply(NodeMetadata nodeMetadata) {
                            return Iterables.contains(workers, nodeMetadata);
                        };
                    };

                    // see master startup for comments
                    monitor.subTask("Starting TLC workers on the remaining node(s) (in background)");
                    final String privateHostname = Iterables.getOnlyElement(master.getPrivateAddresses());
                    execResponse = compute.runScriptOnNodesMatching(onWorkers,
                            exec("cd /mnt/tlc/ && " + "wget http://" + privateHostname + "/tla2tools.jar && "
                                    + "screen -dm -S tlc bash -c \" " + "java " + params.getJavaWorkerVMArgs() + " "
                                    + "-Djava.io.tmpdir=/mnt/tlc/ " + "-Dcom.sun.management.jmxremote "
                                    + "-Dcom.sun.management.jmxremote.port=5400 "
                                    + "-Dcom.sun.management.jmxremote.ssl=false "
                                    + "-Dcom.sun.management.jmxremote.authenticate=false "
                                    + params.getJavaWorkerSystemProperties() + " " + "-cp /mnt/tlc/tla2tools.jar "
                                    + params.getTLCWorkerParameters() + " " + privateHostname + " " // Use host's internal ip due to firewall reasons.
                                    + "&& "
                                    // Terminate regardless of TLCWorker process
                                    // exit value. E.g. TLCWorker can terminate due
                                    // to a NoRouteToHostException when the master
                                    // shut down caused by a violation among the
                                    // init states.
                                    // Run any cloud specific cleanup tasks.
                                    + params.getCloudAPIShutdown() + " && " + "sudo shutdown -h now" + "\""),
                            new TemplateOptions().runAsRoot(false).wrapInInitScript(true).blockOnComplete(false)
                                    .blockUntilRunning(false));
                    throwExceptionOnErrorResponse(execResponse, "Starting TLC workers");
                    monitor.worked(10);
                }

            }

            // Get the output from the remote instance and attach the corresponding
            // InputStream to the CloudStatus. A UI can then read the InputStream and show
            // the output of the TLC process to a user. The SSH connection automatically
            // terminates when the TLC process finishes.
            // https://askubuntu.com/questions/509881/tail-reading-an-entire-file-and-then-following         
            ExecChannel execChannel = null;
            if (!isCLI) {
                execChannel = sshClient
                        .execChannel("tail -q -f -n +1 /mnt/tlc/MC.out --pid $(pgrep -f tla2tools.jar)");
            }

            // Communicate result to user
            monitor.done();
            return new CloudStatus(Status.OK, "org.lamport.tla.toolbox.jcloud", Status.OK,
                    String.format(
                            "TLC is model checking at host %s. "
                                    + "Expect to receive an email at %s with the model checking result eventually.",
                            hostname, props.get("result.mail.address")),
                    null, new URL("http://" + hostname + "/munin/"),
                    execChannel == null ? null : execChannel.getOutput(), sshClient);
        } catch (ExecutionException | InterruptedException | RunNodesException | IOException
                | RunScriptOnNodesException | NoSuchElementException | AuthorizationException | SshException e) {
            e.printStackTrace();
            if (context != null) {
                destroyNodes(context, groupNameUUID);
            }
            // signal error to caller
            return new Status(Status.ERROR, "org.lamport.tla.toolbox.jcloud", e.getMessage(), e);
        } catch (ScriptException e) {
            if (context != null) {
                destroyNodes(context, groupNameUUID);
            }
            // signal error to caller
            return new Status(Status.ERROR, "org.lamport.tla.toolbox.jcloud", e.getTitle(), e);
        } finally {
            if (context != null) {
                // The user has canceled the Toolbox job, take this as a request
                // to destroy all nodes this job has created.
                if (monitor.isCanceled()) {
                    destroyNodes(context, groupNameUUID);
                }
                context.close();
            }
        }
    }

    private Set<NodeMetadata> findReusableNodes(final ComputeService compute, final IProgressMonitor monitor)
            throws IOException {
        // Filter out those nodes which haven't been created by the Toolbox. We can't be
        // sure if they are reusable or that we are allowed to reuse them. Also, skip
        // nodes which are CLI nodes. CLI nodes get destroyed by the Toolbox. Lastly, we
        // are only interested in RUNNING instances.
        final Set<? extends ComputeMetadata> runningNodes = compute.listNodesDetailsMatching(
                node -> node.getName() != null && node.getName().startsWith(providerName.toLowerCase())
                        && !node.getTags().contains("CLI") && node.getStatus() == NodeMetadata.Status.RUNNING);

        // TODO what happens if a node terminates before we tried to runScriptOnNode
        // below? Does runScriptOnNode throw an exception or does the response indicate
        // it?

        for (final ComputeMetadata node : runningNodes) {
            final String id = node.getId();

            // busctl call... atomically cancels a scheduled system shutdown OR returns
            // false if none is scheduled. This is similar to the behavior of init5
            // shutdown -c which changed with systemd. Thx to user grawity on IRC
            // (systemd@freenode) for coming up with this.
            // For more context:
            // https://askubuntu.com/questions/835222/detect-pending-sheduled-shutdown-ubuntu-server-16-04-1
            // https://askubuntu.com/questions/838019/shutdownd-service-missing-on-ubuntu-server-16-lts
            // https://askubuntu.com/questions/994339/check-if-shutdown-schedule-is-active-and-when-it-is
            try {
                final ExecResponse response = compute.runScriptOnNode(id,
                        "busctl call --system org.freedesktop.login1 /org/freedesktop/login1 org.freedesktop.login1.Manager CancelScheduledShutdown",
                        TemplateOptions.Builder.overrideLoginCredentials(getLoginForCommandExecution())
                                .runAsRoot(true).wrapInInitScript(false));
                if (response.getExitStatus() == 0 && (response.getError() == null || response.getError().isEmpty())
                        && response.getOutput().indexOf("true") > 0) { // Part of output of busctl call...
                    return new HashSet<>(Arrays.asList(
                            new WrapperNodeMetadata(compute.getNodeMetadata(id), getLoginForCommandExecution())));
                }
            } catch (RuntimeException e) {
                // - NodeMetadata.Status could have changed after we queried the cloud API but
                // before we connected to the instance, ie the instance terminated/shutdown.
                // - We might not have valid credentials.
                //TODO Needs better reporting?!
                continue;
            }
        }

        return new HashSet<>();
    }

    private Set<? extends NodeMetadata> provisionNodes(ComputeService compute, IProgressMonitor monitor)
            throws RunNodesException, RunScriptOnNodesException {

        // start a node, but first configure it
        final TemplateOptions templateOptions = compute.templateOptions();

        // Open up node's firewall to allow http traffic in. This allows users to 
        // look at the munin/ stats generated for the OS as well as TLC specifically.
        // (See below where munin gets installed manually)
        // This now makes us dependent on EC2 (for now)
        templateOptions.inboundPorts(22, 80, 443);

        // note this will create a user with the same name as you on the
        // node. ex. you can connect via ssh public IP
        templateOptions.runScript(AdminAccess.standard());
        if (isCLI) {
            templateOptions.tags(Arrays.asList("CLI"));
        }
        params.mungeTemplateOptions(templateOptions);

        final TemplateBuilder templateBuilder = compute.templateBuilder();
        templateBuilder.options(templateOptions);
        templateBuilder.imageId(params.getImageId());
        templateBuilder.hardwareId(params.getHardwareId());

        // Everything configured, now launch node
        monitor.subTask(String.format("Starting %s %s instance%s in region %s.", nodes > 1 ? nodes : "a",
                params.getHardwareId(), nodes > 1 ? "s" : "", params.getRegion()));
        final Set<? extends NodeMetadata> createNodesInGroup = compute.createNodesInGroup(groupNameUUID, nodes,
                templateBuilder.build());
        monitor.worked(20);
        if (monitor.isCanceled()) {
            return createNodesInGroup;
        }

        // Install custom tailored jmx2munin to monitor the TLC process. Can
        // either monitor standalone tlc2.TLC or TLCServer.
        monitor.subTask("Provisioning TLC environment on all node(s)");
        final String email = props.getProperty(TLCJobFactory.MAIL_ADDRESS);
        Map<? extends NodeMetadata, ExecResponse> execResponse = compute.runScriptOnNodesMatching(
                inGroup(groupNameUUID),
                // Creating an entry in /etc/alias that makes sure system email sent
                // to root ends up at the address given by the user. Note that this
                // has to be done before postfix gets installed later. postfix
                // re-generates the aliases file for us.
                exec("echo root: " + email + " >> /etc/aliases" + " && "
                // OS tuning parameters for (distributed) TLC:
                // - Disable hugepage defragmentation (especially important on highmem instances)
                        + "echo never > /sys/kernel/mm/transparent_hugepage/defrag" + " && "
                        // - Turn off NUMA balancing
                        + "echo 0 > /proc/sys/kernel/numa_balancing" + " && "
                        // Don't want dpkg to require user interaction.
                        + "export DEBIAN_FRONTEND=noninteractive" + " && " + params.getHostnameSetup() + " && "
                        // Oracle Java 8
                        + (doJfr ? "add-apt-repository ppa:webupd8team/java -y && " : "/bin/true && ")
                        // Accept license before apt (dpkg) tries to present it to us (which fails due to 'noninteractive' mode below)
                        // see http://stackoverflow.com/a/19391042
                        + "echo debconf shared/accepted-oracle-license-v1-1 select true | sudo debconf-set-selections && "
                        + "echo debconf shared/accepted-oracle-license-v1-1 seen true | sudo debconf-set-selections && "
                        // http://repos.azulsystems.com/
                        //                            + "apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 0x219BD9C9 && "
                        //                            + "apt-add-repository 'deb http://repos.azulsystems.com/ubuntu stable main' && "
                        + params.getExtraRepositories() + " && "
                        // Update Ubuntu's package index. The public/remote
                        // package mirrors might have updated. Without
                        // update, we might try to install outdated packages
                        // below. This most likely fails with a 404 because
                        // the packages have been purged from the mirrors.
                        + "apt-get update" + " && "
                        // Never be prompted for input
                        // Download jmx2munin from the INRIA host
                        // TODO make it part of Toolbox and upload from
                        // there (it's tiny 48kb anyway) instead.
                        // This needs some better build-time integration
                        // between TLA and jmx2munin (probably best to make
                        // jmx2munin a submodule of the TLA git repo).
                        // Download an explicit version (commit) to allow
                        // us to continue development in HEAD. Otherwise,
                        // an old Toolbox would use the newest jmx2munin
                        // which might not be compatible with its CDTLCJob.
                        + "wget https://github.com/lemmy/jmx2munin/raw/df6ce053a6d178e7a70434ab2f91089acadf0525/jmx2munin_1.0_all.deb"
                        + " && "
                        //                     + "wget http://tla.msr-inria.inria.fr/jmx2munin/jmx2munin_1.0_all.deb && "
                        // Install jmx2munin into the system
                        + "dpkg -i jmx2munin_1.0_all.deb ; "
                        // Force apt to download and install the
                        // missing dependencies of jmx2munin without
                        // user interaction
                        + "apt-get install --no-install-recommends -fy" + " && "
                        // Install all security relevant system packages
                        + "echo unattended-upgrades unattended-upgrades/enable_auto_updates boolean true | debconf-set-selections"
                        + " && "
                        // screen is needed to allow us to re-attach
                        // to the TLC process if logged in to the
                        // instance directly (it's probably already
                        // installed). zip is used to prepare the
                        // worker tla2tools.jar (strip spec) and
                        // unattended-upgrades makes sure the instance
                        // is up-to-date security-wise. 
                        + "apt-get install --no-install-recommends mdadm e2fsprogs screen zip unattended-upgrades "
                        + params.getExtraPackages() + " -y" + " && " + (doJfr
                                ? "apt-get install --no-install-recommends oracle-java8-installer oracle-java8-set-default -y"
                                : "/bin/true")
                        //                            : "apt-get install --no-install-recommends zulu-8 -y") // Azul Zulu seems to beat Oracle but doesn't have support for JFR. Also, it adds another external dependency, ie repos.azulsystems.com
                        + " && "
                        // Delegate file system tuning to cloud specific code.
                        + params.getOSFilesystemTuning()
                        // Install Oracle Java8. It supports Java Mission
                        // Control, an honest profiler. But first,
                        // automatically accept the Oracle license because
                        // installation will fail otherwise.
                        //                     + " && "
                        //                     + "echo debconf shared/accepted-oracle-license-v1-1 select true | debconf-set-selections && "
                        //                     + "add-apt-repository ppa:webupd8team/java -y && "
                        //                     + "apt-get update && "
                        //                     + "apt-get --no-install-recommends install oracle-java8-installer oracle-java8-set-default -y"
                        // Create /mnt/tlc and change permission to be world writable
                        // Requires package 'apache2' to be already installed. apache2
                        // creates /var/www/html.
                        // "/mnt/tlc" is on the ephemeral and thus faster storage of the
                        // instance.
                        + " && " + "mkdir -p /mnt/tlc/ && chmod 777 /mnt/tlc/ && "
                        + "ln -s /mnt/tlc/MC.out /var/www/html/MC.out && "
                        + "ln -s /mnt/tlc/MC.out /var/www/html/MC.txt && " // Microsoft IE and Edge fail to show line breaks correctly unless ".txt" extension.
                        + "ln -s /mnt/tlc/MC.err /var/www/html/MC.err && "
                        + "ln -s /mnt/tlc/tlc.jfr /var/www/html/tlc.jfr"),
                new TemplateOptions().runAsRoot(true).wrapInInitScript(false));
        throwExceptionOnErrorResponse(execResponse, "Provisioning TLC environment on all nodes");
        monitor.worked(10);
        if (monitor.isCanceled()) {
            return createNodesInGroup;
        }

        // Install all security relevant system packages
        monitor.subTask("Installing security relevant system package upgrades (in background)");
        execResponse = compute.runScriptOnNodesMatching(inGroup(groupNameUUID),
                exec("screen -dm -S security bash -c \"/usr/bin/unattended-upgrades\""), new TemplateOptions()
                        .runAsRoot(true).wrapInInitScript(true).blockOnComplete(false).blockUntilRunning(false));
        throwExceptionOnErrorResponse(execResponse, "Installing security relevant system package upgrades");
        monitor.worked(5);

        return createNodesInGroup;
    }

    private void throwExceptionOnErrorResponse(final Map<? extends NodeMetadata, ExecResponse> execResponse,
            final String step) {
        execResponse.forEach((node, exec) -> {
            // If the script above failed on any number of nodes for whatever reason, don't
            // continue but destroy all nodes.
            if (exec.getExitStatus() > 0) {
                throw new ScriptException(node, exec, step);
            }
        });
    }

    private void throwExceptionOnErrorResponse(final NodeMetadata node, final ExecResponse execResponse,
            final String step) {
        // If the script above failed on any number of nodes for whatever reason, don't
        // continue but destroy all nodes.
        if (execResponse.getExitStatus() > 0) {
            throw new ScriptException(node, execResponse, step);
        }
    }

    public void setIsCLI(boolean cli) {
        this.isCLI = cli;
    }

    public void setDoJfr(boolean doIt) {
        this.doJfr = doIt;
    }

    private static void destroyNodes(final ComputeServiceContext ctx, final String groupname) {
        // Destroy all workers identified by the given group
        final ComputeService computeService = ctx.getComputeService();
        if (computeService != null) {
            Set<? extends NodeMetadata> destroyed = computeService
                    .destroyNodesMatching(Predicates.<NodeMetadata>and(not(TERMINATED), inGroup(groupname)));
            System.out.printf("<< destroyed nodes %s%n", destroyed);
        }
    }

    @SuppressWarnings("serial")
    class ScriptException extends RuntimeException {

        private final String title;

        public ScriptException(final NodeMetadata node, final ExecResponse exec, final String step) {
            super(exec.getOutput());
            this.title = String.format("Launching TLC on %s unsuccessful.\nStep '%s' failed on node '%s'.",
                    params.getCloudProvider(), step, node.getName());
        }

        public String getTitle() {
            return title;
        }
    }

    class CloudStatus extends Status implements ITLCJobStatus {

        private final URL url;
        private final InputStream output;
        private final SshClient sshClient;

        public CloudStatus(int severity, String pluginId, int code, String message, Throwable exception, URL url,
                InputStream output, SshClient sshClient) {
            super(severity, pluginId, code, message, exception);
            this.url = url;
            this.output = output;
            this.sshClient = sshClient;
        }

        @Override
        public URL getURL() {
            return url;
        }

        @Override
        public InputStream getOutput() {
            return this.output;
        }

        public void killTLC() {
            // For some reason shutdown has to be called before kill (shrugs).
            sshClient.execChannel(
                    String.format("sudo shutdown -h +%s && kill $(pgrep -f tla2tools.jar)", SHUTDOWN_AFTER));
        }
    }

    /*
       <nacx> jclouds uses a Credential store to persist node credentials
       <nacx> if you use the same context after creating a node, the credentials should already be in the credential store
       <nacx> the default implementation is in-memory and lives per-ontext
       <nacx> but you can easily create a persistent credential store and use it
       <nacx> when creating the context, you can apss your custom credentialstore module
       <nacx> https://jclouds.apache.org/reference/javadoc/2.1.x/org/jclouds/rest/config/CredentialStoreModule.html
       <nacx> you can instantiate it providing a Map
       <nacx> that will hold the credentials.
       <nacx> It is up to you to persist that map the way you prefer
     */

    //TODO Does this work on Mac and Windows?
    private static LoginCredentials getLoginForCommandExecution() throws IOException {
        final String user = System.getProperty("user.name");
        final String privateKey = Files.toString(new File(System.getProperty("user.home") + "/.ssh/id_rsa"),
                Charsets.UTF_8);
        return LoginCredentials.builder().user(user).privateKey(privateKey).build();
    }

    // TODO Somehow override credentials in wrapped instead of creating wrapper
    // instance?
    class WrapperNodeMetadata extends NodeMetadataImpl {
        public WrapperNodeMetadata(final NodeMetadata m, final LoginCredentials credentials) {
            super(m.getProviderId(), m.getName(), m.getId(), m.getLocation(), m.getUri(), m.getUserMetadata(),
                    m.getTags(), m.getGroup(), m.getHardware(), m.getImageId(), m.getOperatingSystem(),
                    m.getStatus(), m.getBackendStatus(), m.getLoginPort(), m.getPublicAddresses(),
                    m.getPrivateAddresses(), credentials, m.getHostname());
        }
    }
}