com.spotify.styx.docker.KubernetesDockerRunner.java Source code

Java tutorial

Introduction

Here is the source code for com.spotify.styx.docker.KubernetesDockerRunner.java

Source

/*-
 * -\-\-
 * Spotify Styx Scheduler Service
 * --
 * Copyright (C) 2016 Spotify AB
 * --
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * -/-/-
 */

package com.spotify.styx.docker;

import static com.spotify.styx.docker.KubernetesPodEventTranslator.translate;
import static com.spotify.styx.serialization.Json.OBJECT_MAPPER;
import static com.spotify.styx.state.RunState.State.RUNNING;
import static java.util.stream.Collectors.toSet;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Throwables;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.spotify.styx.model.Event;
import com.spotify.styx.model.EventVisitor;
import com.spotify.styx.model.ExecutionDescription;
import com.spotify.styx.model.WorkflowConfiguration;
import com.spotify.styx.model.WorkflowInstance;
import com.spotify.styx.monitoring.Stats;
import com.spotify.styx.state.Message;
import com.spotify.styx.state.RunState;
import com.spotify.styx.state.StateManager;
import com.spotify.styx.state.Trigger;
import com.spotify.styx.util.TriggerUtil;
import io.fabric8.kubernetes.api.model.EnvVar;
import io.fabric8.kubernetes.api.model.EnvVarBuilder;
import io.fabric8.kubernetes.api.model.Pod;
import io.fabric8.kubernetes.api.model.PodBuilder;
import io.fabric8.kubernetes.api.model.PodFluent;
import io.fabric8.kubernetes.api.model.PodList;
import io.fabric8.kubernetes.api.model.PodSpecFluent;
import io.fabric8.kubernetes.api.model.Secret;
import io.fabric8.kubernetes.api.model.VolumeMount;
import io.fabric8.kubernetes.client.KubernetesClient;
import io.fabric8.kubernetes.client.KubernetesClientException;
import io.fabric8.kubernetes.client.Watch;
import io.fabric8.kubernetes.client.Watcher;
import java.io.IOException;
import java.net.ProtocolException;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;

/**
 * A {@link DockerRunner} implementation that submits container executions to a Kubernetes cluster.
 */
class KubernetesDockerRunner implements DockerRunner {

    static final String NAMESPACE = "default";
    static final String STYX_RUN = "styx-run";
    static final String STYX_WORKFLOW_INSTANCE_ANNOTATION = "styx-workflow-instance";
    static final String DOCKER_TERMINATION_LOGGING_ANNOTATION = "styx-docker-termination-logging";
    static final String COMPONENT_ID = "STYX_COMPONENT_ID";
    // TODO: for backward compatibility, delete later
    static final String ENDPOINT_ID = "STYX_ENDPOINT_ID";
    static final String WORKFLOW_ID = "STYX_WORKFLOW_ID";
    static final String PARAMETER = "STYX_PARAMETER";
    static final String EXECUTION_ID = "STYX_EXECUTION_ID";
    static final String TERMINATION_LOG = "STYX_TERMINATION_LOG";
    static final String TRIGGER_ID = "STYX_TRIGGER_ID";
    static final String TRIGGER_TYPE = "STYX_TRIGGER_TYPE";
    static final int POLL_PODS_INTERVAL_SECONDS = 60;

    private static final ScheduledExecutorService EXECUTOR = Executors.newSingleThreadScheduledExecutor(
            new ThreadFactoryBuilder().setDaemon(true).setNameFormat("k8s-scheduler-thread-%d").build());

    private final KubernetesClient client;
    private final StateManager stateManager;
    private final Stats stats;

    private Watch watch;

    KubernetesDockerRunner(KubernetesClient client, StateManager stateManager, Stats stats) {
        this.stateManager = Objects.requireNonNull(stateManager);
        this.client = Objects.requireNonNull(client).inNamespace(NAMESPACE);
        this.stats = Objects.requireNonNull(stats);
    }

    @Override
    public String start(WorkflowInstance workflowInstance, RunSpec runSpec) throws IOException {
        try {
            runSpec.secret().ifPresent(specSecret -> {
                final Secret secret = client.secrets().withName(specSecret.name()).get();
                if (secret == null) {
                    throw new InvalidExecutionException(
                            "Referenced secret '" + specSecret.name() + "' was not found");
                }
            });

            final Pod pod = client.pods().create(createPod(workflowInstance, runSpec));
            return pod.getMetadata().getName();
        } catch (KubernetesClientException kce) {
            throw new IOException("Failed to create Kubernetes pod", kce);
        }
    }

    @VisibleForTesting
    static Pod createPod(WorkflowInstance workflowInstance, RunSpec runSpec) {
        final String imageWithTag = runSpec.imageName().contains(":") ? runSpec.imageName()
                : runSpec.imageName() + ":latest";

        final String podName = STYX_RUN + "-" + UUID.randomUUID().toString();

        // inject environment variables
        List<EnvVar> env = Lists.newArrayList();
        env.add(new EnvVarBuilder().withName(COMPONENT_ID).withValue(workflowInstance.workflowId().componentId())
                .build());
        // TODO: for backward compatibility, delete later
        env.add(new EnvVarBuilder().withName(ENDPOINT_ID).withValue(workflowInstance.workflowId().id()).build());
        env.add(new EnvVarBuilder().withName(WORKFLOW_ID).withValue(workflowInstance.workflowId().id()).build());
        env.add(new EnvVarBuilder().withName(PARAMETER).withValue(workflowInstance.parameter()).build());
        env.add(new EnvVarBuilder().withName(EXECUTION_ID).withValue(podName).build());
        env.add(new EnvVarBuilder().withName(TERMINATION_LOG).withValue("/dev/termination-log").build());
        env.add(new EnvVarBuilder().withName(TRIGGER_ID)
                .withValue(runSpec.trigger().map(TriggerUtil::triggerId).orElse(null)).build());
        env.add(new EnvVarBuilder().withName(TRIGGER_TYPE)
                .withValue(runSpec.trigger().map(TriggerUtil::triggerType).orElse(null)).build());

        PodBuilder podBuilder = new PodBuilder().withNewMetadata().withName(podName)
                .addToAnnotations(STYX_WORKFLOW_INSTANCE_ANNOTATION, workflowInstance.toKey())
                .addToAnnotations(DOCKER_TERMINATION_LOGGING_ANNOTATION,
                        String.valueOf(runSpec.terminationLogging()))
                .endMetadata();
        PodFluent.SpecNested<PodBuilder> spec = podBuilder.withNewSpec().withRestartPolicy("Never");
        PodSpecFluent.ContainersNested<PodFluent.SpecNested<PodBuilder>> container = spec.addNewContainer()
                .withName(STYX_RUN).withImage(imageWithTag).withArgs(runSpec.args()).withEnv(env);

        if (runSpec.secret().isPresent()) {
            final WorkflowConfiguration.Secret secret = runSpec.secret().get();
            spec = spec.addNewVolume().withName(secret.name()).withNewSecret().withSecretName(secret.name())
                    .endSecret().endVolume();
            container = container.addToVolumeMounts(new VolumeMount(secret.mountPath(), secret.name(), true));
        }
        container.endContainer();

        return spec.endSpec().build();
    }

    @Override
    public void cleanup(String executionId) {
        client.pods().withName(executionId).delete();
    }

    @Override
    public void close() throws IOException {
        if (watch != null) {
            watch.close();
        }
    }

    public void init() {
        EXECUTOR.scheduleWithFixedDelay(this::pollPods, POLL_PODS_INTERVAL_SECONDS, POLL_PODS_INTERVAL_SECONDS,
                TimeUnit.SECONDS);

        final String resourceVersion = client.pods().list().getMetadata().getResourceVersion();
        watch = client.pods().withResourceVersion(resourceVersion)
                .watch(new PodWatcher(Integer.parseInt(resourceVersion)));
    }

    void examineRunningWFISandAssociatedPods(PodList podList) {
        final Set<WorkflowInstance> runningWorkflowInstances = stateManager.activeStates().values().stream()
                .filter(runState -> runState.state().equals(RUNNING)).map(RunState::workflowInstance)
                .collect(toSet());

        final Set<WorkflowInstance> workflowInstancesForPods = podList.getItems().stream()
                .filter(pod -> pod.getMetadata().getAnnotations().containsKey(STYX_WORKFLOW_INSTANCE_ANNOTATION))
                .map(pod -> WorkflowInstance
                        .parseKey(pod.getMetadata().getAnnotations().get(STYX_WORKFLOW_INSTANCE_ANNOTATION)))
                .collect(toSet());

        runningWorkflowInstances.removeAll(workflowInstancesForPods);
        runningWorkflowInstances.forEach(workflowInstance -> stateManager
                .receiveIgnoreClosed(Event.runError(workflowInstance, "No pod associated with this instance")));
    }

    @VisibleForTesting
    void pollPods() {
        try {
            final PodList list = client.pods().list();
            examineRunningWFISandAssociatedPods(list);

            final int resourceVersion = Integer.parseInt(list.getMetadata().getResourceVersion());

            for (Pod pod : list.getItems()) {
                logEvent(Watcher.Action.MODIFIED, pod, resourceVersion, true);
                inspectPod(Watcher.Action.MODIFIED, pod);
            }
        } catch (Throwable t) {
            LOG.warn("Error while polling pods", t);
        }
    }

    private void inspectPod(Watcher.Action action, Pod pod) {
        final Map<String, String> annotations = pod.getMetadata().getAnnotations();
        final String podName = pod.getMetadata().getName();
        if (!annotations.containsKey(KubernetesDockerRunner.STYX_WORKFLOW_INSTANCE_ANNOTATION)) {
            LOG.warn("Got pod without workflow instance annotation {}", podName);
            return;
        }

        final WorkflowInstance workflowInstance = WorkflowInstance
                .parseKey(annotations.get(KubernetesDockerRunner.STYX_WORKFLOW_INSTANCE_ANNOTATION));

        final RunState runState = stateManager.get(workflowInstance);
        if (runState == null) {
            LOG.warn("Pod event for unknown or inactive workflow instance {}", workflowInstance);
            return;
        }

        final Optional<String> executionIdOpt = runState.data().executionId();
        if (!executionIdOpt.isPresent()) {
            LOG.warn("Pod event for state with no current executionId: {}", podName);
            return;
        }

        final String executionId = executionIdOpt.get();
        if (!podName.equals(executionId)) {
            LOG.warn("Pod event not matching current exec id, current:{} != pod:{}", executionId, podName);
            return;
        }

        final List<Event> events = translate(workflowInstance, runState, action, pod, stats);

        for (Event event : events) {
            if (event.accept(new PullImageErrorMatcher())) {
                stats.pullImageError();
            }

            try {
                stateManager.receive(event);
            } catch (StateManager.IsClosed isClosed) {
                LOG.warn("Could not receive kubernetes event", isClosed);
                throw Throwables.propagate(isClosed);
            }
        }
    }

    private void logEvent(Watcher.Action action, Pod pod, int resourceVersion, boolean polled) {
        final String podName = pod.getMetadata().getName();
        final String workflowInstance = pod.getMetadata().getAnnotations()
                .getOrDefault(KubernetesDockerRunner.STYX_WORKFLOW_INSTANCE_ANNOTATION, "N/A");
        final String status = readStatus(pod);

        LOG.info("{}Pod event for {} at resource version {}, action: {}, workflow instance: {}, status: {}",
                polled ? "Polled: " : "", podName, resourceVersion, action, workflowInstance, status);
    }

    private String readStatus(Pod pod) {
        try {
            return OBJECT_MAPPER.writeValueAsString(pod.getStatus());
        } catch (JsonProcessingException e) {
            return pod.getStatus().toString();
        }
    }

    public class PodWatcher implements Watcher<Pod> {

        private static final int RECONNECT_DELAY_SECONDS = 1;

        private int lastResourceVersion;

        PodWatcher(int resourceVersion) {
            this.lastResourceVersion = resourceVersion;
        }

        @Override
        public void eventReceived(Action action, Pod pod) {
            if (pod == null) {
                return;
            }

            logEvent(action, pod, lastResourceVersion, false);

            try {
                inspectPod(action, pod);
            } finally {
                // fixme: this breaks the kubernetes api convention of not interpreting the resource version
                // https://github.com/kubernetes/kubernetes/blob/release-1.2/docs/devel/api-conventions.md#metadata
                lastResourceVersion = Integer.parseInt(pod.getMetadata().getResourceVersion());
            }
        }

        private void reconnect() {
            LOG.warn("Re-establishing watching from {}", lastResourceVersion);

            try {
                watch = client.pods().withResourceVersion(Integer.toString(lastResourceVersion)).watch(this);
            } catch (Throwable e) {
                LOG.warn("Retry threw", e);
                scheduleReconnect();
            }
        }

        private void scheduleReconnect() {
            EXECUTOR.schedule(this::reconnect, RECONNECT_DELAY_SECONDS, TimeUnit.SECONDS);
        }

        @Override
        public void onClose(KubernetesClientException e) {
            LOG.warn("Watch closed", e);

            // kube seems to gc old resource versions
            if (e != null && e.getCause() instanceof ProtocolException) {
                // todo: this is racy : more events can be purged while we're playing catch up
                lastResourceVersion++;
                reconnect();
            } else {
                scheduleReconnect();
            }
        }
    }

    // fixme: add a Cause enum to the runError() event instead of this string matching
    private static class PullImageErrorMatcher implements EventVisitor<Boolean> {

        @Override
        public Boolean timeTrigger(WorkflowInstance workflowInstance) {
            return false;
        }

        @Override
        public Boolean triggerExecution(WorkflowInstance workflowInstance, Trigger trigger) {
            return false;
        }

        @Override
        public Boolean info(WorkflowInstance workflowInstance, Message message) {
            return false;
        }

        @Override
        public Boolean dequeue(WorkflowInstance workflowInstance) {
            return false;
        }

        @Override
        public Boolean created(WorkflowInstance workflowInstance, String executionId, String dockerImage) {
            return false;
        }

        @Override
        public Boolean submit(WorkflowInstance workflowInstance, ExecutionDescription executionDescription) {
            return false;
        }

        @Override
        public Boolean submitted(WorkflowInstance workflowInstance, String executionId) {
            return false;
        }

        @Override
        public Boolean started(WorkflowInstance workflowInstance) {
            return false;
        }

        @Override
        public Boolean terminate(WorkflowInstance workflowInstance, Optional<Integer> exitCode) {
            return false;
        }

        @Override
        public Boolean runError(WorkflowInstance workflowInstance, String message) {
            return message.contains("failed to pull");
        }

        @Override
        public Boolean success(WorkflowInstance workflowInstance) {
            return false;
        }

        @Override
        public Boolean retryAfter(WorkflowInstance workflowInstance, long delayMillis) {
            return false;
        }

        @Override
        public Boolean retry(WorkflowInstance workflowInstance) {
            return false;
        }

        @Override
        public Boolean stop(WorkflowInstance workflowInstance) {
            return false;
        }

        @Override
        public Boolean timeout(WorkflowInstance workflowInstance) {
            return false;
        }

        @Override
        public Boolean halt(WorkflowInstance workflowInstance) {
            return false;
        }
    }
}