com.netflix.genie.web.tasks.leader.ClusterCheckerTask.java Source code

Java tutorial

Introduction

Here is the source code for com.netflix.genie.web.tasks.leader.ClusterCheckerTask.java

Source

/*
 *
 *  Copyright 2016 Netflix, Inc.
 *
 *     Licensed under the Apache License, Version 2.0 (the "License");
 *     you may not use this file except in compliance with the License.
 *     You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 *     Unless required by applicable law or agreed to in writing, software
 *     distributed under the License is distributed on an "AS IS" BASIS,
 *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *     See the License for the specific language governing permissions and
 *     limitations under the License.
 *
 */
package com.netflix.genie.web.tasks.leader;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.type.TypeFactory;
import com.google.common.base.Splitter;
import com.netflix.genie.common.dto.Job;
import com.netflix.genie.common.dto.JobExecution;
import com.netflix.genie.common.dto.JobStatus;
import com.netflix.genie.common.exceptions.GenieException;
import com.netflix.genie.core.services.JobPersistenceService;
import com.netflix.genie.core.services.JobSearchService;
import com.netflix.genie.web.properties.ClusterCheckerProperties;
import com.netflix.genie.web.tasks.GenieTaskScheduleType;
import com.netflix.spectator.api.Counter;
import com.netflix.spectator.api.Registry;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.boot.actuate.autoconfigure.ManagementServerProperties;
import org.springframework.boot.actuate.health.Status;
import org.springframework.stereotype.Component;
import org.springframework.web.client.HttpStatusCodeException;
import org.springframework.web.client.RestTemplate;

import javax.validation.constraints.NotNull;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * A task which checks to see if this leader node can communicate with all other nodes in the cluster. If it can't
 * it will keep track of which nodes it can't communicate with and perform various actions based on the number of times
 * it can't communicate with that node. Currently (as of 3.0) this task will mark jobs as lost if they miss a certain
 * number of checks.
 *
 * @author tgianos
 * @since 3.0.0
 */
@Component
@Slf4j
public class ClusterCheckerTask extends LeadershipTask {
    private static final String PROPERTY_STATUS = "status";

    private final String hostName;
    private final ClusterCheckerProperties properties;
    private final JobSearchService jobSearchService;
    private final JobPersistenceService jobPersistenceService;
    private final RestTemplate restTemplate;
    private final String scheme;
    private final String healthEndpoint;
    private final ObjectMapper mapper = new ObjectMapper();
    private final List<String> healthIndicatorsToIgnore;

    private final Map<String, Integer> errorCounts = new HashMap<>();

    // TODO: Add metrics
    private final Counter lostJobsCounter;
    private final Counter unableToUpdateJobCounter;

    /**
     * Constructor.
     *
     * @param hostName                   The host name of this node
     * @param properties                 The properties to use to configure the task
     * @param jobSearchService           The job search service to use
     * @param jobPersistenceService      The job persistence service to use
     * @param restTemplate               The rest template for http calls
     * @param managementServerProperties The properties where Spring actuator is running
     * @param registry                   The spectator registry for getting metrics
     */
    @Autowired
    public ClusterCheckerTask(@NotNull final String hostName, @NotNull final ClusterCheckerProperties properties,
            @NotNull final JobSearchService jobSearchService,
            @NotNull final JobPersistenceService jobPersistenceService,
            @Qualifier("genieRestTemplate") @NotNull final RestTemplate restTemplate,
            @NotNull final ManagementServerProperties managementServerProperties,
            @NotNull final Registry registry) {
        this.hostName = hostName;
        this.properties = properties;
        this.jobSearchService = jobSearchService;
        this.jobPersistenceService = jobPersistenceService;
        this.restTemplate = restTemplate;
        this.scheme = this.properties.getScheme() + "://";
        this.healthEndpoint = ":" + this.properties.getPort() + managementServerProperties.getContextPath()
                + "/health";
        this.healthIndicatorsToIgnore = Splitter.on(",").omitEmptyStrings().trimResults()
                .splitToList(properties.getHealthIndicatorsToIgnore());
        // Keep track of the number of nodes currently unreachable from the the master
        registry.mapSize("genie.tasks.clusterChecker.errorCounts.gauge", this.errorCounts);
        this.lostJobsCounter = registry.counter("genie.tasks.clusterChecker.lostJobs.rate");
        this.unableToUpdateJobCounter = registry.counter("genie.tasks.clusterChecker.unableToUpdateJob.rate");
    }

    /**
     * Ping the health check endpoint of all other nodes which have running jobs. Track results.
     */
    @Override
    public void run() {
        log.info("Checking for cluster node health...");
        this.jobSearchService.getAllHostsWithActiveJobs().stream().filter(host -> !this.hostName.equals(host))
                .forEach(this::validateHostAndUpdateErrorCount);

        this.errorCounts.entrySet().removeIf(entry -> {
            final String host = entry.getKey();
            boolean result = true;
            if (entry.getValue() >= properties.getLostThreshold()) {
                try {
                    updateJobsToFailedOnHost(host);
                } catch (Exception e) {
                    log.error("Unable to update jobs on host {} due to exception", host, e);
                    unableToUpdateJobCounter.increment();
                    result = false;
                }
            } else {
                result = false;
            }
            return result;
        });
        log.info("Finished checking for cluster node health.");
    }

    private void updateJobsToFailedOnHost(final String host) {
        final Set<Job> jobs = jobSearchService.getAllActiveJobsOnHost(host);
        jobs.forEach(job -> {
            try {
                jobPersistenceService.setJobCompletionInformation(
                        job.getId().orElseThrow(IllegalArgumentException::new), JobExecution.LOST_EXIT_CODE,
                        JobStatus.FAILED,
                        "Genie leader can't reach node running job. Assuming node and job are lost.", null, null);
                lostJobsCounter.increment();
            } catch (final GenieException ge) {
                log.error("Unable to update job {} to failed due to exception", job.getId(), ge);
                unableToUpdateJobCounter.increment();
            }
        });
    }

    private void validateHostAndUpdateErrorCount(final String host) {
        //
        // If node is healthy, remove the entry from the errorCounts.
        // If node is not healthy, update the entry in errorCounts
        //
        if (isNodeHealthy(host)) {
            if (errorCounts.containsKey(host)) {
                errorCounts.remove(host);
            }
        } else {
            if (this.errorCounts.containsKey(host)) {
                this.errorCounts.put(host, this.errorCounts.get(host) + 1);
            } else {
                this.errorCounts.put(host, 1);
            }
        }
    }

    private boolean isNodeHealthy(final String host) {
        //
        // A node is valid and healthy if all health indicators excluding the ones mentioned in healthIndicatorsToIgnore
        // are UP.
        //
        boolean result = true;
        try {
            restTemplate.getForObject(this.scheme + host + this.healthEndpoint, String.class);
        } catch (final HttpStatusCodeException e) {
            log.error("Failed validating host {}", host, e);
            try {
                final Map<String, Object> responseMap = mapper.readValue(e.getResponseBodyAsByteArray(),
                        TypeFactory.defaultInstance().constructMapType(Map.class, String.class, Object.class));
                for (Map.Entry<String, Object> responseEntry : responseMap.entrySet()) {
                    if (responseEntry.getValue() instanceof Map
                            && !healthIndicatorsToIgnore.contains(responseEntry.getKey())
                            && !Status.UP.getCode().equals(((Map) responseEntry.getValue()).get(PROPERTY_STATUS))) {
                        result = false;
                        break;
                    }
                }
            } catch (Exception ex) {
                log.error("Failed reading the error response when validating host {}", host, ex);
                result = false;
            }
        } catch (final Exception e) {
            log.error("Unable to reach {}", host, e);
            result = false;
        }
        return result;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public GenieTaskScheduleType getScheduleType() {
        return GenieTaskScheduleType.FIXED_RATE;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public long getFixedRate() {
        return this.properties.getRate();
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void cleanup() {
        this.errorCounts.clear();
    }

    /**
     * Get the current size of error counts. Mainly used for testing.
     *
     * @return Number of nodes currently in an error state
     */
    int getErrorCountsSize() {
        return this.errorCounts.size();
    }
}