io.cassandrareaper.service.SegmentRunner.java Source code

Introduction

Here is the source code for io.cassandrareaper.service.SegmentRunner.java
Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.cassandrareaper.service;

import io.cassandrareaper.AppContext;
import io.cassandrareaper.ReaperApplicationConfiguration.DatacenterAvailability;
import io.cassandrareaper.ReaperException;
import io.cassandrareaper.core.Node;
import io.cassandrareaper.core.NodeMetrics;
import io.cassandrareaper.core.RepairRun;
import io.cassandrareaper.core.RepairSegment;
import io.cassandrareaper.core.RepairUnit;
import io.cassandrareaper.jmx.EndpointSnitchInfoProxy;
import io.cassandrareaper.jmx.JmxProxy;
import io.cassandrareaper.jmx.RepairStatusHandler;
import io.cassandrareaper.jmx.SnapshotProxy;
import io.cassandrareaper.storage.IDistributedStorage;

import java.lang.management.ManagementFactory;
import java.lang.management.OperatingSystemMXBean;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.Condition;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import javax.management.JMException;

import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.sun.management.UnixOperatingSystemMXBean;
import org.apache.cassandra.repair.RepairParallelism;
import org.apache.cassandra.service.ActiveRepairService;
import org.apache.cassandra.utils.progress.ProgressEventType;
import org.apache.commons.lang3.concurrent.ConcurrentException;
import org.apache.commons.lang3.concurrent.LazyInitializer;
import org.apache.commons.lang3.tuple.Pair;
import org.joda.time.DateTime;
import org.joda.time.Seconds;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

final class SegmentRunner implements RepairStatusHandler, Runnable {

    // Caching all active SegmentRunners.
    static final Map<UUID, SegmentRunner> SEGMENT_RUNNERS = Maps.newConcurrentMap();

    private static final Logger LOG = LoggerFactory.getLogger(SegmentRunner.class);

    private static final int MAX_TIMEOUT_EXTENSIONS = 10;
    private static final Pattern REPAIR_UUID_PATTERN = Pattern
            .compile("[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}");

    private static final long SLEEP_TIME_AFTER_POSTPONE_IN_MS = 10000;
    private static final ExecutorService METRICS_GRABBER_EXECUTOR = Executors.newFixedThreadPool(10);
    private static final long METRICS_POLL_INTERVAL_MS = TimeUnit.SECONDS.toMillis(10);
    private static final long METRICS_MAX_WAIT_MS = TimeUnit.MINUTES.toMillis(2);

    private final AppContext context;
    private final UUID segmentId;
    private final Condition condition = new SimpleCondition();
    private final Collection<String> potentialCoordinators;
    private final long timeoutMillis;
    private final double intensity;
    private final RepairParallelism validationParallelism;
    private final String clusterName;
    private final RepairRunner repairRunner;
    private final RepairUnit repairUnit;
    private volatile int repairNo;
    private final AtomicBoolean segmentFailed;
    private final UUID leaderElectionId;
    private final AtomicBoolean successOrFailedNotified = new AtomicBoolean(false);
    private final AtomicBoolean completeNotified = new AtomicBoolean(false);

    SegmentRunner(AppContext context, UUID segmentId, Collection<String> potentialCoordinators, long timeoutMillis,
            double intensity, RepairParallelism validationParallelism, String clusterName, RepairUnit repairUnit,
            RepairRunner repairRunner) throws ReaperException {

        if (SEGMENT_RUNNERS.containsKey(segmentId)) {
            LOG.error("SegmentRunner already exists for segment with ID: {}", segmentId);
            throw new ReaperException("SegmentRunner already exists for segment with ID: " + segmentId);
        }
        this.context = context;
        this.segmentId = segmentId;
        this.potentialCoordinators = potentialCoordinators;
        this.timeoutMillis = timeoutMillis;
        this.intensity = intensity;
        this.validationParallelism = validationParallelism;
        this.clusterName = clusterName;
        this.repairUnit = repairUnit;
        this.repairRunner = repairRunner;
        this.segmentFailed = new AtomicBoolean(false);
        this.leaderElectionId = repairUnit.getIncrementalRepair() ? repairRunner.getRepairRunId() : segmentId;
    }

    @Override
    public void run() {
        if (takeLead()) {
            try {
                if (runRepair()) {
                    long delay = intensityBasedDelayMillis(intensity);
                    try {
                        Thread.sleep(delay);
                    } catch (InterruptedException e) {
                        LOG.warn("Slept shorter than intended delay.");
                    }
                }
            } finally {
                releaseLead();
            }
        }
    }

    static void postponeSegment(AppContext context, RepairSegment segment) {
        postpone(context, segment, context.storage.getRepairUnit(segment.getRepairUnitId()));
    }

    private static void postpone(AppContext context, RepairSegment segment, RepairUnit repairUnit) {
        LOG.info("Postponing segment {}", segment.getId());
        try {
            context.storage.updateRepairSegment(segment.reset()
                    // set coordinator host to null only for full repairs
                    .withCoordinatorHost(repairUnit.getIncrementalRepair() ? segment.getCoordinatorHost() : null)
                    .withFailCount(segment.getFailCount() + 1).withId(segment.getId()).build());
        } finally {
            SEGMENT_RUNNERS.remove(segment.getId());
            context.metricRegistry.counter(metricNameForPostpone(repairUnit, segment)).inc();
        }
    }

    static void abort(AppContext context, RepairSegment segment, JmxProxy jmxConnection) {
        postpone(context, segment, context.storage.getRepairUnit(segment.getRepairUnitId()));
        LOG.info("Aborting repair on segment with id {} on coordinator {}", segment.getId(),
                segment.getCoordinatorHost());

        String metric = MetricRegistry.name(SegmentRunner.class, "abort",
                Optional.ofNullable(segment.getCoordinatorHost()).orElse("null").replace('.', '-'));

        context.metricRegistry.counter(metric).inc();
        jmxConnection.cancelAllRepairs();
    }

    private void abort(RepairSegment segment, JmxProxy jmxConnection) {
        abort(context, segment, jmxConnection);
    }

    /**
     * Remember to call method postponeCurrentSegment() outside of synchronized(condition) block.
     */
    void postponeCurrentSegment() {
        synchronized (condition) {
            RepairSegment segment = context.storage.getRepairSegment(repairRunner.getRepairRunId(), segmentId)
                    .get();
            postpone(context, segment, context.storage.getRepairUnit(segment.getRepairUnitId()));
        }

        try {
            Thread.sleep(SLEEP_TIME_AFTER_POSTPONE_IN_MS);
        } catch (InterruptedException e) {
            LOG.debug("Interrupted while sleeping after a segment was postponed... weird stuff...");
        }
    }

    /**
     * This method is intended to be temporary, until we find the root issue of too many open files issue.
     */
    private static long getOpenFilesAmount() {
        OperatingSystemMXBean os = ManagementFactory.getOperatingSystemMXBean();
        long amountOfOpenFiles = -1;
        if (os instanceof UnixOperatingSystemMXBean) {
            amountOfOpenFiles = ((UnixOperatingSystemMXBean) os).getOpenFileDescriptorCount();
        }
        return amountOfOpenFiles;
    }

    private boolean runRepair() {
        LOG.debug("Run repair for segment #{}", segmentId);
        RepairSegment segment = context.storage.getRepairSegment(repairRunner.getRepairRunId(), segmentId).get();
        Thread.currentThread().setName(clusterName + ":" + segment.getRunId() + ":" + segmentId);

        try (Timer.Context cxt = context.metricRegistry.timer(metricNameForRunRepair(segment)).time()) {
            JmxProxy coordinator = context.jmxConnectionFactory
                    .connectAny(
                            potentialCoordinators.stream()
                                    .map(host -> Node.builder().withClusterName(clusterName).withHostname(host)
                                            .build())
                                    .collect(Collectors.toSet()),
                            context.config.getJmxConnectionTimeoutInSeconds());

            if (SEGMENT_RUNNERS.containsKey(segmentId)) {
                LOG.error("SegmentRunner already exists for segment with ID: {}", segmentId);
                throw new ReaperException("SegmentRunner already exists for segment with ID: " + segmentId);
            }

            String keyspace = repairUnit.getKeyspaceName();
            boolean fullRepair = !repairUnit.getIncrementalRepair();

            LazyInitializer<Set<String>> busyHosts = new BusyHostsInitializer(coordinator);
            if (!canRepair(segment, keyspace, coordinator, busyHosts)) {
                LOG.info("Cannot run segment {} for repair {} at the moment. Will try again later", segmentId,
                        segment.getRunId());
                SEGMENT_RUNNERS.remove(segment.getId());
                try {
                    Thread.sleep(SLEEP_TIME_AFTER_POSTPONE_IN_MS);
                } catch (InterruptedException e) {
                    LOG.debug("Interrupted while sleeping after a segment was postponed... weird stuff...");
                }
                return false;
            }

            try (Timer.Context cxt1 = context.metricRegistry.timer(metricNameForRepairing(segment)).time()) {
                Set<String> tablesToRepair;
                try {
                    tablesToRepair = getTablesToRepair(coordinator, repairUnit);
                } catch (IllegalStateException e) {
                    String msg = "Invalid blacklist definition. It filtered all tables in the keyspace.";
                    LOG.error(msg, e);
                    RepairRun repairRun = context.storage.getRepairRun(segment.getRunId()).get();
                    context.storage.updateRepairRun(repairRun.with().runState(RepairRun.RunState.ERROR)
                            .lastEvent(String.format(msg)).endTime(DateTime.now()).build(segment.getRunId()));
                    repairRunner.killAndCleanupRunner();

                    context.storage.updateRepairSegment(segment.with().withState(RepairSegment.State.DONE)
                            .withStartTime(DateTime.now()).withEndTime(DateTime.now()).withId(segmentId).build());
                    return false;
                }

                try {
                    LOG.debug("Enter synchronized section with segment ID {}", segmentId);
                    synchronized (condition) {

                        segment = segment.with().withCoordinatorHost(coordinator.getHost())
                                .withStartTime(DateTime.now()).withId(segmentId).build();
                        context.storage.updateRepairSegment(segment);

                        repairNo = coordinator.triggerRepair(segment.getStartToken(), segment.getEndToken(),
                                keyspace, validationParallelism, tablesToRepair, fullRepair,
                                repairUnit.getDatacenters(), this, segment.getTokenRange().getTokenRanges(),
                                repairUnit.getRepairThreadCount());

                        if (0 != repairNo) {
                            processTriggeredSegment(segment, coordinator, repairNo);
                        } else {
                            LOG.info("Nothing to repair for segment {} in keyspace {}", segmentId, keyspace);

                            context.storage.updateRepairSegment(segment.with().withState(RepairSegment.State.DONE)
                                    .withEndTime(DateTime.now()).withId(segmentId).build());

                            SEGMENT_RUNNERS.remove(segment.getId());
                        }
                    }
                } finally {
                    LOG.debug("Exiting synchronized section with segment ID {}", segmentId);
                }
            }
        } catch (RuntimeException | ReaperException e) {
            LOG.warn("Failed to connect to a coordinator node for segment {}", segmentId, e);
            String msg = "Postponed a segment because no coordinator was reachable";
            repairRunner.updateLastEvent(msg);
            postponeCurrentSegment();
            LOG.warn("Open files amount for process: " + getOpenFilesAmount());
            return false;
        } finally {
            SEGMENT_RUNNERS.remove(segment.getId());
            context.metricRegistry.histogram(MetricRegistry.name(SegmentRunner.class, "open-files"))
                    .update(getOpenFilesAmount());
        }
        return true;
    }

    private void processTriggeredSegment(final RepairSegment segment, final JmxProxy coordinator, int repairNo) {

        repairRunner.updateLastEvent(String.format("Triggered repair of segment %s via host %s", segment.getId(),
                coordinator.getHost()));

        {
            long timeout = repairUnit.getIncrementalRepair() ? timeoutMillis * MAX_TIMEOUT_EXTENSIONS
                    : timeoutMillis;
            LOG.info("Repair for segment {} started, status wait will timeout in {} millis", segmentId, timeout);
        }

        try {
            final long startTime = System.currentTimeMillis();
            final long maxTime = startTime + timeoutMillis;
            final long waitTime = Math.min(timeoutMillis, 60000);
            long lastLoopTime = startTime;

            while (System.currentTimeMillis() < maxTime) {
                condition.await(waitTime, TimeUnit.MILLISECONDS);

                boolean isDoneOrTimedOut = lastLoopTime + 60_000 > System.currentTimeMillis();

                isDoneOrTimedOut |= RepairSegment.State.DONE == context.storage
                        .getRepairSegment(segment.getRunId(), segmentId).get().getState();

                if (isDoneOrTimedOut) {
                    break;
                }
                renewLead();
                lastLoopTime = System.currentTimeMillis();
            }
        } catch (InterruptedException e) {
            LOG.warn("Repair command {} on segment {} interrupted", this.repairNo, segmentId, e);
        } finally {
            coordinator.removeRepairStatusHandler(repairNo);
            RepairSegment resultingSegment = context.storage
                    .getRepairSegment(repairRunner.getRepairRunId(), segmentId).get();

            LOG.info("Repair command {} on segment {} returned with state {}", this.repairNo, segmentId,
                    resultingSegment.getState());

            if (RepairSegment.State.RUNNING == resultingSegment.getState()) {
                LOG.info("Repair command {} on segment {} has been cancelled while running", this.repairNo,
                        segmentId);
                segmentFailed.set(true);
                abort(resultingSegment, coordinator);

            } else if (RepairSegment.State.DONE == resultingSegment.getState()) {

                LOG.debug("Repair segment with id '{}' was repaired in {} seconds", resultingSegment.getId(),
                        Seconds.secondsBetween(resultingSegment.getStartTime(), resultingSegment.getEndTime())
                                .getSeconds());

                SEGMENT_RUNNERS.remove(resultingSegment.getId());
            } else {
                // Something went wrong on the coordinator node and we never got the RUNNING notification
                // or we are in an undetermined state.
                // Let's just abort and reschedule the segment.
                LOG.info("Repair command {} on segment {} never managed to start within timeout.", this.repairNo,
                        segmentId);
                segmentFailed.set(true);
                abort(resultingSegment, coordinator);
            }
            // Repair is still running, we'll renew lead on the segment when using Cassandra as storage backend
            renewLead();
        }
    }

    private static String metricNameForPostpone(RepairUnit unit, RepairSegment segment) {
        return MetricRegistry.name(SegmentRunner.class, "postpone",
                Optional.ofNullable(segment.getCoordinatorHost()).orElse("null").replace('.', '-'),
                unit.getClusterName().replace('.', '-'), unit.getKeyspaceName());
    }

    private String metricNameForRepairing(RepairSegment rs) {
        return MetricRegistry.name(SegmentRunner.class, "repairing",
                Optional.ofNullable(rs.getCoordinatorHost()).orElse("null").replace('.', '-'),
                clusterName.replace('.', '-'), repairUnit.getKeyspaceName());
    }

    private String metricNameForRunRepair(RepairSegment rs) {
        return MetricRegistry.name(SegmentRunner.class, "runRepair",
                Optional.ofNullable(rs.getCoordinatorHost()).orElse("null").replace('.', '-'),
                clusterName.replace('.', '-'), repairUnit.getKeyspaceName());
    }

    private void declineRun() {
        LOG.info("SegmentRunner declined to repair segment {} "
                + "because only one segment is allowed at once for incremental repairs", segmentId);

        String msg = "Postponed due to already running segment";
        repairRunner.updateLastEvent(msg);
    }

    boolean canRepair(RepairSegment segment, String keyspace, JmxProxy coordinator,
            LazyInitializer<Set<String>> busyHosts) {

        if (repairUnit.getIncrementalRepair()) {
            // In incremental repairs, only one segment is allowed at once (one segment == the full primary range of one node)
            if (repairHasSegmentRunning(segment.getRunId())) {
                declineRun();
                return false;
            }
            if (isRepairRunningOnOneNode(segment)) {
                declineRun();
                return false;
            }
            return true;
        }

        Collection<String> nodes;
        try {
            // when hosts are coming up or going down, this method can throw an
            //  UndeclaredThrowableException
            nodes = coordinator.tokenRangeToEndpoint(keyspace, segment.getTokenRange());
        } catch (RuntimeException e) {
            LOG.warn("SegmentRunner couldn't get token ranges from coordinator: ", e);
            String msg = "SegmentRunner couldn't get token ranges from coordinator";
            repairRunner.updateLastEvent(msg);
            return false;
        }
        String dc = EndpointSnitchInfoProxy.create(coordinator).getDataCenter();
        boolean allLocalDcHosts = true;
        boolean allHosts = true;
        Map<String, String> dcByNode = Maps.newHashMap();
        nodes.forEach(node -> dcByNode.put(node, EndpointSnitchInfoProxy.create(coordinator).getDataCenter(node)));

        List<Callable<Pair<String, Optional<NodeMetrics>>>> getMetricsTasks = nodes.stream()
                .filter(node -> repairUnit.getDatacenters().isEmpty()
                        || repairUnit.getDatacenters().contains(dcByNode.get(node)))
                .map(node -> getNodeMetrics(node, dc != null ? dc : "",
                        dcByNode.get(node) != null ? dcByNode.get(node) : ""))
                .collect(Collectors.toList());

        try {
            for (Future<Pair<String, Optional<NodeMetrics>>> future : METRICS_GRABBER_EXECUTOR
                    .invokeAll(getMetricsTasks)) {
                try {
                    Pair<String, Optional<NodeMetrics>> result = future.get();
                    if (!result.getRight().isPresent()) {
                        // We failed at getting metrics for that node
                        allHosts = false;
                        if (dcByNode.get(result.getLeft()).equals(dc)) {
                            allLocalDcHosts = false;
                        }
                    } else {
                        NodeMetrics metrics = result.getRight().get();
                        int pendingCompactions = metrics.getPendingCompactions();
                        if (pendingCompactions > context.config.getMaxPendingCompactions()) {
                            LOG.info(
                                    "SegmentRunner declined to repair segment {} because of"
                                            + " too many pending compactions (> {}) on host \"{}\"",
                                    segmentId, context.config.getMaxPendingCompactions(), metrics.getNode());

                            String msg = String.format("Postponed due to pending compactions (%d)",
                                    pendingCompactions);
                            repairRunner.updateLastEvent(msg);
                            return false;
                        }
                        if (metrics.hasRepairRunning()) {
                            LOG.info(
                                    "SegmentRunner declined to repair segment {} because one of the hosts ({}) was "
                                            + "already involved in a repair",
                                    segmentId, metrics.getNode());

                            String msg = "Postponed due to affected hosts already doing repairs";
                            repairRunner.updateLastEvent(msg);
                            handlePotentialStuckRepairs(busyHosts, metrics.getNode());
                            return false;
                        }
                    }
                } catch (InterruptedException | ExecutionException | ConcurrentException e) {
                    LOG.warn("Failed grabbing metrics from at least one node. Cannot repair segment :'(", e);
                    allLocalDcHosts = false;
                    allHosts = false;
                }
            }
        } catch (InterruptedException e) {
            LOG.debug("failed grabbing nodes metrics", e);
        }

        if (okToRepairSegment(allLocalDcHosts, allHosts, context.config.getDatacenterAvailability())) {
            LOG.info("It is ok to repair segment '{}' on repair run with id '{}'", segment.getId(),
                    segment.getRunId());
            return true;
        } else {
            LOG.info(
                    "Not ok to repair segment '{}' on repair run with id '{}' because we couldn't get all hosts metrics :'(",
                    segment.getId(), segment.getRunId());
            return false;
        }
    }

    static boolean okToRepairSegment(boolean allLocalDcHosts, boolean allHosts,
            DatacenterAvailability dcAvailability) {
        return allHosts || (allLocalDcHosts && DatacenterAvailability.LOCAL == dcAvailability);
    }

    private void handlePotentialStuckRepairs(LazyInitializer<Set<String>> busyHosts, String hostName)
            throws ConcurrentException {

        if (!busyHosts.get().contains(hostName) && context.storage instanceof IDistributedStorage) {
            try {
                JmxProxy hostProxy = context.jmxConnectionFactory.connect(
                        Node.builder().withClusterName(clusterName).withHostname(hostName).build(),
                        context.config.getJmxConnectionTimeoutInSeconds());
                // We double check that repair is still running there before actually canceling repairs
                if (hostProxy.isRepairRunning()) {
                    LOG.warn("A host ({}) reported that it is involved in a repair, but there is no record "
                            + "of any ongoing repair involving the host. Sending command to abort all repairs "
                            + "on the host.", hostName);
                    hostProxy.cancelAllRepairs();
                }
            } catch (ReaperException | RuntimeException | InterruptedException | JMException e) {
                LOG.debug("failed to cancel repairs on host {}", hostName, e);
            }
        }
    }

    Callable<Pair<String, Optional<NodeMetrics>>> getNodeMetrics(String node, String localDc, String nodeDc) {

        return () -> {
            LOG.debug("getMetricsForHost {} / {} / {}", node, localDc, nodeDc);

            if (DatacenterAvailability.ALL != context.config.getDatacenterAvailability() && !nodeDc.equals(localDc)
                    && context.storage instanceof IDistributedStorage) {
                // If DatacenterAvailability is not ALL, we should assume jmx on remote dc is not reachable.
                return Pair.of(node, getRemoteNodeMetrics(node, nodeDc));
            } else {
                try {
                    JmxProxy nodeProxy = context.jmxConnectionFactory.connect(
                            Node.builder().withClusterName(clusterName).withHostname(node).build(),
                            context.config.getJmxConnectionTimeoutInSeconds());

                    NodeMetrics metrics = NodeMetrics.builder().withNode(node).withDatacenter(nodeDc)
                            .withCluster(nodeProxy.getClusterName())
                            .withPendingCompactions(nodeProxy.getPendingCompactions())
                            .withHasRepairRunning(nodeProxy.isRepairRunning()).withActiveAnticompactions(0) // for future use
                            .build();

                    return Pair.of(node, Optional.of(metrics));
                } catch (RuntimeException | ReaperException e) {
                    LOG.debug("failed to query metrics for host {}, trying to get metrics from storage...", node,
                            e);
                    return Pair.of(node, getRemoteNodeMetrics(node, nodeDc));
                }
            }
        };
    }

    private Optional<NodeMetrics> getRemoteNodeMetrics(String node, String nodeDc) {
        Preconditions.checkState(DatacenterAvailability.ALL != context.config.getDatacenterAvailability());

        Optional<NodeMetrics> result = Optional.empty();
        if (context.storage instanceof IDistributedStorage) {
            IDistributedStorage storage = ((IDistributedStorage) context.storage);
            result = storage.getNodeMetrics(repairRunner.getRepairRunId(), node);

            if (!result.isPresent() && DatacenterAvailability.EACH == context.config.getDatacenterAvailability()) {
                // Sending a request for metrics to the other reaper instances through the Cassandra backend
                storeNodeMetrics(NodeMetrics.builder().withCluster(clusterName).withDatacenter(nodeDc)
                        .withNode(node).withRequested(true).build());

                long start = System.currentTimeMillis();

                while ((!result.isPresent() || result.get().isRequested())
                        && start + METRICS_MAX_WAIT_MS > System.currentTimeMillis()) {

                    try {
                        Thread.sleep(METRICS_POLL_INTERVAL_MS);
                    } catch (InterruptedException ignore) {
                    }
                    LOG.info("Trying to get metrics from remote DCs for {} in {} of {}", node, nodeDc, clusterName);
                    result = storage.getNodeMetrics(repairRunner.getRepairRunId(), node);
                }
            }
        }
        return result;
    }

    private boolean isRepairRunningOnOneNode(RepairSegment segment) {
        for (RepairSegment segmentInRun : context.storage.getRepairSegmentsForRun(segment.getRunId())) {
            try {
                JmxProxy hostProxy = context.jmxConnectionFactory
                        .connect(
                                Node.builder().withClusterName(clusterName)
                                        .withHostname(segmentInRun.getCoordinatorHost()).build(),
                                context.config.getJmxConnectionTimeoutInSeconds());
                if (hostProxy.isRepairRunning()) {
                    return true;
                }
            } catch (ReaperException | JMException | NumberFormatException | InterruptedException e) {
                LOG.error("Unreachable node when trying to determine if repair is running on a node."
                        + " Crossing fingers and continuing...", e);
            }
        }

        return false;
    }

    private boolean repairHasSegmentRunning(UUID repairRunId) {
        Collection<RepairSegment> segments = context.storage.getRepairSegmentsForRun(repairRunId);
        for (RepairSegment segment : segments) {
            if (segment.getState() == RepairSegment.State.RUNNING) {
                LOG.info("segment '{}' is running on host '{}'", segment.getId(), segment.getCoordinatorHost());
                return true;
            }
        }

        return false;
    }

    private void storeNodeMetrics(NodeMetrics metrics) {
        assert context.storage instanceof IDistributedStorage;
        if (DatacenterAvailability.ALL != context.config.getDatacenterAvailability()) {
            ((IDistributedStorage) context.storage).storeNodeMetrics(repairRunner.getRepairRunId(), metrics);
        }
    }

    /**
     * Called when there is an event coming either from JMX or this runner regarding on-going repairs.
     *
     * @param repairNo repair sequence number, obtained when triggering a repair
     * @param status new status of the repair
     * @param message additional information about the repair
     */
    @Override
    public void handle(int repairNo, Optional<ActiveRepairService.Status> status,
            Optional<ProgressEventType> progress, String message, JmxProxy jmxProxy) {

        final RepairSegment segment = context.storage.getRepairSegment(repairRunner.getRepairRunId(), segmentId)
                .get();
        Thread.currentThread().setName(clusterName + ":" + segment.getRunId() + ":" + segmentId);
        LOG.debug("handle called for repairCommandId {}, outcome {} / {} and message: {}", repairNo, status,
                progress, message);

        Preconditions.checkArgument(repairNo == this.repairNo,
                "Handler for command id %s not handling message with number %s", this.repairNo, repairNo);

        boolean failOutsideSynchronizedBlock = false;
        // DO NOT ADD EXTERNAL CALLS INSIDE THIS SYNCHRONIZED BLOCK (JMX PROXY ETC)
        synchronized (condition) {
            RepairSegment currentSegment = context.storage
                    .getRepairSegment(repairRunner.getRepairRunId(), segmentId).get();
            // See status explanations at: https://wiki.apache.org/cassandra/RepairAsyncAPI
            // Old repair API  up to Cassandra-2.1.x
            if (status.isPresent()) {
                failOutsideSynchronizedBlock = handleJmxNotificationForCassandra21(status, currentSegment, repairNo,
                        failOutsideSynchronizedBlock, progress, jmxProxy);
            }
            // New repair API  Cassandra-2.2 onwards
            if (progress.isPresent()) {
                failOutsideSynchronizedBlock = handleJmxNotificationForCassandra22(progress, currentSegment,
                        repairNo, failOutsideSynchronizedBlock, jmxProxy);
            }
        }

        if (failOutsideSynchronizedBlock) {
            if (takeLead() || renewLead()) {
                try {
                    postponeCurrentSegment();
                    tryClearSnapshots(message);
                } finally {
                    // if someone else does hold the lease, ie renewLead(..) was true,
                    // then their writes to repair_run table and any call to releaseLead(..) will throw an exception
                    try {
                        releaseLead();
                    } catch (AssertionError ignore) {
                    }
                }
            }
        }
    }

    private boolean handleJmxNotificationForCassandra22(Optional<ProgressEventType> progress,
            RepairSegment currentSegment, int repairNumber, boolean failOutsideSynchronizedBlock,
            JmxProxy jmxProxy) {

        switch (progress.get()) {
        case START:
            try {
                // avoid changing state to RUNNING if later notifications have already arrived
                if (!successOrFailedNotified.get() && RepairSegment.State.NOT_STARTED == currentSegment.getState()
                        && renewLead()) {

                    context.storage.updateRepairSegment(
                            currentSegment.with().withState(RepairSegment.State.RUNNING).withId(segmentId).build());

                    LOG.debug("updated segment {} with state {}", segmentId, RepairSegment.State.RUNNING);
                    break;
                }
            } catch (AssertionError er) {
                // ignore. segment repair has since timed out.
            }
            segmentFailed.set(true);
            break;

        case SUCCESS:

            Preconditions.checkState(!successOrFailedNotified.get(),
                    "illegal multiple 'SUCCESS' and 'FAILURE', %s:%s", repairRunner.getRepairRunId(), segmentId);

            try {
                if (segmentFailed.get()) {
                    LOG.debug(
                            "Got SUCCESS for segment with id '{}' and repair number '{}', but it had already timed out",
                            segmentId, repairNumber);
                } else if (renewLead()) {
                    LOG.debug("repair session succeeded for segment with id '{}' and repair number '{}'", segmentId,
                            repairNumber);

                    context.storage.updateRepairSegment(currentSegment.with().withState(RepairSegment.State.DONE)
                            .withEndTime(DateTime.now()).withId(segmentId).build());

                    successOrFailedNotified.set(true);
                    // Since we can get out of order notifications,
                    // we need to exit if we already got the COMPLETE notification.
                    if (completeNotified.get()) {
                        condition.signalAll();
                        jmxProxy.removeRepairStatusHandler(repairNumber);
                    }
                    break;
                }
            } catch (AssertionError er) {
                // ignore. segment repair has since timed out.
            }
            segmentFailed.set(true);
            break;

        case ERROR:
        case ABORT:

            Preconditions.checkState(!successOrFailedNotified.get(),
                    "illegal multiple 'SUCCESS' and 'FAILURE', %s:%s", repairRunner.getRepairRunId(), segmentId);

            LOG.warn("repair session failed for segment with id '{}' and repair number '{}'", segmentId,
                    repairNumber);
            failOutsideSynchronizedBlock = true;
            successOrFailedNotified.set(true);
            // Since we can get out of order notifications,
            // we need to exit if we already got the COMPLETE notification.
            if (completeNotified.get()) {
                condition.signalAll();
                jmxProxy.removeRepairStatusHandler(repairNumber);
            }
            break;

        case COMPLETE:
            // This gets called through the JMX proxy at the end
            // regardless of succeeded or failed sessions.
            // Since we can get out of order notifications,
            // we won't exit unless we already got a SUCCESS or ERROR notification.

            Preconditions.checkState(!completeNotified.get(), "illegal multiple 'COMPLETE', %s:%s",
                    repairRunner.getRepairRunId(), segmentId);

            LOG.debug("repair session finished for segment with id '{}' and repair number '{}'", segmentId,
                    repairNumber);
            completeNotified.set(true);
            if (successOrFailedNotified.get()) {
                condition.signalAll();
                jmxProxy.removeRepairStatusHandler(repairNumber);
            }
            break;
        default:
            LOG.debug("Unidentified progressStatus {} for segment with id '{}' and repair number '{}'",
                    progress.get(), segmentId, repairNumber);
        }
        return failOutsideSynchronizedBlock;
    }

    private boolean handleJmxNotificationForCassandra21(Optional<ActiveRepairService.Status> status,
            RepairSegment currentSegment, int repairNumber, boolean failOutsideSynchronizedBlock,
            Optional<ProgressEventType> progress, JmxProxy jmxProxy) {

        switch (status.get()) {
        case STARTED:
            try {
                // avoid changing state to RUNNING if later notifications have already arrived
                if (!successOrFailedNotified.get() && RepairSegment.State.NOT_STARTED == currentSegment.getState()
                        && renewLead()) {

                    context.storage.updateRepairSegment(
                            currentSegment.with().withState(RepairSegment.State.RUNNING).withId(segmentId).build());

                    LOG.debug("updated segment {} with state {}", segmentId, RepairSegment.State.RUNNING);
                    break;
                }
            } catch (AssertionError er) {
                // ignore. segment repair has since timed out.
            }
            segmentFailed.set(true);
            break;

        case SESSION_SUCCESS:
            // Cassandra 2.1 sends several SUCCESS/FAILED notifications during incremental repair
            if (!(repairUnit.getIncrementalRepair() && successOrFailedNotified.get())) {
                Preconditions.checkState(!successOrFailedNotified.get(),
                        "illegal multiple 'SUCCESS' and 'FAILURE', %s:%s", repairRunner.getRepairRunId(),
                        segmentId);

                try {
                    if (segmentFailed.get()) {
                        LOG.debug(
                                "Got SESSION_SUCCESS for segment with id '{}' and repair number '{}', but it had already timed out",
                                segmentId, repairNumber);
                    } else if (renewLead()) {
                        LOG.debug("repair session succeeded for segment with id '{}' and repair number '{}'",
                                segmentId, repairNumber);

                        context.storage
                                .updateRepairSegment(currentSegment.with().withState(RepairSegment.State.DONE)
                                        .withEndTime(DateTime.now()).withId(segmentId).build());

                        // Since we can get out of order notifications,
                        // we need to exit if we already got the COMPLETE notification.
                        successOrFailedNotified.set(true);
                        if (completeNotified.get()) {
                            condition.signalAll();
                            jmxProxy.removeRepairStatusHandler(repairNumber);
                        }

                        break;
                    }
                } catch (AssertionError er) {
                    // ignore. segment repair has since timed out.
                }
                segmentFailed.set(true);
                break;
            }
            break;

        case SESSION_FAILED:
            // Cassandra 2.1 sends several SUCCESS/FAILED notifications during incremental repair
            if (!(repairUnit.getIncrementalRepair() && successOrFailedNotified.get())) {
                Preconditions.checkState(!successOrFailedNotified.get(),
                        "illegal multiple 'SUCCESS' and 'FAILURE', %s:%s", repairRunner.getRepairRunId(),
                        segmentId);

                LOG.warn("repair session failed for segment with id '{}' and repair number '{}'", segmentId,
                        repairNumber);
                failOutsideSynchronizedBlock = true;
                // Since we can get out of order notifications,
                // we need to exit if we already got the COMPLETE notification.
                successOrFailedNotified.set(true);
                if (completeNotified.get()) {
                    condition.signalAll();
                    jmxProxy.removeRepairStatusHandler(repairNumber);
                }
                break;
            }
            break;

        case FINISHED:

            Preconditions.checkState(!completeNotified.get(), "illegal multiple 'COMPLETE', %s:%s",
                    repairRunner.getRepairRunId(), segmentId);

            // This gets called through the JMX proxy at the end
            // regardless of succeeded or failed sessions.
            // Since we can get out of order notifications,
            // we won't exit unless we already got a SUCCESS or ERROR notification.
            LOG.debug("repair session finished for segment with id '{}' and repair number '{}'", segmentId,
                    repairNumber);
            if (successOrFailedNotified.get()) {
                condition.signalAll();
                jmxProxy.removeRepairStatusHandler(repairNumber);
            }
            break;
        default:
            LOG.debug("Unidentified progressStatus {} for segment with id '{}' and repair number '{}'",
                    progress.get(), segmentId, repairNumber);
        }
        return failOutsideSynchronizedBlock;
    }

    /**
     * Attempts to clear snapshots that are possibly left behind after failed repair sessions.
     */
    void tryClearSnapshots(String message) {
        String keyspace = repairUnit.getKeyspaceName();
        String repairId = parseRepairId(message);
        if (repairId != null) {
            for (String involvedNode : potentialCoordinators) {
                try {
                    JmxProxy jmx = context.jmxConnectionFactory.connect(
                            Node.builder().withClusterName(clusterName).withHostname(involvedNode).build(),
                            context.config.getJmxConnectionTimeoutInSeconds());

                    // there is no way of telling if the snapshot was cleared or not :(
                    SnapshotProxy.create(jmx).clearSnapshot(repairId, keyspace);
                } catch (ReaperException | NumberFormatException | InterruptedException e) {
                    LOG.warn("Failed to clear snapshot after failed session for host {}, keyspace {}: {}",
                            involvedNode, keyspace, e.getMessage(), e);
                }
            }
        }
    }

    static String parseRepairId(String message) {
        Matcher uuidMatcher = REPAIR_UUID_PATTERN.matcher(message);
        if (uuidMatcher.find()) {
            return uuidMatcher.group();
        } else {
            return null;
        }
    }

    /**
     * Calculate the delay that should be used before starting the next repair segment.
     *
     * @return the delay in milliseconds.
     */
    long intensityBasedDelayMillis(double intensity) {
        RepairSegment repairSegment = context.storage.getRepairSegment(repairRunner.getRepairRunId(), segmentId)
                .get();
        if (repairSegment.getEndTime() == null && repairSegment.getStartTime() == null) {
            return 0;
        } else if (repairSegment.getEndTime() != null && repairSegment.getStartTime() != null) {
            long repairEnd = repairSegment.getEndTime().getMillis();
            long repairStart = repairSegment.getStartTime().getMillis();
            long repairDuration = Math.max(1, repairEnd - repairStart);
            long delay = (long) (repairDuration / intensity - repairDuration);
            LOG.debug("Scheduling next runner run() with delay {} ms", delay);
            int nbRunningReapers = countRunningReapers();
            LOG.debug("Concurrent reaper instances : {}", nbRunningReapers);
            return delay * nbRunningReapers;
        } else {
            LOG.error(
                    "Segment {} returned with startTime {} and endTime {}. This should not happen."
                            + "Intensity cannot apply, so next run will start immediately.",
                    repairSegment.getId(), repairSegment.getStartTime(), repairSegment.getEndTime());
            return 0;
        }
    }

    private boolean takeLead() {
        try (Timer.Context cx = context.metricRegistry.timer(MetricRegistry.name(SegmentRunner.class, "takeLead"))
                .time()) {

            boolean result = context.storage instanceof IDistributedStorage
                    ? ((IDistributedStorage) context.storage).takeLead(leaderElectionId)
                    : true;

            if (!result) {
                context.metricRegistry.counter(MetricRegistry.name(SegmentRunner.class, "takeLead", "failed"))
                        .inc();
            }
            return result;
        }
    }

    private boolean renewLead() {
        try (Timer.Context cx = context.metricRegistry.timer(MetricRegistry.name(SegmentRunner.class, "renewLead"))
                .time()) {

            boolean result = context.storage instanceof IDistributedStorage
                    ? ((IDistributedStorage) context.storage).renewLead(leaderElectionId)
                    : true;

            if (!result) {
                context.metricRegistry.counter(MetricRegistry.name(SegmentRunner.class, "renewLead", "failed"))
                        .inc();
            }
            return result;
        }
    }

    private void releaseLead() {
        try (Timer.Context cx = context.metricRegistry
                .timer(MetricRegistry.name(SegmentRunner.class, "releaseLead")).time()) {
            if (context.storage instanceof IDistributedStorage) {
                ((IDistributedStorage) context.storage).releaseLead(leaderElectionId);
            }
        }
    }

    private int countRunningReapers() {
        return context.storage instanceof IDistributedStorage
                ? ((IDistributedStorage) context.storage).countRunningReapers()
                : 1;
    }

    /**
     * Applies blacklist filter on tables for the given repair unit.
     *
     * @param coordinator : a JMX proxy instance
     * @param unit : the repair unit for the current run
     * @return the list of tables to repair for the keyspace without the blacklisted ones
     * @throws ReaperException, IllegalStateException
     */
    static Set<String> getTablesToRepair(JmxProxy coordinator, RepairUnit unit)
            throws ReaperException, IllegalStateException {
        Set<String> tables = unit.getColumnFamilies();

        if (!unit.getBlacklistedTables().isEmpty() && unit.getColumnFamilies().isEmpty()) {
            tables = coordinator.getTableNamesForKeyspace(unit.getKeyspaceName()).stream()
                    .filter(tableName -> !unit.getBlacklistedTables().contains(tableName))
                    .collect(Collectors.toSet());
        }

        if (!unit.getBlacklistedTables().isEmpty() && !unit.getColumnFamilies().isEmpty()) {
            tables = unit.getColumnFamilies().stream()
                    .filter(tableName -> !unit.getBlacklistedTables().contains(tableName))
                    .collect(Collectors.toSet());
        }

        Preconditions.checkState(!(!unit.getBlacklistedTables().isEmpty() && tables.isEmpty())); // if we have a blacklist, we should have tables in the output.

        return tables;
    }

    private class BusyHostsInitializer extends LazyInitializer<Set<String>> {

        private final JmxProxy coordinator;

        BusyHostsInitializer(JmxProxy coordinator) {
            this.coordinator = coordinator;
        }

        @Override
        protected Set<String> initialize() {
            Collection<RepairParameters> ongoingRepairs = context.storage.getOngoingRepairsInCluster(clusterName);
            Set<String> busyHosts = Sets.newHashSet();
            ongoingRepairs.forEach((ongoingRepair) -> {
                busyHosts.addAll(
                        coordinator.tokenRangeToEndpoint(ongoingRepair.keyspaceName, ongoingRepair.tokenRange));
            });
            return busyHosts;
        }
    }

}