com.spotify.helios.agent.QueueingHistoryWriter.java Source code

Introduction

Here is the source code for com.spotify.helios.agent.QueueingHistoryWriter.java
Source

/*
 * Copyright (c) 2014 Spotify AB.
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.helios.agent;

import com.google.common.base.Function;
import com.google.common.base.Supplier;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.util.concurrent.AbstractIdleService;
import com.google.common.util.concurrent.MoreExecutors;

import com.fasterxml.jackson.core.type.TypeReference;
import com.spotify.helios.common.descriptors.JobId;
import com.spotify.helios.common.descriptors.TaskStatus;
import com.spotify.helios.common.descriptors.TaskStatusEvent;
import com.spotify.helios.servicescommon.PersistentAtomicReference;
import com.spotify.helios.servicescommon.coordination.Paths;
import com.spotify.helios.servicescommon.coordination.ZooKeeperClient;

import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.ConnectionLossException;
import org.apache.zookeeper.KeeperException.NodeExistsException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.channels.ClosedByInterruptException;
import java.nio.file.Path;
import java.util.Collections;
import java.util.Deque;
import java.util.List;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

import static com.google.common.base.Preconditions.checkState;
import static java.util.concurrent.TimeUnit.SECONDS;

/**
 * Writes task history to ZK, and attempts to gracefully handle the case where ZK is down, and tries
 * to lose the right things if it has to lose stuff.
 *
 * Just some breadcrumbs so next time, the person that follows me can understand why things are
 * the way they are.
 *
 * Theory of operation:
 * 1. saveHistoryItem should never block for any significant amount of time.  Specifically, it
 *    should not block on ZK being in any particular state, and ideally not while a file write is
 *    occurring, as the file may get large if ZK has been away for a long time.
 * 2. We limit each job to max 30 events in memory (and in ZK for that matter)
 * 3. Maximum of 600 total events, so as not to consume all available memory.
 */
public class QueueingHistoryWriter extends AbstractIdleService implements Runnable {
    private static final Logger log = LoggerFactory.getLogger(QueueingHistoryWriter.class);

    public static final int MAX_NUMBER_STATUS_EVENTS_TO_RETAIN = 30;
    private static final int MAX_QUEUE_SIZE = 30;
    private static final int MAX_TOTAL_SIZE = 600;

    private final ConcurrentMap<JobId, Deque<TaskStatusEvent>> items;
    private final ScheduledExecutorService zkWriterExecutor = MoreExecutors.getExitingScheduledExecutorService(
            (ScheduledThreadPoolExecutor) Executors.newScheduledThreadPool(1), 0, SECONDS);
    private final String hostname;
    private final AtomicInteger count;
    private final ZooKeeperClient client;
    private final PersistentAtomicReference<ConcurrentMap<JobId, Deque<TaskStatusEvent>>> backingStore;

    public QueueingHistoryWriter(final String hostname, final ZooKeeperClient client, final Path backingFile)
            throws IOException, InterruptedException {
        this.hostname = hostname;
        this.client = client;
        this.backingStore = PersistentAtomicReference.create(backingFile,
                new TypeReference<ConcurrentMap<JobId, Deque<TaskStatusEvent>>>() {
                }, new Supplier<ConcurrentMap<JobId, Deque<TaskStatusEvent>>>() {
                    @Override
                    public ConcurrentMap<JobId, Deque<TaskStatusEvent>> get() {
                        return Maps.newConcurrentMap();
                    }
                });
        this.items = backingStore.get();

        // Clean out any errant null values.  Normally shouldn't have any, but we did have a few
        // where it happened, and this will make sure we can get out of a bad state if we get into it.
        final ImmutableSet<JobId> curKeys = ImmutableSet.copyOf(this.items.keySet());
        for (JobId key : curKeys) {
            if (this.items.get(key) == null) {
                this.items.remove(key);
            }
        }

        int itemCount = 0;
        for (Deque<TaskStatusEvent> deque : items.values()) {
            itemCount += deque.size();
        }
        this.count = new AtomicInteger(itemCount);
    }

    @Override
    protected void startUp() throws Exception {
        zkWriterExecutor.scheduleAtFixedRate(this, 1, 1, TimeUnit.SECONDS);
    }

    @Override
    protected void shutDown() throws Exception {
        zkWriterExecutor.shutdownNow();
        zkWriterExecutor.awaitTermination(1, TimeUnit.MINUTES);
    }

    private void add(TaskStatusEvent item) throws InterruptedException {
        // If too many "globally", toss them
        while (count.get() >= MAX_TOTAL_SIZE) {
            getNext();
        }

        final JobId key = item.getStatus().getJob().getId();
        final Deque<TaskStatusEvent> deque = getDeque(key);

        synchronized (deque) {
            // if too many in the particular deque, toss them
            while (deque.size() >= MAX_QUEUE_SIZE) {
                deque.remove();
                count.decrementAndGet();
            }
            deque.add(item);
            count.incrementAndGet();
        }

        try {
            backingStore.set(items);
        } catch (ClosedByInterruptException e) {
            log.debug("Writing task status event to backing store was interrupted");
        } catch (IOException e) { // We are best effort after all...
            log.warn("Failed to write task status event to backing store", e);
        }
    }

    private Deque<TaskStatusEvent> getDeque(final JobId key) {
        synchronized (items) {
            final Deque<TaskStatusEvent> deque = items.get(key);
            if (deque == null) { // try more assertively to get a deque
                final ConcurrentLinkedDeque<TaskStatusEvent> newDeque = new ConcurrentLinkedDeque<TaskStatusEvent>();
                items.put(key, newDeque);
                return newDeque;
            }
            return deque;
        }
    }

    public void saveHistoryItem(final JobId jobId, final TaskStatus status) throws InterruptedException {
        saveHistoryItem(jobId, status, System.currentTimeMillis());
    }

    public void saveHistoryItem(final JobId jobId, final TaskStatus status, long timestamp)
            throws InterruptedException {
        add(new TaskStatusEvent(status, timestamp, hostname));
    }

    private TaskStatusEvent getNext() {
        // Some explanation: We first find the eldest event from amongst the queues (ok, they're
        // deques, but we really use it as a put back queue), and only then to we try to get
        // a lock on the relevant queue from whence we got the event.  Assuming that all worked
        // *and* that the event we have wasn't rolled off due to max-size limitations, we then
        // pull the item off the queue and return it.  We're basically doing optimistic concurrency,
        // and skewing things so that adding to this should be cheap.

        while (true) {
            final TaskStatusEvent current = findEldestEvent();

            // Didn't find anything that needed processing?
            if (current == null) {
                return null;
            }

            final JobId id = current.getStatus().getJob().getId();
            final Deque<TaskStatusEvent> deque = items.get(id);
            if (deque == null) {
                // shouldn't happen because we should be the only one pulling items off, but....
                continue;
            }

            synchronized (deque) {
                if (!deque.peek().equals(current)) {
                    // item got rolled off, try again
                    continue;
                }

                // Pull it off the queue and be paranoid.
                final TaskStatusEvent newCurrent = deque.poll();
                count.decrementAndGet();
                checkState(current.equals(newCurrent), "current should equal newCurrent");
                // Safe because this is the *only* place we hold these two locks at the same time.
                synchronized (items) {
                    // Extra paranoia: curDeque should always == deque
                    final Deque<TaskStatusEvent> curDeque = items.get(id);
                    if (curDeque != null && curDeque.isEmpty()) {
                        items.remove(id);
                    }
                }
                return current;
            }
        }
    }

    public boolean isEmpty() {
        return count.get() == 0;
    }

    private void putBack(TaskStatusEvent event) {
        final JobId key = event.getStatus().getJob().getId();
        final Deque<TaskStatusEvent> queue = getDeque(key);
        synchronized (queue) {
            if (queue.size() >= MAX_QUEUE_SIZE) {
                // already full, just toss the event
                return;
            }
            queue.push(event);
            count.incrementAndGet();
        }
    }

    private TaskStatusEvent findEldestEvent() {
        // We don't lock anything because in the worst case, we just put things in out of order which
        // while not perfect, won't cause any actual harm.  Out of order meaning between jobids, not
        // within the same job id.  Whether this is the best strategy (as opposed to fullest deque)
        // is arguable.
        TaskStatusEvent current = null;
        for (Deque<TaskStatusEvent> queue : items.values()) {
            if (queue == null) {
                continue;
            }
            final TaskStatusEvent item = queue.peek();
            if (current == null || (item.getTimestamp() < current.getTimestamp())) {
                current = item;
            }
        }
        return current;
    }

    @Override
    public void run() {
        while (true) {
            final TaskStatusEvent item = getNext();
            if (item == null) {
                return;
            }

            try {
                final JobId jobId = item.getStatus().getJob().getId();
                final String historyPath = Paths.historyJobHostEventsTimestamp(jobId, hostname,
                        item.getTimestamp());
                log.debug("writing queued item to zookeeper {} {}", item.getStatus().getJob().getId(),
                        item.getTimestamp());
                client.ensurePath(historyPath, true);
                client.createAndSetData(historyPath, item.getStatus().toJsonBytes());

                // See if too many
                final List<String> events = client.getChildren(Paths.historyJobHostEvents(jobId, hostname));
                if (events.size() > MAX_NUMBER_STATUS_EVENTS_TO_RETAIN) {
                    trimStatusEvents(events, jobId);
                }
            } catch (NodeExistsException e) {
                // Ahh, the two generals problem...  We handle by doing nothing since the thing
                // we wanted in, is in.
                log.debug("item we wanted in is already there");
            } catch (ConnectionLossException e) {
                log.warn("Connection lost while putting item into zookeeper, will retry");
                putBack(item);
                break;
            } catch (KeeperException e) {
                log.error("Error putting item into zookeeper, will retry", e);
                putBack(item);
                break;
            }
        }
    }

    private void trimStatusEvents(List<String> events, JobId jobId) {
        // CleanupExecutor only has one thread so can assume no others are fiddling as we do this.
        // All this to sort numerically instead of lexically....
        final List<Long> eventsAsLongs = Lists
                .newArrayList(Iterables.transform(events, new Function<String, Long>() {
                    @Override
                    public Long apply(String name) {
                        return Long.valueOf(name);
                    }
                }));
        Collections.sort(eventsAsLongs);

        for (int i = 0; i < (eventsAsLongs.size() - MAX_NUMBER_STATUS_EVENTS_TO_RETAIN); i++) {
            try {
                client.delete(Paths.historyJobHostEventsTimestamp(jobId, hostname, eventsAsLongs.get(i)));
            } catch (KeeperException e) {
                log.warn("failure deleting overflow of status items - we're hoping a later" + " execution will fix",
                        e);
            }
        }
    }
}