com.linkedin.pinot.core.query.scheduler.MultiLevelPriorityQueue.java Source code

Introduction

Here is the source code for com.linkedin.pinot.core.query.scheduler.MultiLevelPriorityQueue.java
Source

/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.linkedin.pinot.core.query.scheduler;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.linkedin.pinot.common.query.ServerQueryRequest;
import com.linkedin.pinot.core.query.scheduler.resources.ResourceManager;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import org.apache.commons.configuration.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Priority queues of scheduler groups that determines query priority based on tokens
 *
 * This is a multi-level query scheduling queue with each sublevel maintaining a waitlist of
 * queries for the group. The priority between groups is provided by specific SchedulerGroup
 * implementation. If two groups have the same priority then the group with lower
 * resource utilization is selected first. Oldest query from the winning SchedulerGroup
 * is selected for execution.
 */
public class MultiLevelPriorityQueue implements SchedulerPriorityQueue {

    private static Logger LOGGER = LoggerFactory.getLogger(MultiLevelPriorityQueue.class);
    public static final String QUERY_DEADLINE_SECONDS_KEY = "query_deadline_seconds";
    public static final String MAX_PENDING_PER_GROUP_KEY = "max_pending_per_group";
    public static final String QUEUE_WAKEUP_MICROS = "queue_wakeup_micros";

    private static final int DEFAULT_WAKEUP_MICROS = 1000;

    private static int wakeUpTimeMicros = DEFAULT_WAKEUP_MICROS;
    private final int maxPendingPerGroup;

    private final Map<String, SchedulerGroup> schedulerGroups = new HashMap<>();
    private final Lock queueLock = new ReentrantLock();
    private final Condition queryReaderCondition = queueLock.newCondition();
    private final ResourceManager resourceManager;
    private final SchedulerGroupMapper groupSelector;
    private final int queryDeadlineMillis;
    private final SchedulerGroupFactory groupFactory;
    private final Configuration config;

    public MultiLevelPriorityQueue(@Nonnull Configuration config, @Nonnull ResourceManager resourceManager,
            @Nonnull SchedulerGroupFactory groupFactory, @Nonnull SchedulerGroupMapper groupMapper) {
        Preconditions.checkNotNull(config);
        Preconditions.checkNotNull(resourceManager);
        Preconditions.checkNotNull(groupFactory);
        Preconditions.checkNotNull(groupMapper);

        // max available tokens per millisecond equals number of threads (total execution capacity)
        // we are over provisioning tokens here because its better to keep pipe full rather than empty
        queryDeadlineMillis = config.getInt(QUERY_DEADLINE_SECONDS_KEY, 30) * 1000;
        wakeUpTimeMicros = config.getInt(QUEUE_WAKEUP_MICROS, DEFAULT_WAKEUP_MICROS);
        maxPendingPerGroup = config.getInt(MAX_PENDING_PER_GROUP_KEY, 10);
        this.config = config;
        this.resourceManager = resourceManager;
        this.groupFactory = groupFactory;
        this.groupSelector = groupMapper;
    }

    @Override
    public void put(@Nonnull SchedulerQueryContext query) throws OutOfCapacityError {
        Preconditions.checkNotNull(query);
        queueLock.lock();
        String groupName = groupSelector.getSchedulerGroupName(query);
        try {
            SchedulerGroup groupContext = getOrCreateGroupContext(groupName);
            checkGroupHasCapacity(groupContext);
            query.setSchedulerGroupContext(groupContext);
            groupContext.addLast(query);
            queryReaderCondition.signal();
        } finally {
            queueLock.unlock();
        }
    }

    /**
     * Blocking call to read the next query in order of priority
     * @return
     */
    @Override
    public @Nullable SchedulerQueryContext take() {
        queueLock.lock();
        try {
            while (true) {
                SchedulerQueryContext schedulerQueryContext;
                while ((schedulerQueryContext = takeNextInternal()) == null) {
                    try {
                        queryReaderCondition.await(wakeUpTimeMicros, TimeUnit.MICROSECONDS);
                    } catch (InterruptedException e) {
                        return null;
                    }
                }
                return schedulerQueryContext;
            }
        } finally {
            queueLock.unlock();
        }
    }

    @Nonnull
    @Override
    public List<SchedulerQueryContext> drain() {
        List<SchedulerQueryContext> pending = new ArrayList<>();
        queueLock.lock();
        try {
            for (Map.Entry<String, SchedulerGroup> groupEntry : schedulerGroups.entrySet()) {
                SchedulerGroup group = groupEntry.getValue();
                while (!group.isEmpty()) {
                    pending.add(group.removeFirst());
                }
            }
        } finally {
            queueLock.unlock();
        }
        return pending;
    }

    private SchedulerQueryContext takeNextInternal() {
        SchedulerGroup currentWinnerGroup = null;
        long startTime = System.nanoTime();
        StringBuilder sb = new StringBuilder("SchedulerInfo:");
        long deadlineEpochMillis = currentTimeMillis() - queryDeadlineMillis;
        for (Map.Entry<String, SchedulerGroup> groupInfoEntry : schedulerGroups.entrySet()) {
            SchedulerGroup group = groupInfoEntry.getValue();
            if (LOGGER.isDebugEnabled()) {
                sb.append(group.toString());
            }
            group.trimExpired(deadlineEpochMillis);
            if (group.isEmpty() || !resourceManager.canSchedule(group)) {
                continue;
            }

            if (currentWinnerGroup == null) {
                currentWinnerGroup = group;
                continue;
            }

            // Preconditions:
            // a. currentGroupResources <= hardLimit
            // b. selectedGroupResources <= hardLimit
            // We prefer group with higher priority but with resource limits.
            // If current group priority are greater than currently winning priority then we choose current
            // group over currentWinnerGroup if
            // a. current group is using less than softLimit resources
            // b. if softLimit < currentGroupResources <= hardLimit then
            //     i. choose group if softLimit <= currentWinnerGroup <= hardLimit
            //     ii. continue with currentWinnerGroup otherwise
            int comparison = group.compareTo(currentWinnerGroup);
            if (comparison < 0) {
                if (currentWinnerGroup.totalReservedThreads() > resourceManager.getTableThreadsSoftLimit()
                        && group.totalReservedThreads() < resourceManager.getTableThreadsSoftLimit()) {
                    currentWinnerGroup = group;
                }
                continue;
            }
            if (comparison >= 0) {
                if (group.totalReservedThreads() < resourceManager.getTableThreadsSoftLimit()
                        || group.totalReservedThreads() < currentWinnerGroup.totalReservedThreads()) {
                    currentWinnerGroup = group;
                }
            }
        }

        SchedulerQueryContext query = null;
        if (currentWinnerGroup != null) {
            ServerQueryRequest queryRequest = currentWinnerGroup.peekFirst().getQueryRequest();
            if (LOGGER.isDebugEnabled()) {
                sb.append(String.format(" Winner: %s: [%d,%d,%d,%d]", currentWinnerGroup.name(),
                        queryRequest.getTimerContext().getQueryArrivalTimeMs(),
                        queryRequest.getInstanceRequest().getRequestId(),
                        queryRequest.getInstanceRequest().getSearchSegments().size(), startTime));
            }
            query = currentWinnerGroup.removeFirst();
        }
        if (LOGGER.isDebugEnabled()) {
            LOGGER.debug(sb.toString());
        }
        return query;
    }

    private void checkGroupHasCapacity(SchedulerGroup groupContext) throws OutOfCapacityError {
        if (groupContext.numPending() >= maxPendingPerGroup
                && groupContext.totalReservedThreads() >= resourceManager.getTableThreadsHardLimit()) {
            throw new OutOfCapacityError(String.format(
                    "SchedulerGroup %s is out of capacity. numPending: %d, maxPending: %d, reservedThreads: %d threadsHardLimit: %d",
                    groupContext.name(), groupContext.numPending(), maxPendingPerGroup,
                    groupContext.totalReservedThreads(), resourceManager.getTableThreadsHardLimit()));
        }
    }

    private SchedulerGroup getOrCreateGroupContext(String groupName) {
        SchedulerGroup groupContext = schedulerGroups.get(groupName);
        if (groupContext == null) {
            groupContext = groupFactory.create(config, groupName);
            schedulerGroups.put(groupName, groupContext);
        }
        return groupContext;
    }

    // separate method to allow mocking for unit testing
    private long currentTimeMillis() {
        return System.currentTimeMillis();
    }

    @VisibleForTesting
    long getWakeupTimeMicros() {
        return wakeUpTimeMicros;
    }
}