Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.exec.tez; import org.apache.hadoop.hive.metastore.api.WMPoolSchedulingPolicy; import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.google.common.math.DoubleMath; import com.google.common.util.concurrent.FutureCallback; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.SettableFuture; import com.google.common.util.concurrent.ThreadFactoryBuilder; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.IdentityHashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.llap.tezplugins.LlapTaskSchedulerService; import org.apache.hadoop.hive.metastore.api.WMFullResourcePlan; import org.apache.hadoop.hive.metastore.api.WMPool; import org.apache.hadoop.hive.metastore.api.WMPoolTrigger; import org.apache.hadoop.hive.metastore.api.WMTrigger; import org.apache.hadoop.hive.ql.exec.tez.AmPluginNode.AmPluginInfo; import org.apache.hadoop.hive.ql.exec.tez.TezSessionState.HiveResources; import org.apache.hadoop.hive.ql.exec.tez.UserPoolMapping.MappingInput; import org.apache.hadoop.hive.ql.exec.tez.WmEvent.EventType; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.session.KillQuery; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.wm.ExecutionTrigger; import org.apache.hadoop.hive.ql.wm.SessionTriggerProvider; import org.apache.hadoop.hive.ql.wm.Trigger; import org.apache.hadoop.hive.ql.wm.TriggerActionHandler; import org.apache.hadoop.hive.ql.wm.WmContext; import org.apache.hadoop.metrics2.MetricsSystem; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hive.common.util.Ref; import org.apache.tez.dag.api.TezConfiguration; import org.codehaus.jackson.annotate.JsonAutoDetect; import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.map.SerializationConfig; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** Workload management entry point for HS2. * Note on how this class operates. * There are tons of things that could be happening in parallel that are a real pain to sync. * Therefore, it uses an actor-ish model where the master thread, in processCurrentEvents method, * processes a bunch of events that have accumulated since the previous iteration, repeatedly * and quickly, doing physical work via async calls or via worker threads. * That way the bulk of the state (pools, etc.) does not require any sync, and we mostly have * a consistent view of the conflicting events when we process things. However, that also means * none of that state can be accessed directly - most changes that touch pool state, or interact * with background operations like init, need to go thru eventstate; see e.g. returnAfterUse. */ public class WorkloadManager extends TezSessionPoolSession.AbstractTriggerValidator implements TezSessionPoolSession.Manager, SessionExpirationTracker.RestartImpl, WorkloadManagerMxBean { private static final Logger LOG = LoggerFactory.getLogger(WorkloadManager.class); private static final char POOL_SEPARATOR = '.'; private static final String POOL_SEPARATOR_STR = "" + POOL_SEPARATOR; private final ObjectMapper objectMapper; // Various final services, configs, etc. private final HiveConf conf; private final TezSessionPool<WmTezSession> tezAmPool; private final SessionExpirationTracker expirationTracker; private final RestrictedConfigChecker restrictedConfig; private final QueryAllocationManager allocationManager; private final String yarnQueue; private final int amRegistryTimeoutMs; private final boolean allowAnyPool; private final MetricsSystem metricsSystem; // Note: it's not clear that we need to track this - unlike PoolManager we don't have non-pool // sessions, so the pool itself could internally track the ses sions it gave out, since // calling close on an unopened session is probably harmless. private final IdentityHashMap<TezSessionPoolSession, Boolean> openSessions = new IdentityHashMap<>(); // We index the get requests to make sure there are no ordering artifacts when we requeue. private final AtomicLong getRequestVersion = new AtomicLong(Long.MIN_VALUE); // The below group of fields (pools, etc.) can only be modified by the master thread. private Map<String, PoolState> pools; private String rpName, defaultPool; // For information only. private int totalQueryParallelism; /** * The queries being killed. This is used to sync between the background kill finishing and the * query finishing and user returning the sessions, which can happen in separate iterations * of the master thread processing, yet need to be aware of each other. */ private Map<WmTezSession, KillQueryContext> killQueryInProgress = new IdentityHashMap<>(); // Used to make sure that waiting getSessions don't block update. private UserPoolMapping userPoolMapping; // End of master thread state // Note: we could use RW lock to allow concurrent calls for different sessions, however all // those calls do is add elements to lists and maps; and we'd need to sync those separately // separately, plus have an object to notify because RW lock does not support conditions // in any sensible way. So, for now the lock is going to be epic. private final ReentrantLock currentLock = new ReentrantLock(); private final Condition hasChangesCondition = currentLock.newCondition(); // The processing thread will switch between these two objects. private final EventState one = new EventState(), two = new EventState(); private boolean hasChanges = false; private EventState current = one; private final WmThreadSyncWork syncWork = new WmThreadSyncWork(); // End sync stuff. private PerPoolTriggerValidatorRunnable triggerValidatorRunnable; private Map<String, SessionTriggerProvider> perPoolProviders = new ConcurrentHashMap<>(); // The master thread and various workers. /** The master thread the processes the events from EventState. */ @VisibleForTesting protected final Thread wmThread; /** Used by the master thread to offload calls blocking on smth other than fast locks. */ private final ExecutorService workPool; /** Used to schedule timeouts for some async operations. */ private final ScheduledExecutorService timeoutPool; private LlapPluginEndpointClientImpl amComm; private static final FutureCallback<Object> FATAL_ERROR_CALLBACK = new FutureCallback<Object>() { @Override public void onSuccess(Object result) { } @Override public void onFailure(Throwable t) { // TODO: shut down HS2? LOG.error("Workload management fatal error", t); } }; private static volatile WorkloadManager INSTANCE; public static WorkloadManager getInstance() { return INSTANCE; } /** Called once, when HS2 initializes. */ public static WorkloadManager create(String yarnQueue, HiveConf conf, WMFullResourcePlan plan) throws ExecutionException, InterruptedException { assert INSTANCE == null; // We could derive the expected number of AMs to pass in. // Note: we pass a null token here; the tokens to talk to plugin endpoints will only be // known once the AMs register, and they are different for every AM (unlike LLAP token). LlapPluginEndpointClientImpl amComm = new LlapPluginEndpointClientImpl(conf, null, -1); QueryAllocationManager qam = new GuaranteedTasksAllocator(conf, amComm); return (INSTANCE = new WorkloadManager(amComm, yarnQueue, conf, qam, plan)); } @VisibleForTesting WorkloadManager(LlapPluginEndpointClientImpl amComm, String yarnQueue, HiveConf conf, QueryAllocationManager qam, WMFullResourcePlan plan) throws ExecutionException, InterruptedException { this.yarnQueue = yarnQueue; this.conf = conf; this.totalQueryParallelism = determineQueryParallelism(plan); this.allocationManager = qam; this.allocationManager.setClusterChangedCallback(() -> notifyOfClusterStateChange()); this.amComm = amComm; if (this.amComm != null) { this.amComm.init(conf); } LOG.info("Initializing with " + totalQueryParallelism + " total query parallelism"); this.amRegistryTimeoutMs = (int) HiveConf.getTimeVar(conf, ConfVars.HIVE_SERVER2_TEZ_WM_AM_REGISTRY_TIMEOUT, TimeUnit.MILLISECONDS); tezAmPool = new TezSessionPool<>(conf, totalQueryParallelism, true, oldSession -> createSession(oldSession == null ? null : oldSession.getConf())); restrictedConfig = new RestrictedConfigChecker(conf); // Only creates the expiration tracker if expiration is configured. expirationTracker = SessionExpirationTracker.create(conf, this); workPool = Executors.newFixedThreadPool(HiveConf.getIntVar(conf, ConfVars.HIVE_SERVER2_WM_WORKER_THREADS), new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Workload management worker %d").build()); timeoutPool = Executors.newSingleThreadScheduledExecutor(new ThreadFactoryBuilder().setDaemon(true) .setNameFormat("Workload management timeout thread").build()); allowAnyPool = HiveConf.getBoolVar(conf, ConfVars.HIVE_SERVER2_WM_ALLOW_ANY_POOL_VIA_JDBC); if (HiveConf.getBoolVar(conf, ConfVars.HIVE_SERVER2_WM_POOL_METRICS)) { metricsSystem = DefaultMetricsSystem.instance(); } else { metricsSystem = null; } wmThread = new Thread(() -> runWmThread(), "Workload management master"); wmThread.setDaemon(true); wmThread.start(); updateResourcePlanAsync(plan).get(); // Wait for the initial resource plan to be applied. objectMapper = new ObjectMapper(); objectMapper.configure(SerializationConfig.Feature.FAIL_ON_EMPTY_BEANS, false); // serialize json based on field annotations only objectMapper.setVisibilityChecker(objectMapper.getSerializationConfig().getDefaultVisibilityChecker() .withGetterVisibility(JsonAutoDetect.Visibility.NONE) .withSetterVisibility(JsonAutoDetect.Visibility.NONE)); } private static int determineQueryParallelism(WMFullResourcePlan plan) { if (plan == null) return 0; int result = 0; for (WMPool pool : plan.getPools()) { result += pool.getQueryParallelism(); } return result; } public void start() throws Exception { initTriggers(); tezAmPool.start(); if (expirationTracker != null) { expirationTracker.start(); } if (amComm != null) { amComm.start(); } allocationManager.start(); } private void initTriggers() { if (triggerValidatorRunnable == null) { final long triggerValidationIntervalMs = HiveConf.getTimeVar(conf, HiveConf.ConfVars.HIVE_TRIGGER_VALIDATION_INTERVAL, TimeUnit.MILLISECONDS); TriggerActionHandler<?> triggerActionHandler = new KillMoveTriggerActionHandler(this); triggerValidatorRunnable = new PerPoolTriggerValidatorRunnable(perPoolProviders, triggerActionHandler, triggerValidationIntervalMs); startTriggerValidator(triggerValidationIntervalMs); } } public void stop() throws Exception { List<TezSessionPoolSession> sessionsToClose = null; synchronized (openSessions) { sessionsToClose = new ArrayList<TezSessionPoolSession>(openSessions.keySet()); } for (TezSessionState sessionState : sessionsToClose) { sessionState.close(false); } if (expirationTracker != null) { expirationTracker.stop(); } allocationManager.stop(); if (wmThread != null) { wmThread.interrupt(); } if (amComm != null) { amComm.stop(); } workPool.shutdownNow(); timeoutPool.shutdownNow(); if (triggerValidatorRunnable != null) { stopTriggerValidator(); } INSTANCE = null; } private void updateSessionTriggerProvidersOnMasterThread() { for (Map.Entry<String, PoolState> entry : pools.entrySet()) { String poolName = entry.getKey(); PoolState poolState = entry.getValue(); final List<Trigger> triggers = Collections.unmodifiableList(poolState.getTriggers()); final List<TezSessionState> sessionStates = Collections.unmodifiableList(poolState.getSessions()); SessionTriggerProvider sessionTriggerProvider = perPoolProviders.get(poolName); if (sessionTriggerProvider != null) { perPoolProviders.get(poolName).setTriggers(triggers); perPoolProviders.get(poolName).setSessions(sessionStates); } else { perPoolProviders.put(poolName, new SessionTriggerProvider(sessionStates, triggers)); } } } @VisibleForTesting Map<String, SessionTriggerProvider> getAllSessionTriggerProviders() { return perPoolProviders; } /** Represent a single iteration of work for the master thread. */ private final static class EventState { private final Set<WmTezSession> toReturn = Sets.newIdentityHashSet(), toDestroy = Sets.newIdentityHashSet(); private final Map<WmTezSession, Boolean> killQueryResults = new IdentityHashMap<>(); private final LinkedList<SessionInitContext> initResults = new LinkedList<>(); private final IdentityHashMap<WmTezSession, SettableFuture<WmTezSession>> toReopen = new IdentityHashMap<>(); private final IdentityHashMap<WmTezSession, Integer> updateErrors = new IdentityHashMap<>(); private final LinkedList<GetRequest> getRequests = new LinkedList<>(); private final IdentityHashMap<WmTezSession, GetRequest> toReuse = new IdentityHashMap<>(); private WMFullResourcePlan resourcePlanToApply = null; private boolean doClearResourcePlan = false; private boolean hasClusterStateChanged = false; private List<SettableFuture<Boolean>> testEvents = new LinkedList<>(); private SettableFuture<Boolean> applyRpFuture; private SettableFuture<List<String>> dumpStateFuture; private final List<MoveSession> moveSessions = new LinkedList<>(); } private final static class MoveSession { private final WmTezSession srcSession; private final String destPool; private final SettableFuture<Boolean> future; public MoveSession(final WmTezSession srcSession, final String destPool) { this.srcSession = srcSession; this.destPool = destPool; this.future = SettableFuture.create(); } @Override public String toString() { return srcSession.getSessionId() + " moving from " + srcSession.getPoolName() + " to " + destPool; } } /** * The work delegated from the master thread that doesn't have an async implementation * (mostly opening and closing the sessions). */ private final static class WmThreadSyncWork { private List<WmTezSession> toRestartInUse = new LinkedList<>(), toDestroyNoRestart = new LinkedList<>(); private Map<WmTezSession, KillQueryContext> toKillQuery = new IdentityHashMap<>(); private List<Path> pathsToDelete = Lists.newArrayList(); } private void runWmThread() { while (true) { EventState currentEvents = null; currentLock.lock(); try { while (!hasChanges) { try { hasChangesCondition.await(1, TimeUnit.SECONDS); } catch (InterruptedException e) { LOG.warn("WM thread was interrupted and will now exit"); return; } } hasChanges = false; currentEvents = current; current = (currentEvents == one) ? two : one; } finally { currentLock.unlock(); } try { LOG.info("Processing current events"); processCurrentEvents(currentEvents, syncWork); scheduleWork(syncWork); updateSessionTriggerProvidersOnMasterThread(); } catch (InterruptedException ex) { LOG.warn("WM thread was interrupted and will now exit"); return; } catch (Exception | AssertionError ex) { LOG.error("WM thread encountered an error but will attempt to continue", ex); for (SettableFuture<Boolean> testEvent : currentEvents.testEvents) { LOG.info("Failing test event " + System.identityHashCode(testEvent)); testEvent.setException(ex); } currentEvents.testEvents.clear(); if (currentEvents.applyRpFuture != null) { currentEvents.applyRpFuture.setException(ex); currentEvents.applyRpFuture = null; } // TODO: we either have to kill HS2 or, as the non-actor model would implicitly, // hope for the best and continue on other threads. Do the latter for now. continue; } } } private void scheduleWork(WmThreadSyncWork context) { // Do the work that cannot be done via async calls. // 1. Kill queries. for (KillQueryContext killCtx : context.toKillQuery.values()) { final WmTezSession toKill = killCtx.session; final String reason = killCtx.reason; LOG.info("Killing query for {}", toKill); workPool.submit(() -> { SessionState ss = new SessionState(new HiveConf()); ss.setIsHiveServerQuery(true); SessionState.start(ss); // Note: we get query ID here, rather than in the caller, where it would be more correct // because we know which exact query we intend to kill. This is valid because we // are not expecting query ID to change - we never reuse the session for which a // query is being killed until both the kill, and the user, return it. String queryId = toKill.getQueryId(); KillQuery kq = toKill.getKillQuery(); try { if (kq != null && queryId != null) { WmEvent wmEvent = new WmEvent(WmEvent.EventType.KILL); LOG.info("Invoking KillQuery for " + queryId + ": " + reason); try { kq.killQuery(queryId, reason, toKill.getConf()); addKillQueryResult(toKill, true); killCtx.killSessionFuture.set(true); wmEvent.endEvent(toKill); LOG.debug("Killed " + queryId); return; } catch (HiveException ex) { LOG.error("Failed to kill " + queryId + "; will try to restart AM instead", ex); } } else { LOG.info("Will queue restart for {}; queryId {}, killQuery {}", toKill, queryId, kq); } } finally { toKill.setQueryId(null); } // We cannot restart in place because the user might receive a failure and return the // session to the master thread without the "irrelevant" flag set. In fact, the query might // have succeeded in the gap and the session might already be returned. Queue restart thru // the master thread. addKillQueryResult(toKill, false); }); } context.toKillQuery.clear(); // 2. Restart pool sessions. for (final WmTezSession toRestart : context.toRestartInUse) { LOG.info("Replacing {} with a new session", toRestart); toRestart.setQueryId(null); workPool.submit(() -> { try { WmEvent wmEvent = new WmEvent(WmEvent.EventType.RESTART); // Note: sessions in toRestart are always in use, so they cannot expire in parallel. tezAmPool.replaceSession(toRestart); wmEvent.endEvent(toRestart); } catch (Exception ex) { LOG.error("Failed to restart an old session; ignoring", ex); } }); } context.toRestartInUse.clear(); // 3. Destroy the sessions that we don't need anymore. for (final WmTezSession toDestroy : context.toDestroyNoRestart) { LOG.info("Closing {} without restart", toDestroy); workPool.submit(() -> { try { WmEvent wmEvent = new WmEvent(WmEvent.EventType.DESTROY); toDestroy.close(false); wmEvent.endEvent(toDestroy); } catch (Exception ex) { LOG.error("Failed to close an old session; ignoring " + ex.getMessage()); } }); } context.toDestroyNoRestart.clear(); // 4. Delete unneeded directories that were replaced by other ones via reopen. for (final Path path : context.pathsToDelete) { LOG.info("Deleting {}", path); workPool.submit(() -> { try { path.getFileSystem(conf).delete(path, true); } catch (Exception ex) { LOG.error("Failed to delete an old path; ignoring " + ex.getMessage()); } }); } context.pathsToDelete.clear(); } /** * This is the main method of the master thread the processes one set of events. * Be mindful of the fact that events can be queued while we are processing events, so * in addition to making sure we keep the current set consistent (e.g. no need to handle * update errors for a session that should already be destroyed), this needs to guard itself * against the future iterations - e.g. what happens if we kill a query due to plan change, * but the DAG finished before the kill happens and the user queues a "return" event? Etc. * DO NOT block for a long time in this method. * @param e Input events. * @param syncWork Output tasks that cannot be called via async methods. */ private void processCurrentEvents(EventState e, WmThreadSyncWork syncWork) throws Exception { // The order of processing is as follows. We'd reclaim or kill all the sessions that we can // reclaim from various user actions and errors, then apply the new plan if any, // then give out all we can give out (restart, get and reopen callers) and rebalance the // resource allocations in all the affected pools. // For every session, we'd check all the concurrent things happening to it. // TODO: also account for Tez-internal session restarts; // AM reg info changes; add notifications, ignore errors, and update alloc. HashSet<String> poolsToRedistribute = new HashSet<>(); // 0. Handle initialization results. for (SessionInitContext sw : e.initResults) { handleInitResultOnMasterThread(sw, syncWork, poolsToRedistribute); } e.initResults.clear(); // 1. Handle kill query results - part 1, just put them in place. We will resolve what // to do with the sessions after we go thru all the concurrent user actions. for (Map.Entry<WmTezSession, Boolean> entry : e.killQueryResults.entrySet()) { WmTezSession killQuerySession = entry.getKey(); boolean killResult = entry.getValue(); LOG.debug("Processing KillQuery {} for {}", killResult ? "success" : "failure", killQuerySession); // Note: do not cancel any user actions here; user actions actually interact with kills. KillQueryContext killCtx = killQueryInProgress.get(killQuerySession); if (killCtx == null) { LOG.error("Internal error - cannot find the context for killing {}", killQuerySession); continue; } killCtx.handleKillQueryCallback(!killResult); } e.killQueryResults.clear(); // 2. Handle sessions that are being destroyed by users. Destroy implies return. for (WmTezSession sessionToDestroy : e.toDestroy) { if (e.toReturn.remove(sessionToDestroy)) { LOG.warn("The session was both destroyed and returned by the user; destroying"); } LOG.info("Destroying {}", sessionToDestroy); RemoveSessionResult rr = handleReturnedInUseSessionOnMasterThread(e, sessionToDestroy, poolsToRedistribute, false); if (rr == RemoveSessionResult.OK || rr == RemoveSessionResult.NOT_FOUND) { // Restart even if there's an internal error. syncWork.toRestartInUse.add(sessionToDestroy); } } e.toDestroy.clear(); // 3. Now handle actual returns. Sessions may be returned to the pool or may trigger expires. for (WmTezSession sessionToReturn : e.toReturn) { LOG.info("Returning {}", sessionToReturn); RemoveSessionResult rr = handleReturnedInUseSessionOnMasterThread(e, sessionToReturn, poolsToRedistribute, true); switch (rr) { case OK: WmEvent wmEvent = new WmEvent(WmEvent.EventType.RETURN); boolean wasReturned = tezAmPool.returnSessionAsync(sessionToReturn); if (!wasReturned) { syncWork.toDestroyNoRestart.add(sessionToReturn); } else { if (sessionToReturn.getWmContext() != null && sessionToReturn.getWmContext().isQueryCompleted()) { sessionToReturn.resolveReturnFuture(); } wmEvent.endEvent(sessionToReturn); } break; case NOT_FOUND: syncWork.toRestartInUse.add(sessionToReturn); // Restart if there's an internal error. break; case IGNORE: break; default: throw new AssertionError("Unknown state " + rr); } } e.toReturn.clear(); // 4. Reopen is essentially just destroy + get a new session for a session in use. for (Map.Entry<WmTezSession, SettableFuture<WmTezSession>> entry : e.toReopen.entrySet()) { LOG.info("Reopening {}", entry.getKey()); handeReopenRequestOnMasterThread(e, entry.getKey(), entry.getValue(), poolsToRedistribute, syncWork); } e.toReopen.clear(); // 5. All the sessions in use that were not destroyed or returned with a failed update now die. for (Map.Entry<WmTezSession, Integer> entry : e.updateErrors.entrySet()) { WmTezSession sessionWithUpdateError = entry.getKey(); int failedEndpointVersion = entry.getValue(); LOG.info("Update failed for {}", sessionWithUpdateError); handleUpdateErrorOnMasterThread(sessionWithUpdateError, failedEndpointVersion, e.toReuse, syncWork, poolsToRedistribute); } e.updateErrors.clear(); // 6. Now apply a resource plan if any. This is expected to be pretty rare. boolean hasRequeues = false; if (e.resourcePlanToApply != null || e.doClearResourcePlan) { LOG.info("Applying new resource plan"); int getReqCount = e.getRequests.size(); applyNewResourcePlanOnMasterThread(e, syncWork, poolsToRedistribute); hasRequeues = getReqCount != e.getRequests.size(); } e.resourcePlanToApply = null; e.doClearResourcePlan = false; // 7. Handle any move session requests. The way move session works right now is // a) sessions get moved to destination pool if there is capacity in destination pool // b) if there is no capacity in destination pool, the session gets killed (since we cannot pause a query) // TODO: in future this the process of killing can be delayed until the point where a session is actually required. // We could consider delaying the move (when destination capacity is full) until there is claim in src pool. // May be change command to support ... DELAYED MOVE TO etl ... which will run under src cluster fraction as long // as possible Map<WmTezSession, WmEvent> recordMoveEvents = new HashMap<>(); for (MoveSession moveSession : e.moveSessions) { handleMoveSessionOnMasterThread(moveSession, syncWork, poolsToRedistribute, e.toReuse, recordMoveEvents); } e.moveSessions.clear(); // 8. Handle all the get/reuse requests. We won't actually give out anything here, but merely // map all the requests and place them in an appropriate order in pool queues. The only // exception is the reuse without queue contention; can be granted immediately. If we can't // reuse the session immediately, we will convert the reuse to a normal get, because we // want query level fairness, and don't want the get in queue to hold up a session. GetRequest req; while ((req = e.getRequests.pollFirst()) != null) { LOG.info("Processing a new get request from " + req.mappingInput); queueGetRequestOnMasterThread(req, poolsToRedistribute, syncWork); } e.toReuse.clear(); // 9. Resolve all the kill query requests in flight. Nothing below can affect them. Iterator<KillQueryContext> iter = killQueryInProgress.values().iterator(); while (iter.hasNext()) { KillQueryContext ctx = iter.next(); KillQueryResult kr = ctx.process(); switch (kr) { case IN_PROGRESS: continue; // Either the user or the kill is not done yet. case OK: { iter.remove(); LOG.debug("Kill query succeeded; returning to the pool: {}", ctx.session); ctx.killSessionFuture.set(true); WmEvent wmEvent = new WmEvent(WmEvent.EventType.RETURN); if (!tezAmPool.returnSessionAsync(ctx.session)) { syncWork.toDestroyNoRestart.add(ctx.session); } else { if (ctx.session.getWmContext() != null && ctx.session.getWmContext().isQueryCompleted()) { ctx.session.resolveReturnFuture(); } wmEvent.endEvent(ctx.session); } break; } case RESTART_REQUIRED: { iter.remove(); ctx.killSessionFuture.set(true); LOG.debug("Kill query failed; restarting: {}", ctx.session); // Note: we assume here the session, before we resolve killQuery result here, is still // "in use". That is because all the user ops above like return, reopen, etc. // don't actually return/reopen/... when kill query is in progress. syncWork.toRestartInUse.add(ctx.session); break; } default: throw new AssertionError("Unknown state " + kr); } } // 10. If there was a cluster state change, make sure we redistribute all the pools. if (e.hasClusterStateChanged) { LOG.info("Processing a cluster state change"); poolsToRedistribute.addAll(pools.keySet()); e.hasClusterStateChanged = false; } // 11. Finally, for all the pools that have changes, promote queued queries and rebalance. for (String poolName : poolsToRedistribute) { if (LOG.isDebugEnabled()) { LOG.info("Processing changes for pool " + poolName + ": " + pools.get(poolName)); } processPoolChangesOnMasterThread(poolName, hasRequeues, syncWork); } // 12. Save state for future iterations. for (KillQueryContext killCtx : syncWork.toKillQuery.values()) { if (killQueryInProgress.put(killCtx.session, killCtx) != null) { LOG.error("One query killed several times - internal error {}", killCtx.session); } } // 13. To record move events, we need to cluster fraction updates that happens at step 11. for (Map.Entry<WmTezSession, WmEvent> entry : recordMoveEvents.entrySet()) { entry.getValue().endEvent(entry.getKey()); } // 14. Give our final state to UI/API requests if any. if (e.dumpStateFuture != null) { List<String> result = new ArrayList<>(); result.add("RESOURCE PLAN " + rpName + "; default pool " + defaultPool); for (PoolState ps : pools.values()) { dumpPoolState(ps, result); } e.dumpStateFuture.set(result); e.dumpStateFuture = null; } // 15. Notify tests and global async ops. for (SettableFuture<Boolean> testEvent : e.testEvents) { LOG.info("Triggering test event " + System.identityHashCode(testEvent)); testEvent.set(null); } e.testEvents.clear(); if (e.applyRpFuture != null) { e.applyRpFuture.set(true); e.applyRpFuture = null; } } private void dumpPoolState(PoolState ps, List<String> set) { StringBuilder sb = new StringBuilder(); sb.append("POOL ").append(ps.fullName).append(": qp ").append(ps.queryParallelism).append(", %% ") .append(ps.finalFraction).append(", sessions: ").append(ps.sessions.size()) .append(", initializing: ").append(ps.initializingSessions.size()).append(", queued: ") .append(ps.queue.size()); set.add(sb.toString()); sb.setLength(0); for (WmTezSession session : ps.sessions) { double cf = session.hasClusterFraction() ? session.getClusterFraction() : 0; sb.append("RUNNING: ").append(cf).append(" (").append(session.getAllocationState()).append(") => ") .append(session.getSessionId()); set.add(sb.toString()); sb.setLength(0); } for (SessionInitContext session : ps.initializingSessions) { sb.append("INITIALIZING: state ").append(session.state); set.add(sb.toString()); sb.setLength(0); } for (GetRequest session : ps.queue) { sb.append("QUEUED: from ").append(session.mappingInput); set.add(sb.toString()); sb.setLength(0); } } private void handleMoveSessionOnMasterThread(final MoveSession moveSession, final WmThreadSyncWork syncWork, final HashSet<String> poolsToRedistribute, final Map<WmTezSession, GetRequest> toReuse, final Map<WmTezSession, WmEvent> recordMoveEvents) { String destPoolName = moveSession.destPool; LOG.info("Handling move session event: {}", moveSession); if (validMove(moveSession.srcSession, destPoolName)) { WmEvent moveEvent = new WmEvent(WmEvent.EventType.MOVE); // remove from src pool RemoveSessionResult rr = checkAndRemoveSessionFromItsPool(moveSession.srcSession, poolsToRedistribute, true, true); if (rr == RemoveSessionResult.OK) { // check if there is capacity in dest pool, if so move else kill the session if (capacityAvailable(destPoolName)) { // add to destination pool Boolean added = checkAndAddSessionToAnotherPool(moveSession.srcSession, destPoolName, poolsToRedistribute); if (added != null && added) { moveSession.future.set(true); recordMoveEvents.put(moveSession.srcSession, moveEvent); return; } else { LOG.error("Failed to move session: {}. Session is not added to destination.", moveSession); } } else { WmTezSession session = moveSession.srcSession; KillQueryContext killQueryContext = new KillQueryContext(session, "Destination pool " + destPoolName + " is full. Killing query."); resetAndQueueKill(syncWork.toKillQuery, killQueryContext, toReuse); } } else { LOG.error("Failed to move session: {}. Session is not removed from its pool.", moveSession); } } else { LOG.error("Validation failed for move session: {}. Invalid move or session/pool got removed.", moveSession); } moveSession.future.set(false); } private Boolean capacityAvailable(final String destPoolName) { PoolState destPool = pools.get(destPoolName); return destPool.getTotalActiveSessions() < destPool.queryParallelism; } private boolean validMove(final WmTezSession srcSession, final String destPool) { return srcSession != null && destPool != null && !srcSession.isIrrelevantForWm() && srcSession.getPoolName() != null && pools.containsKey(srcSession.getPoolName()) && pools.containsKey(destPool) && !srcSession.getPoolName().equalsIgnoreCase(destPool); } // ========= Master thread methods private void handleInitResultOnMasterThread(SessionInitContext sw, WmThreadSyncWork syncWork, HashSet<String> poolsToRedistribute) { // For the failures, the users have been notified, we just need to clean up. There's no // session here (or it's unused), so no conflicts are possible. We just remove it. // For successes, the user has also been notified, so various requests are also possible; // however, to start, we'd just put the session into the sessions list and go from there. WmTezSession session = null; sw.lock.lock(); try { if (sw.state == SessionInitState.CANCELED) { // We have processed this on the previous run, after it has already queued the message. return; } assert sw.state == SessionInitState.DONE; session = sw.session; sw.session = null; } finally { sw.lock.unlock(); } LOG.info("Processing " + ((session == null) ? "failed" : "successful") + " initialization result for pool " + sw.poolName); // We could not have removed the pool for this session, or we would have CANCELED the init. PoolState pool = pools.get(sw.poolName); if (pool == null || !pool.initializingSessions.remove(sw)) { // Query parallelism might be fubar. LOG.error("Cannot remove initializing session from the pool " + sw.poolName + " - internal error"); } poolsToRedistribute.add(sw.poolName); if (session != null) { if (pool != null) { pool.sessions.add(session); } else { LOG.error("Cannot add new session to the pool " + sw.poolName + " because it was removed unexpectedly - internal error " + session); syncWork.toRestartInUse.add(session); } } } private RemoveSessionResult handleReturnedInUseSessionOnMasterThread(EventState e, WmTezSession session, HashSet<String> poolsToRedistribute, boolean isReturn) { // This handles the common logic for destroy and return - everything except // the invalid combination of destroy and return themselves, as well as the actual // statement that destroys or returns it. if (e.updateErrors.remove(session) != null) { LOG.info("Ignoring an update error for a session being destroyed or returned"); } SettableFuture<WmTezSession> future = e.toReopen.remove(session); if (future != null) { future.setException(new AssertionError("Invalid reopen attempt")); } GetRequest reuseRequest = e.toReuse.remove(session); if (reuseRequest != null) { reuseRequest.future.setException(new AssertionError("Invalid reuse attempt")); } session.setQueryId(null); return checkAndRemoveSessionFromItsPool(session, poolsToRedistribute, isReturn, true); } private void handeReopenRequestOnMasterThread(EventState e, WmTezSession session, SettableFuture<WmTezSession> future, HashSet<String> poolsToRedistribute, WmThreadSyncWork syncWork) throws Exception { if (e.updateErrors.remove(session) != null) { LOG.info("Ignoring an update error for a session being reopened"); } GetRequest reuseRequest = e.toReuse.remove(session); if (reuseRequest != null) { reuseRequest.future.setException(new AssertionError("Invalid reuse attempt")); } // In order to expedite things in a general case, we are not actually going to reopen // anything. Instead, we will try to give out an existing session from the pool, and restart // the problematic one in background. String poolName = session.getPoolName(); // Do not update metrics, we'd immediately add the session back if we are able to remove. RemoveSessionResult rr = checkAndRemoveSessionFromItsPool(session, poolsToRedistribute, false, false); switch (rr) { case OK: // If pool didn't exist, checkAndRemoveSessionFromItsPool wouldn't have returned OK. PoolState pool = pools.get(poolName); SessionInitContext sw = new SessionInitContext(future, poolName, session.getQueryId(), session.getWmContext(), session.extractHiveResources()); // We have just removed the session from the same pool, so don't check concurrency here. pool.initializingSessions.add(sw); // Do not update metrics - see above. sw.start(); syncWork.toRestartInUse.add(session); return; case IGNORE: // Reopen implies the use of the reopened session for the same query that we gave it out // for; so, as we would have failed an active query, fail the user before it's started. future.setException( new RuntimeException("WM killed this session during reopen: " + session.getReasonForKill())); return; // No longer relevant for WM. case NOT_FOUND: // If we fail to remove, it's probably an internal error. We'd try to handle it the same way // as above - by restarting the session. We'd fail the caller to avoid exceeding parallelism. future.setException(new RuntimeException("Reopen failed due to an internal error")); syncWork.toRestartInUse.add(session); return; default: throw new AssertionError("Unknown state " + rr); } } private void handleUpdateErrorOnMasterThread(WmTezSession session, int failedEndpointVersion, IdentityHashMap<WmTezSession, GetRequest> toReuse, WmThreadSyncWork syncWork, HashSet<String> poolsToRedistribute) { // First, check if the registry has been updated since the error, and skip the error if // we have received new, valid registry info (TODO: externally, add a grace period for this?). Ref<Integer> endpointVersion = new Ref<>(-1); AmPluginInfo info = session.getAmPluginInfo(endpointVersion); if (info != null && endpointVersion.value > failedEndpointVersion) { LOG.info("Ignoring an update error; endpoint information has been updated to {}", info); return; } GetRequest reuseRequest = toReuse.remove(session); if (reuseRequest != null) { // This session is bad, so don't allow reuse; just convert it to normal get. reuseRequest.sessionToReuse = null; } // We are assuming the update-error AM is bad and just try to kill it. RemoveSessionResult rr = checkAndRemoveSessionFromItsPool(session, poolsToRedistribute, null, true); switch (rr) { case OK: case NOT_FOUND: // Regardless whether it was removed successfully or after failing to remove, restart it. // Since we just restart this from under the user, mark it so we handle it properly when // the user tries to actually use this session and fails, proceeding to return/destroy it. session.setIsIrrelevantForWm("Failed to update resource allocation"); // We assume AM might be bad so we will not try to kill the query here; just scrap the AM. // TODO: propagate this error to TezJobMonitor somehow? Without using killQuery syncWork.toRestartInUse.add(session); break; case IGNORE: return; // An update error for some session that was actually already killed by us. default: throw new AssertionError("Unknown state " + rr); } } private void applyNewResourcePlanOnMasterThread(EventState e, WmThreadSyncWork syncWork, HashSet<String> poolsToRedistribute) { int totalQueryParallelism = 0; WMFullResourcePlan plan = e.resourcePlanToApply; if (plan == null) { // NULL plan means WM is disabled via a command; it could still be reenabled. LOG.info("Disabling workload management because the resource plan has been removed"); this.rpName = null; this.defaultPool = null; this.userPoolMapping = new UserPoolMapping(null, null); } else { this.rpName = plan.getPlan().getName(); this.defaultPool = plan.getPlan().getDefaultPoolPath(); this.userPoolMapping = new UserPoolMapping(plan.getMappings(), defaultPool); } // Note: we assume here that plan has been validated beforehand, so we don't verify // that fractions or query parallelism add up, etc. Map<String, PoolState> oldPools = pools; pools = new HashMap<>(); ArrayList<List<WMPool>> poolsByLevel = new ArrayList<>(); if (plan != null) { // For simplicity, to always have parents while storing pools in a flat structure, we'll // first distribute them by levels, then add level by level. for (WMPool pool : plan.getPools()) { String fullName = pool.getPoolPath(); int ix = StringUtils.countMatches(fullName, POOL_SEPARATOR_STR); while (poolsByLevel.size() <= ix) { poolsByLevel.add(new LinkedList<WMPool>()); // We expect all the levels to have items. } poolsByLevel.get(ix).add(pool); } } for (int level = 0; level < poolsByLevel.size(); ++level) { List<WMPool> poolsOnLevel = poolsByLevel.get(level); for (WMPool pool : poolsOnLevel) { String fullName = pool.getPoolPath(); int qp = pool.getQueryParallelism(); double fraction = pool.getAllocFraction(); if (level > 0) { String parentName = fullName.substring(0, fullName.lastIndexOf(POOL_SEPARATOR)); PoolState parent = pools.get(parentName); fraction = parent.finalFraction * fraction; parent.finalFractionRemaining -= fraction; } PoolState state = oldPools == null ? null : oldPools.remove(fullName); if (state == null) { state = new PoolState(fullName, qp, fraction, pool.getSchedulingPolicy(), metricsSystem); } else { // This will also take care of the queries if query parallelism changed. state.update(qp, fraction, syncWork, e, pool.getSchedulingPolicy()); poolsToRedistribute.add(fullName); } state.setTriggers(new LinkedList<Trigger>()); LOG.info("Adding Hive pool: " + state); pools.put(fullName, state); totalQueryParallelism += qp; } } for (PoolState pool : pools.values()) { if (pool.metrics != null) { pool.metrics .setMaxExecutors(allocationManager.translateAllocationToCpus(pool.finalFractionRemaining)); } } // TODO: in the current impl, triggers are added to RP. For tez, no pool triggers (mapping between trigger name and // pool name) will exist which means all triggers applies to tez. For LLAP, pool triggers has to exist for attaching // triggers to specific pools. // For usability, // Provide a way for triggers sharing/inheritance possibly with following modes // ONLY - only to pool // INHERIT - child pools inherit from parent // GLOBAL - all pools inherit if (plan != null && plan.isSetTriggers() && plan.isSetPoolTriggers()) { Map<String, Trigger> triggers = new HashMap<>(); for (WMTrigger trigger : plan.getTriggers()) { ExecutionTrigger execTrigger = ExecutionTrigger.fromWMTrigger(trigger); triggers.put(trigger.getTriggerName(), execTrigger); } for (WMPoolTrigger poolTrigger : plan.getPoolTriggers()) { PoolState pool = pools.get(poolTrigger.getPool()); Trigger trigger = triggers.get(poolTrigger.getTrigger()); pool.triggers.add(trigger); poolsToRedistribute.add(pool.fullName); LOG.info("Adding pool " + pool.fullName + " trigger " + trigger); } } if (oldPools != null && !oldPools.isEmpty()) { // Looks like some pools were removed; kill running queries, re-queue the queued ones. for (PoolState oldPool : oldPools.values()) { oldPool.destroy(syncWork, e.getRequests, e.toReuse); } } LOG.info("Updating with " + totalQueryParallelism + " total query parallelism"); int deltaSessions = totalQueryParallelism - this.totalQueryParallelism; this.totalQueryParallelism = totalQueryParallelism; if (deltaSessions == 0) return; // Nothing to do. if (deltaSessions < 0) { // First, see if we have sessions that we were planning to restart/kill; get rid of those. deltaSessions = transferSessionsToDestroy(syncWork.toKillQuery.keySet(), syncWork.toDestroyNoRestart, deltaSessions); deltaSessions = transferSessionsToDestroy(syncWork.toRestartInUse, syncWork.toDestroyNoRestart, deltaSessions); } if (deltaSessions != 0) { failOnFutureFailure(tezAmPool.resizeAsync(deltaSessions, syncWork.toDestroyNoRestart)); } } private static int transferSessionsToDestroy(Collection<WmTezSession> source, List<WmTezSession> toDestroy, int deltaSessions) { // We were going to kill some queries and reuse the sessions, or maybe restart and put the new // ones back into the AM pool. However, the AM pool has shrunk, so we will close them instead. if (deltaSessions >= 0) return deltaSessions; int toTransfer = Math.min(-deltaSessions, source.size()); Iterator<WmTezSession> iter = source.iterator(); for (int i = 0; i < toTransfer; ++i) { WmTezSession session = iter.next(); LOG.debug("Will destroy {} instead of restarting", session); if (!session.isIrrelevantForWm()) { session.setIsIrrelevantForWm("Killed due to workload management plan change"); } toDestroy.add(session); iter.remove(); } return deltaSessions + toTransfer; } private void failOnFutureFailure(ListenableFuture<?> future) { Futures.addCallback(future, FATAL_ERROR_CALLBACK); } private void queueGetRequestOnMasterThread(GetRequest req, HashSet<String> poolsToRedistribute, WmThreadSyncWork syncWork) { String poolName = userPoolMapping.mapSessionToPoolName(req.mappingInput, allowAnyPool, allowAnyPool ? pools.keySet() : null); if (poolName == null) { req.future.setException( new NoPoolMappingException("Cannot find any pool mapping for " + req.mappingInput)); returnSessionOnFailedReuse(req, syncWork, poolsToRedistribute); return; } PoolState pool = pools.get(poolName); if (pool == null) { req.future.setException(new AssertionError(poolName + " not found (internal error).")); returnSessionOnFailedReuse(req, syncWork, poolsToRedistribute); return; } PoolState oldPool = null; if (req.sessionToReuse != null) { // Given that we are trying to reuse, this session MUST be in some pool.sessions. // Kills that could have removed it must have cleared sessionToReuse. String oldPoolName = req.sessionToReuse.getPoolName(); oldPool = pools.get(oldPoolName); RemoveSessionResult rr = checkAndRemoveSessionFromItsPool(req.sessionToReuse, poolsToRedistribute, true, false); if (rr != RemoveSessionResult.OK) { if (oldPool.metrics != null) { oldPool.metrics.removeRunningQueries(1); } // Abandon the reuse attempt. returnSessionOnFailedReuse(req, syncWork, null); req.sessionToReuse = null; } else if (pool.getTotalActiveSessions() + pool.queue.size() >= pool.queryParallelism) { if (oldPool.metrics != null) { oldPool.metrics.removeRunningQueries(1); } // One cannot simply reuse the session if there are other queries waiting; to maintain // fairness, we'll try to take a query slot instantly, and if that fails we'll return // this session back to the pool and give the user a new session later. returnSessionOnFailedReuse(req, syncWork, null); req.sessionToReuse = null; } } if (req.sessionToReuse != null) { // If we can immediately reuse a session, there's nothing to wait for - just return. req.sessionToReuse.setPoolName(poolName); req.sessionToReuse.setQueueName(yarnQueue); req.sessionToReuse.setQueryId(req.queryId); // Do not update metrics - we didn't update on removal. pool.sessions.add(req.sessionToReuse); if (pool != oldPool) { poolsToRedistribute.add(poolName); } req.future.set(req.sessionToReuse); return; } // Otherwise, queue the session and make sure we update this pool. pool.queue.addLast(req); if (pool.metrics != null) { pool.metrics.addQueuedQuery(); } poolsToRedistribute.add(poolName); } private void processPoolChangesOnMasterThread(String poolName, boolean hasRequeues, WmThreadSyncWork syncWork) throws Exception { PoolState pool = pools.get(poolName); if (pool == null) return; // Might be from before the new resource plan. // 1. First, start the queries from the queue. int queriesToStart = Math.min(pool.queue.size(), pool.queryParallelism - pool.getTotalActiveSessions()); if (queriesToStart > 0) { LOG.info("Starting {} queries in pool {}", queriesToStart, pool); } if (hasRequeues) { // Sort the queue - we may have put items here out of order. Collections.sort(pool.queue, GetRequest.ORDER_COMPARATOR); } for (int i = 0; i < queriesToStart; ++i) { GetRequest queueReq = pool.queue.pollFirst(); if (pool.metrics != null) { pool.metrics.moveQueuedToRunning(); } assert queueReq.sessionToReuse == null; // Note that in theory, we are guaranteed to have a session waiting for us here, but // the expiration, failures, etc. may cause one to be missing pending restart. // See SessionInitContext javadoc. SessionInitContext sw = new SessionInitContext(queueReq.future, poolName, queueReq.queryId, queueReq.wmContext, null); sw.start(); // It is possible that all the async methods returned on the same thread because the // session with registry data and stuff was available in the pool. // If this happens, we'll take the session out here and "cancel" the init so we skip // processing the message that the successful init has queued for us. boolean isDone = sw.extractSessionAndCancelIfDone(pool.sessions, syncWork.pathsToDelete); if (!isDone) { pool.initializingSessions.add(sw); } // The user has already been notified of completion by SessionInitContext. } // 2. Then, update pool allocations. double totalAlloc = pool.updateAllocationPercentages(); // We are calling this here because we expect the method to be completely async. We also don't // want this call itself to go on a thread because we want the percent-to-physics conversion // logic to be consistent between all the separate calls in one master thread processing round. // Note: If allocation manager does not have cluster state, it won't update anything. When the // cluster state changes, it will notify us, and we'd update the queries again. int cpusAllocated = allocationManager.updateSessionsAsync(totalAlloc, pool.sessions); if (pool.metrics != null) { pool.metrics.setExecutors(cpusAllocated); if (cpusAllocated > 0) { // Update max executors now that cluster info is definitely available. pool.metrics.setMaxExecutors(allocationManager.translateAllocationToCpus(totalAlloc)); } } } private void returnSessionOnFailedReuse(GetRequest req, WmThreadSyncWork syncWork, HashSet<String> poolsToRedistribute) { WmTezSession session = req.sessionToReuse; if (session == null) return; req.sessionToReuse = null; session.setQueryId(null); if (poolsToRedistribute != null) { RemoveSessionResult rr = checkAndRemoveSessionFromItsPool(session, poolsToRedistribute, true, true); // The session cannot have been killed just now; this happens after all the kills in // the current iteration, so we would have cleared sessionToReuse when killing this. boolean isOk = (rr == RemoveSessionResult.OK); assert isOk || rr == RemoveSessionResult.IGNORE; if (!isOk) return; } WmEvent wmEvent = new WmEvent(WmEvent.EventType.RETURN); if (!tezAmPool.returnSessionAsync(session)) { syncWork.toDestroyNoRestart.add(session); } else { if (session.getWmContext() != null && session.getWmContext().isQueryCompleted()) { session.resolveReturnFuture(); } wmEvent.endEvent(session); } } /** The result of trying to remove a presumably-active session from a pool on a user request. */ private static enum RemoveSessionResult { OK, // Normal case - an active session was removed from the pool. IGNORE, // Session was restarted out of bounds, any user-side handling should be ignored. // Or, session is being killed, need to coordinate between that and the user. // These two cases don't need to be distinguished for now. NOT_FOUND // The session is active but not found in the pool - internal error. } /** * Checks if the session is still relevant for WM and if yes, removes it from its thread. * @param isSessionOk Whether the user thinks the session being returned in some way is ok; * true means it is (return, reuse); false mean it isn't (reopen, destroy); * null means this is not a user call. * @return true if the session was removed; false if the session was already processed by WM * thread (so we are dealing with an outdated request); null if the session should be * in WM but wasn't found in the requisite pool (internal error?). */ private RemoveSessionResult checkAndRemoveSessionFromItsPool(WmTezSession session, Set<String> poolsToRedistribute, Boolean isSessionOk, boolean updateMetrics) { // It is possible for some request to be queued after a main thread has decided to kill this // session; on the next iteration, we'd be processing that request with an irrelevant session. if (session.isIrrelevantForWm()) { return RemoveSessionResult.IGNORE; } if (killQueryInProgress.containsKey(session)) { if (isSessionOk != null) { killQueryInProgress.get(session).handleUserCallback(!isSessionOk); } return RemoveSessionResult.IGNORE; } // If we did not kill this session we expect everything to be present. String poolName = session.getPoolName(); if (poolName != null) { poolsToRedistribute.add(poolName); PoolState pool = pools.get(poolName); session.clearWm(); if (pool != null && pool.sessions.remove(session)) { if (updateMetrics && pool.metrics != null) { pool.metrics.removeRunningQueries(1); } return RemoveSessionResult.OK; } } LOG.error("Session was not in the pool (internal error) " + poolName + ": " + session); return RemoveSessionResult.NOT_FOUND; } private Boolean checkAndAddSessionToAnotherPool(WmTezSession session, String destPoolName, Set<String> poolsToRedistribute) { if (session.isIrrelevantForWm()) { // This is called only during move session handling, removing session already checks this. // So this is not expected as remove failing will not even invoke this method LOG.error( "Unexpected during add session to another pool. If remove failed this should not have been called."); return false; } PoolState destPool = pools.get(destPoolName); if (destPool != null && destPool.sessions.add(session)) { if (destPool.metrics != null) { destPool.metrics.addRunningQuery(); } session.setPoolName(destPoolName); updateTriggers(session); poolsToRedistribute.add(destPoolName); return true; } LOG.error("Session {} was not added to pool {}", session, destPoolName); return null; } // ===== EVENT METHODS public ListenableFuture<Boolean> updateResourcePlanAsync(WMFullResourcePlan plan) { SettableFuture<Boolean> applyRpFuture = SettableFuture.create(); currentLock.lock(); try { // TODO: if there's versioning/etc., it will come in here. For now we rely on external // locking or ordering of calls. This should potentially return a Future for that. if (current.resourcePlanToApply != null) { LOG.warn("Several resource plans are being applied at the same time; using the latest"); current.applyRpFuture.setException(new HiveException("Another plan was applied in parallel")); } current.applyRpFuture = applyRpFuture; if (plan == null) { current.resourcePlanToApply = null; current.doClearResourcePlan = true; } else { current.resourcePlanToApply = plan; current.doClearResourcePlan = false; } notifyWmThreadUnderLock(); } finally { currentLock.unlock(); } return applyRpFuture; } Future<Boolean> applyMoveSessionAsync(WmTezSession srcSession, String destPoolName) { currentLock.lock(); MoveSession moveSession; try { moveSession = new MoveSession(srcSession, destPoolName); current.moveSessions.add(moveSession); LOG.info("Queued move session: {}", moveSession); notifyWmThreadUnderLock(); } finally { currentLock.unlock(); } return moveSession.future; } Future<Boolean> applyKillSessionAsync(WmTezSession wmTezSession, String killReason) { KillQueryContext killQueryContext; currentLock.lock(); try { killQueryContext = new KillQueryContext(wmTezSession, killReason); resetAndQueueKill(syncWork.toKillQuery, killQueryContext, current.toReuse); LOG.info("Queued session for kill: {}", killQueryContext.session); notifyWmThreadUnderLock(); } finally { currentLock.unlock(); } return killQueryContext.killSessionFuture; } private final static class GetRequest { public static final Comparator<GetRequest> ORDER_COMPARATOR = (o1, o2) -> { if (o1.order == o2.order) return 0; return o1.order < o2.order ? -1 : 1; }; private final long order; private final MappingInput mappingInput; private final SettableFuture<WmTezSession> future; private WmTezSession sessionToReuse; private final String queryId; private final WmContext wmContext; private GetRequest(MappingInput mappingInput, String queryId, SettableFuture<WmTezSession> future, WmTezSession sessionToReuse, long order, final WmContext wmContext) { assert mappingInput != null; this.mappingInput = mappingInput; this.queryId = queryId; this.future = future; this.sessionToReuse = sessionToReuse; this.order = order; this.wmContext = wmContext; } @Override public String toString() { return "[#" + order + ", " + mappingInput + ", reuse " + sessionToReuse + "]"; } } @VisibleForTesting public WmTezSession getSession(TezSessionState session, MappingInput input, HiveConf conf) throws Exception { return getSession(session, input, conf, null); } public WmTezSession getSession(TezSessionState session, MappingInput input, HiveConf conf, final WmContext wmContext) throws Exception { WmEvent wmEvent = new WmEvent(WmEvent.EventType.GET); // Note: not actually used for pool sessions; verify some things like doAs are not set. validateConfig(conf); String queryId = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYID); SettableFuture<WmTezSession> future = SettableFuture.create(); WmTezSession wmSession = checkSessionForReuse(session); GetRequest req = new GetRequest(input, queryId, future, wmSession, getRequestVersion.incrementAndGet(), wmContext); currentLock.lock(); try { current.getRequests.add(req); if (req.sessionToReuse != null) { // Note: we assume reuse is only possible for the same user and config. current.toReuse.put(wmSession, req); } notifyWmThreadUnderLock(); } finally { currentLock.unlock(); } try { WmTezSession sessionState = future.get(); wmEvent.endEvent(sessionState); return sessionState; } catch (ExecutionException ex) { Throwable realEx = ex.getCause(); throw realEx instanceof Exception ? (Exception) realEx : ex; } } @Override public void destroy(TezSessionState session) throws Exception { WmTezSession wmTezSession = ensureOwnedSession(session); resetGlobalTezSession(wmTezSession); currentLock.lock(); try { current.toDestroy.add(wmTezSession); notifyWmThreadUnderLock(); } finally { currentLock.unlock(); } } private void resetGlobalTezSession(WmTezSession wmTezSession) { // This has to be done synchronously to avoid the caller getting this session again. // Ideally we'd get rid of this thread-local nonsense. SessionState sessionState = SessionState.get(); if (sessionState != null && sessionState.getTezSession() == wmTezSession) { sessionState.setTezSession(null); } } @Override public void returnAfterUse(TezSessionPoolSession session) throws Exception { WmTezSession wmTezSession = ensureOwnedSession(session); resetGlobalTezSession(wmTezSession); currentLock.lock(); try { wmTezSession.createAndSetReturnFuture(); current.toReturn.add(wmTezSession); notifyWmThreadUnderLock(); } finally { currentLock.unlock(); } } public void notifyOfInconsistentAllocation(WmTezSession session) { // We just act as a pass-thru between the session and allocation manager. We don't change the // allocation target (only WM thread can do that); therefore we can do this directly and // actualState-based sync will take care of multiple potential message senders. allocationManager.updateSessionAsync(session); } public void notifyOfClusterStateChange() { currentLock.lock(); try { current.hasClusterStateChanged = true; notifyWmThreadUnderLock(); } finally { currentLock.unlock(); } } void addUpdateError(WmTezSession wmTezSession, int endpointVersion) { currentLock.lock(); try { Integer existing = current.updateErrors.get(wmTezSession); // Only store the latest error, if there are multiple. if (existing != null && existing >= endpointVersion) return; current.updateErrors.put(wmTezSession, endpointVersion); notifyWmThreadUnderLock(); } finally { currentLock.unlock(); } } @Override public List<String> getWmStateDescription() { Future<List<String>> future = null; currentLock.lock(); try { if (current.dumpStateFuture != null) { future = current.dumpStateFuture; } else { future = current.dumpStateFuture = SettableFuture.create(); notifyWmThreadUnderLock(); } } finally { currentLock.unlock(); } try { return future.get(); } catch (InterruptedException | ExecutionException e) { LOG.error("Error getting description", e); return Lists.newArrayList("Error: " + e.toString()); } } private void addKillQueryResult(WmTezSession toKill, boolean success) { currentLock.lock(); try { current.killQueryResults.put(toKill, success); notifyWmThreadUnderLock(); } finally { currentLock.unlock(); } } @VisibleForTesting /** * Adds a test event that's processed at the end of WM iteration. * This allows tests to wait for an iteration to finish without messing with the threading * logic (that is prone to races if we e.g. remember the state before and wait for it to change, * self-deadlocking when triggering things explicitly and calling a blocking API, and hanging * forever if we wait for "another iteration"). If addTestEvent is called after all the other * calls of interest, it is guaranteed that the events from those calls will be processed * fully when the future is triggered. */ Future<Boolean> addTestEvent() { SettableFuture<Boolean> testEvent = SettableFuture.create(); currentLock.lock(); try { LOG.info("Adding test event " + System.identityHashCode(testEvent)); current.testEvents.add(testEvent); notifyWmThreadUnderLock(); } finally { currentLock.unlock(); } return testEvent; } public void notifyInitializationCompleted(SessionInitContext initCtx) { currentLock.lock(); try { current.initResults.add(initCtx); notifyWmThreadUnderLock(); } finally { currentLock.unlock(); } } @Override public TezSessionState reopen(TezSessionState session) throws Exception { WmTezSession wmTezSession = ensureOwnedSession(session); HiveConf sessionConf = wmTezSession.getConf(); if (sessionConf == null) { // TODO: can this ever happen? LOG.warn("Session configuration is null for " + wmTezSession); sessionConf = new HiveConf(conf, WorkloadManager.class); } SettableFuture<WmTezSession> future = SettableFuture.create(); currentLock.lock(); try { if (current.toReopen.containsKey(wmTezSession)) { throw new AssertionError("The session is being reopened more than once " + session); } current.toReopen.put(wmTezSession, future); notifyWmThreadUnderLock(); } finally { currentLock.unlock(); } return future.get(); } @Override public void closeAndReopenExpiredSession(TezSessionPoolSession session) throws Exception { // By definition, this session is not in use and can no longer be in use, so it only // affects the session pool. We can handle this inline. tezAmPool.replaceSession(ensureOwnedSession(session)); } // ======= VARIOUS UTILITY METHOD private void notifyWmThreadUnderLock() { if (hasChanges) return; hasChanges = true; hasChangesCondition.signalAll(); } private WmTezSession checkSessionForReuse(TezSessionState session) throws Exception { if (session == null) return null; WmTezSession result = null; if (session instanceof WmTezSession) { result = (WmTezSession) session; if (result.isOwnedBy(this)) { return result; } // This should never happen, at least for now. Throw? LOG.warn("Attempting to reuse a session not belonging to us: " + result); result.returnToSessionManager(); return null; } LOG.warn("Attempting to reuse a non-WM session for workload management:" + session); if (session instanceof TezSessionPoolSession) { session.returnToSessionManager(); } else { session.close(false); // This is a non-pool session, get rid of it. } return null; } private void validateConfig(HiveConf conf) throws HiveException { String queueName = conf.get(TezConfiguration.TEZ_QUEUE_NAME); if ((queueName != null) && !queueName.isEmpty()) { LOG.warn("Ignoring " + TezConfiguration.TEZ_QUEUE_NAME + "=" + queueName); conf.set(TezConfiguration.TEZ_QUEUE_NAME, yarnQueue); } if (conf.getBoolVar(ConfVars.HIVE_SERVER2_ENABLE_DOAS)) { // Should this also just be ignored? Throw for now, doAs unlike queue is often set by admin. throw new HiveException(ConfVars.HIVE_SERVER2_ENABLE_DOAS.varname + " is not supported"); } if (restrictedConfig != null) { restrictedConfig.validate(conf); } } private WmTezSession createSession(HiveConf conf) { WmTezSession session = createSessionObject(TezSessionState.makeSessionId(), conf); session.setQueueName(yarnQueue); session.setDefault(); LOG.info("Created new interactive session object " + session.getSessionId()); return session; } @VisibleForTesting protected WmTezSession createSessionObject(String sessionId, HiveConf conf) { conf = (conf == null) ? new HiveConf(this.conf) : conf; conf.set(LlapTaskSchedulerService.LLAP_PLUGIN_ENDPOINT_ENABLED, "true"); return new WmTezSession(sessionId, this, expirationTracker, conf); } private WmTezSession ensureOwnedSession(TezSessionState oldSession) { if (!(oldSession instanceof WmTezSession) || !((WmTezSession) oldSession).isOwnedBy(this)) { throw new AssertionError("Not a WM session " + oldSession); } WmTezSession session = (WmTezSession) oldSession; return session; } /** Called by TezSessionPoolSession when opened. */ @Override public void registerOpenSession(TezSessionPoolSession session) { synchronized (openSessions) { openSessions.put(session, true); } } /** Called by TezSessionPoolSession when closed. */ @Override public void unregisterOpenSession(TezSessionPoolSession session) { synchronized (openSessions) { openSessions.remove(session); } tezAmPool.notifyClosed(session); } @VisibleForTesting public SessionExpirationTracker getExpirationTracker() { return expirationTracker; } @VisibleForTesting int getNumSessions() { return tezAmPool.getInitialSize(); } protected final HiveConf getConf() { return conf; } void updateTriggers(final WmTezSession session) { WmContext wmContext = session.getWmContext(); String poolName = session.getPoolName(); PoolState poolState = pools.get(poolName); if (wmContext != null && poolState != null) { wmContext.addTriggers(poolState.getTriggers()); LOG.info("Subscribed to counters: {}", wmContext.getSubscribedCounters()); } } @Override Runnable getTriggerValidatorRunnable() { return triggerValidatorRunnable; } /** * State of a single pool. * Unless otherwise specified, the members are only modified by the master thread. */ private static class PoolState { // Add stuff here as WM is implemented. private final LinkedList<SessionInitContext> initializingSessions = new LinkedList<>(); // Note: the list is expected to be a few items; if it's longer we may want an IHM. private final LinkedList<WmTezSession> sessions = new LinkedList<>(); private final LinkedList<GetRequest> queue = new LinkedList<>(); private final WmPoolMetrics metrics; private final String fullName; private double finalFraction; private double finalFractionRemaining; private int queryParallelism = -1; private List<Trigger> triggers = new ArrayList<>(); private WMPoolSchedulingPolicy schedulingPolicy; public PoolState(String fullName, int queryParallelism, double fraction, String schedulingPolicy, MetricsSystem ms) { this.fullName = fullName; // TODO: this actually calls the metrics system and getMetrics - that may be expensive. // For now it looks like it should be ok to do on WM thread. this.metrics = ms == null ? null : WmPoolMetrics.create(fullName, ms); update(queryParallelism, fraction, null, null, schedulingPolicy); } public int getTotalActiveSessions() { return sessions.size() + initializingSessions.size(); } public void update(int queryParallelism, double fraction, WmThreadSyncWork syncWork, EventState e, String schedulingPolicy) { this.finalFraction = this.finalFractionRemaining = fraction; this.queryParallelism = queryParallelism; if (metrics != null) { metrics.setParallelQueries(queryParallelism); } try { this.schedulingPolicy = MetaStoreUtils.parseSchedulingPolicy(schedulingPolicy); } catch (IllegalArgumentException ex) { // This should be validated at change time; let's fall back to a default here. LOG.error("Unknown scheduling policy " + schedulingPolicy + "; using FAIR"); this.schedulingPolicy = WMPoolSchedulingPolicy.FAIR; } // TODO: two possible improvements // 1) Right now we kill all the queries here; we could just kill -qpDelta. // 2) After the queries are killed queued queries would take their place. // If we could somehow restart queries we could instead put them at the front // of the queue (esp. in conjunction with (1)) and rerun them. if (queryParallelism < getTotalActiveSessions()) { extractAllSessionsToKill("The query pool was resized by administrator", e.toReuse, syncWork); } // We will requeue, and not kill, the queries that are not running yet. // Insert them all before the get requests from this iteration. GetRequest req; if (metrics != null) { metrics.removeQueuedQueries(queue.size()); } while ((req = queue.pollLast()) != null) { e.getRequests.addFirst(req); } } public void destroy(WmThreadSyncWork syncWork, LinkedList<GetRequest> globalQueue, IdentityHashMap<WmTezSession, GetRequest> toReuse) { extractAllSessionsToKill("The query pool was removed by administrator", toReuse, syncWork); // All the pending get requests should just be requeued elsewhere. // Note that we never queue session reuse so sessionToReuse would be null. globalQueue.addAll(0, queue); if (metrics != null) { metrics.removeQueuedQueries(queue.size()); metrics.destroy(); } queue.clear(); } public double updateAllocationPercentages() { switch (schedulingPolicy) { case FAIR: int totalSessions = sessions.size() + initializingSessions.size(); if (totalSessions == 0) return 0; double allocation = finalFractionRemaining / totalSessions; for (WmTezSession session : sessions) { updateSessionAllocationWithEvent(session, allocation); } // Do not give out the capacity of the initializing sessions to the running ones; // we expect init to be fast. return finalFractionRemaining - allocation * initializingSessions.size(); case FIFO: if (sessions.isEmpty()) return 0; boolean isFirst = true; for (WmTezSession session : sessions) { updateSessionAllocationWithEvent(session, isFirst ? finalFractionRemaining : 0); isFirst = false; } return finalFractionRemaining; default: throw new AssertionError("Unexpected enum value " + schedulingPolicy); } } private void updateSessionAllocationWithEvent(WmTezSession session, double allocation) { WmEvent event = null; WmContext ctx = session.getWmContext(); if (ctx != null && session.hasClusterFraction() && !DoubleMath.fuzzyEquals(session.getClusterFraction(), allocation, 0.0001f)) { event = new WmEvent(EventType.UPDATE); } session.setClusterFraction(allocation); if (event != null) { event.endEvent(session); } } public LinkedList<WmTezSession> getSessions() { return sessions; } public LinkedList<SessionInitContext> getInitializingSessions() { return initializingSessions; } @Override public String toString() { return "[" + fullName + ", query parallelism " + queryParallelism + ", fraction of the cluster " + finalFraction + ", fraction used by child pools " + (finalFraction - finalFractionRemaining) + ", active sessions " + sessions.size() + ", initializing sessions " + initializingSessions.size() + "]"; } private void extractAllSessionsToKill(String killReason, IdentityHashMap<WmTezSession, GetRequest> toReuse, WmThreadSyncWork syncWork) { int totalCount = sessions.size() + initializingSessions.size(); for (WmTezSession sessionToKill : sessions) { resetRemovedSessionToKill(syncWork.toKillQuery, new KillQueryContext(sessionToKill, killReason), toReuse); } sessions.clear(); for (SessionInitContext initCtx : initializingSessions) { // It is possible that the background init thread has finished in parallel, queued // the message for us but also returned the session to the user. WmTezSession sessionToKill = initCtx.cancelAndExtractSessionIfDone(killReason, syncWork.pathsToDelete); if (sessionToKill == null) { continue; // Async op in progress; the callback will take care of this. } resetRemovedSessionToKill(syncWork.toKillQuery, new KillQueryContext(sessionToKill, killReason), toReuse); } initializingSessions.clear(); if (metrics != null) { metrics.removeRunningQueries(totalCount); } } public void setTriggers(final LinkedList<Trigger> triggers) { this.triggers = triggers; } public List<Trigger> getTriggers() { return triggers; } } private enum SessionInitState { GETTING, // We are getting a session from TezSessionPool WAITING_FOR_REGISTRY, // We have the session but it doesn't have registry info yet. DONE, // We have the session with registry info, or we have failed. CANCELED // The master thread has CANCELED this and will never look at it again. } /** * The class that serves as a synchronization point, and future callback, * for async session initialization, as well as parallel cancellation. */ private final class SessionInitContext implements FutureCallback<WmTezSession> { private final static int MAX_ATTEMPT_NUMBER = 1; // Retry once. private final String poolName, queryId; private final ReentrantLock lock = new ReentrantLock(); private WmTezSession session; private SettableFuture<WmTezSession> future; private SessionInitState state; private String cancelReason; private HiveResources prelocalizedResources; private Path pathToDelete; private WmContext wmContext; private int attemptNumber = 0; public SessionInitContext(SettableFuture<WmTezSession> future, String poolName, String queryId, WmContext wmContext, HiveResources prelocalizedResources) { this.state = SessionInitState.GETTING; this.future = future; this.poolName = poolName; this.queryId = queryId; this.prelocalizedResources = prelocalizedResources; this.wmContext = wmContext; } public void start() throws Exception { ListenableFuture<WmTezSession> getFuture = tezAmPool.getSessionAsync(); Futures.addCallback(getFuture, this); } @Override public void onSuccess(WmTezSession session) { SessionInitState oldState; SettableFuture<WmTezSession> future = null; lock.lock(); try { oldState = state; switch (oldState) { case GETTING: { LOG.info("Received a session from AM pool {}", session); assert this.state == SessionInitState.GETTING; session.setPoolName(poolName); session.setQueueName(yarnQueue); session.setQueryId(queryId); if (prelocalizedResources != null) { pathToDelete = session.replaceHiveResources(prelocalizedResources, true); } if (wmContext != null) { session.setWmContext(wmContext); } this.session = session; this.state = SessionInitState.WAITING_FOR_REGISTRY; break; } case WAITING_FOR_REGISTRY: { assert this.session != null; this.state = SessionInitState.DONE; assert session == this.session; future = this.future; this.future = null; break; } case CANCELED: { future = this.future; this.session = null; this.future = null; break; } default: { future = this.future; this.future = null; break; } } } finally { lock.unlock(); } switch (oldState) { case GETTING: { ListenableFuture<WmTezSession> waitFuture = session.waitForAmRegistryAsync(amRegistryTimeoutMs, timeoutPool); Futures.addCallback(waitFuture, this); break; } case WAITING_FOR_REGISTRY: { // Notify the master thread and the user. notifyInitializationCompleted(this); future.set(session); break; } case CANCELED: { // Return session to the pool; we can do it directly here. future.setException( new HiveException("The query was killed by workload management: " + cancelReason)); session.clearWm(); session.setQueryId(null); session.setWmContext(null); tezAmPool.returnSession(session); break; } default: { AssertionError error = new AssertionError("Unexpected state " + state); future.setException(error); throw error; } } } @Override public void onFailure(Throwable t) { SettableFuture<WmTezSession> future; WmTezSession session; boolean wasCanceled = false, doRetry = false; lock.lock(); try { wasCanceled = (state == SessionInitState.CANCELED); session = this.session; this.session = null; doRetry = !wasCanceled && (attemptNumber < MAX_ATTEMPT_NUMBER); if (doRetry) { ++attemptNumber; this.state = SessionInitState.GETTING; future = null; } else { future = this.future; this.future = null; if (!wasCanceled) { this.state = SessionInitState.DONE; } } } finally { lock.unlock(); } if (doRetry) { try { start(); return; } catch (Exception e) { LOG.error("Failed to retry; propagating original error. The new error is ", e); } finally { discardSessionOnFailure(session); } } if (!wasCanceled) { if (LOG.isDebugEnabled()) { LOG.info("Queueing the initialization failure with " + session); } notifyInitializationCompleted(this); // Report failure to the main thread. } future.setException(t); discardSessionOnFailure(session); } public void discardSessionOnFailure(WmTezSession session) { if (session == null) return; session.clearWm(); session.setQueryId(null); // We can just restart the session if we have received one. try { tezAmPool.replaceSession(session); } catch (Exception e) { LOG.error("Failed to restart a failed session", e); } } /** Cancel the async operation (even if it's done), and return the session if done. */ public WmTezSession cancelAndExtractSessionIfDone(String cancelReason, List<Path> toDelete) { lock.lock(); try { SessionInitState state = this.state; this.state = SessionInitState.CANCELED; this.cancelReason = cancelReason; if (state == SessionInitState.DONE) { WmTezSession result = this.session; this.session = null; if (pathToDelete != null) { toDelete.add(pathToDelete); } return result; } else { // In the states where a background operation is in progress, wait for the callback. // Also, ignore any duplicate calls; also don't kill failed ones - handled elsewhere. if (state == SessionInitState.CANCELED) { LOG.warn("Duplicate call to extract " + session); } return null; } } finally { lock.unlock(); } } /** Extracts the session and cancel the operation, both only if done. */ public boolean extractSessionAndCancelIfDone(List<WmTezSession> results, List<Path> toDelete) { lock.lock(); try { if (state != SessionInitState.DONE) return false; this.state = SessionInitState.CANCELED; if (pathToDelete != null) { toDelete.add(pathToDelete); } if (this.session != null) { results.add(this.session); } // Otherwise we have failed; the callback has taken care of the failure. this.session = null; return true; } finally { lock.unlock(); } } @Override public String toString() { return "[state=" + state + ", session=" + session + "]"; } } boolean isManaged(MappingInput input) { // This is always replaced atomically, so we don't care about concurrency here. UserPoolMapping mapping = userPoolMapping; if (mapping != null) { // Don't pass in the pool set - not thread safe; if the user is trying to force us to // use a non-existent pool, we want to fail anyway. We will fail later during get. String mappedPool = mapping.mapSessionToPoolName(input, allowAnyPool, null); LOG.info("Mapping input: {} mapped to pool: {}", input, mappedPool); return true; } return false; } private enum KillQueryResult { OK, RESTART_REQUIRED, IN_PROGRESS } /** * When we kill a query without killing a session, we need two things to come back before reuse. * First of all, kill query itself should come back, and second the user should handle it * and let go of the session (or, the query could finish and it could give the session back * even before we try to kill the query). We also need to handle cases where the user doesn't * like the session even before we kill it, or the kill fails and the user is happily computing * away. This class is to collect and make sense of the state around all this. */ static final class KillQueryContext { private SettableFuture<Boolean> killSessionFuture; private final String reason; private final WmTezSession session; // Note: all the fields are only modified by master thread. private boolean isUserDone = false, isKillDone = false, hasKillFailed = false, hasUserFailed = false; KillQueryContext(WmTezSession session, String reason) { this.session = session; this.reason = reason; this.killSessionFuture = SettableFuture.create(); } private void handleKillQueryCallback(boolean hasFailed) { isKillDone = true; hasKillFailed = hasFailed; } private void handleUserCallback(boolean hasFailed) { if (isUserDone) { LOG.warn("Duplicate user call for a session being killed; ignoring"); return; } isUserDone = true; hasUserFailed = hasFailed; } private KillQueryResult process() { if (!isUserDone && hasKillFailed) { // The user has not returned and the kill has failed. // We are going to brute force kill the AM; whatever user does is now irrelevant. session.setIsIrrelevantForWm(reason); return KillQueryResult.RESTART_REQUIRED; } if (!isUserDone || !isKillDone) return KillQueryResult.IN_PROGRESS; // Someone is not done. // Both user and the kill have returned. if (hasUserFailed && hasKillFailed) { // If the kill failed and the user also thinks the session is invalid, restart it. session.setIsIrrelevantForWm(reason); return KillQueryResult.RESTART_REQUIRED; } // Otherwise, we can reuse the session. Either the kill has failed but the user managed to // return early (in fact, can it fail because the query has completed earlier?), or the user // has failed because the query was killed from under it. return KillQueryResult.OK; } @Override public String toString() { return "KillQueryContext [isUserDone=" + isUserDone + ", isKillDone=" + isKillDone + ", hasKillFailed=" + hasKillFailed + ", hasUserFailed=" + hasUserFailed + ", session=" + session + ", reason=" + reason + "]"; } } private static void resetRemovedSessionToKill(Map<WmTezSession, KillQueryContext> toKillQuery, KillQueryContext killQueryContext, Map<WmTezSession, GetRequest> toReuse) { toKillQuery.put(killQueryContext.session, killQueryContext); killQueryContext.session.clearWm(); GetRequest req = toReuse.remove(killQueryContext.session); if (req != null) { req.sessionToReuse = null; } } private void resetAndQueueKill(Map<WmTezSession, KillQueryContext> toKillQuery, KillQueryContext killQueryContext, Map<WmTezSession, GetRequest> toReuse) { WmTezSession toKill = killQueryContext.session; toKillQuery.put(toKill, killQueryContext); // The way this works is, a session in WM pool will move back to tez AM pool on a kill and will get // reassigned back to WM pool on GetRequest based on user pool mapping. Only if we remove the session from active // sessions list of its WM pool will the queue'd GetRequest be processed String poolName = toKill.getPoolName(); if (poolName != null) { PoolState poolState = pools.get(poolName); if (poolState != null) { poolState.getSessions().remove(toKill); Iterator<SessionInitContext> iter = poolState.getInitializingSessions().iterator(); while (iter.hasNext()) { if (iter.next().session == toKill) { iter.remove(); break; } } } } toKill.clearWm(); GetRequest req = toReuse.remove(toKill); if (req != null) { req.sessionToReuse = null; } } @VisibleForTesting TezSessionPool<WmTezSession> getTezAmPool() { return tezAmPool; } public final static class NoPoolMappingException extends Exception { public NoPoolMappingException(String message) { super(message); } private static final long serialVersionUID = 346375346724L; } }