Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package storm.mesos.schedulers; import org.apache.commons.lang3.StringUtils; import org.apache.mesos.Protos; import org.apache.mesos.SchedulerDriver; import org.apache.storm.scheduler.Cluster; import org.apache.storm.scheduler.ExecutorDetails; import org.apache.storm.scheduler.IScheduler; import org.apache.storm.scheduler.SchedulerAssignment; import org.apache.storm.scheduler.SupervisorDetails; import org.apache.storm.scheduler.Topologies; import org.apache.storm.scheduler.TopologyDetails; import org.apache.storm.scheduler.WorkerSlot; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import storm.mesos.resources.AggregatedOffers; import storm.mesos.resources.ResourceNotAvailableException; import storm.mesos.util.MesosCommon; import static storm.mesos.util.PrettyProtobuf.offerMapKeySetToString; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.ListIterator; import java.util.Map; import java.util.Set; /** * Default Scheduler used by mesos-storm framework. */ public class StormSchedulerImpl implements IScheduler, IMesosStormScheduler { private final Logger log = LoggerFactory.getLogger(StormSchedulerImpl.class); private Map mesosStormConf; private final Map<String, MesosWorkerSlot> mesosWorkerSlotMap = new HashMap<>(); private volatile boolean offersSuppressed = false; private SchedulerDriver driver; private Set<String> offersRequestTracker = new HashSet<>(); private StormSchedulerImpl() { // We make this constructor private so that calling it results in a compile time error } public StormSchedulerImpl(final SchedulerDriver driver) { this.driver = driver; } @Override public void prepare(Map conf) { mesosStormConf = conf; } public void addOfferRequest(String key) { offersRequestTracker.add(key); } public void removeOfferRequest(String key) { offersRequestTracker.remove(key); } private List<MesosWorkerSlot> getMesosWorkerSlots(Map<String, AggregatedOffers> aggregatedOffersPerNode, Collection<String> nodesWithExistingSupervisors, TopologyDetails topologyDetails) { double requestedWorkerCpu = MesosCommon.topologyWorkerCpu(mesosStormConf, topologyDetails); double requestedWorkerMem = MesosCommon.topologyWorkerMem(mesosStormConf, topologyDetails); int requestedWorkerMemInt = (int) requestedWorkerMem; List<MesosWorkerSlot> mesosWorkerSlots = new ArrayList<>(); boolean slotFound = false; int slotsNeeded = topologyDetails.getNumWorkers(); /* XXX(erikdw): For now we clear out our knowledge of pre-existing supervisors while searching for slots * for this topology, to make the behavior of allSlotsAvailableForScheduling() mimic that of assignSlots(). * * See this issue: https://github.com/mesos/storm/issues/160 * * Until that issue is fixed, we must not discount the resources used by pre-existing supervisors. * Otherwise we will under-represent the resources needed as compared to what the more ignorant * assignSlots() will believe is needed, and thus may prevent MesosWorkerSlots from actually being * used. i.e., assignSlots() doesn't know if supervisors already exist, since it doesn't receive the * existingSupervisors input parameter that allSlotsAvailableForScheduling() does. */ nodesWithExistingSupervisors.clear(); do { slotFound = false; List<String> hostsWithOffers = new ArrayList<String>(aggregatedOffersPerNode.keySet()); Collections.shuffle(hostsWithOffers); for (String currentNode : hostsWithOffers) { AggregatedOffers aggregatedOffers = aggregatedOffersPerNode.get(currentNode); boolean supervisorExists = nodesWithExistingSupervisors.contains(currentNode); if (!aggregatedOffers.isFit(mesosStormConf, topologyDetails, supervisorExists)) { if (!supervisorExists) { log.info( "{} with requestedWorkerCpu {} and requestedWorkerMem {} plus the requirements to launch a supervisor does not fit onto {} with resources {}", topologyDetails.getId(), requestedWorkerCpu, requestedWorkerMemInt, aggregatedOffers.getHostname(), aggregatedOffers.toString()); } else { log.info( "{} with requestedWorkerCpu {} and requestedWorkerMem {} does not fit onto {} with resources {}", topologyDetails.getId(), requestedWorkerCpu, requestedWorkerMemInt, aggregatedOffers.getHostname(), aggregatedOffers.toString()); } continue; } log.info( "{} with requestedWorkerCpu {} and requestedWorkerMem {} does fit onto {} with resources {}", topologyDetails.getId(), requestedWorkerCpu, requestedWorkerMemInt, aggregatedOffers.getHostname(), aggregatedOffers.toString()); MesosWorkerSlot mesosWorkerSlot; try { mesosWorkerSlot = SchedulerUtils.createMesosWorkerSlot(mesosStormConf, aggregatedOffers, topologyDetails, supervisorExists); } catch (ResourceNotAvailableException rexp) { log.warn(rexp.getMessage()); continue; } nodesWithExistingSupervisors.add(currentNode); mesosWorkerSlots.add(mesosWorkerSlot); slotFound = true; if (--slotsNeeded == 0) { break; } } } while (slotFound && slotsNeeded > 0); return mesosWorkerSlots; } /* * Different topologies have different resource requirements in terms of cpu and memory. So when Mesos asks * this scheduler for a list of available worker slots, we create "MesosWorkerSlot" and store them into mesosWorkerSlotMap. * Notably, we return a list of MesosWorkerSlot objects, even though Storm is only aware of the WorkerSlot type. However, * since a MesosWorkerSlot *is* a WorkerSlot (in the polymorphic sense), Storm treats the list as WorkerSlot objects. * * Note: * 1. "MesosWorkerSlot" is the same as WorkerSlot except that it is dedicated for a topology upon creation. This means that, * a MesosWorkerSlot belonging to one topology cannot be used to launch a worker belonging to a different topology. * 2. Please note that this method is called before schedule is invoked. We use this opportunity to assign the MesosWorkerSlot * to a specific topology and store the state in "mesosWorkerSlotMap". This way, when Storm later calls schedule, we can just * look up the "mesosWorkerSlotMap" for a list of available slots for the particular topology. * 3. Given MesosWorkerSlot extends WorkerSlot, we shouldn't have to really create a "mesosWorkerSlotMap". Instead, in the schedule * method, we could have just upcasted the "WorkerSlot" to "MesosWorkerSlot". But this is not currently possible because storm * passes a recreated version of WorkerSlot to schedule method instead of passing the WorkerSlot returned by this method as is. */ @Override public List<WorkerSlot> allSlotsAvailableForScheduling(Map<Protos.OfferID, Protos.Offer> offers, Collection<SupervisorDetails> existingSupervisors, Topologies topologies, Set<String> topologiesMissingAssignments) { if (topologiesMissingAssignments.isEmpty()) { removeOfferRequest(MesosCommon.TOPOLOGIES_OFFERS_REQUEST_KEY); } else { addOfferRequest(MesosCommon.TOPOLOGIES_OFFERS_REQUEST_KEY); } if (offersRequestTracker.isEmpty()) { if (!offers.isEmpty()) { log.info( "Declining all offers that are currently buffered because no topologies nor tasks need assignments. Declined offer ids: {}", offerMapKeySetToString(offers)); for (Protos.OfferID offerId : offers.keySet()) { driver.declineOffer(offerId); } offers.clear(); } if (!offersSuppressed) { log.info( "(SUPPRESS OFFERS) We don't have any topologies nor tasks that need assignments, but offers are still flowing. Suppressing offers."); driver.suppressOffers(); offersSuppressed = true; } return new ArrayList<>(); } log.info("Topologies that need assignments: {}", topologiesMissingAssignments.toString()); if (offersSuppressed) { log.info( "(REVIVE OFFERS) We have topologies or tasks that need assignments, but offers are currently suppressed. Reviving offers."); driver.reviveOffers(); offersSuppressed = false; // Note: We still have _offersLock at this point, so we return the empty ArrayList if we happen to have no offers // this way we can release the lock and acquire new offers. Otherwise proceed through the logic below to see if we // can make any slots on the offer(s) we do have if (offers.isEmpty()) { return new ArrayList<>(); } } List<WorkerSlot> allSlots = new ArrayList<>(); Map<String, AggregatedOffers> aggregatedOffersPerNode = MesosCommon.getAggregatedOffersPerNode(offers); for (String currentTopology : topologiesMissingAssignments) { TopologyDetails topologyDetails = topologies.getById(currentTopology); int slotsNeeded = topologyDetails.getNumWorkers(); log.info("Trying to find {} slots for {}", slotsNeeded, topologyDetails.getId()); if (slotsNeeded <= 0) { continue; } Set<String> nodesWithExistingSupervisors = new HashSet<>(); for (String currentNode : aggregatedOffersPerNode.keySet()) { if (SchedulerUtils.supervisorExists(MesosCommon.getMesosFrameworkName(mesosStormConf), currentNode, existingSupervisors, currentTopology)) { nodesWithExistingSupervisors.add(currentNode); } } List<MesosWorkerSlot> mesosWorkerSlotList = getMesosWorkerSlots(aggregatedOffersPerNode, nodesWithExistingSupervisors, topologyDetails); for (MesosWorkerSlot mesosWorkerSlot : mesosWorkerSlotList) { String slotId = String.format("%s:%s", mesosWorkerSlot.getNodeId(), mesosWorkerSlot.getPort()); mesosWorkerSlotMap.put(slotId, mesosWorkerSlot); allSlots.add(mesosWorkerSlot); } log.info("Number of available slots for {}: {}", topologyDetails.getId(), mesosWorkerSlotList.size()); } List<String> slotsStrings = new ArrayList<String>(); for (WorkerSlot slot : allSlots) { slotsStrings.add("" + slot.getNodeId() + ":" + slot.getPort()); } log.info("allSlotsAvailableForScheduling: {} available slots: [{}]", allSlots.size(), StringUtils.join(slotsStrings, ", ")); return allSlots; } Map<String, List<MesosWorkerSlot>> getMesosWorkerSlotPerTopology(List<WorkerSlot> workerSlots) { HashMap<String, List<MesosWorkerSlot>> perTopologySlotList = new HashMap<>(); for (WorkerSlot workerSlot : workerSlots) { if (workerSlot.getNodeId() == null) { log.warn("Unexpected: Node id is null for worker slot while scheduling"); continue; } MesosWorkerSlot mesosWorkerSlot = mesosWorkerSlotMap .get(String.format("%s:%d", workerSlot.getNodeId(), workerSlot.getPort())); String topologyId = mesosWorkerSlot.getTopologyId(); if (perTopologySlotList.get(topologyId) == null) { perTopologySlotList.put(topologyId, new ArrayList<MesosWorkerSlot>()); } perTopologySlotList.get(topologyId).add(mesosWorkerSlot); } return perTopologySlotList; } List<List<ExecutorDetails>> executorsPerWorkerList(Cluster cluster, TopologyDetails topologyDetails, int slotsRequested, int slotsAssigned, int slotsAvailable) { Collection<ExecutorDetails> executors = cluster.getUnassignedExecutors(topologyDetails); String topologyId = topologyDetails.getId(); // Check if we don't actually need to schedule any executors because all requested slots are assigned already. if (slotsRequested == slotsAssigned) { if (executors.isEmpty()) { // TODO: print executors list cleanly in a single line String msg = String.format( "executorsPerWorkerList: for %s, slotsRequested: %d == slotsAssigned: %d, BUT there are " + "unassigned executors which is nonsensical", topologyId, slotsRequested, slotsAssigned); log.error(msg); throw new RuntimeException(msg); } log.debug( "executorsPerWorkerList: for {}, slotsRequested: {} == slotsAssigned: {}, so no need to schedule any executors", topologyId, slotsRequested, slotsAssigned); return null; } int slotsToUse = 0; // If there are not any unassigned executors, we need to re-distribute all currently assigned executors across workers if (executors.isEmpty()) { if (slotsAssigned < slotsAvailable) { log.info( "All executors are already assigned for {}, but only onto {} slots. Redistributing all assigned executors to new set of {} slots.", topologyId, slotsAssigned, slotsAvailable); SchedulerAssignment schedulerAssignment = cluster.getAssignmentById(topologyId); // Un-assign them int slotsFreed = schedulerAssignment.getSlots().size(); cluster.freeSlots(schedulerAssignment.getSlots()); log.info("executorsPerWorkerList: for {}, slotsAvailable: {}, slotsAssigned: {}, slotsFreed: {}", topologyId, slotsAvailable, slotsAssigned, slotsFreed); executors = cluster.getUnassignedExecutors(topologyDetails); slotsToUse = slotsAvailable; } else { log.info( "All executors are already assigned for {}. Not going to redistribute work because slotsAvailable is {} and slotsAssigned is {}", topologyId, slotsAvailable, slotsAssigned); return null; } } else { /* * Spread the unassigned executors onto however many available slots we can possibly use. * i.e., there might be more than we need. * * Note that this logic can lead to an imbalance of executors/worker between various workers. * * We propose to avoid such problems by having an option (perhaps on by default) which only will * ever schedule onto the exact requested number of workers. * See https://github.com/mesos/storm/issues/158 * For now we just issue a warning when we detect such a situation. */ int slotsNeeded = slotsRequested - slotsAssigned; // Just in case something strange happens, we don't want this to be negative slotsToUse = Math.max(Math.min(slotsNeeded, slotsAvailable), 0); // Notably, if slotsAssigned was 0, then this would be a full rebalance onto less workers than requested, // and hence wouldn't lead to an imbalance. if (slotsToUse + slotsAssigned < slotsRequested && slotsAssigned != 0) { log.warn( "For {}, assigning {} storm executors onto {} new slots when we already have {} executors assigned to {} slots, " + "this may lead to executor imbalance.", topologyId, executors.size(), slotsToUse, cluster.getAssignmentById(topologyId).getExecutors().size(), slotsAssigned); } } List<String> executorsStrings = new ArrayList<String>(); List<List<ExecutorDetails>> executorsPerWorkerList = new ArrayList<>(); for (ExecutorDetails exec : executors) { executorsStrings.add(exec.toString()); } String info = String.format("executorsPerWorkerList: available executors for %s: %s", topologyId, StringUtils.join(executorsStrings, ", ")); for (int i = 0; i < slotsToUse; i++) { executorsPerWorkerList.add(new ArrayList<ExecutorDetails>()); } List<ExecutorDetails> executorList = new ArrayList<>(executors); /* The goal of this scheduler is to mimic Storm's default version. Storm's default scheduler sorts the * executors by their id before spreading them across the available workers. */ Collections.sort(executorList, new Comparator<ExecutorDetails>() { public int compare(ExecutorDetails e1, ExecutorDetails e2) { return e1.getStartTask() - e2.getStartTask(); } }); int index = -1; for (ExecutorDetails executorDetails : executorList) { index = ++index % slotsToUse; // log.info("executorsPerWorkerList -- adding {} to list at index {}", executorDetails.toString(), index); executorsPerWorkerList.get(index).add(executorDetails); } return executorsPerWorkerList; } /** * Schedule function looks in the "mesosWorkerSlotMap" to determine which topology owns the particular * WorkerSlot and assigns the executors accordingly. */ @Override public void schedule(Topologies topologies, Cluster cluster) { List<WorkerSlot> workerSlots = cluster.getAvailableSlots(); String info = ""; if (!workerSlots.isEmpty()) { info = "Scheduling the following worker slots from cluster.getAvailableSlots: "; List<String> workerSlotsStrings = new ArrayList<String>(); for (WorkerSlot ws : workerSlots) { workerSlotsStrings.add(ws.toString()); } info += String.format("[%s]", StringUtils.join(workerSlotsStrings, ", ")); log.info(info); } Map<String, List<MesosWorkerSlot>> perTopologySlotList = getMesosWorkerSlotPerTopology(workerSlots); if (perTopologySlotList.isEmpty()) { return; } info = "Schedule the per-topology slots:"; for (String topo : perTopologySlotList.keySet()) { List<String> mwsAssignments = new ArrayList<>(); for (MesosWorkerSlot mws : perTopologySlotList.get(topo)) { mwsAssignments.add(mws.getNodeId() + ":" + mws.getPort()); } info += String.format(" {%s, [%s]}", topo, StringUtils.join(mwsAssignments, ", ")); } log.info(info); // So far we know how many MesosSlots each of the topologies have got. Let's assign executors for each of them for (String topologyId : perTopologySlotList.keySet()) { TopologyDetails topologyDetails = topologies.getById(topologyId); List<MesosWorkerSlot> mesosWorkerSlots = perTopologySlotList.get(topologyId); int slotsRequested = topologyDetails.getNumWorkers(); int slotsAssigned = cluster.getAssignedNumWorkers(topologyDetails); int slotsAvailable = mesosWorkerSlots.size(); if (slotsAvailable == 0) { log.warn("No slots found for topology {} while scheduling", topologyId); continue; } log.info("topologyId: {}, slotsRequested: {}, slotsAssigned: {}, slotsAvailable: {}", topologyId, slotsRequested, slotsAssigned, slotsAvailable); List<List<ExecutorDetails>> executorsPerWorkerList = executorsPerWorkerList(cluster, topologyDetails, slotsRequested, slotsAssigned, slotsAvailable); if (executorsPerWorkerList == null || executorsPerWorkerList.isEmpty()) { continue; } info = "schedule: Cluster assignment for " + topologyId + "." + " Requesting " + slotsRequested + " slots, with " + slotsAvailable + " slots available, and " + slotsAssigned + " currently assigned." + " Setting new assignment (node:port, executorsPerWorkerList) as: "; List<String> slotAssignmentStrings = new ArrayList<String>(); ListIterator<List<ExecutorDetails>> iterator = executorsPerWorkerList.listIterator(); while (iterator.hasNext()) { List<ExecutorDetails> executorsPerWorker = iterator.next(); slotAssignmentStrings.add("(" + mesosWorkerSlots.get(0).getNodeId() + ":" + mesosWorkerSlots.get(0).getPort() + ", " + executorsPerWorker.toString() + ")"); iterator.remove(); cluster.assign(mesosWorkerSlots.remove(0), topologyId, executorsPerWorker); } if (slotsAvailable == 0) { info += "[]"; } else { info += StringUtils.join(slotAssignmentStrings, ", "); } log.info(info); } mesosWorkerSlotMap.clear(); } }