Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.cloud; import java.io.Closeable; import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.ListIterator; import java.util.Locale; import java.util.Map; import java.util.Random; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.lang.StringUtils; import org.apache.solr.client.solrj.SolrResponse; import org.apache.solr.cloud.overseer.ClusterStateMutator; import org.apache.solr.cloud.overseer.CollectionMutator; import org.apache.solr.cloud.overseer.NodeMutator; import org.apache.solr.cloud.overseer.OverseerAction; import org.apache.solr.cloud.overseer.ReplicaMutator; import org.apache.solr.cloud.overseer.SliceMutator; import org.apache.solr.cloud.overseer.ZkStateWriter; import org.apache.solr.cloud.overseer.ZkWriteCommand; import org.apache.solr.common.SolrException; import org.apache.solr.common.cloud.ClusterState; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.SolrZkClient; import org.apache.solr.common.cloud.ZkNodeProps; import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.common.params.CollectionParams; import org.apache.solr.common.util.IOUtils; import org.apache.solr.common.util.Utils; import org.apache.solr.core.CloudConfig; import org.apache.solr.handler.admin.CollectionsHandler; import org.apache.solr.handler.component.ShardHandler; import org.apache.solr.update.UpdateShardHandler; import org.apache.solr.util.stats.Clock; import org.apache.solr.util.stats.Timer; import org.apache.solr.util.stats.TimerContext; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static org.apache.solr.cloud.OverseerCollectionMessageHandler.ONLY_ACTIVE_NODES; import static org.apache.solr.cloud.OverseerCollectionMessageHandler.SHARD_UNIQUE; import static org.apache.solr.common.params.CollectionParams.CollectionAction.BALANCESHARDUNIQUE; /** * Cluster leader. Responsible for processing state updates, node assignments, creating/deleting * collections, shards, replicas and setting various properties. */ public class Overseer implements Closeable { public static final String QUEUE_OPERATION = "operation"; public static final int STATE_UPDATE_DELAY = 1500; // delay between cloud state updates public static final int NUM_RESPONSES_TO_STORE = 10000; private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); enum LeaderStatus { DONT_KNOW, NO, YES } private class ClusterStateUpdater implements Runnable, Closeable { private final ZkStateReader reader; private final SolrZkClient zkClient; private final String myId; //queue where everybody can throw tasks private final DistributedQueue stateUpdateQueue; //Internal queue where overseer stores events that have not yet been published into cloudstate //If Overseer dies while extracting the main queue a new overseer will start from this queue private final DistributedQueue workQueue; // Internal map which holds the information about running tasks. private final DistributedMap runningMap; // Internal map which holds the information about successfully completed tasks. private final DistributedMap completedMap; // Internal map which holds the information about failed tasks. private final DistributedMap failureMap; private final Stats zkStats; private boolean isClosed = false; public ClusterStateUpdater(final ZkStateReader reader, final String myId, Stats zkStats) { this.zkClient = reader.getZkClient(); this.zkStats = zkStats; this.stateUpdateQueue = getStateUpdateQueue(zkClient, zkStats); this.workQueue = getInternalWorkQueue(zkClient, zkStats); this.failureMap = getFailureMap(zkClient); this.runningMap = getRunningMap(zkClient); this.completedMap = getCompletedMap(zkClient); this.myId = myId; this.reader = reader; } public Stats getStateUpdateQueueStats() { return stateUpdateQueue.getStats(); } public Stats getWorkQueueStats() { return workQueue.getStats(); } @Override public void run() { LeaderStatus isLeader = amILeader(); while (isLeader == LeaderStatus.DONT_KNOW) { log.debug("am_i_leader unclear {}", isLeader); isLeader = amILeader(); // not a no, not a yes, try ask again } log.info("Starting to work on the main queue"); try { ZkStateWriter zkStateWriter = null; ClusterState clusterState = null; boolean refreshClusterState = true; // let's refresh in the first iteration while (!this.isClosed) { isLeader = amILeader(); if (LeaderStatus.NO == isLeader) { break; } else if (LeaderStatus.YES != isLeader) { log.debug("am_i_leader unclear {}", isLeader); continue; // not a no, not a yes, try ask again } //TODO consider removing 'refreshClusterState' and simply check if clusterState is null if (refreshClusterState) { try { reader.updateClusterState(); clusterState = reader.getClusterState(); zkStateWriter = new ZkStateWriter(reader, stats); refreshClusterState = false; // if there were any errors while processing // the state queue, items would have been left in the // work queue so let's process those first byte[] data = workQueue.peek(); boolean hadWorkItems = data != null; while (data != null) { final ZkNodeProps message = ZkNodeProps.load(data); log.info("processMessage: workQueueSize: {}, message = {}", workQueue.getStats().getQueueLength(), message); // force flush to ZK after each message because there is no fallback if workQueue items // are removed from workQueue but fail to be written to ZK clusterState = processQueueItem(message, clusterState, zkStateWriter, false, null); workQueue.poll(); // poll-ing removes the element we got by peek-ing data = workQueue.peek(); } // force flush at the end of the loop if (hadWorkItems) { clusterState = zkStateWriter.writePendingUpdates(); } } catch (KeeperException e) { if (e.code() == KeeperException.Code.SESSIONEXPIRED) { log.warn("Solr cannot talk to ZK, exiting Overseer work queue loop", e); return; } log.error("Exception in Overseer work queue loop", e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); return; } catch (Exception e) { log.error("Exception in Overseer work queue loop", e); } } byte[] head = null; try { head = stateUpdateQueue.peek(true); } catch (KeeperException e) { if (e.code() == KeeperException.Code.SESSIONEXPIRED) { log.warn("Solr cannot talk to ZK, exiting Overseer main queue loop", e); return; } log.error("Exception in Overseer main queue loop", e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); return; } catch (Exception e) { log.error("Exception in Overseer main queue loop", e); } try { while (head != null) { byte[] data = head; final ZkNodeProps message = ZkNodeProps.load(data); log.info("processMessage: queueSize: {}, message = {} current state version: {}", stateUpdateQueue.getStats().getQueueLength(), message, clusterState.getZkClusterStateVersion()); // we can batch here because workQueue is our fallback in case a ZK write failed clusterState = processQueueItem(message, clusterState, zkStateWriter, true, new ZkStateWriter.ZkWriteCallback() { @Override public void onEnqueue() throws Exception { workQueue.offer(data); } @Override public void onWrite() throws Exception { // remove everything from workQueue while (workQueue.poll() != null) ; } }); // it is safer to keep this poll here because an invalid message might never be queued // and therefore we can't rely on the ZkWriteCallback to remove the item stateUpdateQueue.poll(); if (isClosed) break; // if an event comes in the next 100ms batch it together head = stateUpdateQueue.peek(100); } // we should force write all pending updates because the next iteration might sleep until there // are more items in the main queue clusterState = zkStateWriter.writePendingUpdates(); // clean work queue while (workQueue.poll() != null) ; } catch (KeeperException.BadVersionException bve) { log.warn( "Bad version writing to ZK using compare-and-set, will force refresh cluster state", bve); refreshClusterState = true; } catch (KeeperException e) { if (e.code() == KeeperException.Code.SESSIONEXPIRED) { log.warn("Solr cannot talk to ZK, exiting Overseer main queue loop", e); return; } log.error("Exception in Overseer main queue loop", e); refreshClusterState = true; // force refresh state in case of all errors } catch (InterruptedException e) { Thread.currentThread().interrupt(); return; } catch (Exception e) { log.error("Exception in Overseer main queue loop", e); refreshClusterState = true; // it might have been a bad version error } } } finally { log.info("Overseer Loop exiting : {}", LeaderElector.getNodeName(myId)); new Thread("OverseerExitThread") { //do this in a separate thread because any wait is interrupted in this main thread @Override public void run() { checkIfIamStillLeader(); } }.start(); } } private ClusterState processQueueItem(ZkNodeProps message, ClusterState clusterState, ZkStateWriter zkStateWriter, boolean enableBatching, ZkStateWriter.ZkWriteCallback callback) throws Exception { final String operation = message.getStr(QUEUE_OPERATION); List<ZkWriteCommand> zkWriteCommands = null; final TimerContext timerContext = stats.time(operation); try { zkWriteCommands = processMessage(clusterState, message, operation); stats.success(operation); } catch (Exception e) { // generally there is nothing we can do - in most cases, we have // an issue that will fail again on retry or we cannot communicate with a // ZooKeeper in which case another Overseer should take over // TODO: if ordering for the message is not important, we could // track retries and put it back on the end of the queue log.error( "Overseer could not process the current clusterstate state update message, skipping the message.", e); stats.error(operation); } finally { timerContext.stop(); } if (zkWriteCommands != null) { for (ZkWriteCommand zkWriteCommand : zkWriteCommands) { clusterState = zkStateWriter.enqueueUpdate(clusterState, zkWriteCommand, callback); } if (!enableBatching) { clusterState = zkStateWriter.writePendingUpdates(); } } return clusterState; } private void checkIfIamStillLeader() { if (zkController != null && zkController.getCoreContainer().isShutDown()) return;//shutting down no need to go further org.apache.zookeeper.data.Stat stat = new org.apache.zookeeper.data.Stat(); String path = OverseerElectionContext.OVERSEER_ELECT + "/leader"; byte[] data; try { data = zkClient.getData(path, null, stat, true); } catch (Exception e) { log.error("could not read the data", e); return; } try { Map m = (Map) Utils.fromJSON(data); String id = (String) m.get("id"); if (overseerCollectionConfigSetProcessor.getId().equals(id)) { try { log.info("I'm exiting , but I'm still the leader"); zkClient.delete(path, stat.getVersion(), true); } catch (KeeperException.BadVersionException e) { //no problem ignore it some other Overseer has already taken over } catch (Exception e) { log.error("Could not delete my leader node ", e); } } else { log.info("somebody else has already taken up the overseer position"); } } finally { //if I am not shutting down, Then I need to rejoin election try { if (zkController != null && !zkController.getCoreContainer().isShutDown()) { zkController.rejoinOverseerElection(null, false); } } catch (Exception e) { log.warn("Unable to rejoinElection ", e); } } } private List<ZkWriteCommand> processMessage(ClusterState clusterState, final ZkNodeProps message, final String operation) { CollectionParams.CollectionAction collectionAction = CollectionParams.CollectionAction.get(operation); if (collectionAction != null) { switch (collectionAction) { case CREATE: return Collections.singletonList( new ClusterStateMutator(getZkStateReader()).createCollection(clusterState, message)); case DELETE: return Collections.singletonList( new ClusterStateMutator(getZkStateReader()).deleteCollection(clusterState, message)); case CREATESHARD: return Collections.singletonList( new CollectionMutator(getZkStateReader()).createShard(clusterState, message)); case DELETESHARD: return Collections.singletonList( new CollectionMutator(getZkStateReader()).deleteShard(clusterState, message)); case ADDREPLICA: return Collections .singletonList(new SliceMutator(getZkStateReader()).addReplica(clusterState, message)); case ADDREPLICAPROP: return Collections.singletonList( new ReplicaMutator(getZkStateReader()).addReplicaProperty(clusterState, message)); case DELETEREPLICAPROP: return Collections.singletonList( new ReplicaMutator(getZkStateReader()).deleteReplicaProperty(clusterState, message)); case BALANCESHARDUNIQUE: ExclusiveSliceProperty dProp = new ExclusiveSliceProperty(clusterState, message); if (dProp.balanceProperty()) { String collName = message.getStr(ZkStateReader.COLLECTION_PROP); return Collections.singletonList(new ZkWriteCommand(collName, dProp.getDocCollection())); } break; case MODIFYCOLLECTION: CollectionsHandler.verifyRuleParams(zkController.getCoreContainer(), message.getProperties()); return Collections .singletonList(new CollectionMutator(reader).modifyCollection(clusterState, message)); case MIGRATESTATEFORMAT: return Collections.singletonList( new ClusterStateMutator(reader).migrateStateFormat(clusterState, message)); default: throw new RuntimeException( "unknown operation:" + operation + " contents:" + message.getProperties()); } } else { OverseerAction overseerAction = OverseerAction.get(operation); if (overseerAction == null) { throw new RuntimeException( "unknown operation:" + operation + " contents:" + message.getProperties()); } switch (overseerAction) { case STATE: return Collections .singletonList(new ReplicaMutator(getZkStateReader()).setState(clusterState, message)); case LEADER: return Collections.singletonList( new SliceMutator(getZkStateReader()).setShardLeader(clusterState, message)); case DELETECORE: return Collections.singletonList( new SliceMutator(getZkStateReader()).removeReplica(clusterState, message)); case ADDROUTINGRULE: return Collections.singletonList( new SliceMutator(getZkStateReader()).addRoutingRule(clusterState, message)); case REMOVEROUTINGRULE: return Collections.singletonList( new SliceMutator(getZkStateReader()).removeRoutingRule(clusterState, message)); case UPDATESHARDSTATE: return Collections.singletonList( new SliceMutator(getZkStateReader()).updateShardState(clusterState, message)); case QUIT: if (myId.equals(message.get("id"))) { log.info("Quit command received {}", LeaderElector.getNodeName(myId)); overseerCollectionConfigSetProcessor.close(); close(); } else { log.warn("Overseer received wrong QUIT message {}", message); } break; case DOWNNODE: return new NodeMutator(getZkStateReader()).downNode(clusterState, message); default: throw new RuntimeException( "unknown operation:" + operation + " contents:" + message.getProperties()); } } return Collections.singletonList(ZkStateWriter.NO_OP); } private LeaderStatus amILeader() { TimerContext timerContext = stats.time("am_i_leader"); boolean success = true; try { ZkNodeProps props = ZkNodeProps.load( zkClient.getData(OverseerElectionContext.OVERSEER_ELECT + "/leader", null, null, true)); if (myId.equals(props.getStr("id"))) { return LeaderStatus.YES; } } catch (KeeperException e) { success = false; if (e.code() == KeeperException.Code.CONNECTIONLOSS) { log.error("", e); return LeaderStatus.DONT_KNOW; } else if (e.code() == KeeperException.Code.SESSIONEXPIRED) { log.info("", e); } else { log.warn("", e); } } catch (InterruptedException e) { success = false; Thread.currentThread().interrupt(); } finally { timerContext.stop(); if (success) { stats.success("am_i_leader"); } else { stats.error("am_i_leader"); } } log.info("According to ZK I (id=" + myId + ") am no longer a leader."); return LeaderStatus.NO; } @Override public void close() { this.isClosed = true; } } // Class to encapsulate processing replica properties that have at most one replica hosting a property per slice. private class ExclusiveSliceProperty { private ClusterState clusterState; private final boolean onlyActiveNodes; private final String property; private final DocCollection collection; private final String collectionName; // Key structure. For each node, list all replicas on it regardless of whether they have the property or not. private final Map<String, List<SliceReplica>> nodesHostingReplicas = new HashMap<>(); // Key structure. For each node, a list of the replicas _currently_ hosting the property. private final Map<String, List<SliceReplica>> nodesHostingProp = new HashMap<>(); Set<String> shardsNeedingHosts = new HashSet<String>(); Map<String, Slice> changedSlices = new HashMap<>(); // Work on copies rather than the underlying cluster state. private int origMaxPropPerNode = 0; private int origModulo = 0; private int tmpMaxPropPerNode = 0; private int tmpModulo = 0; Random rand = new Random(); private int assigned = 0; ExclusiveSliceProperty(ClusterState clusterState, ZkNodeProps message) { this.clusterState = clusterState; String tmp = message.getStr(ZkStateReader.PROPERTY_PROP); if (StringUtils.startsWith(tmp, OverseerCollectionMessageHandler.COLL_PROP_PREFIX) == false) { tmp = OverseerCollectionMessageHandler.COLL_PROP_PREFIX + tmp; } this.property = tmp.toLowerCase(Locale.ROOT); collectionName = message.getStr(ZkStateReader.COLLECTION_PROP); if (StringUtils.isBlank(collectionName) || StringUtils.isBlank(property)) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Overseer '" + message.getStr(QUEUE_OPERATION) + "' requires both the '" + ZkStateReader.COLLECTION_PROP + "' and '" + ZkStateReader.PROPERTY_PROP + "' parameters. No action taken "); } Boolean shardUnique = Boolean.parseBoolean(message.getStr(SHARD_UNIQUE)); if (shardUnique == false && SliceMutator.SLICE_UNIQUE_BOOLEAN_PROPERTIES.contains(this.property) == false) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Balancing properties amongst replicas in a slice requires that" + " the property be a pre-defined property (e.g. 'preferredLeader') or that 'shardUnique' be set to 'true' " + " Property: " + this.property + " shardUnique: " + Boolean.toString(shardUnique)); } collection = clusterState.getCollection(collectionName); if (collection == null) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Could not find collection ' " + collectionName + "' for overseer operation '" + message.getStr(QUEUE_OPERATION) + "'. No action taken."); } onlyActiveNodes = Boolean.parseBoolean(message.getStr(ONLY_ACTIVE_NODES, "true")); } private DocCollection getDocCollection() { return collection; } private boolean isActive(Replica replica) { return replica.getState() == Replica.State.ACTIVE; } // Collect a list of all the nodes that _can_ host the indicated property. Along the way, also collect any of // the replicas on that node that _already_ host the property as well as any slices that do _not_ have the // property hosted. // // Return true if anything node needs it's property reassigned. False if the property is already balanced for // the collection. private boolean collectCurrentPropStats() { int maxAssigned = 0; // Get a list of potential replicas that can host the property _and_ their counts // Move any obvious entries to a list of replicas to change the property on Set<String> allHosts = new HashSet<>(); for (Slice slice : collection.getSlices()) { boolean sliceHasProp = false; for (Replica replica : slice.getReplicas()) { if (onlyActiveNodes && isActive(replica) == false) { if (StringUtils.isNotBlank(replica.getStr(property))) { removeProp(slice, replica.getName()); // Note, we won't be committing this to ZK until later. } continue; } allHosts.add(replica.getNodeName()); String nodeName = replica.getNodeName(); if (StringUtils.isNotBlank(replica.getStr(property))) { if (sliceHasProp) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "'" + BALANCESHARDUNIQUE + "' should only be called for properties that have at most one member " + "in any slice with the property set. No action taken."); } if (nodesHostingProp.containsKey(nodeName) == false) { nodesHostingProp.put(nodeName, new ArrayList<SliceReplica>()); } nodesHostingProp.get(nodeName).add(new SliceReplica(slice, replica)); ++assigned; maxAssigned = Math.max(maxAssigned, nodesHostingProp.get(nodeName).size()); sliceHasProp = true; } if (nodesHostingReplicas.containsKey(nodeName) == false) { nodesHostingReplicas.put(nodeName, new ArrayList<SliceReplica>()); } nodesHostingReplicas.get(nodeName).add(new SliceReplica(slice, replica)); } } // If the total number of already-hosted properties assigned to nodes // that have potential to host leaders is equal to the slice count _AND_ none of the current nodes has more than // the max number of properties, there's nothing to do. origMaxPropPerNode = collection.getSlices().size() / allHosts.size(); // Some nodes can have one more of the proeprty if the numbers aren't exactly even. origModulo = collection.getSlices().size() % allHosts.size(); if (origModulo > 0) { origMaxPropPerNode++; // have to have some nodes with 1 more property. } // We can say for sure that we need to rebalance if we don't have as many assigned properties as slices. if (assigned != collection.getSlices().size()) { return true; } // Make sure there are no more slices at the limit than the "leftovers" // Let's say there's 7 slices and 3 nodes. We need to distribute the property as 3 on node1, 2 on node2 and 2 on node3 // (3, 2, 2) We need to be careful to not distribute them as 3, 3, 1. that's what this check is all about. int counter = origModulo; for (List<SliceReplica> list : nodesHostingProp.values()) { if (list.size() == origMaxPropPerNode) --counter; } if (counter == 0) return false; // nodes with 1 extra leader are exactly the needed number return true; } private void removeSliceAlreadyHostedFromPossibles(String sliceName) { for (Map.Entry<String, List<SliceReplica>> entReplica : nodesHostingReplicas.entrySet()) { ListIterator<SliceReplica> iter = entReplica.getValue().listIterator(); while (iter.hasNext()) { SliceReplica sr = iter.next(); if (sr.slice.getName().equals(sliceName)) iter.remove(); } } } private void balanceUnassignedReplicas() { tmpMaxPropPerNode = origMaxPropPerNode; // A bit clumsy, but don't want to duplicate code. tmpModulo = origModulo; // Get the nodeName and shardName for the node that has the least room for this while (shardsNeedingHosts.size() > 0) { String nodeName = ""; int minSize = Integer.MAX_VALUE; SliceReplica srToChange = null; for (String slice : shardsNeedingHosts) { for (Map.Entry<String, List<SliceReplica>> ent : nodesHostingReplicas.entrySet()) { // A little tricky. If we don't set this to something below, then it means all possible places to // put this property are full up, so just put it somewhere. if (srToChange == null && ent.getValue().size() > 0) { srToChange = ent.getValue().get(0); } ListIterator<SliceReplica> iter = ent.getValue().listIterator(); while (iter.hasNext()) { SliceReplica sr = iter.next(); if (StringUtils.equals(slice, sr.slice.getName()) == false) { continue; } if (nodesHostingProp.containsKey(ent.getKey()) == false) { nodesHostingProp.put(ent.getKey(), new ArrayList<SliceReplica>()); } if (minSize > nodesHostingReplicas.get(ent.getKey()).size() && nodesHostingProp.get(ent.getKey()).size() < tmpMaxPropPerNode) { minSize = nodesHostingReplicas.get(ent.getKey()).size(); srToChange = sr; nodeName = ent.getKey(); } } } } // Now, you have a slice and node to put it on shardsNeedingHosts.remove(srToChange.slice.getName()); if (nodesHostingProp.containsKey(nodeName) == false) { nodesHostingProp.put(nodeName, new ArrayList<SliceReplica>()); } nodesHostingProp.get(nodeName).add(srToChange); adjustLimits(nodesHostingProp.get(nodeName)); removeSliceAlreadyHostedFromPossibles(srToChange.slice.getName()); addProp(srToChange.slice, srToChange.replica.getName()); } } // Adjust the min/max counts per allowed per node. Special handling here for dealing with the fact // that no node should have more than 1 more replica with this property than any other. private void adjustLimits(List<SliceReplica> changeList) { if (changeList.size() == tmpMaxPropPerNode) { if (tmpModulo < 0) return; --tmpModulo; if (tmpModulo == 0) { --tmpMaxPropPerNode; --tmpModulo; // Prevent dropping tmpMaxPropPerNode again. } } } // Go through the list of presently-hosted properties and remove any that have too many replicas that host the property private void removeOverallocatedReplicas() { tmpMaxPropPerNode = origMaxPropPerNode; // A bit clumsy, but don't want to duplicate code. tmpModulo = origModulo; for (Map.Entry<String, List<SliceReplica>> ent : nodesHostingProp.entrySet()) { while (ent.getValue().size() > tmpMaxPropPerNode) { // remove delta nodes ent.getValue().remove(rand.nextInt(ent.getValue().size())); } adjustLimits(ent.getValue()); } } private void removeProp(Slice origSlice, String replicaName) { getReplicaFromChanged(origSlice, replicaName).getProperties().remove(property); } private void addProp(Slice origSlice, String replicaName) { getReplicaFromChanged(origSlice, replicaName).getProperties().put(property, "true"); } // Just a place to encapsulate the fact that we need to have new slices (copy) to update before we // put this all in the cluster state. private Replica getReplicaFromChanged(Slice origSlice, String replicaName) { Slice newSlice = changedSlices.get(origSlice.getName()); Replica replica; if (newSlice != null) { replica = newSlice.getReplica(replicaName); } else { newSlice = new Slice(origSlice.getName(), origSlice.getReplicasCopy(), origSlice.shallowCopy()); changedSlices.put(origSlice.getName(), newSlice); replica = newSlice.getReplica(replicaName); } if (replica == null) { throw new SolrException(SolrException.ErrorCode.INVALID_STATE, "Should have been able to find replica '" + replicaName + "' in slice '" + origSlice.getName() + "'. No action taken"); } return replica; } // Main entry point for carrying out the action. Returns "true" if we have actually moved properties around. private boolean balanceProperty() { if (collectCurrentPropStats() == false) { return false; } // we have two lists based on nodeName // 1> all the nodes that _could_ host a property for the slice // 2> all the nodes that _currently_ host a property for the slice. // So, remove a replica from the nodes that have too many removeOverallocatedReplicas(); // prune replicas belonging to a slice that have the property currently assigned from the list of replicas // that could host the property. for (Map.Entry<String, List<SliceReplica>> entProp : nodesHostingProp.entrySet()) { for (SliceReplica srHosting : entProp.getValue()) { removeSliceAlreadyHostedFromPossibles(srHosting.slice.getName()); } } // Assemble the list of slices that do not have any replica hosting the property: for (Map.Entry<String, List<SliceReplica>> ent : nodesHostingReplicas.entrySet()) { ListIterator<SliceReplica> iter = ent.getValue().listIterator(); while (iter.hasNext()) { SliceReplica sr = iter.next(); shardsNeedingHosts.add(sr.slice.getName()); } } // At this point, nodesHostingProp should contain _only_ lists of replicas that belong to slices that do _not_ // have any replica hosting the property. So let's assign them. balanceUnassignedReplicas(); for (Slice newSlice : changedSlices.values()) { DocCollection docCollection = CollectionMutator.updateSlice(collectionName, clusterState.getCollection(collectionName), newSlice); clusterState = ClusterStateMutator.newState(clusterState, collectionName, docCollection); } return true; } } private class SliceReplica { private Slice slice; private Replica replica; SliceReplica(Slice slice, Replica replica) { this.slice = slice; this.replica = replica; } } class OverseerThread extends Thread implements Closeable { protected volatile boolean isClosed; private Closeable thread; public OverseerThread(ThreadGroup tg, Closeable thread) { super(tg, (Runnable) thread); this.thread = thread; } public OverseerThread(ThreadGroup ccTg, Closeable thread, String name) { super(ccTg, (Runnable) thread, name); this.thread = thread; } @Override public void close() throws IOException { thread.close(); this.isClosed = true; } public boolean isClosed() { return this.isClosed; } } private OverseerThread ccThread; private OverseerThread updaterThread; private OverseerThread arfoThread; private final ZkStateReader reader; private final ShardHandler shardHandler; private final UpdateShardHandler updateShardHandler; private final String adminPath; private OverseerCollectionConfigSetProcessor overseerCollectionConfigSetProcessor; private ZkController zkController; private Stats stats; private String id; private boolean closed; private CloudConfig config; // overseer not responsible for closing reader public Overseer(ShardHandler shardHandler, UpdateShardHandler updateShardHandler, String adminPath, final ZkStateReader reader, ZkController zkController, CloudConfig config) throws KeeperException, InterruptedException { this.reader = reader; this.shardHandler = shardHandler; this.updateShardHandler = updateShardHandler; this.adminPath = adminPath; this.zkController = zkController; this.stats = new Stats(); this.config = config; } public synchronized void start(String id) { this.id = id; closed = false; doClose(); stats = new Stats(); log.info("Overseer (id=" + id + ") starting"); createOverseerNode(reader.getZkClient()); //launch cluster state updater thread ThreadGroup tg = new ThreadGroup("Overseer state updater."); updaterThread = new OverseerThread(tg, new ClusterStateUpdater(reader, id, stats), "OverseerStateUpdate-" + id); updaterThread.setDaemon(true); ThreadGroup ccTg = new ThreadGroup("Overseer collection creation process."); OverseerNodePrioritizer overseerPrioritizer = new OverseerNodePrioritizer(reader, adminPath, shardHandler.getShardHandlerFactory()); overseerCollectionConfigSetProcessor = new OverseerCollectionConfigSetProcessor(reader, id, shardHandler, adminPath, stats, Overseer.this, overseerPrioritizer); ccThread = new OverseerThread(ccTg, overseerCollectionConfigSetProcessor, "OverseerCollectionConfigSetProcessor-" + id); ccThread.setDaemon(true); ThreadGroup ohcfTg = new ThreadGroup("Overseer Hdfs SolrCore Failover Thread."); OverseerAutoReplicaFailoverThread autoReplicaFailoverThread = new OverseerAutoReplicaFailoverThread(config, reader, updateShardHandler); arfoThread = new OverseerThread(ohcfTg, autoReplicaFailoverThread, "OverseerHdfsCoreFailoverThread-" + id); arfoThread.setDaemon(true); updaterThread.start(); ccThread.start(); arfoThread.start(); } public Stats getStats() { return stats; } ZkController getZkController() { return zkController; } /** * For tests. * * @lucene.internal * @return state updater thread */ public synchronized OverseerThread getUpdaterThread() { return updaterThread; } public synchronized void close() { if (closed) return; log.info("Overseer (id=" + id + ") closing"); doClose(); this.closed = true; } private void doClose() { if (updaterThread != null) { IOUtils.closeQuietly(updaterThread); updaterThread.interrupt(); } if (ccThread != null) { IOUtils.closeQuietly(ccThread); ccThread.interrupt(); } if (arfoThread != null) { IOUtils.closeQuietly(arfoThread); arfoThread.interrupt(); } updaterThread = null; ccThread = null; arfoThread = null; } /** * Get queue that can be used to send messages to Overseer. * <p> * Any and all modifications to the cluster state must be sent to * the overseer via this queue. The complete list of overseer actions * supported by this queue are documented inside the {@link OverseerAction} enum. * <p> * Performance statistics on the returned queue * are <em>not</em> tracked by the Overseer Stats API, * see {@link org.apache.solr.common.params.CollectionParams.CollectionAction#OVERSEERSTATUS}. * Therefore, this method should be used only by clients for writing to the overseer queue. * <p> * This method will create the /overseer znode in ZooKeeper if it does not exist already. * * @param zkClient the {@link SolrZkClient} to be used for reading/writing to the queue * @return a {@link DistributedQueue} object */ public static DistributedQueue getStateUpdateQueue(final SolrZkClient zkClient) { return getStateUpdateQueue(zkClient, new Stats()); } /** * The overseer uses the returned queue to read any operations submitted by clients. * This method should not be used directly by anyone other than the Overseer itself. * This method will create the /overseer znode in ZooKeeper if it does not exist already. * * @param zkClient the {@link SolrZkClient} to be used for reading/writing to the queue * @param zkStats a {@link Overseer.Stats} object which tracks statistics for all zookeeper operations performed by this queue * @return a {@link DistributedQueue} object */ static DistributedQueue getStateUpdateQueue(final SolrZkClient zkClient, Stats zkStats) { createOverseerNode(zkClient); return new DistributedQueue(zkClient, "/overseer/queue", zkStats); } /** * Internal overseer work queue. This should not be used outside of Overseer. * <p> * This queue is used to store overseer operations that have been removed from the * state update queue but are being executed as part of a batch. Once * the result of the batch is persisted to zookeeper, these items are removed from the * work queue. If the overseer dies while processing a batch then a new overseer always * operates from the work queue first and only then starts processing operations from the * state update queue. * This method will create the /overseer znode in ZooKeeper if it does not exist already. * * @param zkClient the {@link SolrZkClient} to be used for reading/writing to the queue * @param zkStats a {@link Overseer.Stats} object which tracks statistics for all zookeeper operations performed by this queue * @return a {@link DistributedQueue} object */ static DistributedQueue getInternalWorkQueue(final SolrZkClient zkClient, Stats zkStats) { createOverseerNode(zkClient); return new DistributedQueue(zkClient, "/overseer/queue-work", zkStats); } /* Internal map for failed tasks, not to be used outside of the Overseer */ static DistributedMap getRunningMap(final SolrZkClient zkClient) { createOverseerNode(zkClient); return new DistributedMap(zkClient, "/overseer/collection-map-running"); } /* Size-limited map for successfully completed tasks*/ static DistributedMap getCompletedMap(final SolrZkClient zkClient) { createOverseerNode(zkClient); return new SizeLimitedDistributedMap(zkClient, "/overseer/collection-map-completed", NUM_RESPONSES_TO_STORE); } /* Map for failed tasks, not to be used outside of the Overseer */ static DistributedMap getFailureMap(final SolrZkClient zkClient) { createOverseerNode(zkClient); return new SizeLimitedDistributedMap(zkClient, "/overseer/collection-map-failure", NUM_RESPONSES_TO_STORE); } /** * Get queue that can be used to submit collection API tasks to the Overseer. * <p> * This queue is used internally by the {@link CollectionsHandler} to submit collection API * tasks which are executed by the {@link OverseerCollectionMessageHandler}. The actions supported * by this queue are listed in the {@link org.apache.solr.common.params.CollectionParams.CollectionAction} * enum. * <p> * Performance statistics on the returned queue * are <em>not</em> tracked by the Overseer Stats API, * see {@link org.apache.solr.common.params.CollectionParams.CollectionAction#OVERSEERSTATUS}. * * @param zkClient the {@link SolrZkClient} to be used for reading/writing to the queue * @return a {@link DistributedQueue} object */ static OverseerTaskQueue getCollectionQueue(final SolrZkClient zkClient) { return getCollectionQueue(zkClient, new Stats()); } /** * Get queue that can be used to read collection API tasks to the Overseer. * <p> * This queue is used internally by the {@link OverseerCollectionMessageHandler} to read collection API * tasks submitted by the {@link CollectionsHandler}. The actions supported * by this queue are listed in the {@link org.apache.solr.common.params.CollectionParams.CollectionAction} * enum. * <p> * Performance statistics on the returned queue are tracked by the Overseer Stats API, * see {@link org.apache.solr.common.params.CollectionParams.CollectionAction#OVERSEERSTATUS}. * * @param zkClient the {@link SolrZkClient} to be used for reading/writing to the queue * @return a {@link DistributedQueue} object */ static OverseerTaskQueue getCollectionQueue(final SolrZkClient zkClient, Stats zkStats) { createOverseerNode(zkClient); return new OverseerTaskQueue(zkClient, "/overseer/collection-queue-work", zkStats); } /** * Get queue that can be used to submit configset API tasks to the Overseer. * <p> * This queue is used internally by the {@link org.apache.solr.handler.admin.ConfigSetsHandler} to submit * tasks which are executed by the {@link OverseerConfigSetMessageHandler}. The actions supported * by this queue are listed in the {@link org.apache.solr.common.params.ConfigSetParams.ConfigSetAction} * enum. * <p> * Performance statistics on the returned queue * are <em>not</em> tracked by the Overseer Stats API, * see {@link org.apache.solr.common.params.CollectionParams.CollectionAction#OVERSEERSTATUS}. * * @param zkClient the {@link SolrZkClient} to be used for reading/writing to the queue * @return a {@link DistributedQueue} object */ static OverseerTaskQueue getConfigSetQueue(final SolrZkClient zkClient) { return getConfigSetQueue(zkClient, new Stats()); } /** * Get queue that can be used to read configset API tasks to the Overseer. * <p> * This queue is used internally by the {@link OverseerConfigSetMessageHandler} to read configset API * tasks submitted by the {@link org.apache.solr.handler.admin.ConfigSetsHandler}. The actions supported * by this queue are listed in the {@link org.apache.solr.common.params.ConfigSetParams.ConfigSetAction} * enum. * <p> * Performance statistics on the returned queue are tracked by the Overseer Stats API, * see {@link org.apache.solr.common.params.CollectionParams.CollectionAction#OVERSEERSTATUS}. * <p> * For now, this internally returns the same queue as {@link #getCollectionQueue(SolrZkClient, Stats)}. * It is the responsibility of the client to ensure that configset API actions are prefixed with * {@link OverseerConfigSetMessageHandler#CONFIGSETS_ACTION_PREFIX} so that it is processed by * {@link OverseerConfigSetMessageHandler}. * * @param zkClient the {@link SolrZkClient} to be used for reading/writing to the queue * @return a {@link DistributedQueue} object */ static OverseerTaskQueue getConfigSetQueue(final SolrZkClient zkClient, Stats zkStats) { // For now, we use the same queue as the collection queue, but ensure // that the actions are prefixed with a unique string. createOverseerNode(zkClient); return getCollectionQueue(zkClient, zkStats); } private static void createOverseerNode(final SolrZkClient zkClient) { try { zkClient.create("/overseer", new byte[0], CreateMode.PERSISTENT, true); } catch (KeeperException.NodeExistsException e) { //ok } catch (InterruptedException e) { log.error("Could not create Overseer node", e); Thread.currentThread().interrupt(); throw new RuntimeException(e); } catch (KeeperException e) { log.error("Could not create Overseer node", e); throw new RuntimeException(e); } } public static boolean isLegacy(ZkStateReader stateReader) { String legacyProperty = stateReader.getClusterProperty(ZkStateReader.LEGACY_CLOUD, "true"); return !"false".equals(legacyProperty); } public ZkStateReader getZkStateReader() { return reader; } /** * Used to hold statistics about overseer operations. It will be exposed * to the OverseerCollectionProcessor to return statistics. * * This is experimental API and subject to change. */ public static class Stats { static final int MAX_STORED_FAILURES = 10; final Map<String, Stat> stats = new ConcurrentHashMap<>(); private volatile int queueLength; public Map<String, Stat> getStats() { return stats; } public int getSuccessCount(String operation) { Stat stat = stats.get(operation.toLowerCase(Locale.ROOT)); return stat == null ? 0 : stat.success.get(); } public int getErrorCount(String operation) { Stat stat = stats.get(operation.toLowerCase(Locale.ROOT)); return stat == null ? 0 : stat.errors.get(); } public void success(String operation) { String op = operation.toLowerCase(Locale.ROOT); Stat stat = stats.get(op); if (stat == null) { stat = new Stat(); stats.put(op, stat); } stat.success.incrementAndGet(); } public void error(String operation) { String op = operation.toLowerCase(Locale.ROOT); Stat stat = stats.get(op); if (stat == null) { stat = new Stat(); stats.put(op, stat); } stat.errors.incrementAndGet(); } public TimerContext time(String operation) { String op = operation.toLowerCase(Locale.ROOT); Stat stat = stats.get(op); if (stat == null) { stat = new Stat(); stats.put(op, stat); } return stat.requestTime.time(); } public void storeFailureDetails(String operation, ZkNodeProps request, SolrResponse resp) { String op = operation.toLowerCase(Locale.ROOT); Stat stat = stats.get(op); if (stat == null) { stat = new Stat(); stats.put(op, stat); } LinkedList<FailedOp> failedOps = stat.failureDetails; synchronized (failedOps) { if (failedOps.size() >= MAX_STORED_FAILURES) { failedOps.removeFirst(); } failedOps.addLast(new FailedOp(request, resp)); } } public List<FailedOp> getFailureDetails(String operation) { Stat stat = stats.get(operation.toLowerCase(Locale.ROOT)); if (stat == null || stat.failureDetails.isEmpty()) return null; LinkedList<FailedOp> failedOps = stat.failureDetails; synchronized (failedOps) { ArrayList<FailedOp> ret = new ArrayList<>(failedOps); return ret; } } public int getQueueLength() { return queueLength; } public void setQueueLength(int queueLength) { this.queueLength = queueLength; } public void clear() { stats.clear(); } } public static class Stat { public final AtomicInteger success; public final AtomicInteger errors; public final Timer requestTime; public final LinkedList<FailedOp> failureDetails; public Stat() { this.success = new AtomicInteger(); this.errors = new AtomicInteger(); this.requestTime = new Timer(TimeUnit.MILLISECONDS, TimeUnit.MINUTES, Clock.defaultClock()); this.failureDetails = new LinkedList<>(); } } public static class FailedOp { public final ZkNodeProps req; public final SolrResponse resp; public FailedOp(ZkNodeProps req, SolrResponse resp) { this.req = req; this.resp = resp; } } }