Java tutorial
package org.apache.solr.cloud; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Random; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.SynchronousQueue; import java.util.concurrent.TimeUnit; import org.apache.commons.lang.StringUtils; import org.apache.solr.client.solrj.SolrResponse; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.apache.solr.client.solrj.request.AbstractUpdateRequest; import org.apache.solr.client.solrj.request.CoreAdminRequest; import org.apache.solr.client.solrj.request.UpdateRequest; import org.apache.solr.client.solrj.response.UpdateResponse; import org.apache.solr.cloud.Assign.ReplicaCount; import org.apache.solr.cloud.DistributedQueue.QueueEvent; import org.apache.solr.cloud.Overseer.LeaderStatus; import org.apache.solr.cloud.overseer.ClusterStateMutator; import org.apache.solr.cloud.overseer.OverseerAction; import org.apache.solr.cloud.rule.ReplicaAssigner; import org.apache.solr.cloud.rule.ReplicaAssigner.Position; import org.apache.solr.cloud.rule.Rule; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.cloud.Aliases; import org.apache.solr.common.cloud.ClusterState; import org.apache.solr.common.cloud.CompositeIdRouter; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.DocRouter; import org.apache.solr.common.cloud.ImplicitDocRouter; import org.apache.solr.common.cloud.PlainIdRouter; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.RoutingRule; import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.SolrZkClient; import org.apache.solr.common.cloud.ZkConfigManager; import org.apache.solr.common.cloud.ZkCoreNodeProps; import org.apache.solr.common.cloud.ZkNodeProps; import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.common.params.CollectionParams; import org.apache.solr.common.params.CoreAdminParams; import org.apache.solr.common.params.CoreAdminParams.CoreAdminAction; import org.apache.solr.common.params.MapSolrParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.ShardParams; import org.apache.solr.common.util.ExecutorUtil; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.util.StrUtils; import org.apache.solr.common.util.Utils; import org.apache.solr.handler.admin.ClusterStatus; import org.apache.solr.handler.component.ShardHandler; import org.apache.solr.handler.component.ShardHandlerFactory; import org.apache.solr.handler.component.ShardRequest; import org.apache.solr.handler.component.ShardResponse; import org.apache.solr.update.SolrIndexSplitter; import org.apache.solr.util.DefaultSolrThreadFactory; import org.apache.solr.util.stats.Snapshot; import org.apache.solr.util.stats.Timer; import org.apache.solr.util.stats.TimerContext; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.data.Stat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static org.apache.solr.cloud.Assign.getNodesForNewReplicas; import static org.apache.solr.common.cloud.DocCollection.SNITCH; import static org.apache.solr.common.util.Utils.makeMap; import static org.apache.solr.common.cloud.ZkStateReader.BASE_URL_PROP; import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP; import static org.apache.solr.common.cloud.ZkStateReader.CORE_NAME_PROP; import static org.apache.solr.common.cloud.ZkStateReader.ELECTION_NODE_PROP; import static org.apache.solr.common.cloud.ZkStateReader.MAX_SHARDS_PER_NODE; import static org.apache.solr.common.cloud.ZkStateReader.NODE_NAME_PROP; import static org.apache.solr.common.cloud.ZkStateReader.PROPERTY_PROP; import static org.apache.solr.common.cloud.ZkStateReader.PROPERTY_VALUE_PROP; import static org.apache.solr.common.cloud.ZkStateReader.REJOIN_AT_HEAD_PROP; import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR; import static org.apache.solr.common.cloud.ZkStateReader.REPLICA_PROP; import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP; import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICA; import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICAPROP; import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDROLE; import static org.apache.solr.common.params.CollectionParams.CollectionAction.BALANCESHARDUNIQUE; import static org.apache.solr.common.params.CollectionParams.CollectionAction.CLUSTERSTATUS; import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATE; import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATESHARD; import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETE; import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETEREPLICAPROP; import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETESHARD; import static org.apache.solr.common.params.CollectionParams.CollectionAction.REMOVEROLE; import static org.apache.solr.common.params.CommonAdminParams.ASYNC; import static org.apache.solr.common.params.CommonParams.NAME; import static org.apache.solr.common.util.StrUtils.formatString; public class OverseerCollectionProcessor implements Runnable, Closeable { public static final String NUM_SLICES = "numShards"; static final boolean CREATE_NODE_SET_SHUFFLE_DEFAULT = true; public static final String CREATE_NODE_SET_SHUFFLE = "createNodeSet.shuffle"; public static final String CREATE_NODE_SET_EMPTY = "EMPTY"; public static final String CREATE_NODE_SET = "createNodeSet"; public static final String ROUTER = "router"; public static final String SHARDS_PROP = "shards"; public static final String REQUESTID = "requestid"; public static final String COLL_CONF = "collection.configName"; public static final String COLL_PROP_PREFIX = "property."; public static final String ONLY_IF_DOWN = "onlyIfDown"; public static final String SHARD_UNIQUE = "shardUnique"; public static final String ONLY_ACTIVE_NODES = "onlyactivenodes"; private static final String SKIP_CREATE_REPLICA_IN_CLUSTER_STATE = "skipCreateReplicaInClusterState"; public int maxParallelThreads = 10; public static final Map<String, Object> COLL_PROPS = Collections.unmodifiableMap(makeMap(ROUTER, DocRouter.DEFAULT_NAME, ZkStateReader.REPLICATION_FACTOR, "1", ZkStateReader.MAX_SHARDS_PER_NODE, "1", ZkStateReader.AUTO_ADD_REPLICAS, "false", DocCollection.RULE, null, SNITCH, null)); static final Random RANDOM; static { // We try to make things reproducible in the context of our tests by initializing the random instance // based on the current seed String seed = System.getProperty("tests.seed"); if (seed == null) { RANDOM = new Random(); } else { RANDOM = new Random(seed.hashCode()); } } public ExecutorService tpe; private static Logger log = LoggerFactory.getLogger(OverseerCollectionProcessor.class); private DistributedQueue workQueue; private DistributedMap runningMap; private DistributedMap completedMap; private DistributedMap failureMap; // Set that maintains a list of all the tasks that are running. This is keyed on zk id of the task. final private Set runningTasks; // Set that tracks collections that are currently being processed by a running task. // This is used for handling mutual exclusion of the tasks. final private Set collectionWip; // List of completed tasks. This is used to clean up workQueue in zk. final private HashMap<String, QueueEvent> completedTasks; private String myId; private final ShardHandlerFactory shardHandlerFactory; private String adminPath; private ZkStateReader zkStateReader; private boolean isClosed; private Overseer.Stats stats; // Set of tasks that have been picked up for processing but not cleaned up from zk work-queue. // It may contain tasks that have completed execution, have been entered into the completed/failed map in zk but not // deleted from the work-queue as that is a batched operation. final private Set<String> runningZKTasks; private final Object waitLock = new Object(); private Overseer overseer; public OverseerCollectionProcessor(ZkStateReader zkStateReader, String myId, final ShardHandler shardHandler, String adminPath, Overseer.Stats stats, Overseer overseer) { this(zkStateReader, myId, shardHandler.getShardHandlerFactory(), adminPath, stats, Overseer.getCollectionQueue(zkStateReader.getZkClient(), stats), Overseer.getRunningMap(zkStateReader.getZkClient()), Overseer.getCompletedMap(zkStateReader.getZkClient()), Overseer.getFailureMap(zkStateReader.getZkClient())); this.overseer = overseer; } protected OverseerCollectionProcessor(ZkStateReader zkStateReader, String myId, final ShardHandlerFactory shardHandlerFactory, String adminPath, Overseer.Stats stats, DistributedQueue workQueue, DistributedMap runningMap, DistributedMap completedMap, DistributedMap failureMap) { this.zkStateReader = zkStateReader; this.myId = myId; this.shardHandlerFactory = shardHandlerFactory; this.adminPath = adminPath; this.workQueue = workQueue; this.runningMap = runningMap; this.completedMap = completedMap; this.failureMap = failureMap; this.stats = stats; this.runningZKTasks = new HashSet<>(); this.runningTasks = new HashSet(); this.collectionWip = new HashSet(); this.completedTasks = new HashMap<>(); } @Override public void run() { log.info("Process current queue of collection creations"); LeaderStatus isLeader = amILeader(); while (isLeader == LeaderStatus.DONT_KNOW) { log.debug("am_i_leader unclear {}", isLeader); isLeader = amILeader(); // not a no, not a yes, try ask again } String oldestItemInWorkQueue = null; // hasLeftOverItems - used for avoiding re-execution of async tasks that were processed by a previous Overseer. // This variable is set in case there's any task found on the workQueue when the OCP starts up and // the id for the queue tail is used as a marker to check for the task in completed/failed map in zk. // Beyond the marker, all tasks can safely be assumed to have never been executed. boolean hasLeftOverItems = true; try { oldestItemInWorkQueue = workQueue.getTailId(); } catch (KeeperException e) { // We don't need to handle this. This is just a fail-safe which comes in handy in skipping already processed // async calls. SolrException.log(log, "", e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } if (oldestItemInWorkQueue == null) hasLeftOverItems = false; else log.debug("Found already existing elements in the work-queue. Last element: {}", oldestItemInWorkQueue); try { prioritizeOverseerNodes(); } catch (Exception e) { log.error("Unable to prioritize overseer ", e); } // TODO: Make maxThreads configurable. this.tpe = new ExecutorUtil.MDCAwareThreadPoolExecutor(5, 100, 0L, TimeUnit.MILLISECONDS, new SynchronousQueue<Runnable>(), new DefaultSolrThreadFactory("OverseerThreadFactory")); try { while (!this.isClosed) { try { isLeader = amILeader(); if (LeaderStatus.NO == isLeader) { break; } else if (LeaderStatus.YES != isLeader) { log.debug("am_i_leader unclear {}", isLeader); continue; // not a no, not a yes, try asking again } log.debug("Cleaning up work-queue. #Running tasks: {}", runningTasks.size()); cleanUpWorkQueue(); printTrackingMaps(); boolean waited = false; while (runningTasks.size() > maxParallelThreads) { synchronized (waitLock) { waitLock.wait(100);//wait for 100 ms or till a task is complete } waited = true; } if (waited) cleanUpWorkQueue(); List<QueueEvent> heads = workQueue.peekTopN(maxParallelThreads, runningZKTasks, 2000L); if (heads == null) continue; log.debug("Got {} tasks from work-queue : [{}]", heads.size(), heads.toString()); if (isClosed) break; for (QueueEvent head : heads) { final ZkNodeProps message = ZkNodeProps.load(head.getBytes()); String collectionName = message.containsKey(COLLECTION_PROP) ? message.getStr(COLLECTION_PROP) : message.getStr(NAME); final String asyncId = message.getStr(ASYNC); if (hasLeftOverItems) { if (head.getId().equals(oldestItemInWorkQueue)) hasLeftOverItems = false; if (asyncId != null && (completedMap.contains(asyncId) || failureMap.contains(asyncId))) { log.debug("Found already processed task in workQueue, cleaning up. AsyncId [{}]", asyncId); workQueue.remove(head); continue; } } if (!checkExclusivity(message, head.getId())) { log.debug("Exclusivity check failed for [{}]", message.toString()); continue; } try { markTaskAsRunning(head, collectionName, asyncId, message); log.debug("Marked task [{}] as running", head.getId()); } catch (KeeperException.NodeExistsException e) { // This should never happen log.error("Tried to pick up task [{}] when it was already running!", head.getId()); } catch (InterruptedException e) { log.error("Thread interrupted while trying to pick task for execution.", head.getId()); Thread.currentThread().interrupt(); } log.info("Overseer Collection Processor: Get the message id:" + head.getId() + " message:" + message.toString()); String operation = message.getStr(Overseer.QUEUE_OPERATION); Runner runner = new Runner(message, operation, head); tpe.execute(runner); } } catch (KeeperException e) { if (e.code() == KeeperException.Code.SESSIONEXPIRED) { log.warn("Overseer cannot talk to ZK"); return; } SolrException.log(log, "", e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); return; } catch (Exception e) { SolrException.log(log, "", e); } } } finally { this.close(); } } private boolean checkExclusivity(ZkNodeProps message, String id) throws KeeperException, InterruptedException { String collectionName = message.containsKey(COLLECTION_PROP) ? message.getStr(COLLECTION_PROP) : message.getStr(NAME); if (collectionName == null) return true; // CLUSTERSTATUS is always mutually exclusive //TODO deprecated remove this check . if (CLUSTERSTATUS.isEqual(message.getStr(Overseer.QUEUE_OPERATION))) return true; if (collectionWip.contains(collectionName)) return false; if (runningZKTasks.contains(id)) return false; return true; } private void cleanUpWorkQueue() throws KeeperException, InterruptedException { synchronized (completedTasks) { for (String id : completedTasks.keySet()) { workQueue.remove(completedTasks.get(id)); runningZKTasks.remove(id); } completedTasks.clear(); } } public void close() { isClosed = true; if (tpe != null) { if (!tpe.isShutdown()) { tpe.shutdown(); try { tpe.awaitTermination(60, TimeUnit.SECONDS); } catch (InterruptedException e) { log.warn("Thread interrupted while waiting for OCP threadpool close."); Thread.currentThread().interrupt(); } finally { if (!tpe.isShutdown()) tpe.shutdownNow(); } } } } private synchronized void prioritizeOverseerNodes() throws KeeperException, InterruptedException { SolrZkClient zk = zkStateReader.getZkClient(); if (!zk.exists(ZkStateReader.ROLES, true)) return; Map m = (Map) Utils.fromJSON(zk.getData(ZkStateReader.ROLES, null, new Stat(), true)); List overseerDesignates = (List) m.get("overseer"); if (overseerDesignates == null || overseerDesignates.isEmpty()) return; String ldr = getLeaderNode(zk); if (overseerDesignates.contains(ldr)) return; log.info("prioritizing overseer nodes at {} overseer designates are {}", myId, overseerDesignates); List<String> electionNodes = getSortedElectionNodes(zk, OverseerElectionContext.PATH + LeaderElector.ELECTION_NODE); if (electionNodes.size() < 2) return; log.info("sorted nodes {}", electionNodes); String designateNodeId = null; for (String electionNode : electionNodes) { if (overseerDesignates.contains(LeaderElector.getNodeName(electionNode))) { designateNodeId = electionNode; break; } } if (designateNodeId == null) { log.warn("No live overseer designate "); return; } if (!designateNodeId.equals(electionNodes.get(1))) { //checking if it is already at no:1 log.info("asking node {} to come join election at head", designateNodeId); invokeOverseerOp(designateNodeId, "rejoinAtHead"); //ask designate to come first log.info("asking the old first in line {} to rejoin election ", electionNodes.get(1)); invokeOverseerOp(electionNodes.get(1), "rejoin");//ask second inline to go behind } //now ask the current leader to QUIT , so that the designate can takeover Overseer.getInQueue(zkStateReader.getZkClient()) .offer(Utils.toJSON(new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.QUIT.toLower(), "id", getLeaderId(zkStateReader.getZkClient())))); } public static List<String> getSortedOverseerNodeNames(SolrZkClient zk) throws KeeperException, InterruptedException { List<String> children = null; try { children = zk.getChildren(OverseerElectionContext.PATH + LeaderElector.ELECTION_NODE, null, true); } catch (Exception e) { log.warn("error ", e); return new ArrayList<>(); } LeaderElector.sortSeqs(children); ArrayList<String> nodeNames = new ArrayList<>(children.size()); for (String c : children) nodeNames.add(LeaderElector.getNodeName(c)); return nodeNames; } public static List<String> getSortedElectionNodes(SolrZkClient zk, String path) throws KeeperException, InterruptedException { List<String> children = null; try { children = zk.getChildren(path, null, true); LeaderElector.sortSeqs(children); return children; } catch (Exception e) { throw e; } } public static String getLeaderNode(SolrZkClient zkClient) throws KeeperException, InterruptedException { String id = getLeaderId(zkClient); return id == null ? null : LeaderElector.getNodeName(id); } public static String getLeaderId(SolrZkClient zkClient) throws KeeperException, InterruptedException { byte[] data = null; try { data = zkClient.getData("/overseer_elect/leader", null, new Stat(), true); } catch (KeeperException.NoNodeException e) { return null; } Map m = (Map) Utils.fromJSON(data); return (String) m.get("id"); } private void invokeOverseerOp(String electionNode, String op) { ModifiableSolrParams params = new ModifiableSolrParams(); ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); params.set(CoreAdminParams.ACTION, CoreAdminAction.OVERSEEROP.toString()); params.set("op", op); params.set("qt", adminPath); params.set("electionNode", electionNode); ShardRequest sreq = new ShardRequest(); sreq.purpose = 1; String replica = zkStateReader.getBaseUrlForNodeName(LeaderElector.getNodeName(electionNode)); sreq.shards = new String[] { replica }; sreq.actualShards = sreq.shards; sreq.params = params; shardHandler.submit(sreq, replica, sreq.params); shardHandler.takeCompletedOrError(); } protected LeaderStatus amILeader() { TimerContext timerContext = stats.time("collection_am_i_leader"); boolean success = true; try { ZkNodeProps props = ZkNodeProps .load(zkStateReader.getZkClient().getData("/overseer_elect/leader", null, null, true)); if (myId.equals(props.getStr("id"))) { return LeaderStatus.YES; } } catch (KeeperException e) { success = false; if (e.code() == KeeperException.Code.CONNECTIONLOSS) { log.error("", e); return LeaderStatus.DONT_KNOW; } else if (e.code() == KeeperException.Code.SESSIONEXPIRED) { log.info("", e); } else { log.warn("", e); } } catch (InterruptedException e) { success = false; Thread.currentThread().interrupt(); } finally { timerContext.stop(); if (success) { stats.success("collection_am_i_leader"); } else { stats.error("collection_am_i_leader"); } } log.info("According to ZK I (id=" + myId + ") am no longer a leader."); return LeaderStatus.NO; } @SuppressWarnings("unchecked") protected SolrResponse processMessage(ZkNodeProps message, String operation) { log.warn("OverseerCollectionProcessor.processMessage : " + operation + " , " + message.toString()); NamedList results = new NamedList(); try { // force update the cluster state zkStateReader.updateClusterState(); CollectionParams.CollectionAction action = CollectionParams.CollectionAction.get(operation); if (action == null) { throw new SolrException(ErrorCode.BAD_REQUEST, "Unknown operation:" + operation); } switch (action) { case CREATE: createCollection(zkStateReader.getClusterState(), message, results); break; case DELETE: deleteCollection(message, results); break; case RELOAD: ModifiableSolrParams params = new ModifiableSolrParams(); params.set(CoreAdminParams.ACTION, CoreAdminAction.RELOAD.toString()); collectionCmd(zkStateReader.getClusterState(), message, params, results, Replica.State.ACTIVE); break; case CREATEALIAS: createAlias(zkStateReader.getAliases(), message); break; case DELETEALIAS: deleteAlias(zkStateReader.getAliases(), message); break; case SPLITSHARD: splitShard(zkStateReader.getClusterState(), message, results); break; case DELETESHARD: deleteShard(zkStateReader.getClusterState(), message, results); break; case CREATESHARD: createShard(zkStateReader.getClusterState(), message, results); break; case DELETEREPLICA: deleteReplica(zkStateReader.getClusterState(), message, results); break; case MIGRATE: migrate(zkStateReader.getClusterState(), message, results); break; case ADDROLE: processRoleCommand(message, operation); break; case REMOVEROLE: processRoleCommand(message, operation); break; case ADDREPLICA: addReplica(zkStateReader.getClusterState(), message, results); break; case OVERSEERSTATUS: getOverseerStatus(message, results); break; case CLUSTERSTATUS://TODO . deprecated. OCP does not need to do it .remove in a later release new ClusterStatus(zkStateReader, message).getClusterStatus(results); break; case ADDREPLICAPROP: processReplicaAddPropertyCommand(message); break; case DELETEREPLICAPROP: processReplicaDeletePropertyCommand(message); break; case BALANCESHARDUNIQUE: balanceProperty(message); break; case REBALANCELEADERS: processRebalanceLeaders(message); break; case MODIFYCOLLECTION: overseer.getInQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(message)); break; default: throw new SolrException(ErrorCode.BAD_REQUEST, "Unknown operation:" + operation); } } catch (Exception e) { String collName = message.getStr("collection"); if (collName == null) collName = message.getStr(NAME); if (collName == null) { SolrException.log(log, "Operation " + operation + " failed", e); } else { SolrException.log(log, "Collection: " + collName + " operation: " + operation + " failed", e); } results.add("Operation " + operation + " caused exception:", e); SimpleOrderedMap nl = new SimpleOrderedMap(); nl.add("msg", e.getMessage()); nl.add("rspCode", e instanceof SolrException ? ((SolrException) e).code() : -1); results.add("exception", nl); } return new OverseerSolrResponse(results); } @SuppressWarnings("unchecked") private void processRebalanceLeaders(ZkNodeProps message) throws KeeperException, InterruptedException { checkRequired(message, COLLECTION_PROP, SHARD_ID_PROP, CORE_NAME_PROP, ELECTION_NODE_PROP, NODE_NAME_PROP, BASE_URL_PROP, REJOIN_AT_HEAD_PROP); ModifiableSolrParams params = new ModifiableSolrParams(); params.set(COLLECTION_PROP, message.getStr(COLLECTION_PROP)); params.set(SHARD_ID_PROP, message.getStr(SHARD_ID_PROP)); params.set(REJOIN_AT_HEAD_PROP, message.getStr(REJOIN_AT_HEAD_PROP)); params.set(CoreAdminParams.ACTION, CoreAdminAction.REJOINLEADERELECTION.toString()); params.set(CORE_NAME_PROP, message.getStr(CORE_NAME_PROP)); params.set(NODE_NAME_PROP, message.getStr(NODE_NAME_PROP)); params.set(ELECTION_NODE_PROP, message.getStr(ELECTION_NODE_PROP)); params.set(BASE_URL_PROP, message.getStr(BASE_URL_PROP)); String baseUrl = message.getStr(BASE_URL_PROP); ShardRequest sreq = new ShardRequest(); sreq.nodeName = message.getStr(ZkStateReader.CORE_NAME_PROP); // yes, they must use same admin handler path everywhere... params.set("qt", adminPath); sreq.purpose = ShardRequest.PURPOSE_PRIVATE; sreq.shards = new String[] { baseUrl }; sreq.actualShards = sreq.shards; sreq.params = params; ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); shardHandler.submit(sreq, baseUrl, sreq.params); } @SuppressWarnings("unchecked") private void processReplicaAddPropertyCommand(ZkNodeProps message) throws KeeperException, InterruptedException { checkRequired(message, COLLECTION_PROP, SHARD_ID_PROP, REPLICA_PROP, PROPERTY_PROP, PROPERTY_VALUE_PROP); SolrZkClient zkClient = zkStateReader.getZkClient(); DistributedQueue inQueue = Overseer.getInQueue(zkClient); Map<String, Object> propMap = new HashMap<>(); propMap.put(Overseer.QUEUE_OPERATION, ADDREPLICAPROP.toLower()); propMap.putAll(message.getProperties()); ZkNodeProps m = new ZkNodeProps(propMap); inQueue.offer(Utils.toJSON(m)); } private void processReplicaDeletePropertyCommand(ZkNodeProps message) throws KeeperException, InterruptedException { checkRequired(message, COLLECTION_PROP, SHARD_ID_PROP, REPLICA_PROP, PROPERTY_PROP); SolrZkClient zkClient = zkStateReader.getZkClient(); DistributedQueue inQueue = Overseer.getInQueue(zkClient); Map<String, Object> propMap = new HashMap<>(); propMap.put(Overseer.QUEUE_OPERATION, DELETEREPLICAPROP.toLower()); propMap.putAll(message.getProperties()); ZkNodeProps m = new ZkNodeProps(propMap); inQueue.offer(Utils.toJSON(m)); } private void balanceProperty(ZkNodeProps message) throws KeeperException, InterruptedException { if (StringUtils.isBlank(message.getStr(COLLECTION_PROP)) || StringUtils.isBlank(message.getStr(PROPERTY_PROP))) { throw new SolrException(ErrorCode.BAD_REQUEST, "The '" + COLLECTION_PROP + "' and '" + PROPERTY_PROP + "' parameters are required for the BALANCESHARDUNIQUE operation, no action taken"); } SolrZkClient zkClient = zkStateReader.getZkClient(); DistributedQueue inQueue = Overseer.getInQueue(zkClient); Map<String, Object> propMap = new HashMap<>(); propMap.put(Overseer.QUEUE_OPERATION, BALANCESHARDUNIQUE.toLower()); propMap.putAll(message.getProperties()); inQueue.offer(Utils.toJSON(new ZkNodeProps(propMap))); } @SuppressWarnings("unchecked") private void getOverseerStatus(ZkNodeProps message, NamedList results) throws KeeperException, InterruptedException { String leaderNode = getLeaderNode(zkStateReader.getZkClient()); results.add("leader", leaderNode); Stat stat = new Stat(); zkStateReader.getZkClient().getData("/overseer/queue", null, stat, true); results.add("overseer_queue_size", stat.getNumChildren()); stat = new Stat(); zkStateReader.getZkClient().getData("/overseer/queue-work", null, stat, true); results.add("overseer_work_queue_size", stat.getNumChildren()); stat = new Stat(); zkStateReader.getZkClient().getData("/overseer/collection-queue-work", null, stat, true); results.add("overseer_collection_queue_size", stat.getNumChildren()); NamedList overseerStats = new NamedList(); NamedList collectionStats = new NamedList(); NamedList stateUpdateQueueStats = new NamedList(); NamedList workQueueStats = new NamedList(); NamedList collectionQueueStats = new NamedList(); for (Map.Entry<String, Overseer.Stat> entry : stats.getStats().entrySet()) { String key = entry.getKey(); NamedList<Object> lst = new SimpleOrderedMap<>(); if (key.startsWith("collection_")) { collectionStats.add(key.substring(11), lst); int successes = stats.getSuccessCount(entry.getKey()); int errors = stats.getErrorCount(entry.getKey()); lst.add("requests", successes); lst.add("errors", errors); List<Overseer.FailedOp> failureDetails = stats.getFailureDetails(key); if (failureDetails != null) { List<SimpleOrderedMap<Object>> failures = new ArrayList<>(); for (Overseer.FailedOp failedOp : failureDetails) { SimpleOrderedMap<Object> fail = new SimpleOrderedMap<>(); fail.add("request", failedOp.req.getProperties()); fail.add("response", failedOp.resp.getResponse()); failures.add(fail); } lst.add("recent_failures", failures); } } else if (key.startsWith("/overseer/queue_")) { stateUpdateQueueStats.add(key.substring(16), lst); } else if (key.startsWith("/overseer/queue-work_")) { workQueueStats.add(key.substring(21), lst); } else if (key.startsWith("/overseer/collection-queue-work_")) { collectionQueueStats.add(key.substring(32), lst); } else { // overseer stats overseerStats.add(key, lst); int successes = stats.getSuccessCount(entry.getKey()); int errors = stats.getErrorCount(entry.getKey()); lst.add("requests", successes); lst.add("errors", errors); } Timer timer = entry.getValue().requestTime; Snapshot snapshot = timer.getSnapshot(); lst.add("totalTime", timer.getSum()); lst.add("avgRequestsPerMinute", timer.getMeanRate()); lst.add("5minRateRequestsPerMinute", timer.getFiveMinuteRate()); lst.add("15minRateRequestsPerMinute", timer.getFifteenMinuteRate()); lst.add("avgTimePerRequest", timer.getMean()); lst.add("medianRequestTime", snapshot.getMedian()); lst.add("75thPctlRequestTime", snapshot.get75thPercentile()); lst.add("95thPctlRequestTime", snapshot.get95thPercentile()); lst.add("99thPctlRequestTime", snapshot.get99thPercentile()); lst.add("999thPctlRequestTime", snapshot.get999thPercentile()); } results.add("overseer_operations", overseerStats); results.add("collection_operations", collectionStats); results.add("overseer_queue", stateUpdateQueueStats); results.add("overseer_internal_queue", workQueueStats); results.add("collection_queue", collectionQueueStats); } @SuppressWarnings("unchecked") private void getClusterStatus(ClusterState clusterState, ZkNodeProps message, NamedList results) throws KeeperException, InterruptedException { String collection = message.getStr(ZkStateReader.COLLECTION_PROP); // read aliases Aliases aliases = zkStateReader.getAliases(); Map<String, List<String>> collectionVsAliases = new HashMap<>(); Map<String, String> aliasVsCollections = aliases.getCollectionAliasMap(); if (aliasVsCollections != null) { for (Map.Entry<String, String> entry : aliasVsCollections.entrySet()) { List<String> colls = StrUtils.splitSmart(entry.getValue(), ','); String alias = entry.getKey(); for (String coll : colls) { if (collection == null || collection.equals(coll)) { List<String> list = collectionVsAliases.get(coll); if (list == null) { list = new ArrayList<>(); collectionVsAliases.put(coll, list); } list.add(alias); } } } } Map roles = null; if (zkStateReader.getZkClient().exists(ZkStateReader.ROLES, true)) { roles = (Map) Utils .fromJSON(zkStateReader.getZkClient().getData(ZkStateReader.ROLES, null, null, true)); } // convert cluster state into a map of writable types byte[] bytes = Utils.toJSON(clusterState); Map<String, Object> stateMap = (Map<String, Object>) Utils.fromJSON(bytes); Set<String> collections = new HashSet<>(); String routeKey = message.getStr(ShardParams._ROUTE_); String shard = message.getStr(ZkStateReader.SHARD_ID_PROP); if (collection == null) { collections = new HashSet<>(clusterState.getCollections()); } else { collections = Collections.singleton(collection); } NamedList<Object> collectionProps = new SimpleOrderedMap<Object>(); for (String name : collections) { Map<String, Object> collectionStatus = null; DocCollection clusterStateCollection = clusterState.getCollection(name); Set<String> requestedShards = new HashSet<>(); if (routeKey != null) { DocRouter router = clusterStateCollection.getRouter(); Collection<Slice> slices = router.getSearchSlices(routeKey, null, clusterStateCollection); for (Slice slice : slices) { requestedShards.add(slice.getName()); } } if (shard != null) { requestedShards.add(shard); } if (clusterStateCollection.getStateFormat() > 1) { bytes = Utils.toJSON(clusterStateCollection); Map<String, Object> docCollection = (Map<String, Object>) Utils.fromJSON(bytes); collectionStatus = getCollectionStatus(docCollection, name, requestedShards); } else { collectionStatus = getCollectionStatus((Map<String, Object>) stateMap.get(name), name, requestedShards); } collectionStatus.put("znodeVersion", clusterStateCollection.getZNodeVersion()); if (collectionVsAliases.containsKey(name) && !collectionVsAliases.get(name).isEmpty()) { collectionStatus.put("aliases", collectionVsAliases.get(name)); } String configName = zkStateReader.readConfigName(name); collectionStatus.put("configName", configName); collectionProps.add(name, collectionStatus); } List<String> liveNodes = zkStateReader.getZkClient().getChildren(ZkStateReader.LIVE_NODES_ZKNODE, null, true); // now we need to walk the collectionProps tree to cross-check replica state with live nodes crossCheckReplicaStateWithLiveNodes(liveNodes, collectionProps); NamedList<Object> clusterStatus = new SimpleOrderedMap<>(); clusterStatus.add("collections", collectionProps); // read cluster properties Map clusterProps = zkStateReader.getClusterProps(); if (clusterProps != null && !clusterProps.isEmpty()) { clusterStatus.add("properties", clusterProps); } // add the alias map too if (aliasVsCollections != null && !aliasVsCollections.isEmpty()) { clusterStatus.add("aliases", aliasVsCollections); } // add the roles map if (roles != null) { clusterStatus.add("roles", roles); } // add live_nodes clusterStatus.add("live_nodes", liveNodes); results.add("cluster", clusterStatus); } /** * Walks the tree of collection status to verify that any replicas not reporting a "down" status is * on a live node, if any replicas reporting their status as "active" but the node is not live is * marked as "down"; used by CLUSTERSTATUS. * @param liveNodes List of currently live node names. * @param collectionProps Map of collection status information pulled directly from ZooKeeper. */ @SuppressWarnings("unchecked") protected void crossCheckReplicaStateWithLiveNodes(List<String> liveNodes, NamedList<Object> collectionProps) { Iterator<Map.Entry<String, Object>> colls = collectionProps.iterator(); while (colls.hasNext()) { Map.Entry<String, Object> next = colls.next(); Map<String, Object> collMap = (Map<String, Object>) next.getValue(); Map<String, Object> shards = (Map<String, Object>) collMap.get("shards"); for (Object nextShard : shards.values()) { Map<String, Object> shardMap = (Map<String, Object>) nextShard; Map<String, Object> replicas = (Map<String, Object>) shardMap.get("replicas"); for (Object nextReplica : replicas.values()) { Map<String, Object> replicaMap = (Map<String, Object>) nextReplica; if (Replica.State .getState((String) replicaMap.get(ZkStateReader.STATE_PROP)) != Replica.State.DOWN) { // not down, so verify the node is live String node_name = (String) replicaMap.get(ZkStateReader.NODE_NAME_PROP); if (!liveNodes.contains(node_name)) { // node is not live, so this replica is actually down replicaMap.put(ZkStateReader.STATE_PROP, Replica.State.DOWN.toString()); } } } } } } /** * Get collection status from cluster state. * Can return collection status by given shard name. * * * @param collection collection map parsed from JSON-serialized {@link ClusterState} * @param name collection name * @param requestedShards a set of shards to be returned in the status. * An empty or null values indicates <b>all</b> shards. * @return map of collection properties */ @SuppressWarnings("unchecked") private Map<String, Object> getCollectionStatus(Map<String, Object> collection, String name, Set<String> requestedShards) { if (collection == null) { throw new SolrException(ErrorCode.BAD_REQUEST, "Collection: " + name + " not found"); } if (requestedShards == null || requestedShards.isEmpty()) { return collection; } else { Map<String, Object> shards = (Map<String, Object>) collection.get("shards"); Map<String, Object> selected = new HashMap<>(); for (String selectedShard : requestedShards) { if (!shards.containsKey(selectedShard)) { throw new SolrException(ErrorCode.BAD_REQUEST, "Collection: " + name + " shard: " + selectedShard + " not found"); } selected.put(selectedShard, shards.get(selectedShard)); collection.put("shards", selected); } return collection; } } @SuppressWarnings("unchecked") private void processRoleCommand(ZkNodeProps message, String operation) throws KeeperException, InterruptedException { SolrZkClient zkClient = zkStateReader.getZkClient(); Map roles = null; String node = message.getStr("node"); String roleName = message.getStr("role"); boolean nodeExists = false; if (nodeExists = zkClient.exists(ZkStateReader.ROLES, true)) { roles = (Map) Utils.fromJSON(zkClient.getData(ZkStateReader.ROLES, null, new Stat(), true)); } else { roles = new LinkedHashMap(1); } List nodeList = (List) roles.get(roleName); if (nodeList == null) roles.put(roleName, nodeList = new ArrayList()); if (ADDROLE.toString().toLowerCase(Locale.ROOT).equals(operation)) { log.info("Overseer role added to {}", node); if (!nodeList.contains(node)) nodeList.add(node); } else if (REMOVEROLE.toString().toLowerCase(Locale.ROOT).equals(operation)) { log.info("Overseer role removed from {}", node); nodeList.remove(node); } if (nodeExists) { zkClient.setData(ZkStateReader.ROLES, Utils.toJSON(roles), true); } else { zkClient.create(ZkStateReader.ROLES, Utils.toJSON(roles), CreateMode.PERSISTENT, true); } //if there are too many nodes this command may time out. And most likely dedicated // overseers are created when there are too many nodes . So , do this operation in a separate thread new Thread() { @Override public void run() { try { prioritizeOverseerNodes(); } catch (Exception e) { log.error("Error in prioritizing Overseer", e); } } }.start(); } @SuppressWarnings("unchecked") private void deleteReplica(ClusterState clusterState, ZkNodeProps message, NamedList results) throws KeeperException, InterruptedException { checkRequired(message, COLLECTION_PROP, SHARD_ID_PROP, REPLICA_PROP); String collectionName = message.getStr(COLLECTION_PROP); String shard = message.getStr(SHARD_ID_PROP); String replicaName = message.getStr(REPLICA_PROP); DocCollection coll = clusterState.getCollection(collectionName); Slice slice = coll.getSlice(shard); ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); if (slice == null) { throw new SolrException(ErrorCode.BAD_REQUEST, "Invalid shard name : " + shard + " in collection : " + collectionName); } Replica replica = slice.getReplica(replicaName); if (replica == null) { ArrayList<String> l = new ArrayList<>(); for (Replica r : slice.getReplicas()) l.add(r.getName()); throw new SolrException(ErrorCode.BAD_REQUEST, "Invalid replica : " + replicaName + " in shard/collection : " + shard + "/" + collectionName + " available replicas are " + StrUtils.join(l, ',')); } // If users are being safe and only want to remove a shard if it is down, they can specify onlyIfDown=true // on the command. if (Boolean.parseBoolean(message.getStr(ONLY_IF_DOWN)) && replica.getState() != Replica.State.DOWN) { throw new SolrException(ErrorCode.BAD_REQUEST, "Attempted to remove replica : " + collectionName + "/" + shard + "/" + replicaName + " with onlyIfDown='true', but state is '" + replica.getStr(ZkStateReader.STATE_PROP) + "'"); } String baseUrl = replica.getStr(ZkStateReader.BASE_URL_PROP); String core = replica.getStr(ZkStateReader.CORE_NAME_PROP); // assume the core exists and try to unload it Map m = makeMap("qt", adminPath, CoreAdminParams.ACTION, CoreAdminAction.UNLOAD.toString(), CoreAdminParams.CORE, core, CoreAdminParams.DELETE_INSTANCE_DIR, "true", CoreAdminParams.DELETE_DATA_DIR, "true"); ShardRequest sreq = new ShardRequest(); sreq.purpose = 1; sreq.shards = new String[] { baseUrl }; sreq.actualShards = sreq.shards; sreq.params = new ModifiableSolrParams(new MapSolrParams(m)); try { shardHandler.submit(sreq, baseUrl, sreq.params); } catch (Exception e) { log.warn("Exception trying to unload core " + sreq, e); } collectShardResponses(replica.getState() != Replica.State.ACTIVE ? new NamedList() : results, false, null, shardHandler); if (waitForCoreNodeGone(collectionName, shard, replicaName, 5000)) return;// check if the core unload removed the // corenode zk enry deleteCoreNode(collectionName, replicaName, replica, core); // try and ensure core info is removed from clusterstate if (waitForCoreNodeGone(collectionName, shard, replicaName, 30000)) return; throw new SolrException(ErrorCode.SERVER_ERROR, "Could not remove replica : " + collectionName + "/" + shard + "/" + replicaName); } private boolean waitForCoreNodeGone(String collectionName, String shard, String replicaName, int timeoutms) throws InterruptedException { long waitUntil = System.nanoTime() + TimeUnit.NANOSECONDS.convert(timeoutms, TimeUnit.MILLISECONDS); boolean deleted = false; while (System.nanoTime() < waitUntil) { Thread.sleep(100); DocCollection docCollection = zkStateReader.getClusterState().getCollection(collectionName); if (docCollection != null) { Slice slice = docCollection.getSlice(shard); if (slice == null || slice.getReplica(replicaName) == null) { deleted = true; } } // Return true if either someone already deleted the collection/slice/replica. if (docCollection == null || deleted) break; } return deleted; } private void deleteCoreNode(String collectionName, String replicaName, Replica replica, String core) throws KeeperException, InterruptedException { ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.DELETECORE.toLower(), ZkStateReader.CORE_NAME_PROP, core, ZkStateReader.NODE_NAME_PROP, replica.getStr(ZkStateReader.NODE_NAME_PROP), ZkStateReader.COLLECTION_PROP, collectionName, ZkStateReader.CORE_NODE_NAME_PROP, replicaName); Overseer.getInQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(m)); } private void checkRequired(ZkNodeProps message, String... props) { for (String prop : props) { if (message.get(prop) == null) { throw new SolrException(ErrorCode.BAD_REQUEST, StrUtils.join(Arrays.asList(props), ',') + " are required params"); } } } private void deleteCollection(ZkNodeProps message, NamedList results) throws KeeperException, InterruptedException { final String collection = message.getStr(NAME); try { ModifiableSolrParams params = new ModifiableSolrParams(); params.set(CoreAdminParams.ACTION, CoreAdminAction.UNLOAD.toString()); params.set(CoreAdminParams.DELETE_INSTANCE_DIR, true); params.set(CoreAdminParams.DELETE_DATA_DIR, true); collectionCmd(zkStateReader.getClusterState(), message, params, results, null); ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, DELETE.toLower(), NAME, collection); Overseer.getInQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(m)); // wait for a while until we don't see the collection long now = System.nanoTime(); long timeout = now + TimeUnit.NANOSECONDS.convert(30, TimeUnit.SECONDS); boolean removed = false; while (System.nanoTime() < timeout) { Thread.sleep(100); removed = !zkStateReader.getClusterState().hasCollection(collection); if (removed) { Thread.sleep(500); // just a bit of time so it's more likely other // readers see on return break; } } if (!removed) { throw new SolrException(ErrorCode.SERVER_ERROR, "Could not fully remove collection: " + collection); } } finally { try { if (zkStateReader.getZkClient().exists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection, true)) { zkStateReader.getZkClient().clean(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection); } } catch (InterruptedException e) { SolrException.log(log, "Cleaning up collection in zk was interrupted:" + collection, e); Thread.currentThread().interrupt(); } catch (KeeperException e) { SolrException.log(log, "Problem cleaning up collection in zk:" + collection, e); } } } private void createAlias(Aliases aliases, ZkNodeProps message) { String aliasName = message.getStr(NAME); String collections = message.getStr("collections"); Map<String, Map<String, String>> newAliasesMap = new HashMap<>(); Map<String, String> newCollectionAliasesMap = new HashMap<>(); Map<String, String> prevColAliases = aliases.getCollectionAliasMap(); if (prevColAliases != null) { newCollectionAliasesMap.putAll(prevColAliases); } newCollectionAliasesMap.put(aliasName, collections); newAliasesMap.put("collection", newCollectionAliasesMap); Aliases newAliases = new Aliases(newAliasesMap); byte[] jsonBytes = null; if (newAliases.collectionAliasSize() > 0) { // only sub map right now jsonBytes = Utils.toJSON(newAliases.getAliasMap()); } try { zkStateReader.getZkClient().setData(ZkStateReader.ALIASES, jsonBytes, true); checkForAlias(aliasName, collections); // some fudge for other nodes Thread.sleep(100); } catch (KeeperException e) { log.error("", e); throw new SolrException(ErrorCode.SERVER_ERROR, e); } catch (InterruptedException e) { log.warn("", e); throw new SolrException(ErrorCode.SERVER_ERROR, e); } } private void checkForAlias(String name, String value) { long now = System.nanoTime(); long timeout = now + TimeUnit.NANOSECONDS.convert(30, TimeUnit.SECONDS); boolean success = false; Aliases aliases = null; while (System.nanoTime() < timeout) { aliases = zkStateReader.getAliases(); String collections = aliases.getCollectionAlias(name); if (collections != null && collections.equals(value)) { success = true; break; } } if (!success) { log.warn("Timeout waiting to be notified of Alias change..."); } } private void checkForAliasAbsence(String name) { long now = System.nanoTime(); long timeout = now + TimeUnit.NANOSECONDS.convert(30, TimeUnit.SECONDS); boolean success = false; Aliases aliases = null; while (System.nanoTime() < timeout) { aliases = zkStateReader.getAliases(); String collections = aliases.getCollectionAlias(name); if (collections == null) { success = true; break; } } if (!success) { log.warn("Timeout waiting to be notified of Alias change..."); } } private void deleteAlias(Aliases aliases, ZkNodeProps message) { String aliasName = message.getStr(NAME); Map<String, Map<String, String>> newAliasesMap = new HashMap<>(); Map<String, String> newCollectionAliasesMap = new HashMap<>(); newCollectionAliasesMap.putAll(aliases.getCollectionAliasMap()); newCollectionAliasesMap.remove(aliasName); newAliasesMap.put("collection", newCollectionAliasesMap); Aliases newAliases = new Aliases(newAliasesMap); byte[] jsonBytes = null; if (newAliases.collectionAliasSize() > 0) { // only sub map right now jsonBytes = Utils.toJSON(newAliases.getAliasMap()); } try { zkStateReader.getZkClient().setData(ZkStateReader.ALIASES, jsonBytes, true); checkForAliasAbsence(aliasName); // some fudge for other nodes Thread.sleep(100); } catch (KeeperException e) { log.error("", e); throw new SolrException(ErrorCode.SERVER_ERROR, e); } catch (InterruptedException e) { log.warn("", e); throw new SolrException(ErrorCode.SERVER_ERROR, e); } } private boolean createShard(ClusterState clusterState, ZkNodeProps message, NamedList results) throws KeeperException, InterruptedException { String collectionName = message.getStr(COLLECTION_PROP); String sliceName = message.getStr(SHARD_ID_PROP); log.info("Create shard invoked: {}", message); if (collectionName == null || sliceName == null) throw new SolrException(ErrorCode.BAD_REQUEST, "'collection' and 'shard' are required parameters"); int numSlices = 1; ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); DocCollection collection = clusterState.getCollection(collectionName); int repFactor = message.getInt(REPLICATION_FACTOR, collection.getInt(REPLICATION_FACTOR, 1)); String createNodeSetStr = message.getStr(CREATE_NODE_SET); List<ReplicaCount> sortedNodeList = getNodesForNewReplicas(clusterState, collectionName, sliceName, repFactor, createNodeSetStr, overseer.getZkController().getCoreContainer()); Overseer.getInQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(message)); // wait for a while until we see the shard long waitUntil = System.nanoTime() + TimeUnit.NANOSECONDS.convert(30, TimeUnit.SECONDS); boolean created = false; while (System.nanoTime() < waitUntil) { Thread.sleep(100); created = zkStateReader.getClusterState().getCollection(collectionName).getSlice(sliceName) != null; if (created) break; } if (!created) throw new SolrException(ErrorCode.SERVER_ERROR, "Could not fully create shard: " + message.getStr(NAME)); String configName = message.getStr(COLL_CONF); for (int j = 1; j <= repFactor; j++) { String nodeName = sortedNodeList.get(((j - 1)) % sortedNodeList.size()).nodeName; String shardName = collectionName + "_" + sliceName + "_replica" + j; log.info("Creating shard " + shardName + " as part of slice " + sliceName + " of collection " + collectionName + " on " + nodeName); // Need to create new params for each request ModifiableSolrParams params = new ModifiableSolrParams(); params.set(CoreAdminParams.ACTION, CoreAdminAction.CREATE.toString()); params.set(CoreAdminParams.NAME, shardName); params.set(COLL_CONF, configName); params.set(CoreAdminParams.COLLECTION, collectionName); params.set(CoreAdminParams.SHARD, sliceName); params.set(ZkStateReader.NUM_SHARDS_PROP, numSlices); addPropertyParams(message, params); ShardRequest sreq = new ShardRequest(); params.set("qt", adminPath); sreq.purpose = 1; String replica = zkStateReader.getBaseUrlForNodeName(nodeName); sreq.shards = new String[] { replica }; sreq.actualShards = sreq.shards; sreq.params = params; shardHandler.submit(sreq, replica, sreq.params); } processResponses(results, shardHandler); log.info("Finished create command on all shards for collection: " + collectionName); return true; } private boolean splitShard(ClusterState clusterState, ZkNodeProps message, NamedList results) { String collectionName = message.getStr("collection"); String slice = message.getStr(ZkStateReader.SHARD_ID_PROP); log.info("Split shard invoked"); String splitKey = message.getStr("split.key"); ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); DocCollection collection = clusterState.getCollection(collectionName); DocRouter router = collection.getRouter() != null ? collection.getRouter() : DocRouter.DEFAULT; Slice parentSlice = null; if (slice == null) { if (router instanceof CompositeIdRouter) { Collection<Slice> searchSlices = router.getSearchSlicesSingle(splitKey, new ModifiableSolrParams(), collection); if (searchSlices.isEmpty()) { throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to find an active shard for split.key: " + splitKey); } if (searchSlices.size() > 1) { throw new SolrException(ErrorCode.BAD_REQUEST, "Splitting a split.key: " + splitKey + " which spans multiple shards is not supported"); } parentSlice = searchSlices.iterator().next(); slice = parentSlice.getName(); log.info("Split by route.key: {}, parent shard is: {} ", splitKey, slice); } else { throw new SolrException(ErrorCode.BAD_REQUEST, "Split by route key can only be used with CompositeIdRouter or subclass. Found router: " + router.getClass().getName()); } } else { parentSlice = clusterState.getSlice(collectionName, slice); } if (parentSlice == null) { if (clusterState.hasCollection(collectionName)) { throw new SolrException(ErrorCode.BAD_REQUEST, "No shard with the specified name exists: " + slice); } else { throw new SolrException(ErrorCode.BAD_REQUEST, "No collection with the specified name exists: " + collectionName); } } // find the leader for the shard Replica parentShardLeader = null; try { parentShardLeader = zkStateReader.getLeaderRetry(collectionName, slice, 10000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } DocRouter.Range range = parentSlice.getRange(); if (range == null) { range = new PlainIdRouter().fullRange(); } List<DocRouter.Range> subRanges = null; String rangesStr = message.getStr(CoreAdminParams.RANGES); if (rangesStr != null) { String[] ranges = rangesStr.split(","); if (ranges.length == 0 || ranges.length == 1) { throw new SolrException(ErrorCode.BAD_REQUEST, "There must be at least two ranges specified to split a shard"); } else { subRanges = new ArrayList<>(ranges.length); for (int i = 0; i < ranges.length; i++) { String r = ranges[i]; try { subRanges.add(DocRouter.DEFAULT.fromString(r)); } catch (Exception e) { throw new SolrException(ErrorCode.BAD_REQUEST, "Exception in parsing hexadecimal hash range: " + r, e); } if (!subRanges.get(i).isSubsetOf(range)) { throw new SolrException(ErrorCode.BAD_REQUEST, "Specified hash range: " + r + " is not a subset of parent shard's range: " + range.toString()); } } List<DocRouter.Range> temp = new ArrayList<>(subRanges); // copy to preserve original order Collections.sort(temp); if (!range.equals(new DocRouter.Range(temp.get(0).min, temp.get(temp.size() - 1).max))) { throw new SolrException(ErrorCode.BAD_REQUEST, "Specified hash ranges: " + rangesStr + " do not cover the entire range of parent shard: " + range); } for (int i = 1; i < temp.size(); i++) { if (temp.get(i - 1).max + 1 != temp.get(i).min) { throw new SolrException(ErrorCode.BAD_REQUEST, "Specified hash ranges: " + rangesStr + " either overlap with each other or " + "do not cover the entire range of parent shard: " + range); } } } } else if (splitKey != null) { if (router instanceof CompositeIdRouter) { CompositeIdRouter compositeIdRouter = (CompositeIdRouter) router; subRanges = compositeIdRouter.partitionRangeByKey(splitKey, range); if (subRanges.size() == 1) { throw new SolrException(ErrorCode.BAD_REQUEST, "The split.key: " + splitKey + " has a hash range that is exactly equal to hash range of shard: " + slice); } for (DocRouter.Range subRange : subRanges) { if (subRange.min == subRange.max) { throw new SolrException(ErrorCode.BAD_REQUEST, "The split.key: " + splitKey + " must be a compositeId"); } } log.info("Partitioning parent shard " + slice + " range: " + parentSlice.getRange() + " yields: " + subRanges); rangesStr = ""; for (int i = 0; i < subRanges.size(); i++) { DocRouter.Range subRange = subRanges.get(i); rangesStr += subRange.toString(); if (i < subRanges.size() - 1) rangesStr += ','; } } } else { // todo: fixed to two partitions? subRanges = router.partitionRange(2, range); } try { List<String> subSlices = new ArrayList<>(subRanges.size()); List<String> subShardNames = new ArrayList<>(subRanges.size()); String nodeName = parentShardLeader.getNodeName(); for (int i = 0; i < subRanges.size(); i++) { String subSlice = slice + "_" + i; subSlices.add(subSlice); String subShardName = collectionName + "_" + subSlice + "_replica1"; subShardNames.add(subShardName); Slice oSlice = clusterState.getSlice(collectionName, subSlice); if (oSlice != null) { final Slice.State state = oSlice.getState(); if (state == Slice.State.ACTIVE) { throw new SolrException(ErrorCode.BAD_REQUEST, "Sub-shard: " + subSlice + " exists in active state. Aborting split shard."); } else if (state == Slice.State.CONSTRUCTION || state == Slice.State.RECOVERY) { // delete the shards for (String sub : subSlices) { log.info("Sub-shard: {} already exists therefore requesting its deletion", sub); Map<String, Object> propMap = new HashMap<>(); propMap.put(Overseer.QUEUE_OPERATION, "deleteshard"); propMap.put(COLLECTION_PROP, collectionName); propMap.put(SHARD_ID_PROP, sub); ZkNodeProps m = new ZkNodeProps(propMap); try { deleteShard(clusterState, m, new NamedList()); } catch (Exception e) { throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + sub, e); } } } } } // do not abort splitshard if the unloading fails // this can happen because the replicas created previously may be down // the only side effect of this is that the sub shard may end up having more replicas than we want collectShardResponses(results, false, null, shardHandler); final String asyncId = message.getStr(ASYNC); HashMap<String, String> requestMap = new HashMap<>(); for (int i = 0; i < subRanges.size(); i++) { String subSlice = subSlices.get(i); String subShardName = subShardNames.get(i); DocRouter.Range subRange = subRanges.get(i); log.info("Creating slice " + subSlice + " of collection " + collectionName + " on " + nodeName); Map<String, Object> propMap = new HashMap<>(); propMap.put(Overseer.QUEUE_OPERATION, CREATESHARD.toLower()); propMap.put(ZkStateReader.SHARD_ID_PROP, subSlice); propMap.put(ZkStateReader.COLLECTION_PROP, collectionName); propMap.put(ZkStateReader.SHARD_RANGE_PROP, subRange.toString()); propMap.put(ZkStateReader.SHARD_STATE_PROP, Slice.State.CONSTRUCTION.toString()); propMap.put(ZkStateReader.SHARD_PARENT_PROP, parentSlice.getName()); DistributedQueue inQueue = Overseer.getInQueue(zkStateReader.getZkClient()); inQueue.offer(Utils.toJSON(new ZkNodeProps(propMap))); // wait until we are able to see the new shard in cluster state waitForNewShard(collectionName, subSlice); // refresh cluster state clusterState = zkStateReader.getClusterState(); log.info("Adding replica " + subShardName + " as part of slice " + subSlice + " of collection " + collectionName + " on " + nodeName); propMap = new HashMap<>(); propMap.put(Overseer.QUEUE_OPERATION, ADDREPLICA.toLower()); propMap.put(COLLECTION_PROP, collectionName); propMap.put(SHARD_ID_PROP, subSlice); propMap.put("node", nodeName); propMap.put(CoreAdminParams.NAME, subShardName); // copy over property params: for (String key : message.keySet()) { if (key.startsWith(COLL_PROP_PREFIX)) { propMap.put(key, message.getStr(key)); } } // add async param if (asyncId != null) { propMap.put(ASYNC, asyncId); } addReplica(clusterState, new ZkNodeProps(propMap), results); } collectShardResponses(results, true, "SPLITSHARD failed to create subshard leaders", shardHandler); completeAsyncRequest(asyncId, requestMap, results); for (String subShardName : subShardNames) { // wait for parent leader to acknowledge the sub-shard core log.info("Asking parent leader to wait for: " + subShardName + " to be alive on: " + nodeName); String coreNodeName = waitForCoreNodeName(collectionName, nodeName, subShardName); CoreAdminRequest.WaitForState cmd = new CoreAdminRequest.WaitForState(); cmd.setCoreName(subShardName); cmd.setNodeName(nodeName); cmd.setCoreNodeName(coreNodeName); cmd.setState(Replica.State.ACTIVE); cmd.setCheckLive(true); cmd.setOnlyIfLeader(true); ModifiableSolrParams p = new ModifiableSolrParams(cmd.getParams()); sendShardRequest(nodeName, p, shardHandler, asyncId, requestMap); } collectShardResponses(results, true, "SPLITSHARD timed out waiting for subshard leaders to come up", shardHandler); completeAsyncRequest(asyncId, requestMap, results); log.info("Successfully created all sub-shards for collection " + collectionName + " parent shard: " + slice + " on: " + parentShardLeader); log.info("Splitting shard " + parentShardLeader.getName() + " as part of slice " + slice + " of collection " + collectionName + " on " + parentShardLeader); ModifiableSolrParams params = new ModifiableSolrParams(); params.set(CoreAdminParams.ACTION, CoreAdminAction.SPLIT.toString()); params.set(CoreAdminParams.CORE, parentShardLeader.getStr("core")); for (int i = 0; i < subShardNames.size(); i++) { String subShardName = subShardNames.get(i); params.add(CoreAdminParams.TARGET_CORE, subShardName); } params.set(CoreAdminParams.RANGES, rangesStr); sendShardRequest(parentShardLeader.getNodeName(), params, shardHandler, asyncId, requestMap); collectShardResponses(results, true, "SPLITSHARD failed to invoke SPLIT core admin command", shardHandler); completeAsyncRequest(asyncId, requestMap, results); log.info("Index on shard: " + nodeName + " split into two successfully"); // apply buffered updates on sub-shards for (int i = 0; i < subShardNames.size(); i++) { String subShardName = subShardNames.get(i); log.info("Applying buffered updates on : " + subShardName); params = new ModifiableSolrParams(); params.set(CoreAdminParams.ACTION, CoreAdminAction.REQUESTAPPLYUPDATES.toString()); params.set(CoreAdminParams.NAME, subShardName); sendShardRequest(nodeName, params, shardHandler, asyncId, requestMap); } collectShardResponses(results, true, "SPLITSHARD failed while asking sub shard leaders to apply buffered updates", shardHandler); completeAsyncRequest(asyncId, requestMap, results); log.info("Successfully applied buffered updates on : " + subShardNames); // Replica creation for the new Slices // look at the replication factor and see if it matches reality // if it does not, find best nodes to create more cores // TODO: Have replication factor decided in some other way instead of numShards for the parent int repFactor = clusterState.getSlice(collectionName, slice).getReplicas().size(); // we need to look at every node and see how many cores it serves // add our new cores to existing nodes serving the least number of cores // but (for now) require that each core goes on a distinct node. // TODO: add smarter options that look at the current number of cores per // node? // for now we just go random Set<String> nodes = clusterState.getLiveNodes(); List<String> nodeList = new ArrayList<>(nodes.size()); nodeList.addAll(nodes); // TODO: Have maxShardsPerNode param for this operation? // Remove the node that hosts the parent shard for replica creation. nodeList.remove(nodeName); // TODO: change this to handle sharding a slice into > 2 sub-shards. List<Map<String, Object>> replicas = new ArrayList<>((repFactor - 1) * 2); for (int i = 1; i <= subSlices.size(); i++) { Collections.shuffle(nodeList, RANDOM); String sliceName = subSlices.get(i - 1); for (int j = 2; j <= repFactor; j++) { String subShardNodeName = nodeList.get((repFactor * (i - 1) + (j - 2)) % nodeList.size()); String shardName = collectionName + "_" + sliceName + "_replica" + (j); log.info("Creating replica shard " + shardName + " as part of slice " + sliceName + " of collection " + collectionName + " on " + subShardNodeName); ZkNodeProps props = new ZkNodeProps(Overseer.QUEUE_OPERATION, ADDREPLICA.toLower(), ZkStateReader.COLLECTION_PROP, collectionName, ZkStateReader.SHARD_ID_PROP, sliceName, ZkStateReader.CORE_NAME_PROP, shardName, ZkStateReader.STATE_PROP, Replica.State.DOWN.toString(), ZkStateReader.BASE_URL_PROP, zkStateReader.getBaseUrlForNodeName(subShardNodeName), ZkStateReader.NODE_NAME_PROP, subShardNodeName); Overseer.getInQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(props)); HashMap<String, Object> propMap = new HashMap<>(); propMap.put(Overseer.QUEUE_OPERATION, ADDREPLICA.toLower()); propMap.put(COLLECTION_PROP, collectionName); propMap.put(SHARD_ID_PROP, sliceName); propMap.put("node", subShardNodeName); propMap.put(CoreAdminParams.NAME, shardName); // copy over property params: for (String key : message.keySet()) { if (key.startsWith(COLL_PROP_PREFIX)) { propMap.put(key, message.getStr(key)); } } // add async param if (asyncId != null) { propMap.put(ASYNC, asyncId); } // special flag param to instruct addReplica not to create the replica in cluster state again propMap.put(SKIP_CREATE_REPLICA_IN_CLUSTER_STATE, "true"); replicas.add(propMap); } } // we must set the slice state into recovery before actually creating the replica cores // this ensures that the logic inside Overseer to update sub-shard state to 'active' // always gets a chance to execute. See SOLR-7673 if (repFactor == 1) { // switch sub shard states to 'active' log.info("Replication factor is 1 so switching shard states"); DistributedQueue inQueue = Overseer.getInQueue(zkStateReader.getZkClient()); Map<String, Object> propMap = new HashMap<>(); propMap.put(Overseer.QUEUE_OPERATION, OverseerAction.UPDATESHARDSTATE.toLower()); propMap.put(slice, Slice.State.INACTIVE.toString()); for (String subSlice : subSlices) { propMap.put(subSlice, Slice.State.ACTIVE.toString()); } propMap.put(ZkStateReader.COLLECTION_PROP, collectionName); ZkNodeProps m = new ZkNodeProps(propMap); inQueue.offer(Utils.toJSON(m)); } else { log.info("Requesting shard state be set to 'recovery'"); DistributedQueue inQueue = Overseer.getInQueue(zkStateReader.getZkClient()); Map<String, Object> propMap = new HashMap<>(); propMap.put(Overseer.QUEUE_OPERATION, OverseerAction.UPDATESHARDSTATE.toLower()); for (String subSlice : subSlices) { propMap.put(subSlice, Slice.State.RECOVERY.toString()); } propMap.put(ZkStateReader.COLLECTION_PROP, collectionName); ZkNodeProps m = new ZkNodeProps(propMap); inQueue.offer(Utils.toJSON(m)); } // now actually create replica cores on sub shard nodes for (Map<String, Object> replica : replicas) { addReplica(clusterState, new ZkNodeProps(replica), results); } collectShardResponses(results, true, "SPLITSHARD failed to create subshard replicas", shardHandler); completeAsyncRequest(asyncId, requestMap, results); log.info("Successfully created all replica shards for all sub-slices " + subSlices); commit(results, slice, parentShardLeader); return true; } catch (SolrException e) { throw e; } catch (Exception e) { log.error( "Error executing split operation for collection: " + collectionName + " parent shard: " + slice, e); throw new SolrException(ErrorCode.SERVER_ERROR, null, e); } } private void commit(NamedList results, String slice, Replica parentShardLeader) { log.info("Calling soft commit to make sub shard updates visible"); String coreUrl = new ZkCoreNodeProps(parentShardLeader).getCoreUrl(); // HttpShardHandler is hard coded to send a QueryRequest hence we go direct // and we force open a searcher so that we have documents to show upon switching states UpdateResponse updateResponse = null; try { updateResponse = softCommit(coreUrl); processResponse(results, null, coreUrl, updateResponse, slice); } catch (Exception e) { processResponse(results, e, coreUrl, updateResponse, slice); throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to call distrib softCommit on: " + coreUrl, e); } } static UpdateResponse softCommit(String url) throws SolrServerException, IOException { try (HttpSolrClient client = new HttpSolrClient(url)) { client.setConnectionTimeout(30000); client.setSoTimeout(120000); UpdateRequest ureq = new UpdateRequest(); ureq.setParams(new ModifiableSolrParams()); ureq.setAction(AbstractUpdateRequest.ACTION.COMMIT, false, true, true); return ureq.process(client); } } private String waitForCoreNodeName(String collectionName, String msgNodeName, String msgCore) { int retryCount = 320; while (retryCount-- > 0) { Map<String, Slice> slicesMap = zkStateReader.getClusterState().getSlicesMap(collectionName); if (slicesMap != null) { for (Slice slice : slicesMap.values()) { for (Replica replica : slice.getReplicas()) { // TODO: for really large clusters, we could 'index' on this String nodeName = replica.getStr(ZkStateReader.NODE_NAME_PROP); String core = replica.getStr(ZkStateReader.CORE_NAME_PROP); if (nodeName.equals(msgNodeName) && core.equals(msgCore)) { return replica.getName(); } } } } try { Thread.sleep(1000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } throw new SolrException(ErrorCode.SERVER_ERROR, "Could not find coreNodeName"); } private void waitForNewShard(String collectionName, String sliceName) throws KeeperException, InterruptedException { log.info("Waiting for slice {} of collection {} to be available", sliceName, collectionName); long startTime = System.currentTimeMillis(); int retryCount = 320; while (retryCount-- > 0) { DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName); if (collection == null) { throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to find collection: " + collectionName + " in clusterstate"); } Slice slice = collection.getSlice(sliceName); if (slice != null) { log.info("Waited for {} seconds for slice {} of collection {} to be available", (System.currentTimeMillis() - startTime) / 1000, sliceName, collectionName); return; } Thread.sleep(1000); zkStateReader.updateClusterState(); } throw new SolrException(ErrorCode.SERVER_ERROR, "Could not find new slice " + sliceName + " in collection " + collectionName + " even after waiting for " + (System.currentTimeMillis() - startTime) / 1000 + " seconds"); } private void collectShardResponses(NamedList results, boolean abortOnError, String msgOnError, ShardHandler shardHandler) { ShardResponse srsp; do { srsp = shardHandler.takeCompletedOrError(); if (srsp != null) { processResponse(results, srsp); Throwable exception = srsp.getException(); if (abortOnError && exception != null) { // drain pending requests while (srsp != null) { srsp = shardHandler.takeCompletedOrError(); } throw new SolrException(ErrorCode.SERVER_ERROR, msgOnError, exception); } } } while (srsp != null); } private void deleteShard(ClusterState clusterState, ZkNodeProps message, NamedList results) { String collection = message.getStr(ZkStateReader.COLLECTION_PROP); String sliceId = message.getStr(ZkStateReader.SHARD_ID_PROP); log.info("Delete shard invoked"); Slice slice = clusterState.getSlice(collection, sliceId); if (slice == null) { if (clusterState.hasCollection(collection)) { throw new SolrException(ErrorCode.BAD_REQUEST, "No shard with name " + sliceId + " exists for collection " + collection); } else { throw new SolrException(ErrorCode.BAD_REQUEST, "No collection with the specified name exists: " + collection); } } // For now, only allow for deletions of Inactive slices or custom hashes (range==null). // TODO: Add check for range gaps on Slice deletion final Slice.State state = slice.getState(); if (!(slice.getRange() == null || state == Slice.State.INACTIVE || state == Slice.State.RECOVERY || state == Slice.State.CONSTRUCTION)) { throw new SolrException(ErrorCode.BAD_REQUEST, "The slice: " + slice.getName() + " is currently " + state + ". Only non-active (or custom-hashed) slices can be deleted."); } ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); try { ModifiableSolrParams params = new ModifiableSolrParams(); params.set(CoreAdminParams.ACTION, CoreAdminAction.UNLOAD.toString()); params.set(CoreAdminParams.DELETE_INDEX, "true"); sliceCmd(clusterState, params, null, slice, shardHandler); processResponses(results, shardHandler); ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, DELETESHARD.toLower(), ZkStateReader.COLLECTION_PROP, collection, ZkStateReader.SHARD_ID_PROP, sliceId); Overseer.getInQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(m)); // wait for a while until we don't see the shard long now = System.nanoTime(); long timeout = now + TimeUnit.NANOSECONDS.convert(30, TimeUnit.SECONDS); boolean removed = false; while (System.nanoTime() < timeout) { Thread.sleep(100); removed = zkStateReader.getClusterState().getSlice(collection, sliceId) == null; if (removed) { Thread.sleep(100); // just a bit of time so it's more likely other readers see on return break; } } if (!removed) { throw new SolrException(ErrorCode.SERVER_ERROR, "Could not fully remove collection: " + collection + " shard: " + sliceId); } log.info("Successfully deleted collection: " + collection + ", shard: " + sliceId); } catch (SolrException e) { throw e; } catch (Exception e) { throw new SolrException(ErrorCode.SERVER_ERROR, "Error executing delete operation for collection: " + collection + " shard: " + sliceId, e); } } private void migrate(ClusterState clusterState, ZkNodeProps message, NamedList results) throws KeeperException, InterruptedException { String sourceCollectionName = message.getStr("collection"); String splitKey = message.getStr("split.key"); String targetCollectionName = message.getStr("target.collection"); int timeout = message.getInt("forward.timeout", 10 * 60) * 1000; DocCollection sourceCollection = clusterState.getCollection(sourceCollectionName); if (sourceCollection == null) { throw new SolrException(ErrorCode.BAD_REQUEST, "Unknown source collection: " + sourceCollectionName); } DocCollection targetCollection = clusterState.getCollection(targetCollectionName); if (targetCollection == null) { throw new SolrException(ErrorCode.BAD_REQUEST, "Unknown target collection: " + sourceCollectionName); } if (!(sourceCollection.getRouter() instanceof CompositeIdRouter)) { throw new SolrException(ErrorCode.BAD_REQUEST, "Source collection must use a compositeId router"); } if (!(targetCollection.getRouter() instanceof CompositeIdRouter)) { throw new SolrException(ErrorCode.BAD_REQUEST, "Target collection must use a compositeId router"); } CompositeIdRouter sourceRouter = (CompositeIdRouter) sourceCollection.getRouter(); CompositeIdRouter targetRouter = (CompositeIdRouter) targetCollection.getRouter(); Collection<Slice> sourceSlices = sourceRouter.getSearchSlicesSingle(splitKey, null, sourceCollection); if (sourceSlices.isEmpty()) { throw new SolrException(ErrorCode.BAD_REQUEST, "No active slices available in source collection: " + sourceCollection + "for given split.key: " + splitKey); } Collection<Slice> targetSlices = targetRouter.getSearchSlicesSingle(splitKey, null, targetCollection); if (targetSlices.isEmpty()) { throw new SolrException(ErrorCode.BAD_REQUEST, "No active slices available in target collection: " + targetCollection + "for given split.key: " + splitKey); } String asyncId = null; if (message.containsKey(ASYNC) && message.get(ASYNC) != null) asyncId = message.getStr(ASYNC); for (Slice sourceSlice : sourceSlices) { for (Slice targetSlice : targetSlices) { log.info("Migrating source shard: {} to target shard: {} for split.key = " + splitKey, sourceSlice, targetSlice); migrateKey(clusterState, sourceCollection, sourceSlice, targetCollection, targetSlice, splitKey, timeout, results, asyncId, message); } } } private void migrateKey(ClusterState clusterState, DocCollection sourceCollection, Slice sourceSlice, DocCollection targetCollection, Slice targetSlice, String splitKey, int timeout, NamedList results, String asyncId, ZkNodeProps message) throws KeeperException, InterruptedException { String tempSourceCollectionName = "split_" + sourceSlice.getName() + "_temp_" + targetSlice.getName(); if (clusterState.hasCollection(tempSourceCollectionName)) { log.info("Deleting temporary collection: " + tempSourceCollectionName); Map<String, Object> props = makeMap(Overseer.QUEUE_OPERATION, DELETE.toLower(), NAME, tempSourceCollectionName); try { deleteCollection(new ZkNodeProps(props), results); clusterState = zkStateReader.getClusterState(); } catch (Exception e) { log.warn("Unable to clean up existing temporary collection: " + tempSourceCollectionName, e); } } CompositeIdRouter sourceRouter = (CompositeIdRouter) sourceCollection.getRouter(); DocRouter.Range keyHashRange = sourceRouter.keyHashRange(splitKey); ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); log.info("Hash range for split.key: {} is: {}", splitKey, keyHashRange); // intersect source range, keyHashRange and target range // this is the range that has to be split from source and transferred to target DocRouter.Range splitRange = intersect(targetSlice.getRange(), intersect(sourceSlice.getRange(), keyHashRange)); if (splitRange == null) { log.info("No common hashes between source shard: {} and target shard: {}", sourceSlice.getName(), targetSlice.getName()); return; } log.info("Common hash range between source shard: {} and target shard: {} = " + splitRange, sourceSlice.getName(), targetSlice.getName()); Replica targetLeader = zkStateReader.getLeaderRetry(targetCollection.getName(), targetSlice.getName(), 10000); // For tracking async calls. HashMap<String, String> requestMap = new HashMap<String, String>(); log.info("Asking target leader node: " + targetLeader.getNodeName() + " core: " + targetLeader.getStr("core") + " to buffer updates"); ModifiableSolrParams params = new ModifiableSolrParams(); params.set(CoreAdminParams.ACTION, CoreAdminAction.REQUESTBUFFERUPDATES.toString()); params.set(CoreAdminParams.NAME, targetLeader.getStr("core")); String nodeName = targetLeader.getNodeName(); sendShardRequest(targetLeader.getNodeName(), params, shardHandler, asyncId, requestMap); collectShardResponses(results, true, "MIGRATE failed to request node to buffer updates", shardHandler); completeAsyncRequest(asyncId, requestMap, results); ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.ADDROUTINGRULE.toLower(), COLLECTION_PROP, sourceCollection.getName(), SHARD_ID_PROP, sourceSlice.getName(), "routeKey", SolrIndexSplitter.getRouteKey(splitKey) + "!", "range", splitRange.toString(), "targetCollection", targetCollection.getName(), // TODO: look at using nanoTime here? "expireAt", String.valueOf(System.currentTimeMillis() + timeout)); log.info("Adding routing rule: " + m); Overseer.getInQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(m)); // wait for a while until we see the new rule log.info("Waiting to see routing rule updated in clusterstate"); long waitUntil = System.nanoTime() + TimeUnit.NANOSECONDS.convert(60, TimeUnit.SECONDS); boolean added = false; while (System.nanoTime() < waitUntil) { Thread.sleep(100); Map<String, RoutingRule> rules = zkStateReader.getClusterState() .getSlice(sourceCollection.getName(), sourceSlice.getName()).getRoutingRules(); if (rules != null) { RoutingRule rule = rules.get(SolrIndexSplitter.getRouteKey(splitKey) + "!"); if (rule != null && rule.getRouteRanges().contains(splitRange)) { added = true; break; } } } if (!added) { throw new SolrException(ErrorCode.SERVER_ERROR, "Could not add routing rule: " + m); } log.info("Routing rule added successfully"); // Create temp core on source shard Replica sourceLeader = zkStateReader.getLeaderRetry(sourceCollection.getName(), sourceSlice.getName(), 10000); // create a temporary collection with just one node on the shard leader String configName = zkStateReader.readConfigName(sourceCollection.getName()); Map<String, Object> props = makeMap(Overseer.QUEUE_OPERATION, CREATE.toLower(), NAME, tempSourceCollectionName, REPLICATION_FACTOR, 1, NUM_SLICES, 1, COLL_CONF, configName, CREATE_NODE_SET, sourceLeader.getNodeName()); if (asyncId != null) { String internalAsyncId = asyncId + Math.abs(System.nanoTime()); props.put(ASYNC, internalAsyncId); } log.info("Creating temporary collection: " + props); createCollection(clusterState, new ZkNodeProps(props), results); // refresh cluster state clusterState = zkStateReader.getClusterState(); Slice tempSourceSlice = clusterState.getCollection(tempSourceCollectionName).getSlices().iterator().next(); Replica tempSourceLeader = zkStateReader.getLeaderRetry(tempSourceCollectionName, tempSourceSlice.getName(), 120000); String tempCollectionReplica1 = tempSourceCollectionName + "_" + tempSourceSlice.getName() + "_replica1"; String coreNodeName = waitForCoreNodeName(tempSourceCollectionName, sourceLeader.getNodeName(), tempCollectionReplica1); // wait for the replicas to be seen as active on temp source leader log.info("Asking source leader to wait for: " + tempCollectionReplica1 + " to be alive on: " + sourceLeader.getNodeName()); CoreAdminRequest.WaitForState cmd = new CoreAdminRequest.WaitForState(); cmd.setCoreName(tempCollectionReplica1); cmd.setNodeName(sourceLeader.getNodeName()); cmd.setCoreNodeName(coreNodeName); cmd.setState(Replica.State.ACTIVE); cmd.setCheckLive(true); cmd.setOnlyIfLeader(true); // we don't want this to happen asynchronously sendShardRequest(tempSourceLeader.getNodeName(), new ModifiableSolrParams(cmd.getParams()), shardHandler, null, null); collectShardResponses(results, true, "MIGRATE failed to create temp collection leader or timed out waiting for it to come up", shardHandler); log.info("Asking source leader to split index"); params = new ModifiableSolrParams(); params.set(CoreAdminParams.ACTION, CoreAdminAction.SPLIT.toString()); params.set(CoreAdminParams.CORE, sourceLeader.getStr("core")); params.add(CoreAdminParams.TARGET_CORE, tempSourceLeader.getStr("core")); params.set(CoreAdminParams.RANGES, splitRange.toString()); params.set("split.key", splitKey); String tempNodeName = sourceLeader.getNodeName(); sendShardRequest(tempNodeName, params, shardHandler, asyncId, requestMap); collectShardResponses(results, true, "MIGRATE failed to invoke SPLIT core admin command", shardHandler); completeAsyncRequest(asyncId, requestMap, results); log.info("Creating a replica of temporary collection: {} on the target leader node: {}", tempSourceCollectionName, targetLeader.getNodeName()); String tempCollectionReplica2 = tempSourceCollectionName + "_" + tempSourceSlice.getName() + "_replica2"; props = new HashMap<>(); props.put(Overseer.QUEUE_OPERATION, ADDREPLICA.toLower()); props.put(COLLECTION_PROP, tempSourceCollectionName); props.put(SHARD_ID_PROP, tempSourceSlice.getName()); props.put("node", targetLeader.getNodeName()); props.put(CoreAdminParams.NAME, tempCollectionReplica2); // copy over property params: for (String key : message.keySet()) { if (key.startsWith(COLL_PROP_PREFIX)) { props.put(key, message.getStr(key)); } } // add async param if (asyncId != null) { props.put(ASYNC, asyncId); } addReplica(clusterState, new ZkNodeProps(props), results); collectShardResponses(results, true, "MIGRATE failed to create replica of temporary collection in target leader node.", shardHandler); completeAsyncRequest(asyncId, requestMap, results); coreNodeName = waitForCoreNodeName(tempSourceCollectionName, targetLeader.getNodeName(), tempCollectionReplica2); // wait for the replicas to be seen as active on temp source leader log.info("Asking temp source leader to wait for: " + tempCollectionReplica2 + " to be alive on: " + targetLeader.getNodeName()); cmd = new CoreAdminRequest.WaitForState(); cmd.setCoreName(tempSourceLeader.getStr("core")); cmd.setNodeName(targetLeader.getNodeName()); cmd.setCoreNodeName(coreNodeName); cmd.setState(Replica.State.ACTIVE); cmd.setCheckLive(true); cmd.setOnlyIfLeader(true); params = new ModifiableSolrParams(cmd.getParams()); sendShardRequest(tempSourceLeader.getNodeName(), params, shardHandler, asyncId, requestMap); collectShardResponses(results, true, "MIGRATE failed to create temp collection replica or timed out waiting for them to come up", shardHandler); completeAsyncRequest(asyncId, requestMap, results); log.info("Successfully created replica of temp source collection on target leader node"); log.info("Requesting merge of temp source collection replica to target leader"); params = new ModifiableSolrParams(); params.set(CoreAdminParams.ACTION, CoreAdminAction.MERGEINDEXES.toString()); params.set(CoreAdminParams.CORE, targetLeader.getStr("core")); params.set(CoreAdminParams.SRC_CORE, tempCollectionReplica2); sendShardRequest(targetLeader.getNodeName(), params, shardHandler, asyncId, requestMap); collectShardResponses(results, true, "MIGRATE failed to merge " + tempCollectionReplica2 + " to " + targetLeader.getStr("core") + " on node: " + targetLeader.getNodeName(), shardHandler); completeAsyncRequest(asyncId, requestMap, results); log.info("Asking target leader to apply buffered updates"); params = new ModifiableSolrParams(); params.set(CoreAdminParams.ACTION, CoreAdminAction.REQUESTAPPLYUPDATES.toString()); params.set(CoreAdminParams.NAME, targetLeader.getStr("core")); sendShardRequest(targetLeader.getNodeName(), params, shardHandler, asyncId, requestMap); collectShardResponses(results, true, "MIGRATE failed to request node to apply buffered updates", shardHandler); completeAsyncRequest(asyncId, requestMap, results); try { log.info("Deleting temporary collection: " + tempSourceCollectionName); props = makeMap(Overseer.QUEUE_OPERATION, DELETE.toLower(), NAME, tempSourceCollectionName); deleteCollection(new ZkNodeProps(props), results); } catch (Exception e) { log.error("Unable to delete temporary collection: " + tempSourceCollectionName + ". Please remove it manually", e); } } private void completeAsyncRequest(String asyncId, HashMap<String, String> requestMap, NamedList results) { if (asyncId != null) { waitForAsyncCallsToComplete(requestMap, results); requestMap.clear(); } } private DocRouter.Range intersect(DocRouter.Range a, DocRouter.Range b) { if (a == null || b == null || !a.overlaps(b)) { return null; } else if (a.isSubsetOf(b)) return a; else if (b.isSubsetOf(a)) return b; else if (b.includes(a.max)) { return new DocRouter.Range(b.min, a.max); } else { return new DocRouter.Range(a.min, b.max); } } private void sendShardRequest(String nodeName, ModifiableSolrParams params, ShardHandler shardHandler, String asyncId, Map<String, String> requestMap) { if (asyncId != null) { String coreAdminAsyncId = asyncId + Math.abs(System.nanoTime()); params.set(ASYNC, coreAdminAsyncId); requestMap.put(nodeName, coreAdminAsyncId); } ShardRequest sreq = new ShardRequest(); params.set("qt", adminPath); sreq.purpose = 1; String replica = zkStateReader.getBaseUrlForNodeName(nodeName); sreq.shards = new String[] { replica }; sreq.actualShards = sreq.shards; sreq.params = params; shardHandler.submit(sreq, replica, sreq.params); } private void addPropertyParams(ZkNodeProps message, ModifiableSolrParams params) { // Now add the property.key=value pairs for (String key : message.keySet()) { if (key.startsWith(COLL_PROP_PREFIX)) { params.set(key, message.getStr(key)); } } } private static List<String> getLiveOrLiveAndCreateNodeSetList(final Set<String> liveNodes, final ZkNodeProps message, final Random random) { // TODO: add smarter options that look at the current number of cores per // node? // for now we just go random (except when createNodeSet and createNodeSet.shuffle=false are passed in) List<String> nodeList; final String createNodeSetStr = message.getStr(CREATE_NODE_SET); final List<String> createNodeList = (createNodeSetStr == null) ? null : StrUtils.splitSmart((CREATE_NODE_SET_EMPTY.equals(createNodeSetStr) ? "" : createNodeSetStr), ",", true); if (createNodeList != null) { nodeList = new ArrayList<>(createNodeList); nodeList.retainAll(liveNodes); if (message.getBool(CREATE_NODE_SET_SHUFFLE, CREATE_NODE_SET_SHUFFLE_DEFAULT)) { Collections.shuffle(nodeList, random); } } else { nodeList = new ArrayList<>(liveNodes); Collections.shuffle(nodeList, random); } return nodeList; } private void createCollection(ClusterState clusterState, ZkNodeProps message, NamedList results) throws KeeperException, InterruptedException { final String collectionName = message.getStr(NAME); if (clusterState.hasCollection(collectionName)) { throw new SolrException(ErrorCode.BAD_REQUEST, "collection already exists: " + collectionName); } String configName = getConfigName(collectionName, message); if (configName == null) { throw new SolrException(ErrorCode.BAD_REQUEST, "No config set found to associate with the collection."); } else if (!validateConfig(configName)) { throw new SolrException(ErrorCode.BAD_REQUEST, "Can not find the specified config set: " + configName); } try { // look at the replication factor and see if it matches reality // if it does not, find best nodes to create more cores int repFactor = message.getInt(REPLICATION_FACTOR, 1); ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); final String async = message.getStr(ASYNC); Integer numSlices = message.getInt(NUM_SLICES, null); String router = message.getStr("router.name", DocRouter.DEFAULT_NAME); List<String> shardNames = new ArrayList<>(); if (ImplicitDocRouter.NAME.equals(router)) { ClusterStateMutator.getShardNames(shardNames, message.getStr("shards", null)); numSlices = shardNames.size(); } else { if (numSlices == null) { throw new SolrException(ErrorCode.BAD_REQUEST, NUM_SLICES + " is a required param (when using CompositeId router)."); } ClusterStateMutator.getShardNames(numSlices, shardNames); } int maxShardsPerNode = message.getInt(MAX_SHARDS_PER_NODE, 1); if (repFactor <= 0) { throw new SolrException(ErrorCode.BAD_REQUEST, REPLICATION_FACTOR + " must be greater than 0"); } if (numSlices <= 0) { throw new SolrException(ErrorCode.BAD_REQUEST, NUM_SLICES + " must be > 0"); } // we need to look at every node and see how many cores it serves // add our new cores to existing nodes serving the least number of cores // but (for now) require that each core goes on a distinct node. final List<String> nodeList = getLiveOrLiveAndCreateNodeSetList(clusterState.getLiveNodes(), message, RANDOM); Map<Position, String> positionVsNodes; if (nodeList.isEmpty()) { log.warn("It is unusual to create a collection (" + collectionName + ") without cores."); positionVsNodes = new HashMap<>(); } else { if (repFactor > nodeList.size()) { log.warn("Specified " + REPLICATION_FACTOR + " of " + repFactor + " on collection " + collectionName + " is higher than or equal to the number of Solr instances currently live or live and part of your " + CREATE_NODE_SET + "(" + nodeList.size() + "). It's unusual to run two replica of the same slice on the same Solr-instance."); } int maxShardsAllowedToCreate = maxShardsPerNode * nodeList.size(); int requestedShardsToCreate = numSlices * repFactor; if (maxShardsAllowedToCreate < requestedShardsToCreate) { throw new SolrException(ErrorCode.BAD_REQUEST, "Cannot create collection " + collectionName + ". Value of " + MAX_SHARDS_PER_NODE + " is " + maxShardsPerNode + ", and the number of nodes currently live or live and part of your " + CREATE_NODE_SET + " is " + nodeList.size() + ". This allows a maximum of " + maxShardsAllowedToCreate + " to be created. Value of " + NUM_SLICES + " is " + numSlices + " and value of " + REPLICATION_FACTOR + " is " + repFactor + ". This requires " + requestedShardsToCreate + " shards to be created (higher than the allowed number)"); } positionVsNodes = identifyNodes(clusterState, nodeList, message, shardNames, repFactor); } boolean isLegacyCloud = Overseer.isLegacy(zkStateReader.getClusterProps()); createConfNode(configName, collectionName, isLegacyCloud); Overseer.getInQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(message)); // wait for a while until we do see the collection long waitUntil = System.nanoTime() + TimeUnit.NANOSECONDS.convert(30, TimeUnit.SECONDS); boolean created = false; while (System.nanoTime() < waitUntil) { Thread.sleep(100); created = zkStateReader.getClusterState().getCollections().contains(collectionName); if (created) break; } if (!created) throw new SolrException(ErrorCode.SERVER_ERROR, "Could not fully create collection: " + collectionName); if (nodeList.isEmpty()) { log.info("Finished create command for collection: {}", collectionName); return; } // For tracking async calls. HashMap<String, String> requestMap = new HashMap<String, String>(); log.info(formatString( "Creating SolrCores for new collection {0}, shardNames {1} , replicationFactor : {2}", collectionName, shardNames, repFactor)); Map<String, ShardRequest> coresToCreate = new LinkedHashMap<>(); for (Map.Entry<Position, String> e : positionVsNodes.entrySet()) { Position position = e.getKey(); String nodeName = e.getValue(); String coreName = collectionName + "_" + position.shard + "_replica" + (position.index + 1); log.info(formatString("Creating core {0} as part of shard {1} of collection {2} on {3}", coreName, position.shard, collectionName, nodeName)); String baseUrl = zkStateReader.getBaseUrlForNodeName(nodeName); //in the new mode, create the replica in clusterstate prior to creating the core. // Otherwise the core creation fails if (!isLegacyCloud) { ZkNodeProps props = new ZkNodeProps(Overseer.QUEUE_OPERATION, ADDREPLICA.toString(), ZkStateReader.COLLECTION_PROP, collectionName, ZkStateReader.SHARD_ID_PROP, position.shard, ZkStateReader.CORE_NAME_PROP, coreName, ZkStateReader.STATE_PROP, Replica.State.DOWN.toString(), ZkStateReader.BASE_URL_PROP, baseUrl); Overseer.getInQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(props)); } // Need to create new params for each request ModifiableSolrParams params = new ModifiableSolrParams(); params.set(CoreAdminParams.ACTION, CoreAdminAction.CREATE.toString()); params.set(CoreAdminParams.NAME, coreName); params.set(COLL_CONF, configName); params.set(CoreAdminParams.COLLECTION, collectionName); params.set(CoreAdminParams.SHARD, position.shard); params.set(ZkStateReader.NUM_SHARDS_PROP, numSlices); if (async != null) { String coreAdminAsyncId = async + Math.abs(System.nanoTime()); params.add(ASYNC, coreAdminAsyncId); requestMap.put(nodeName, coreAdminAsyncId); } addPropertyParams(message, params); ShardRequest sreq = new ShardRequest(); params.set("qt", adminPath); sreq.purpose = 1; sreq.shards = new String[] { baseUrl }; sreq.actualShards = sreq.shards; sreq.params = params; if (isLegacyCloud) { shardHandler.submit(sreq, sreq.shards[0], sreq.params); } else { coresToCreate.put(coreName, sreq); } } if (!isLegacyCloud) { // wait for all replica entries to be created Map<String, Replica> replicas = waitToSeeReplicasInState(collectionName, coresToCreate.keySet()); for (Map.Entry<String, ShardRequest> e : coresToCreate.entrySet()) { ShardRequest sreq = e.getValue(); sreq.params.set(CoreAdminParams.CORE_NODE_NAME, replicas.get(e.getKey()).getName()); shardHandler.submit(sreq, sreq.shards[0], sreq.params); } } processResponses(results, shardHandler); completeAsyncRequest(async, requestMap, results); log.info("Finished create command on all shards for collection: " + collectionName); } catch (SolrException ex) { throw ex; } catch (Exception ex) { throw new SolrException(ErrorCode.SERVER_ERROR, null, ex); } } private Map<Position, String> identifyNodes(ClusterState clusterState, List<String> nodeList, ZkNodeProps message, List<String> shardNames, int repFactor) throws IOException { List<Map> maps = (List) message.get("rule"); if (maps == null) { int i = 0; Map<Position, String> result = new HashMap<>(); for (String aShard : shardNames) { for (int j = 0; j < repFactor; j++) { result.put(new Position(aShard, j), nodeList.get(i % nodeList.size())); i++; } } return result; } List<Rule> rules = new ArrayList<>(); for (Object map : maps) rules.add(new Rule((Map) map)); Map<String, Integer> sharVsReplicaCount = new HashMap<>(); for (String shard : shardNames) sharVsReplicaCount.put(shard, repFactor); ReplicaAssigner replicaAssigner = new ReplicaAssigner(rules, sharVsReplicaCount, (List<Map>) message.get(SNITCH), new HashMap<String, Map<String, Integer>>(), //this is a new collection. So, there are no nodes in any shard nodeList, overseer.getZkController().getCoreContainer(), clusterState); return replicaAssigner.getNodeMappings(); } private Map<String, Replica> waitToSeeReplicasInState(String collectionName, Collection<String> coreNames) throws InterruptedException { Map<String, Replica> result = new HashMap<>(); long endTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(30, TimeUnit.SECONDS); while (true) { DocCollection coll = zkStateReader.getClusterState().getCollection(collectionName); for (String coreName : coreNames) { if (result.containsKey(coreName)) continue; for (Slice slice : coll.getSlices()) { for (Replica replica : slice.getReplicas()) { if (coreName.equals(replica.getStr(ZkStateReader.CORE_NAME_PROP))) { result.put(coreName, replica); break; } } } } if (result.size() == coreNames.size()) { return result; } if (System.nanoTime() > endTime) { throw new SolrException(ErrorCode.SERVER_ERROR, "Timed out waiting to see all replicas: " + coreNames + " in cluster state."); } Thread.sleep(100); } } private void addReplica(ClusterState clusterState, ZkNodeProps message, NamedList results) throws KeeperException, InterruptedException { String collection = message.getStr(COLLECTION_PROP); String node = message.getStr(CoreAdminParams.NODE); String shard = message.getStr(SHARD_ID_PROP); String coreName = message.getStr(CoreAdminParams.NAME); if (StringUtils.isBlank(coreName)) { coreName = message.getStr(CoreAdminParams.PROPERTY_PREFIX + CoreAdminParams.NAME); } final String asyncId = message.getStr(ASYNC); DocCollection coll = clusterState.getCollection(collection); if (coll == null) { throw new SolrException(ErrorCode.BAD_REQUEST, "Collection: " + collection + " does not exist"); } if (coll.getSlice(shard) == null) { throw new SolrException(ErrorCode.BAD_REQUEST, "Collection: " + collection + " shard: " + shard + " does not exist"); } ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); // Kind of unnecessary, but it does put the logic of whether to override maxShardsPerNode in one place. node = getNodesForNewReplicas(clusterState, collection, shard, 1, node, overseer.getZkController().getCoreContainer()).get(0).nodeName; log.info("Node not provided, Identified {} for creating new replica", node); if (!clusterState.liveNodesContain(node)) { throw new SolrException(ErrorCode.BAD_REQUEST, "Node: " + node + " is not live"); } if (coreName == null) { coreName = Assign.buildCoreName(coll, shard); } else if (!message.getBool(SKIP_CREATE_REPLICA_IN_CLUSTER_STATE, false)) { //Validate that the core name is unique in that collection for (Slice slice : coll.getSlices()) { for (Replica replica : slice.getReplicas()) { String replicaCoreName = replica.getStr(CORE_NAME_PROP); if (coreName.equals(replicaCoreName)) { throw new SolrException(ErrorCode.BAD_REQUEST, "Another replica with the same core name already exists" + " for this collection"); } } } } ModifiableSolrParams params = new ModifiableSolrParams(); if (!Overseer.isLegacy(zkStateReader.getClusterProps())) { if (!message.getBool(SKIP_CREATE_REPLICA_IN_CLUSTER_STATE, false)) { ZkNodeProps props = new ZkNodeProps(Overseer.QUEUE_OPERATION, ADDREPLICA.toLower(), ZkStateReader.COLLECTION_PROP, collection, ZkStateReader.SHARD_ID_PROP, shard, ZkStateReader.CORE_NAME_PROP, coreName, ZkStateReader.STATE_PROP, Replica.State.DOWN.toString(), ZkStateReader.BASE_URL_PROP, zkStateReader.getBaseUrlForNodeName(node), ZkStateReader.NODE_NAME_PROP, node); Overseer.getInQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(props)); } params.set(CoreAdminParams.CORE_NODE_NAME, waitToSeeReplicasInState(collection, Collections.singletonList(coreName)).get(coreName) .getName()); } String configName = zkStateReader.readConfigName(collection); String routeKey = message.getStr(ShardParams._ROUTE_); String dataDir = message.getStr(CoreAdminParams.DATA_DIR); String instanceDir = message.getStr(CoreAdminParams.INSTANCE_DIR); params.set(CoreAdminParams.ACTION, CoreAdminAction.CREATE.toString()); params.set(CoreAdminParams.NAME, coreName); params.set(COLL_CONF, configName); params.set(CoreAdminParams.COLLECTION, collection); if (shard != null) { params.set(CoreAdminParams.SHARD, shard); } else if (routeKey != null) { Collection<Slice> slices = coll.getRouter().getSearchSlicesSingle(routeKey, null, coll); if (slices.isEmpty()) { throw new SolrException(ErrorCode.BAD_REQUEST, "No active shard serving _route_=" + routeKey + " found"); } else { params.set(CoreAdminParams.SHARD, slices.iterator().next().getName()); } } else { throw new SolrException(ErrorCode.BAD_REQUEST, "Specify either 'shard' or _route_ param"); } if (dataDir != null) { params.set(CoreAdminParams.DATA_DIR, dataDir); } if (instanceDir != null) { params.set(CoreAdminParams.INSTANCE_DIR, instanceDir); } addPropertyParams(message, params); // For tracking async calls. HashMap<String, String> requestMap = new HashMap<>(); sendShardRequest(node, params, shardHandler, asyncId, requestMap); collectShardResponses(results, true, "ADDREPLICA failed to create replica", shardHandler); completeAsyncRequest(asyncId, requestMap, results); } private void processResponses(NamedList results, ShardHandler shardHandler) { ShardResponse srsp; do { srsp = shardHandler.takeCompletedOrError(); if (srsp != null) { processResponse(results, srsp); } } while (srsp != null); } private String getConfigName(String coll, ZkNodeProps message) throws KeeperException, InterruptedException { String configName = message.getStr(OverseerCollectionProcessor.COLL_CONF); if (configName == null) { // if there is only one conf, use that List<String> configNames = null; try { configNames = zkStateReader.getZkClient().getChildren(ZkConfigManager.CONFIGS_ZKNODE, null, true); if (configNames != null && configNames.size() == 1) { configName = configNames.get(0); // no config set named, but there is only 1 - use it log.info("Only one config set found in zk - using it:" + configName); } else if (configNames.contains(coll)) { configName = coll; } } catch (KeeperException.NoNodeException e) { } } return configName; } private boolean validateConfig(String configName) throws KeeperException, InterruptedException { return zkStateReader.getZkClient().exists(ZkConfigManager.CONFIGS_ZKNODE + "/" + configName, true); } /** * This doesn't validate the config (path) itself and is just responsible for creating the confNode. * That check should be done before the config node is created. */ private void createConfNode(String configName, String coll, boolean isLegacyCloud) throws KeeperException, InterruptedException { if (configName != null) { String collDir = ZkStateReader.COLLECTIONS_ZKNODE + "/" + coll; log.info("creating collections conf node {} ", collDir); byte[] data = Utils.toJSON(makeMap(ZkController.CONFIGNAME_PROP, configName)); if (zkStateReader.getZkClient().exists(collDir, true)) { zkStateReader.getZkClient().setData(collDir, data, true); } else { zkStateReader.getZkClient().makePath(collDir, data, true); } } else { if (isLegacyCloud) { log.warn("Could not obtain config name"); } else { throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to get config name"); } } } private void collectionCmd(ClusterState clusterState, ZkNodeProps message, ModifiableSolrParams params, NamedList results, Replica.State stateMatcher) { log.info("Executing Collection Cmd : " + params); String collectionName = message.getStr(NAME); ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); DocCollection coll = clusterState.getCollection(collectionName); for (Map.Entry<String, Slice> entry : coll.getSlicesMap().entrySet()) { Slice slice = entry.getValue(); sliceCmd(clusterState, params, stateMatcher, slice, shardHandler); } processResponses(results, shardHandler); } private void sliceCmd(ClusterState clusterState, ModifiableSolrParams params, Replica.State stateMatcher, Slice slice, ShardHandler shardHandler) { Map<String, Replica> shards = slice.getReplicasMap(); Set<Map.Entry<String, Replica>> shardEntries = shards.entrySet(); for (Map.Entry<String, Replica> shardEntry : shardEntries) { final ZkNodeProps node = shardEntry.getValue(); if (clusterState.liveNodesContain(node.getStr(ZkStateReader.NODE_NAME_PROP)) && (stateMatcher == null || Replica.State.getState(node.getStr(ZkStateReader.STATE_PROP)) == stateMatcher)) { // For thread safety, only simple clone the ModifiableSolrParams ModifiableSolrParams cloneParams = new ModifiableSolrParams(); cloneParams.add(params); cloneParams.set(CoreAdminParams.CORE, node.getStr(ZkStateReader.CORE_NAME_PROP)); String replica = node.getStr(ZkStateReader.BASE_URL_PROP); ShardRequest sreq = new ShardRequest(); sreq.nodeName = node.getStr(ZkStateReader.NODE_NAME_PROP); // yes, they must use same admin handler path everywhere... cloneParams.set("qt", adminPath); sreq.purpose = 1; sreq.shards = new String[] { replica }; sreq.actualShards = sreq.shards; sreq.params = cloneParams; log.info("Collection Admin sending CoreAdmin cmd to " + replica + " params:" + sreq.params); shardHandler.submit(sreq, replica, sreq.params); } } } private void processResponse(NamedList results, ShardResponse srsp) { Throwable e = srsp.getException(); String nodeName = srsp.getNodeName(); SolrResponse solrResponse = srsp.getSolrResponse(); String shard = srsp.getShard(); processResponse(results, e, nodeName, solrResponse, shard); } @SuppressWarnings("unchecked") private void processResponse(NamedList results, Throwable e, String nodeName, SolrResponse solrResponse, String shard) { if (e != null) { log.error("Error from shard: " + shard, e); SimpleOrderedMap failure = (SimpleOrderedMap) results.get("failure"); if (failure == null) { failure = new SimpleOrderedMap(); results.add("failure", failure); } failure.add(nodeName, e.getClass().getName() + ":" + e.getMessage()); } else { SimpleOrderedMap success = (SimpleOrderedMap) results.get("success"); if (success == null) { success = new SimpleOrderedMap(); results.add("success", success); } success.add(nodeName, solrResponse.getResponse()); } } public boolean isClosed() { return isClosed; } @SuppressWarnings("unchecked") private void waitForAsyncCallsToComplete(Map<String, String> requestMap, NamedList results) { for (String k : requestMap.keySet()) { log.debug("I am Waiting for :{}/{}", k, requestMap.get(k)); results.add(requestMap.get(k), waitForCoreAdminAsyncCallToComplete(k, requestMap.get(k))); } } private NamedList waitForCoreAdminAsyncCallToComplete(String nodeName, String requestId) { ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); ModifiableSolrParams params = new ModifiableSolrParams(); params.set(CoreAdminParams.ACTION, CoreAdminAction.REQUESTSTATUS.toString()); params.set(CoreAdminParams.REQUESTID, requestId); int counter = 0; ShardRequest sreq; do { sreq = new ShardRequest(); params.set("qt", adminPath); sreq.purpose = 1; String replica = zkStateReader.getBaseUrlForNodeName(nodeName); sreq.shards = new String[] { replica }; sreq.actualShards = sreq.shards; sreq.params = params; shardHandler.submit(sreq, replica, sreq.params); ShardResponse srsp; do { srsp = shardHandler.takeCompletedOrError(); if (srsp != null) { NamedList results = new NamedList(); processResponse(results, srsp); String r = (String) srsp.getSolrResponse().getResponse().get("STATUS"); if (r.equals("running")) { log.debug("The task is still RUNNING, continuing to wait."); try { Thread.sleep(1000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } continue; } else if (r.equals("completed")) { log.debug("The task is COMPLETED, returning"); return srsp.getSolrResponse().getResponse(); } else if (r.equals("failed")) { // TODO: Improve this. Get more information. log.debug("The task is FAILED, returning"); return srsp.getSolrResponse().getResponse(); } else if (r.equals("notfound")) { log.debug("The task is notfound, retry"); if (counter++ < 5) { try { Thread.sleep(1000); } catch (InterruptedException e) { } break; } throw new SolrException(ErrorCode.BAD_REQUEST, "Invalid status request for requestId: " + requestId + "" + srsp.getSolrResponse().getResponse().get("STATUS") + "retried " + counter + "times"); } else { throw new SolrException(ErrorCode.BAD_REQUEST, "Invalid status request " + srsp.getSolrResponse().getResponse().get("STATUS")); } } } while (srsp != null); } while (true); } @SuppressWarnings("unchecked") private void markTaskAsRunning(QueueEvent head, String collectionName, String asyncId, ZkNodeProps message) throws KeeperException, InterruptedException { synchronized (runningZKTasks) { runningZKTasks.add(head.getId()); } synchronized (runningTasks) { runningTasks.add(head.getId()); } //TODO deprecated remove this check . if (!CLUSTERSTATUS.isEqual(message.getStr(Overseer.QUEUE_OPERATION)) && collectionName != null) { synchronized (collectionWip) { collectionWip.add(collectionName); } } if (asyncId != null) runningMap.put(asyncId, null); } protected class Runner implements Runnable { ZkNodeProps message; String operation; SolrResponse response; QueueEvent head; public Runner(ZkNodeProps message, String operation, QueueEvent head) { this.message = message; this.operation = operation; this.head = head; response = null; } @Override public void run() { final TimerContext timerContext = stats.time("collection_" + operation); boolean success = false; final String asyncId = message.getStr(ASYNC); String collectionName = message.containsKey(COLLECTION_PROP) ? message.getStr(COLLECTION_PROP) : message.getStr(NAME); try { try { log.debug("Runner processing {}", head.getId()); response = processMessage(message, operation); } finally { timerContext.stop(); updateStats(); } if (asyncId != null) { if (response != null && (response.getResponse().get("failure") != null || response.getResponse().get("exception") != null)) { failureMap.put(asyncId, SolrResponse.serializable(response)); log.debug("Updated failed map for task with zkid:[{}]", head.getId()); } else { completedMap.put(asyncId, SolrResponse.serializable(response)); log.debug("Updated completed map for task with zkid:[{}]", head.getId()); } } else { head.setBytes(SolrResponse.serializable(response)); log.debug("Completed task:[{}]", head.getId()); } markTaskComplete(head.getId(), asyncId, collectionName); log.debug("Marked task [{}] as completed.", head.getId()); printTrackingMaps(); log.info("Overseer Collection Processor: Message id:" + head.getId() + " complete, response:" + response.getResponse().toString()); success = true; } catch (KeeperException e) { SolrException.log(log, "", e); } catch (InterruptedException e) { // Reset task from tracking data structures so that it can be retried. resetTaskWithException(head.getId(), asyncId, collectionName); log.warn("Resetting task {} as the thread was interrupted.", head.getId()); Thread.currentThread().interrupt(); } finally { if (!success) { // Reset task from tracking data structures so that it can be retried. resetTaskWithException(head.getId(), asyncId, collectionName); } synchronized (waitLock) { waitLock.notifyAll(); } } } private void markTaskComplete(String id, String asyncId, String collectionName) throws KeeperException, InterruptedException { synchronized (completedTasks) { completedTasks.put(id, head); } synchronized (runningTasks) { runningTasks.remove(id); } if (asyncId != null) runningMap.remove(asyncId); if (!CLUSTERSTATUS.isEqual(operation) && collectionName != null) { synchronized (collectionWip) { collectionWip.remove(collectionName); } } } private void resetTaskWithException(String id, String asyncId, String collectionName) { log.warn("Resetting task: {}, requestid: {}, collectionName: {}", id, asyncId, collectionName); try { if (asyncId != null) runningMap.remove(asyncId); synchronized (runningTasks) { runningTasks.remove(id); } if (!CLUSTERSTATUS.isEqual(operation) && collectionName != null) { synchronized (collectionWip) { collectionWip.remove(collectionName); } } } catch (KeeperException e) { SolrException.log(log, "", e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } private void updateStats() { if (isSuccessful()) { stats.success("collection_" + operation); } else { stats.error("collection_" + operation); stats.storeFailureDetails("collection_" + operation, message, response); } } private boolean isSuccessful() { if (response == null) return false; return !(response.getResponse().get("failure") != null || response.getResponse().get("exception") != null); } } private void printTrackingMaps() { if (log.isDebugEnabled()) { synchronized (runningTasks) { log.debug("RunningTasks: {}", runningTasks.toString()); } synchronized (completedTasks) { log.debug("CompletedTasks: {}", completedTasks.keySet().toString()); } synchronized (runningZKTasks) { log.debug("RunningZKTasks: {}", runningZKTasks.toString()); } } } String getId() { return myId; } }