Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.service; import java.util.*; import java.util.concurrent.ExecutorService; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.io.*; import java.lang.management.ManagementFactory; import java.math.BigInteger; import java.net.UnknownHostException; import javax.management.MBeanServer; import javax.management.ObjectName; import org.apache.cassandra.analytics.AnalyticsContext; import org.apache.cassandra.concurrent.DebuggableThreadPoolExecutor; import org.apache.cassandra.concurrent.MultiThreadedStage; import org.apache.cassandra.concurrent.SingleThreadedStage; import org.apache.cassandra.concurrent.StageManager; import org.apache.cassandra.concurrent.ThreadFactoryImpl; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.BinaryVerbHandler; import org.apache.cassandra.db.CalloutDeployVerbHandler; import org.apache.cassandra.db.DBManager; import org.apache.cassandra.db.DataFileVerbHandler; import org.apache.cassandra.db.HintedHandOffManager; import org.apache.cassandra.db.LoadVerbHandler; import org.apache.cassandra.db.Memtable; import org.apache.cassandra.db.ReadRepairVerbHandler; import org.apache.cassandra.db.ReadVerbHandler; import org.apache.cassandra.db.Row; import org.apache.cassandra.db.RowMutationVerbHandler; import org.apache.cassandra.db.SystemTable; import org.apache.cassandra.db.Table; import org.apache.cassandra.db.TouchVerbHandler; import org.apache.cassandra.dht.BootStrapper; import org.apache.cassandra.dht.BootstrapInitiateMessage; import org.apache.cassandra.dht.BootstrapMetadataVerbHandler; import org.apache.cassandra.dht.Range; import org.apache.cassandra.gms.ApplicationState; import org.apache.cassandra.gms.EndPointState; import org.apache.cassandra.gms.FailureDetector; import org.apache.cassandra.gms.Gossiper; import org.apache.cassandra.gms.IEndPointStateChangeSubscriber; import org.apache.cassandra.io.DataInputBuffer; import org.apache.cassandra.io.ICompactSerializer; import org.apache.cassandra.locator.EndPointSnitch; import org.apache.cassandra.locator.IEndPointSnitch; import org.apache.cassandra.locator.IReplicaPlacementStrategy; import org.apache.cassandra.locator.RackAwareStrategy; import org.apache.cassandra.locator.RackUnawareStrategy; import org.apache.cassandra.locator.TokenMetadata; import org.apache.cassandra.net.CompactEndPointSerializationHelper; import org.apache.cassandra.net.EndPoint; import org.apache.cassandra.net.IVerbHandler; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.net.http.HttpConnection; import org.apache.cassandra.net.io.StreamContextManager; import org.apache.cassandra.tools.MembershipCleanerVerbHandler; import org.apache.cassandra.tools.TokenUpdateVerbHandler; import org.apache.cassandra.utils.FileUtils; import org.apache.cassandra.utils.LogUtil; import org.apache.commons.math.linear.RealMatrix; import org.apache.commons.math.linear.RealMatrixImpl; import org.apache.log4j.Logger; import org.apache.cassandra.concurrent.*; import org.apache.cassandra.db.*; import org.apache.cassandra.net.io.*; import org.apache.cassandra.gms.*; import org.apache.cassandra.utils.*; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.WatchedEvent; import org.apache.zookeeper.Watcher; import org.apache.zookeeper.ZooKeeper; import org.apache.zookeeper.ZooDefs.Ids; import org.apache.zookeeper.data.Stat; import org.apache.zookeeper.proto.WatcherEvent; /* * This abstraction contains the token/identifier of this node * on the identifier space. This token gets gossiped around. * This class will also maintain histograms of the load information * of other nodes in the cluster. * Author : Avinash Lakshman ( alakshman@facebook.com) & Prashant Malik ( pmalik@facebook.com ) */ public final class StorageService implements IEndPointStateChangeSubscriber, StorageServiceMBean { private static Logger logger_ = Logger.getLogger(StorageService.class); private final static String nodeId_ = "NODE-IDENTIFIER"; private final static String loadAll_ = "LOAD-ALL"; /* Gossip load after every 5 mins. */ private static final long threshold_ = 5 * 60 * 1000L; /* All stage identifiers */ public final static String mutationStage_ = "ROW-MUTATION-STAGE"; public final static String readStage_ = "ROW-READ-STAGE"; public final static String mrStage_ = "MAP-REDUCE-STAGE"; /* All verb handler identifiers */ public final static String mutationVerbHandler_ = "ROW-MUTATION-VERB-HANDLER"; public final static String tokenVerbHandler_ = "TOKEN-VERB-HANDLER"; public final static String loadVerbHandler_ = "LOAD-VERB-HANDLER"; public final static String binaryVerbHandler_ = "BINARY-VERB-HANDLER"; public final static String readRepairVerbHandler_ = "READ-REPAIR-VERB-HANDLER"; public final static String readVerbHandler_ = "ROW-READ-VERB-HANDLER"; public final static String bootStrapInitiateVerbHandler_ = "BOOTSTRAP-INITIATE-VERB-HANDLER"; public final static String bootStrapInitiateDoneVerbHandler_ = "BOOTSTRAP-INITIATE-DONE-VERB-HANDLER"; public final static String bootStrapTerminateVerbHandler_ = "BOOTSTRAP-TERMINATE-VERB-HANDLER"; public final static String dataFileVerbHandler_ = "DATA-FILE-VERB-HANDLER"; public final static String mbrshipCleanerVerbHandler_ = "MBRSHIP-CLEANER-VERB-HANDLER"; public final static String bsMetadataVerbHandler_ = "BS-METADATA-VERB-HANDLER"; public final static String calloutDeployVerbHandler_ = "CALLOUT-DEPLOY-VERB-HANDLER"; public final static String touchVerbHandler_ = "TOUCH-VERB-HANDLER"; public static enum ConsistencyLevel { WEAK, STRONG } private static StorageService instance_; /* Used to lock the factory for creation of StorageService instance */ private static Lock createLock_ = new ReentrantLock(); private static EndPoint tcpAddr_; private static EndPoint udpAddr_; private static IPartitioner partitioner_; public static EndPoint getLocalStorageEndPoint() { return tcpAddr_; } public static EndPoint getLocalControlEndPoint() { return udpAddr_; } public static String getHostUrl() { return "http://" + tcpAddr_.getHost() + ":" + DatabaseDescriptor.getHttpPort(); } /** * This is a facade for the hashing * function used by the system for * partitioning. */ public static BigInteger hash(String key) { return partitioner_.hash(key); } public static IPartitioner getPartitioner() { return partitioner_; } public static enum BootstrapMode { HINT, FULL } public static class BootstrapInitiateDoneVerbHandler implements IVerbHandler { private static Logger logger_ = Logger.getLogger(BootstrapInitiateDoneVerbHandler.class); public void doVerb(Message message) { logger_.debug("Received a bootstrap initiate done message ..."); /* Let the Stream Manager do his thing. */ StreamManager.instance(message.getFrom()).start(); } } private class ShutdownTimerTask extends TimerTask { public void run() { StorageService.instance().shutdown(); } } /* * Factory method that gets an instance of the StorageService * class. */ public static StorageService instance() { if (instance_ == null) { StorageService.createLock_.lock(); try { if (instance_ == null) { try { instance_ = new StorageService(); } catch (Throwable th) { logger_.error(LogUtil.throwableToString(th)); System.exit(1); } } } finally { createLock_.unlock(); } } return instance_; } /* * This is the endpoint snitch which depends on the network architecture. We * need to keep this information for each endpoint so that we make decisions * while doing things like replication etc. * */ private IEndPointSnitch endPointSnitch_; /* Uptime of this node - we use this to determine if a bootstrap can be performed by this node */ private long uptime_ = 0L; /* This abstraction maintains the token/endpoint metadata information */ private TokenMetadata tokenMetadata_ = new TokenMetadata(); private DBManager.StorageMetadata storageMetadata_; /* * Maintains a list of all components that need to be shutdown * for a clean exit. */ private Set<IComponentShutdown> components_ = new HashSet<IComponentShutdown>(); /* * This boolean indicates if we are in loading state. If we are then we do not want any * distributed algorithms w.r.t change in token state to kick in. */ private boolean isLoadState_ = false; /* Timer is used to disseminate load information */ private Timer loadTimer_ = new Timer(false); /* * This variable indicates if the local storage instance * has been shutdown. */ private AtomicBoolean isShutdown_ = new AtomicBoolean(false); /* This thread pool is used to do the bootstrap for a new node */ private ExecutorService bootStrapper_ = new DebuggableThreadPoolExecutor(1, 1, Integer.MAX_VALUE, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(), new ThreadFactoryImpl("BOOT-STRAPPER")); /* This thread pool does consistency checks when the client doesn't care about consistency */ private ExecutorService consistencyManager_; /* This is the entity that tracks load information of all nodes in the cluster */ private StorageLoadBalancer storageLoadBalancer_; /* We use this interface to determine where replicas need to be placed */ private IReplicaPlacementStrategy nodePicker_; /* Handle to a ZooKeeper instance */ private ZooKeeper zk_; /* * Registers with Management Server */ private void init() { // Register this instance with JMX try { MBeanServer mbs = ManagementFactory.getPlatformMBeanServer(); mbs.registerMBean(this, new ObjectName("org.apache.cassandra.service:type=StorageService")); } catch (Exception e) { logger_.error(LogUtil.throwableToString(e)); } } public StorageService() { init(); uptime_ = System.currentTimeMillis(); storageLoadBalancer_ = new StorageLoadBalancer(this); endPointSnitch_ = new EndPointSnitch(); /* register the verb handlers */ MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.tokenVerbHandler_, new TokenUpdateVerbHandler()); MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.binaryVerbHandler_, new BinaryVerbHandler()); MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.loadVerbHandler_, new LoadVerbHandler()); MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.mutationVerbHandler_, new RowMutationVerbHandler()); MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.readRepairVerbHandler_, new ReadRepairVerbHandler()); MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.readVerbHandler_, new ReadVerbHandler()); MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.bootStrapInitiateVerbHandler_, new Table.BootStrapInitiateVerbHandler()); MessagingService.getMessagingInstance().registerVerbHandlers( StorageService.bootStrapInitiateDoneVerbHandler_, new StorageService.BootstrapInitiateDoneVerbHandler()); MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.bootStrapTerminateVerbHandler_, new StreamManager.BootstrapTerminateVerbHandler()); MessagingService.getMessagingInstance().registerVerbHandlers(HttpConnection.httpRequestVerbHandler_, new HttpRequestVerbHandler(this)); MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.dataFileVerbHandler_, new DataFileVerbHandler()); MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.mbrshipCleanerVerbHandler_, new MembershipCleanerVerbHandler()); MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.bsMetadataVerbHandler_, new BootstrapMetadataVerbHandler()); MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.calloutDeployVerbHandler_, new CalloutDeployVerbHandler()); MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.touchVerbHandler_, new TouchVerbHandler()); /* register the stage for the mutations */ int threadCount = DatabaseDescriptor.getThreadsPerPool(); consistencyManager_ = new DebuggableThreadPoolExecutor(threadCount, threadCount, Integer.MAX_VALUE, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(), new ThreadFactoryImpl("CONSISTENCY-MANAGER")); StageManager.registerStage(StorageService.mutationStage_, new MultiThreadedStage(StorageService.mutationStage_, threadCount)); StageManager.registerStage(StorageService.readStage_, new MultiThreadedStage(StorageService.readStage_, 2 * threadCount)); StageManager.registerStage(StorageService.mrStage_, new MultiThreadedStage(StorageService.mrStage_, threadCount)); /* Stage for handling the HTTP messages. */ StageManager.registerStage(HttpConnection.httpStage_, new SingleThreadedStage("HTTP-REQUEST")); if (DatabaseDescriptor.isRackAware()) nodePicker_ = new RackAwareStrategy(tokenMetadata_); else nodePicker_ = new RackUnawareStrategy(tokenMetadata_); } private void reportToZookeeper() throws Throwable { try { zk_ = new ZooKeeper(DatabaseDescriptor.getZkAddress(), DatabaseDescriptor.getZkSessionTimeout(), new Watcher() { public void process(WatchedEvent we) { String path = "/Cassandra/" + DatabaseDescriptor.getClusterName() + "/Leader"; String eventPath = we.getPath(); logger_.debug("PROCESS EVENT : " + eventPath); if (eventPath != null && (eventPath.contains(path))) { logger_.debug("Signalling the leader instance ..."); LeaderElector.instance().signal(); } } }); Stat stat = zk_.exists("/", false); if (stat != null) { stat = zk_.exists("/Cassandra", false); if (stat == null) { logger_.debug("Creating the Cassandra znode ..."); zk_.create("/Cassandra", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } String path = "/Cassandra/" + DatabaseDescriptor.getClusterName(); stat = zk_.exists(path, false); if (stat == null) { logger_.debug("Creating the cluster znode " + path); zk_.create(path, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } /* Create the Leader, Locks and Misc znode */ stat = zk_.exists(path + "/Leader", false); if (stat == null) { logger_.debug("Creating the leader znode " + path); zk_.create(path + "/Leader", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } stat = zk_.exists(path + "/Locks", false); if (stat == null) { logger_.debug("Creating the locks znode " + path); zk_.create(path + "/Locks", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } stat = zk_.exists(path + "/Misc", false); if (stat == null) { logger_.debug("Creating the misc znode " + path); zk_.create(path + "/Misc", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } } } catch (KeeperException ke) { LogUtil.throwableToString(ke); /* do the re-initialize again. */ reportToZookeeper(); } } protected ZooKeeper getZooKeeperHandle() { return zk_; } public boolean isLeader(EndPoint endpoint) { EndPoint leader = getLeader(); return leader.equals(endpoint); } public EndPoint getLeader() { return LeaderElector.instance().getLeader(); } public void registerComponentForShutdown(IComponentShutdown component) { components_.add(component); } static { String hashingStrategy = DatabaseDescriptor.getHashingStrategy(); if (DatabaseDescriptor.ophf_.equalsIgnoreCase(hashingStrategy)) { partitioner_ = new OrderPreservingHashPartitioner(); } else { partitioner_ = new RandomPartitioner(); } } public void start() throws IOException { /* Start the DB */ storageMetadata_ = DBManager.instance().start(); /* Set up TCP endpoint */ tcpAddr_ = new EndPoint(DatabaseDescriptor.getStoragePort()); /* Set up UDP endpoint */ udpAddr_ = new EndPoint(DatabaseDescriptor.getControlPort()); /* Listen for application messages */ MessagingService.getMessagingInstance().listen(tcpAddr_, false); /* Listen for control messages */ MessagingService.getMessagingInstance().listenUDP(udpAddr_); /* Listen for HTTP messages */ MessagingService.getMessagingInstance().listen(new EndPoint(DatabaseDescriptor.getHttpPort()), true); /* start the analytics context package */ AnalyticsContext.instance().start(); /* starts a load timer thread */ loadTimer_.schedule(new LoadDisseminator(), StorageService.threshold_, StorageService.threshold_); /* report our existence to ZooKeeper instance and start the leader election service */ //reportToZookeeper(); /* start the leader election algorithm */ //LeaderElector.instance().start(); /* start the map reduce framework */ //startMapReduceFramework(); /* Start the storage load balancer */ storageLoadBalancer_.start(); /* Register with the Gossiper for EndPointState notifications */ Gossiper.instance().register(this); /* * Start the gossiper with the generation # retrieved from the System * table */ Gossiper.instance().start(udpAddr_, storageMetadata_.getGeneration()); /* Make sure this token gets gossiped around. */ tokenMetadata_.update(storageMetadata_.getStorageId(), StorageService.tcpAddr_); Gossiper.instance().addApplicationState(StorageService.nodeId_, new ApplicationState(storageMetadata_.getStorageId().toString())); } public void killMe() throws Throwable { isShutdown_.set(true); /* * Shutdown the Gossiper to stop responding/sending Gossip messages. * This causes other nodes to detect you as dead and starting hinting * data for the local endpoint. */ Gossiper.instance().shutdown(); final long nodeDeadDetectionTime = 25000L; Thread.sleep(nodeDeadDetectionTime); /* Now perform a force flush of the table */ String table = DatabaseDescriptor.getTables().get(0); Table.open(table).flush(false); /* Now wait for the flush to complete */ Thread.sleep(nodeDeadDetectionTime); /* Shutdown all other components */ StorageService.instance().shutdown(); } public boolean isShutdown() { return isShutdown_.get(); } public void shutdown() { bootStrapper_.shutdownNow(); /* shut down all stages */ StageManager.shutdown(); /* shut down the messaging service */ MessagingService.shutdown(); /* shut down all memtables */ Memtable.shutdown(); /* shut down the load disseminator */ loadTimer_.cancel(); /* shut down the cleaner thread in FileUtils */ FileUtils.shutdown(); /* shut down all registered components */ for (IComponentShutdown component : components_) { component.shutdown(); } } public TokenMetadata getTokenMetadata() { return tokenMetadata_.cloneMe(); } /* TODO: remove later */ public void updateTokenMetadata(BigInteger token, EndPoint endpoint) { tokenMetadata_.update(token, endpoint); } public IEndPointSnitch getEndPointSnitch() { return endPointSnitch_; } /* * Given an EndPoint this method will report if the * endpoint is in the same data center as the local * storage endpoint. */ public boolean isInSameDataCenter(EndPoint endpoint) throws IOException { return endPointSnitch_.isInSameDataCenter(StorageService.tcpAddr_, endpoint); } /* * This method performs the requisite operations to make * sure that the N replicas are in sync. We do this in the * background when we do not care much about consistency. */ public void doConsistencyCheck(Row row, List<EndPoint> endpoints, ReadCommand message) { Runnable consistencySentinel = new ConsistencyManager(row.cloneMe(), endpoints, message.columnFamilyColumn, message.start, message.count, message.sinceTimestamp, message.columnNames); consistencyManager_.submit(consistencySentinel); } @Deprecated public void doConsistencyCheck(Row row, List<EndPoint> endpoints, String columnFamily, int start, int count) { Runnable consistencySentinel = new ConsistencyManager(row.cloneMe(), endpoints, columnFamily, start, count); consistencyManager_.submit(consistencySentinel); } @Deprecated public void doConsistencyCheck(Row row, List<EndPoint> endpoints, String columnFamily, long sinceTimestamp) { Runnable consistencySentinel = new ConsistencyManager(row.cloneMe(), endpoints, columnFamily, sinceTimestamp); consistencyManager_.submit(consistencySentinel); } @Deprecated public void doConsistencyCheck(Row row, List<EndPoint> endpoints, String columnFamily, List<String> columns) { Runnable consistencySentinel = new ConsistencyManager(row.cloneMe(), endpoints, columnFamily, columns); consistencyManager_.submit(consistencySentinel); } /* * This method displays all the ranges and the replicas * that are responsible for the individual ranges. The * format of this string is the following: * * R1 : A B C * R2 : D E F * R3 : G H I */ public String showTheRing() { StringBuilder sb = new StringBuilder(); /* Get the token to endpoint map. */ Map<BigInteger, EndPoint> tokenToEndPointMap = tokenMetadata_.cloneTokenEndPointMap(); Set<BigInteger> tokens = tokenToEndPointMap.keySet(); /* All the ranges for the tokens */ Range[] ranges = getAllRanges(tokens); Map<Range, List<EndPoint>> oldRangeToEndPointMap = constructRangeToEndPointMap(ranges); Set<Range> rangeSet = oldRangeToEndPointMap.keySet(); for (Range range : rangeSet) { sb.append(range); sb.append(" : "); List<EndPoint> replicas = oldRangeToEndPointMap.get(range); for (EndPoint replica : replicas) { sb.append(replica); sb.append(" "); } sb.append(System.getProperty("line.separator")); } return sb.toString(); } public Map<Range, List<EndPoint>> getRangeToEndPointMap() { /* Get the token to endpoint map. */ Map<BigInteger, EndPoint> tokenToEndPointMap = tokenMetadata_.cloneTokenEndPointMap(); Set<BigInteger> tokens = tokenToEndPointMap.keySet(); /* All the ranges for the tokens */ Range[] ranges = getAllRanges(tokens); return constructRangeToEndPointMap(ranges); } /** * Construct the range to endpoint mapping based on the true view * of the world. * @param ranges * @return mapping of ranges to the replicas responsible for them. */ public Map<Range, List<EndPoint>> constructRangeToEndPointMap(Range[] ranges) { logger_.debug("Constructing range to endpoint map ..."); Map<Range, List<EndPoint>> rangeToEndPointMap = new HashMap<Range, List<EndPoint>>(); for (Range range : ranges) { EndPoint[] endpoints = getNStorageEndPoint(range.right()); rangeToEndPointMap.put(range, new ArrayList<EndPoint>(Arrays.asList(endpoints))); } logger_.debug("Done constructing range to endpoint map ..."); return rangeToEndPointMap; } /** * Construct the range to endpoint mapping based on the view as dictated * by the mapping of token to endpoints passed in. * @param ranges * @param tokenToEndPointMap mapping of token to endpoints. * @return mapping of ranges to the replicas responsible for them. */ public Map<Range, List<EndPoint>> constructRangeToEndPointMap(Range[] ranges, Map<BigInteger, EndPoint> tokenToEndPointMap) { logger_.debug("Constructing range to endpoint map ..."); Map<Range, List<EndPoint>> rangeToEndPointMap = new HashMap<Range, List<EndPoint>>(); for (Range range : ranges) { EndPoint[] endpoints = getNStorageEndPoint(range.right(), tokenToEndPointMap); rangeToEndPointMap.put(range, new ArrayList<EndPoint>(Arrays.asList(endpoints))); } logger_.debug("Done constructing range to endpoint map ..."); return rangeToEndPointMap; } /** * Construct a mapping from endpoint to ranges that endpoint is * responsible for. * @return the mapping from endpoint to the ranges it is responsible * for. */ public Map<EndPoint, List<Range>> constructEndPointToRangesMap() { Map<EndPoint, List<Range>> endPointToRangesMap = new HashMap<EndPoint, List<Range>>(); Map<BigInteger, EndPoint> tokenToEndPointMap = tokenMetadata_.cloneTokenEndPointMap(); Collection<EndPoint> mbrs = tokenToEndPointMap.values(); for (EndPoint mbr : mbrs) { endPointToRangesMap.put(mbr, getRangesForEndPoint(mbr)); } return endPointToRangesMap; } /** * Called when there is a change in application state. In particular * we are interested in new tokens as a result of a new node or an * existing node moving to a new location on the ring. */ public void onChange(EndPoint endpoint, EndPointState epState) { EndPoint ep = new EndPoint(endpoint.getHost(), DatabaseDescriptor.getStoragePort()); /* node identifier for this endpoint on the identifier space */ ApplicationState nodeIdState = epState.getApplicationState(StorageService.nodeId_); if (nodeIdState != null) { BigInteger newToken = new BigInteger(nodeIdState.getState()); logger_.debug("CHANGE IN STATE FOR " + endpoint + " - has token " + nodeIdState.getState()); BigInteger oldToken = tokenMetadata_.getToken(ep); if (oldToken != null) { /* * If oldToken equals the newToken then the node had crashed * and is coming back up again. If oldToken is not equal to * the newToken this means that the node is being relocated * to another position in the ring. */ if (!oldToken.equals(newToken)) { logger_.debug("Relocation for endpoint " + ep); tokenMetadata_.update(newToken, ep); } else { /* * This means the node crashed and is coming back up. * Deliver the hints that we have for this endpoint. */ logger_.debug("Sending hinted data to " + ep); doBootstrap(endpoint, BootstrapMode.HINT); } } else { /* * This is a new node and we just update the token map. */ tokenMetadata_.update(newToken, ep); } } else { /* * If we are here and if this node is UP and already has an entry * in the token map. It means that the node was behind a network partition. */ if (epState.isAlive() && tokenMetadata_.isKnownEndPoint(endpoint)) { logger_.debug("EndPoint " + ep + " just recovered from a partition. Sending hinted data."); doBootstrap(ep, BootstrapMode.HINT); } } /* Check if a bootstrap is in order */ ApplicationState loadAllState = epState.getApplicationState(StorageService.loadAll_); if (loadAllState != null) { String nodes = loadAllState.getState(); if (nodes != null) { doBootstrap(ep, BootstrapMode.FULL); } } } /** * Get the count of primary keys from the sampler. */ public String getLoadInfo() { long diskSpace = FileUtils.getUsedDiskSpace(); return FileUtils.stringifyFileSize(diskSpace); } /** * Get the primary count info for this endpoint. * This is gossiped around and cached in the * StorageLoadBalancer. */ public String getLoadInfo(EndPoint ep) { LoadInfo li = storageLoadBalancer_.getLoad(ep); return (li == null) ? "N/A" : li.toString(); } /* * This method updates the token on disk and modifies the cached * StorageMetadata instance. This is only for the local endpoint. */ public void updateToken(BigInteger token) throws IOException { /* update the token on disk */ SystemTable.openSystemTable(SystemTable.name_).updateToken(token); /* Update the storageMetadata cache */ storageMetadata_.setStorageId(token); /* Update the token maps */ /* Get the old token. This needs to be removed. */ tokenMetadata_.update(token, StorageService.tcpAddr_); /* Gossip this new token for the local storage instance */ Gossiper.instance().addApplicationState(StorageService.nodeId_, new ApplicationState(token.toString())); } /* * This method removes the state associated with this endpoint * from the TokenMetadata instance. * * param@ endpoint remove the token state associated with this * endpoint. */ public void removeTokenState(EndPoint endpoint) { tokenMetadata_.remove(endpoint); /* Remove the state from the Gossiper */ Gossiper.instance().removeFromMembership(endpoint); } /* * This method is invoked by the Loader process to force the * node to move from its current position on the token ring, to * a position to be determined based on the keys. This will help * all nodes to start off perfectly load balanced. The array passed * in is evaluated as follows by the loader process: * If there are 10 keys in the system and a totality of 5 nodes * then each node needs to have 2 keys i.e the array is made up * of every 2nd key in the total list of keys. */ public void relocate(String[] keys) throws IOException { if (keys.length > 0) { isLoadState_ = true; BigInteger token = tokenMetadata_.getToken(StorageService.tcpAddr_); Map<BigInteger, EndPoint> tokenToEndPointMap = tokenMetadata_.cloneTokenEndPointMap(); BigInteger[] tokens = tokenToEndPointMap.keySet().toArray(new BigInteger[0]); Arrays.sort(tokens); int index = Arrays.binarySearch(tokens, token) * (keys.length / tokens.length); BigInteger newToken = hash(keys[index]); /* update the token */ updateToken(newToken); } } /* * This is used to indicate that this node is done * with the loading of data. */ public void resetLoadState() { isLoadState_ = false; } /** * This method takes a colon separated string of nodes that need * to be bootstrapped. It is also used to filter some source of * data. Suppose the nodes to be bootstrapped are A, B and C. Then * <i>allNodes</i> must be specified as A:B:C. * */ private void doBootstrap(String nodes) { String[] allNodesAndFilter = nodes.split("-"); String nodesToLoad; String filterSources = null; if (allNodesAndFilter.length == 2) { nodesToLoad = allNodesAndFilter[0]; filterSources = allNodesAndFilter[1]; } else { nodesToLoad = allNodesAndFilter[0]; } String[] allNodes = nodesToLoad.split(":"); EndPoint[] endpoints = new EndPoint[allNodes.length]; BigInteger[] tokens = new BigInteger[allNodes.length]; for (int i = 0; i < allNodes.length; ++i) { endpoints[i] = new EndPoint(allNodes[i].trim(), DatabaseDescriptor.getStoragePort()); tokens[i] = tokenMetadata_.getToken(endpoints[i]); } /* Start the bootstrap algorithm */ if (filterSources == null) bootStrapper_.submit(new BootStrapper(endpoints, tokens)); else { String[] allFilters = filterSources.split(":"); EndPoint[] filters = new EndPoint[allFilters.length]; for (int i = 0; i < allFilters.length; ++i) { filters[i] = new EndPoint(allFilters[i].trim(), DatabaseDescriptor.getStoragePort()); } bootStrapper_.submit(new BootStrapper(endpoints, tokens, filters)); } } /** * Starts the bootstrap operations for the specified endpoint. * The name of this method is however a misnomer since it does * handoff of data to the specified node when it has crashed * and come back up, marked as alive after a network partition * and also when it joins the ring either as an old node being * relocated or as a brand new node. */ public final void doBootstrap(EndPoint endpoint, BootstrapMode mode) { switch (mode) { case FULL: BigInteger token = tokenMetadata_.getToken(endpoint); bootStrapper_.submit(new BootStrapper(new EndPoint[] { endpoint }, new BigInteger[] { token })); break; case HINT: /* Deliver the hinted data to this endpoint. */ HintedHandOffManager.instance().deliverHints(endpoint); break; default: break; } } /* This methods belong to the MBean interface */ public String getToken(EndPoint ep) { EndPoint ep2 = new EndPoint(ep.getHost(), DatabaseDescriptor.getStoragePort()); BigInteger token = tokenMetadata_.getToken(ep2); return (token == null) ? BigInteger.ZERO.toString() : token.toString(); } public String getToken() { return tokenMetadata_.getToken(StorageService.tcpAddr_).toString(); } public void updateToken(String token) { try { updateToken(new BigInteger(token)); } catch (IOException ex) { logger_.debug(LogUtil.throwableToString(ex)); } } public String getLiveNodes() { return stringify(Gossiper.instance().getLiveMembers()); } public String getUnreachableNodes() { return stringify(Gossiper.instance().getUnreachableMembers()); } /* Helper for the MBean interface */ private String stringify(Set<EndPoint> eps) { StringBuilder sb = new StringBuilder(""); for (EndPoint ep : eps) { sb.append(ep); sb.append(" "); } return sb.toString(); } public void loadAll(String nodes) { doBootstrap(nodes); } public void doGC() { List<String> tables = DatabaseDescriptor.getTables(); for (String tName : tables) { Table table = Table.open(tName); table.doGC(); } } public void forceHandoff(String directories, String host) throws IOException { List<File> filesList = new ArrayList<File>(); String[] sources = directories.split(":"); for (String source : sources) { File directory = new File(source); Collections.addAll(filesList, directory.listFiles()); } File[] files = filesList.toArray(new File[0]); StreamContextManager.StreamContext[] streamContexts = new StreamContextManager.StreamContext[files.length]; int i = 0; for (File file : files) { streamContexts[i] = new StreamContextManager.StreamContext(file.getAbsolutePath(), file.length()); logger_.debug("Stream context metadata " + streamContexts[i]); ++i; } if (files.length > 0) { EndPoint target = new EndPoint(host, DatabaseDescriptor.getStoragePort()); /* Set up the stream manager with the files that need to streamed */ StreamManager.instance(target).addFilesToStream(streamContexts); /* Send the bootstrap initiate message */ BootstrapInitiateMessage biMessage = new BootstrapInitiateMessage(streamContexts); Message message = BootstrapInitiateMessage.makeBootstrapInitiateMessage(biMessage); logger_.debug("Sending a bootstrap initiate message to " + target + " ..."); MessagingService.getMessagingInstance().sendOneWay(message, target); logger_.debug("Waiting for transfer to " + target + " to complete"); StreamManager.instance(target).waitForStreamCompletion(); logger_.debug("Done with transfer to " + target); } } /* End of MBean interface methods */ /** * This method returns the predecessor of the endpoint ep on the identifier * space. */ EndPoint getPredecessor(EndPoint ep) { BigInteger token = tokenMetadata_.getToken(ep); Map<BigInteger, EndPoint> tokenToEndPointMap = tokenMetadata_.cloneTokenEndPointMap(); List<BigInteger> tokens = new ArrayList<BigInteger>(tokenToEndPointMap.keySet()); Collections.sort(tokens); int index = Collections.binarySearch(tokens, token); return (index == 0) ? tokenToEndPointMap.get(tokens.get(tokens.size() - 1)) : tokenToEndPointMap.get(tokens.get(--index)); } /* * This method returns the successor of the endpoint ep on the identifier * space. */ public EndPoint getSuccessor(EndPoint ep) { BigInteger token = tokenMetadata_.getToken(ep); Map<BigInteger, EndPoint> tokenToEndPointMap = tokenMetadata_.cloneTokenEndPointMap(); List<BigInteger> tokens = new ArrayList<BigInteger>(tokenToEndPointMap.keySet()); Collections.sort(tokens); int index = Collections.binarySearch(tokens, token); return (index == (tokens.size() - 1)) ? tokenToEndPointMap.get(tokens.get(0)) : tokenToEndPointMap.get(tokens.get(++index)); } /** * Get the primary range for the specified endpoint. * @param ep endpoint we are interested in. * @return range for the specified endpoint. */ public Range getPrimaryRangeForEndPoint(EndPoint ep) { BigInteger right = tokenMetadata_.getToken(ep); EndPoint predecessor = getPredecessor(ep); BigInteger left = tokenMetadata_.getToken(predecessor); return new Range(left, right); } /** * Get all ranges an endpoint is responsible for. * @param ep endpoint we are interested in. * @return ranges for the specified endpoint. */ List<Range> getRangesForEndPoint(EndPoint ep) { List<Range> ranges = new ArrayList<Range>(); ranges.add(getPrimaryRangeForEndPoint(ep)); EndPoint predecessor = ep; int count = DatabaseDescriptor.getReplicationFactor() - 1; for (int i = 0; i < count; ++i) { predecessor = getPredecessor(predecessor); ranges.add(getPrimaryRangeForEndPoint(predecessor)); } return ranges; } /** * Get all ranges that span the ring as per * current snapshot of the token distribution. * @return all ranges in sorted order. */ public Range[] getAllRanges() { Set<BigInteger> allTokens = tokenMetadata_.cloneTokenEndPointMap().keySet(); return getAllRanges(allTokens); } /** * Get all ranges that span the ring given a set * of tokens. All ranges are in sorted order of * ranges. * @return ranges in sorted order */ public Range[] getAllRanges(Set<BigInteger> tokens) { List<Range> ranges = new ArrayList<Range>(); List<BigInteger> allTokens = new ArrayList<BigInteger>(tokens); Collections.sort(allTokens); int size = allTokens.size(); for (int i = 1; i < size; ++i) { Range range = new Range(allTokens.get(i - 1), allTokens.get(i)); ranges.add(range); } Range range = new Range(allTokens.get(size - 1), allTokens.get(0)); ranges.add(range); return ranges.toArray(new Range[0]); } /** * This method returns the endpoint that is responsible for storing the * specified key. * * param @ key - key for which we need to find the endpoint * return value - the endpoint responsible for this key */ public EndPoint getPrimary(String key) { EndPoint endpoint = StorageService.tcpAddr_; BigInteger token = hash(key); Map<BigInteger, EndPoint> tokenToEndPointMap = tokenMetadata_.cloneTokenEndPointMap(); List<BigInteger> tokens = new ArrayList<BigInteger>(tokenToEndPointMap.keySet()); if (tokens.size() > 0) { Collections.sort(tokens); int index = Collections.binarySearch(tokens, token); if (index >= 0) { /* * retrieve the endpoint based on the token at this index in the * tokens list */ endpoint = tokenToEndPointMap.get(tokens.get(index)); } else { index = (index + 1) * (-1); if (index < tokens.size()) endpoint = tokenToEndPointMap.get(tokens.get(index)); else endpoint = tokenToEndPointMap.get(tokens.get(0)); } } return endpoint; } /** * This method determines whether the local endpoint is the * primary for the given key. * @param key * @return true if the local endpoint is the primary replica. */ public boolean isPrimary(String key) { EndPoint endpoint = getPrimary(key); return StorageService.tcpAddr_.equals(endpoint); } /** * This method returns the N endpoints that are responsible for storing the * specified key i.e for replication. * * param @ key - key for which we need to find the endpoint return value - * the endpoint responsible for this key */ public EndPoint[] getNStorageEndPoint(String key) { BigInteger token = hash(key); return nodePicker_.getStorageEndPoints(token); } private Map<String, EndPoint[]> getNStorageEndPoints(String[] keys) { return nodePicker_.getStorageEndPoints(keys); } /** * This method attempts to return N endpoints that are responsible for storing the * specified key i.e for replication. * * param @ key - key for which we need to find the endpoint return value - * the endpoint responsible for this key */ public List<EndPoint> getNLiveStorageEndPoint(String key) { List<EndPoint> liveEps = new ArrayList<EndPoint>(); EndPoint[] endpoints = getNStorageEndPoint(key); for (EndPoint endpoint : endpoints) { if (FailureDetector.instance().isAlive(endpoint)) liveEps.add(endpoint); } return liveEps; } /** * This method returns the N endpoints that are responsible for storing the * specified key i.e for replication. * * param @ key - key for which we need to find the endpoint return value - * the endpoint responsible for this key */ public Map<EndPoint, EndPoint> getNStorageEndPointMap(String key) { BigInteger token = hash(key); return nodePicker_.getHintedStorageEndPoints(token); } /** * This method returns the N endpoints that are responsible for storing the * specified token i.e for replication. * * param @ token - position on the ring */ public EndPoint[] getNStorageEndPoint(BigInteger token) { return nodePicker_.getStorageEndPoints(token); } /** * This method returns the N endpoints that are responsible for storing the * specified token i.e for replication and are based on the token to endpoint * mapping that is passed in. * * param @ token - position on the ring * param @ tokens - w/o the following tokens in the token list */ protected EndPoint[] getNStorageEndPoint(BigInteger token, Map<BigInteger, EndPoint> tokenToEndPointMap) { return nodePicker_.getStorageEndPoints(token, tokenToEndPointMap); } /** * This function finds the most suitable endpoint given a key. * It checks for loclity and alive test. */ public EndPoint findSuitableEndPoint(String key) throws IOException { EndPoint[] endpoints = getNStorageEndPoint(key); for (EndPoint endPoint : endpoints) { if (endPoint.equals(StorageService.getLocalStorageEndPoint())) { return endPoint; } } int j = 0; for (; j < endpoints.length; ++j) { if (StorageService.instance().isInSameDataCenter(endpoints[j]) && FailureDetector.instance().isAlive(endpoints[j])) { logger_.debug( "EndPoint " + endpoints[j] + " is in the same data center as local storage endpoint."); return endpoints[j]; } } // We have tried to be really nice but looks like theer are no servers // in the local data center that are alive and can service this request so // just send it to teh first alive guy and see if we get anything. j = 0; for (; j < endpoints.length; ++j) { if (FailureDetector.instance().isAlive(endpoints[j])) { logger_.debug("EndPoint " + endpoints[j] + " is alive so get data from it."); return endpoints[j]; } } return null; } public Map<String, EndPoint> findSuitableEndPoints(String[] keys) throws IOException { Map<String, EndPoint> suitableEndPoints = new HashMap<String, EndPoint>(); Map<String, EndPoint[]> results = getNStorageEndPoints(keys); for (String key : keys) { EndPoint[] endpoints = results.get(key); /* indicates if we have to move on to the next key */ boolean moveOn = false; for (EndPoint endPoint : endpoints) { if (endPoint.equals(StorageService.getLocalStorageEndPoint())) { suitableEndPoints.put(key, endPoint); moveOn = true; break; } } if (moveOn) continue; int j = 0; for (; j < endpoints.length; ++j) { if (StorageService.instance().isInSameDataCenter(endpoints[j]) && FailureDetector.instance().isAlive(endpoints[j])) { logger_.debug( "EndPoint " + endpoints[j] + " is in the same data center as local storage endpoint."); suitableEndPoints.put(key, endpoints[j]); moveOn = true; break; } } if (moveOn) continue; // We have tried to be really nice but looks like theer are no servers // in the local data center that are alive and can service this request so // just send it to the first alive guy and see if we get anything. j = 0; for (; j < endpoints.length; ++j) { if (FailureDetector.instance().isAlive(endpoints[j])) { logger_.debug("EndPoint " + endpoints[j] + " is alive so get data from it."); suitableEndPoints.put(key, endpoints[j]); break; } } } return suitableEndPoints; } }