Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.net; import java.nio.channels.ClosedChannelException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import com.google.common.annotations.VisibleForTesting; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import io.netty.util.concurrent.Future; import org.apache.cassandra.concurrent.ScheduledExecutors; import org.apache.cassandra.concurrent.Stage; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.service.AbstractWriteResponseHandler; import org.apache.cassandra.utils.ExecutorUtils; import org.apache.cassandra.utils.FBUtilities; import static java.util.Collections.synchronizedList; import static java.util.concurrent.TimeUnit.MINUTES; import static java.util.concurrent.TimeUnit.NANOSECONDS; import static org.apache.cassandra.concurrent.Stage.MUTATION; import static org.apache.cassandra.utils.Throwables.maybeFail; /** * MessagingService implements all internode communication - with the exception of SSTable streaming (for now). * * Specifically, it's responsible for dispatch of outbound messages to other nodes and routing of inbound messages * to their appropriate {@link IVerbHandler}. * * <h2>Using MessagingService: sending requests and responses</h2> * * The are two ways to send a {@link Message}, and you should pick one depending on the desired behaviour: * 1. To send a request that expects a response back, use * {@link #sendWithCallback(Message, InetAddressAndPort, RequestCallback)} method. Once a response * message is received, {@link RequestCallback#onResponse(Message)} method will be invoked on the * provided callback - in case of a success response. In case of a failure response (see {@link Verb#FAILURE_RSP}), * or if a response doesn't arrive within verb's configured expiry time, * {@link RequestCallback#onFailure(InetAddressAndPort, RequestFailureReason)} will be invoked instead. * 2. To send a response back, or a message that expects no response, use {@link #send(Message, InetAddressAndPort)} * method. * * See also: {@link Message#out(Verb, Object)}, {@link Message#responseWith(Object)}, * and {@link Message#failureResponse(RequestFailureReason)}. * * <h2>Using MessagingService: handling a request</h2> * * As described in the previous section, to handle responses you only need to implement {@link RequestCallback} * interface - so long as your response verb handler is the default {@link ResponseVerbHandler}. * * There are two steps you need to perform to implement request handling: * 1. Create a {@link IVerbHandler} to process incoming requests and responses for the new type (if applicable). * 2. Add a new {@link Verb} to the enum for the new request type, and, if applicable, one for the response message. * * MessagingService will now automatically invoke your handler whenever a {@link Message} with this verb arrives. * * <h1>Architecture of MessagingService</h1> * * <h2>QOS</h2> * * Since our messaging protocol is TCP-based, and also doesn't yet support interleaving messages with each other, * we need a way to prevent head-of-line blocking adversely affecting all messages - in particular, large messages * being in the way of smaller ones. To achive that (somewhat), we maintain three messaging connections to and * from each peer: * - one for large messages - defined as being larger than {@link OutboundConnections#LARGE_MESSAGE_THRESHOLD} * (65KiB by default) * - one for small messages - defined as smaller than that threshold * - and finally, a connection for urgent messages - usually small and/or that are important to arrive * promptly, e.g. gossip-related ones * * <h2>Wire format and framing</h2> * * Small messages are grouped together into frames, and large messages are split over multiple frames. * Framing provides application-level integrity protection to otherwise raw streams of data - we use * CRC24 for frame headers and CRC32 for the entire payload. LZ4 is optionally used for compression. * * You can find the on-wire format description of individual messages in the comments for * {@link Message.Serializer}, alongside with format evolution notes. * For the list and descriptions of available frame decoders see {@link FrameDecoder} comments. You can * find wire format documented in the javadoc of {@link FrameDecoder} implementations: * see {@link FrameDecoderCrc} and {@link FrameDecoderLZ4} in particular. * * <h2>Architecture of outbound messaging</h2> * * {@link OutboundConnection} is the core class implementing outbound connection logic, with * {@link OutboundConnection#enqueue(Message)} being its main entry point. The connections are initiated * by {@link OutboundConnectionInitiator}. * * Netty pipeline for outbound messaging connections generally consists of the following handlers: * * [(optional) SslHandler] <- [FrameEncoder] * * {@link OutboundConnection} handles the entire lifetime of a connection: from the very first handshake * to any necessary reconnects if necessary. * * Message-delivery flow varies depending on the connection type. * * For {@link ConnectionType#SMALL_MESSAGES} and {@link ConnectionType#URGENT_MESSAGES}, * {@link Message} serialization and delivery occurs directly on the event loop. * See {@link OutboundConnection.EventLoopDelivery} for details. * * For {@link ConnectionType#LARGE_MESSAGES}, to ensure that servicing large messages doesn't block * timely service of other requests, message serialization is offloaded to a companion thread pool * ({@link SocketFactory#synchronousWorkExecutor}). Most of the work will be performed by * {@link AsyncChannelOutputPlus}. Please see {@link OutboundConnection.LargeMessageDelivery} * for details. * * To prevent fast clients, or slow nodes on the other end of the connection from overwhelming * a host with enqueued, unsent messages on heap, we impose strict limits on how much memory enqueued, * undelivered messages can claim. * * Every individual connection gets an exclusive permit quota to use - 4MiB by default; every endpoint * (group of large, small, and urgent connection) is capped at, by default, at 128MiB of undelivered messages, * and a global limit of 512MiB is imposed on all endpoints combined. * * On an attempt to {@link OutboundConnection#enqueue(Message)}, the connection will attempt to allocate * permits for message-size number of bytes from its exclusive quota; if successful, it will add the * message to the queue; if unsuccessful, it will need to allocate remainder from both endpoint and lobal * reserves, and if it fails to do so, the message will be rejected, and its callbacks, if any, * immediately expired. * * For a more detailed description please see the docs and comments of {@link OutboundConnection}. * * <h2>Architecture of inbound messaging</h2> * * {@link InboundMessageHandler} is the core class implementing inbound connection logic, paired * with {@link FrameDecoder}. Inbound connections are initiated by {@link InboundConnectionInitiator}. * The primary entry points to these classes are {@link FrameDecoder#channelRead(ShareableBytes)} * and {@link InboundMessageHandler#process(FrameDecoder.Frame)}. * * Netty pipeline for inbound messaging connections generally consists of the following handlers: * * [(optional) SslHandler] -> [FrameDecoder] -> [InboundMessageHandler] * * {@link FrameDecoder} is responsible for decoding incoming frames and work stashing; {@link InboundMessageHandler} * then takes decoded frames from the decoder and processes the messages contained in them. * * The flow differs between small and large messages. Small ones are deserialized immediately, and only * then scheduled on the right thread pool for the {@link Verb} for execution. Large messages, OTOH, * aren't deserialized until they are just about to be executed on the appropriate {@link Stage}. * * Similarly to outbound handling, inbound messaging imposes strict memory utilisation limits on individual * endpoints and on global aggregate consumption, and implements simple flow control, to prevent a single * fast endpoint from overwhelming a host. * * Every individual connection gets an exclusive permit quota to use - 4MiB by default; every endpoint * (group of large, small, and urgent connection) is capped at, by default, at 128MiB of unprocessed messages, * and a global limit of 512MiB is imposed on all endpoints combined. * * On arrival of a message header, the handler will attempt to allocate permits for message-size number * of bytes from its exclusive quota; if successful, it will proceed to deserializing and processing the message. * If unsuccessful, the handler will attempt to allocate the remainder from its endpoint and global reserve; * if either allocation is unsuccessful, the handler will cease any further frame processing, and tell * {@link FrameDecoder} to stop reading from the network; subsequently, it will put itself on a special * {@link org.apache.cassandra.net.InboundMessageHandler.WaitQueue}, to be reactivated once more permits * become available. * * For a more detailed description please see the docs and comments of {@link InboundMessageHandler} and * {@link FrameDecoder}. * * <h2>Observability</h2> * * MessagingService exposes diagnostic counters for both outbound and inbound directions - received and sent * bytes and message counts, overload bytes and message count, error bytes and error counts, and many more. * * See {@link org.apache.cassandra.metrics.InternodeInboundMetrics} and * {@link org.apache.cassandra.metrics.InternodeOutboundMetrics} for JMX-exposed counters. * * We also provide {@code system_views.internode_inbound} and {@code system_views.internode_outbound} virtual tables - * implemented in {@link org.apache.cassandra.db.virtual.InternodeInboundTable} and * {@link org.apache.cassandra.db.virtual.InternodeOutboundTable} respectively. */ public final class MessagingService extends MessagingServiceMBeanImpl { private static final Logger logger = LoggerFactory.getLogger(MessagingService.class); // 8 bits version, so don't waste versions public static final int VERSION_30 = 10; public static final int VERSION_3014 = 11; public static final int VERSION_40 = 12; public static final int minimum_version = VERSION_30; public static final int current_version = VERSION_40; static AcceptVersions accept_messaging = new AcceptVersions(minimum_version, current_version); static AcceptVersions accept_streaming = new AcceptVersions(current_version, current_version); private static class MSHandle { public static final MessagingService instance = new MessagingService(false); } public static MessagingService instance() { return MSHandle.instance; } public final SocketFactory socketFactory = new SocketFactory(); public final LatencySubscribers latencySubscribers = new LatencySubscribers(); public final RequestCallbacks callbacks = new RequestCallbacks(this); // a public hook for filtering messages intended for delivery to this node public final InboundSink inboundSink = new InboundSink(this); // the inbound global reserve limits and associated wait queue private final InboundMessageHandlers.GlobalResourceLimits inboundGlobalReserveLimits = new InboundMessageHandlers.GlobalResourceLimits( new ResourceLimits.Concurrent( DatabaseDescriptor.getInternodeApplicationReceiveQueueReserveGlobalCapacityInBytes())); // the socket bindings we accept incoming connections on private final InboundSockets inboundSockets = new InboundSockets( new InboundConnectionSettings().withHandlers(this::getInbound).withSocketFactory(socketFactory)); // a public hook for filtering messages intended for delivery to another node public final OutboundSink outboundSink = new OutboundSink(this::doSend); final ResourceLimits.Limit outboundGlobalReserveLimit = new ResourceLimits.Concurrent( DatabaseDescriptor.getInternodeApplicationSendQueueReserveGlobalCapacityInBytes()); // back-pressure implementation private final BackPressureStrategy backPressure = DatabaseDescriptor.getBackPressureStrategy(); private volatile boolean isShuttingDown; @VisibleForTesting MessagingService(boolean testOnly) { super(testOnly); OutboundConnections.scheduleUnusedConnectionMonitoring(this, ScheduledExecutors.scheduledTasks, 1L, TimeUnit.HOURS); } /** * Send a non-mutation message to a given endpoint. This method specifies a callback * which is invoked with the actual response. * * @param message message to be sent. * @param to endpoint to which the message needs to be sent * @param cb callback interface which is used to pass the responses or * suggest that a timeout occurred to the invoker of the send(). */ public void sendWithCallback(Message message, InetAddressAndPort to, RequestCallback cb) { sendWithCallback(message, to, cb, null); } public void sendWithCallback(Message message, InetAddressAndPort to, RequestCallback cb, ConnectionType specifyConnection) { callbacks.addWithExpiration(cb, message, to); updateBackPressureOnSend(to, cb, message); if (cb.invokeOnFailure() && !message.callBackOnFailure()) message = message.withCallBackOnFailure(); send(message, to, specifyConnection); } /** * Send a mutation message or a Paxos Commit to a given endpoint. This method specifies a callback * which is invoked with the actual response. * Also holds the message (only mutation messages) to determine if it * needs to trigger a hint (uses StorageProxy for that). * * @param message message to be sent. * @param to endpoint to which the message needs to be sent * @param handler callback interface which is used to pass the responses or * suggest that a timeout occurred to the invoker of the send(). */ public void sendWriteWithCallback(Message message, Replica to, AbstractWriteResponseHandler<?> handler, boolean allowHints) { assert message.callBackOnFailure(); callbacks.addWithExpiration(handler, message, to, handler.consistencyLevel(), allowHints); updateBackPressureOnSend(to.endpoint(), handler, message); send(message, to.endpoint(), null); } /** * Send a message to a given endpoint. This method adheres to the fire and forget * style messaging. * * @param message messages to be sent. * @param to endpoint to which the message needs to be sent */ public void send(Message message, InetAddressAndPort to) { send(message, to, null); } public void send(Message message, InetAddressAndPort to, ConnectionType specifyConnection) { if (logger.isTraceEnabled()) { logger.trace("{} sending {} to {}@{}", FBUtilities.getBroadcastAddressAndPort(), message.verb(), message.id(), to); if (to.equals(FBUtilities.getBroadcastAddressAndPort())) logger.trace("Message-to-self {} going over MessagingService", message); } outboundSink.accept(message, to, specifyConnection); } private void doSend(Message message, InetAddressAndPort to, ConnectionType specifyConnection) { // expire the callback if the message failed to enqueue (failed to establish a connection or exceeded queue capacity) while (true) { OutboundConnections connections = getOutbound(to); try { connections.enqueue(message, specifyConnection); return; } catch (ClosedChannelException e) { if (isShuttingDown) return; // just drop the message, and let others clean up // remove the connection and try again channelManagers.remove(to, connections); } } } /** * Updates the back-pressure state on sending to the given host if enabled and the given message callback supports it. * * @param host The replica host the back-pressure state refers to. * @param callback The message callback. * @param message The actual message. */ void updateBackPressureOnSend(InetAddressAndPort host, RequestCallback callback, Message<?> message) { if (DatabaseDescriptor.backPressureEnabled() && callback.supportsBackPressure()) { BackPressureState backPressureState = getBackPressureState(host); if (backPressureState != null) backPressureState.onMessageSent(message); } } /** * Updates the back-pressure state on reception from the given host if enabled and the given message callback supports it. * * @param host The replica host the back-pressure state refers to. * @param callback The message callback. * @param timeout True if updated following a timeout, false otherwise. */ void updateBackPressureOnReceive(InetAddressAndPort host, RequestCallback callback, boolean timeout) { if (DatabaseDescriptor.backPressureEnabled() && callback.supportsBackPressure()) { BackPressureState backPressureState = getBackPressureState(host); if (backPressureState == null) return; if (!timeout) backPressureState.onResponseReceived(); else backPressureState.onResponseTimeout(); } } /** * Applies back-pressure for the given hosts, according to the configured strategy. * * If the local host is present, it is removed from the pool, as back-pressure is only applied * to remote hosts. * * @param hosts The hosts to apply back-pressure to. * @param timeoutInNanos The max back-pressure timeout. */ public void applyBackPressure(Iterable<InetAddressAndPort> hosts, long timeoutInNanos) { if (DatabaseDescriptor.backPressureEnabled()) { Set<BackPressureState> states = new HashSet<>(); for (InetAddressAndPort host : hosts) { if (host.equals(FBUtilities.getBroadcastAddressAndPort())) continue; states.add(getOutbound(host).getBackPressureState()); } //noinspection unchecked backPressure.apply(states, timeoutInNanos, NANOSECONDS); } } BackPressureState getBackPressureState(InetAddressAndPort host) { return getOutbound(host).getBackPressureState(); } void markExpiredCallback(InetAddressAndPort addr) { OutboundConnections conn = channelManagers.get(addr); if (conn != null) conn.incrementExpiredCallbackCount(); } /** * Only to be invoked once we believe the endpoint will never be contacted again. * * We close the connection after a five minute delay, to give asynchronous operations a chance to terminate */ public void closeOutbound(InetAddressAndPort to) { OutboundConnections pool = channelManagers.get(to); if (pool != null) pool.scheduleClose(5L, MINUTES, true).addListener(future -> channelManagers.remove(to, pool)); } /** * Only to be invoked once we believe the connections will never be used again. */ void closeOutboundNow(OutboundConnections connections) { connections.close(true) .addListener(future -> channelManagers.remove(connections.template().to, connections)); } /** * Only to be invoked once we believe the connections will never be used again. */ public void removeInbound(InetAddressAndPort from) { InboundMessageHandlers handlers = messageHandlers.remove(from); if (null != handlers) handlers.releaseMetrics(); } /** * Closes any current open channel/connection to the endpoint, but does not cause any message loss, and we will * try to re-establish connections immediately */ public void interruptOutbound(InetAddressAndPort to) { OutboundConnections pool = channelManagers.get(to); if (pool != null) pool.interrupt(); } /** * Reconnect to the peer using the given {@code addr}. Outstanding messages in each channel will be sent on the * current channel. Typically this function is used for something like EC2 public IP addresses which need to be used * for communication between EC2 regions. * * @param address IP Address to identify the peer * @param preferredAddress IP Address to use (and prefer) going forward for connecting to the peer */ @SuppressWarnings("UnusedReturnValue") public Future<Void> maybeReconnectWithNewIp(InetAddressAndPort address, InetAddressAndPort preferredAddress) { if (!SystemKeyspace.updatePreferredIP(address, preferredAddress)) return null; OutboundConnections messagingPool = channelManagers.get(address); if (messagingPool != null) return messagingPool.reconnectWithNewIp(preferredAddress); return null; } /** * Wait for callbacks and don't allow any more to be created (since they could require writing hints) */ public void shutdown() { shutdown(1L, MINUTES, true, true); } public void shutdown(long timeout, TimeUnit units, boolean shutdownGracefully, boolean shutdownExecutors) { isShuttingDown = true; logger.info("Waiting for messaging service to quiesce"); // We may need to schedule hints on the mutation stage, so it's erroneous to shut down the mutation stage first assert !MUTATION.executor().isShutdown(); if (shutdownGracefully) { callbacks.shutdownGracefully(); List<Future<Void>> closing = new ArrayList<>(); for (OutboundConnections pool : channelManagers.values()) closing.add(pool.close(true)); long deadline = System.nanoTime() + units.toNanos(timeout); maybeFail(() -> new FutureCombiner(closing).get(timeout, units), () -> { List<ExecutorService> inboundExecutors = new ArrayList<>(); inboundSockets.close(synchronizedList(inboundExecutors)::add).get(); ExecutorUtils.awaitTermination(1L, TimeUnit.MINUTES, inboundExecutors); }, () -> { if (shutdownExecutors) shutdownExecutors(deadline); }, () -> callbacks.awaitTerminationUntil(deadline), inboundSink::clear, outboundSink::clear); } else { callbacks.shutdownNow(false); List<Future<Void>> closing = new ArrayList<>(); List<ExecutorService> inboundExecutors = synchronizedList(new ArrayList<ExecutorService>()); closing.add(inboundSockets.close(inboundExecutors::add)); for (OutboundConnections pool : channelManagers.values()) closing.add(pool.close(false)); long deadline = System.nanoTime() + units.toNanos(timeout); maybeFail(() -> new FutureCombiner(closing).get(timeout, units), () -> { if (shutdownExecutors) shutdownExecutors(deadline); }, () -> ExecutorUtils.awaitTermination(timeout, units, inboundExecutors), () -> callbacks.awaitTerminationUntil(deadline), inboundSink::clear, outboundSink::clear); } } private void shutdownExecutors(long deadlineNanos) throws TimeoutException, InterruptedException { socketFactory.shutdownNow(); socketFactory.awaitTerminationUntil(deadlineNanos); } private OutboundConnections getOutbound(InetAddressAndPort to) { OutboundConnections connections = channelManagers.get(to); if (connections == null) connections = OutboundConnections.tryRegister(channelManagers, to, new OutboundConnectionSettings(to).withDefaults(ConnectionCategory.MESSAGING), backPressure.newState(to)); return connections; } InboundMessageHandlers getInbound(InetAddressAndPort from) { InboundMessageHandlers handlers = messageHandlers.get(from); if (null != handlers) return handlers; return messageHandlers.computeIfAbsent(from, addr -> new InboundMessageHandlers(FBUtilities.getLocalAddressAndPort(), addr, DatabaseDescriptor.getInternodeApplicationReceiveQueueCapacityInBytes(), DatabaseDescriptor.getInternodeApplicationReceiveQueueReserveEndpointCapacityInBytes(), inboundGlobalReserveLimits, metrics, inboundSink)); } @VisibleForTesting boolean isConnected(InetAddressAndPort address, Message<?> messageOut) { OutboundConnections pool = channelManagers.get(address); if (pool == null) return false; return pool.connectionFor(messageOut).isConnected(); } public void listen() { inboundSockets.open(); } public void waitUntilListening() throws InterruptedException { inboundSockets.open().await(); } }