Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.service; import java.io.IOException; import java.net.UnknownHostException; import java.util.*; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicBoolean; import com.google.common.base.Preconditions; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.collect.Multimap; import com.google.common.util.concurrent.AbstractFuture; import com.google.common.util.concurrent.ListeningExecutorService; import com.google.common.util.concurrent.MoreExecutors; import org.apache.cassandra.locator.EndpointsByRange; import org.apache.cassandra.locator.EndpointsForRange; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor; import org.apache.cassandra.concurrent.NamedThreadFactory; import org.apache.cassandra.concurrent.ScheduledExecutors; import org.apache.cassandra.config.Config; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.gms.ApplicationState; import org.apache.cassandra.gms.EndpointState; import org.apache.cassandra.gms.FailureDetector; import org.apache.cassandra.gms.Gossiper; import org.apache.cassandra.gms.IFailureDetector; import org.apache.cassandra.gms.IEndpointStateChangeSubscriber; import org.apache.cassandra.gms.IFailureDetectionEventListener; import org.apache.cassandra.gms.VersionedValue; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.TokenMetadata; import org.apache.cassandra.net.RequestCallback; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.repair.CommonRange; import org.apache.cassandra.streaming.PreviewKind; import org.apache.cassandra.repair.RepairJobDesc; import org.apache.cassandra.repair.RepairParallelism; import org.apache.cassandra.repair.RepairSession; import org.apache.cassandra.repair.consistent.CoordinatorSessions; import org.apache.cassandra.repair.consistent.LocalSessions; import org.apache.cassandra.repair.messages.*; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.utils.CassandraVersion; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.MBeanWrapper; import org.apache.cassandra.utils.Pair; import org.apache.cassandra.utils.UUIDGen; import static com.google.common.collect.Iterables.concat; import static com.google.common.collect.Iterables.transform; import static org.apache.cassandra.net.Verb.PREPARE_MSG; /** * ActiveRepairService is the starting point for manual "active" repairs. * * Each user triggered repair will correspond to one or multiple repair session, * one for each token range to repair. On repair session might repair multiple * column families. For each of those column families, the repair session will * request merkle trees for each replica of the range being repaired, diff those * trees upon receiving them, schedule the streaming ofthe parts to repair (based on * the tree diffs) and wait for all those operation. See RepairSession for more * details. * * The creation of a repair session is done through the submitRepairSession that * returns a future on the completion of that session. */ public class ActiveRepairService implements IEndpointStateChangeSubscriber, IFailureDetectionEventListener, ActiveRepairServiceMBean { public enum ParentRepairStatus { IN_PROGRESS, COMPLETED, FAILED } public static class ConsistentSessions { public final LocalSessions local = new LocalSessions(); public final CoordinatorSessions coordinated = new CoordinatorSessions(); } public final ConsistentSessions consistent = new ConsistentSessions(); private boolean registeredForEndpointChanges = false; private static final Logger logger = LoggerFactory.getLogger(ActiveRepairService.class); // singleton enforcement public static final ActiveRepairService instance = new ActiveRepairService(FailureDetector.instance, Gossiper.instance); public static final long UNREPAIRED_SSTABLE = 0; public static final UUID NO_PENDING_REPAIR = null; /** * A map of active coordinator session. */ private final ConcurrentMap<UUID, RepairSession> sessions = new ConcurrentHashMap<>(); private final ConcurrentMap<UUID, ParentRepairSession> parentRepairSessions = new ConcurrentHashMap<>(); public final static ExecutorService repairCommandExecutor; static { Config.RepairCommandPoolFullStrategy strategy = DatabaseDescriptor.getRepairCommandPoolFullStrategy(); BlockingQueue<Runnable> queue; if (strategy == Config.RepairCommandPoolFullStrategy.reject) queue = new SynchronousQueue<>(); else queue = new LinkedBlockingQueue<>(); repairCommandExecutor = new JMXEnabledThreadPoolExecutor(1, DatabaseDescriptor.getRepairCommandPoolSize(), 1, TimeUnit.HOURS, queue, new NamedThreadFactory("Repair-Task"), "internal", new ThreadPoolExecutor.AbortPolicy()); } private final IFailureDetector failureDetector; private final Gossiper gossiper; private final Cache<Integer, Pair<ParentRepairStatus, List<String>>> repairStatusByCmd; public ActiveRepairService(IFailureDetector failureDetector, Gossiper gossiper) { this.failureDetector = failureDetector; this.gossiper = gossiper; this.repairStatusByCmd = CacheBuilder.newBuilder() .expireAfterWrite(Long.getLong("cassandra.parent_repair_status_expiry_seconds", TimeUnit.SECONDS.convert(1, TimeUnit.DAYS)), TimeUnit.SECONDS) // using weight wouldn't work so well, since it doesn't reflect mutation of cached data // see https://github.com/google/guava/wiki/CachesExplained // We assume each entry is unlikely to be much more than 100 bytes, so bounding the size should be sufficient. .maximumSize(Long.getLong("cassandra.parent_repair_status_cache_size", 100_000)).build(); MBeanWrapper.instance.registerMBean(this, MBEAN_NAME); } public void start() { consistent.local.start(); ScheduledExecutors.optionalTasks.scheduleAtFixedRate(consistent.local::cleanup, 0, LocalSessions.CLEANUP_INTERVAL, TimeUnit.SECONDS); } @Override public List<Map<String, String>> getSessions(boolean all) { return consistent.local.sessionInfo(all); } @Override public void failSession(String session, boolean force) { UUID sessionID = UUID.fromString(session); consistent.local.cancelSession(sessionID, force); } @Override public void setRepairSessionSpaceInMegabytes(int sizeInMegabytes) { DatabaseDescriptor.setRepairSessionSpaceInMegabytes(sizeInMegabytes); } @Override public int getRepairSessionSpaceInMegabytes() { return DatabaseDescriptor.getRepairSessionSpaceInMegabytes(); } /** * Requests repairs for the given keyspace and column families. * * @return Future for asynchronous call or null if there is no need to repair */ public RepairSession submitRepairSession(UUID parentRepairSession, CommonRange range, String keyspace, RepairParallelism parallelismDegree, boolean isIncremental, boolean pullRepair, boolean force, PreviewKind previewKind, boolean optimiseStreams, ListeningExecutorService executor, String... cfnames) { if (range.endpoints.isEmpty()) return null; if (cfnames.length == 0) return null; final RepairSession session = new RepairSession(parentRepairSession, UUIDGen.getTimeUUID(), range, keyspace, parallelismDegree, isIncremental, pullRepair, force, previewKind, optimiseStreams, cfnames); sessions.put(session.getId(), session); // register listeners registerOnFdAndGossip(session); // remove session at completion session.addListener(new Runnable() { /** * When repair finished, do clean up */ public void run() { sessions.remove(session.getId()); } }, MoreExecutors.directExecutor()); session.start(executor); return session; } public boolean getUseOffheapMerkleTrees() { return DatabaseDescriptor.useOffheapMerkleTrees(); } public void setUseOffheapMerkleTrees(boolean value) { DatabaseDescriptor.useOffheapMerkleTrees(value); } private <T extends AbstractFuture & IEndpointStateChangeSubscriber & IFailureDetectionEventListener> void registerOnFdAndGossip( final T task) { gossiper.register(task); failureDetector.registerFailureDetectionEventListener(task); // unregister listeners at completion task.addListener(new Runnable() { /** * When repair finished, do clean up */ public void run() { failureDetector.unregisterFailureDetectionEventListener(task); gossiper.unregister(task); } }, MoreExecutors.directExecutor()); } public synchronized void terminateSessions() { Throwable cause = new IOException("Terminate session is called"); for (RepairSession session : sessions.values()) { session.forceShutdown(cause); } parentRepairSessions.clear(); } public void recordRepairStatus(int cmd, ParentRepairStatus parentRepairStatus, List<String> messages) { repairStatusByCmd.put(cmd, Pair.create(parentRepairStatus, messages)); } Pair<ParentRepairStatus, List<String>> getRepairStatus(Integer cmd) { return repairStatusByCmd.getIfPresent(cmd); } /** * Return all of the neighbors with whom we share the provided range. * * @param keyspaceName keyspace to repair * @param keyspaceLocalRanges local-range for given keyspaceName * @param toRepair token to repair * @param dataCenters the data centers to involve in the repair * * @return neighbors with whom we share the provided range */ public static EndpointsForRange getNeighbors(String keyspaceName, Iterable<Range<Token>> keyspaceLocalRanges, Range<Token> toRepair, Collection<String> dataCenters, Collection<String> hosts) { StorageService ss = StorageService.instance; EndpointsByRange replicaSets = ss.getRangeToAddressMap(keyspaceName); Range<Token> rangeSuperSet = null; for (Range<Token> range : keyspaceLocalRanges) { if (range.contains(toRepair)) { rangeSuperSet = range; break; } else if (range.intersects(toRepair)) { throw new IllegalArgumentException(String.format( "Requested range %s intersects a local range (%s) " + "but is not fully contained in one; this would lead to " + "imprecise repair. keyspace: %s", toRepair.toString(), range.toString(), keyspaceName)); } } if (rangeSuperSet == null || !replicaSets.containsKey(rangeSuperSet)) return EndpointsForRange.empty(toRepair); EndpointsForRange neighbors = replicaSets.get(rangeSuperSet).withoutSelf(); if (dataCenters != null && !dataCenters.isEmpty()) { TokenMetadata.Topology topology = ss.getTokenMetadata().cloneOnlyTokenMap().getTopology(); Multimap<String, InetAddressAndPort> dcEndpointsMap = topology.getDatacenterEndpoints(); Iterable<InetAddressAndPort> dcEndpoints = concat(transform(dataCenters, dcEndpointsMap::get)); return neighbors.select(dcEndpoints, true); } else if (hosts != null && !hosts.isEmpty()) { Set<InetAddressAndPort> specifiedHost = new HashSet<>(); for (final String host : hosts) { try { final InetAddressAndPort endpoint = InetAddressAndPort.getByName(host.trim()); if (endpoint.equals(FBUtilities.getBroadcastAddressAndPort()) || neighbors.endpoints().contains(endpoint)) specifiedHost.add(endpoint); } catch (UnknownHostException e) { throw new IllegalArgumentException("Unknown host specified " + host, e); } } if (!specifiedHost.contains(FBUtilities.getBroadcastAddressAndPort())) throw new IllegalArgumentException("The current host must be part of the repair"); if (specifiedHost.size() <= 1) { String msg = "Specified hosts %s do not share range %s needed for repair. Either restrict repair ranges " + "with -st/-et options, or specify one of the neighbors that share this range with " + "this node: %s."; throw new IllegalArgumentException(String.format(msg, hosts, toRepair, neighbors)); } specifiedHost.remove(FBUtilities.getBroadcastAddressAndPort()); return neighbors.keep(specifiedHost); } return neighbors; } /** * we only want to set repairedAt for incremental repairs including all replicas for a token range. For non-global * incremental repairs, forced incremental repairs, and full repairs, the UNREPAIRED_SSTABLE value will prevent * sstables from being promoted to repaired or preserve the repairedAt/pendingRepair values, respectively. */ static long getRepairedAt(RepairOption options, boolean force) { // we only want to set repairedAt for incremental repairs including all replicas for a token range. For non-global incremental repairs, full repairs, the UNREPAIRED_SSTABLE value will prevent // sstables from being promoted to repaired or preserve the repairedAt/pendingRepair values, respectively. For forced repairs, repairedAt time is only set to UNREPAIRED_SSTABLE if we actually // end up skipping replicas if (options.isIncremental() && options.isGlobal() && !force) { return System.currentTimeMillis(); } else { return ActiveRepairService.UNREPAIRED_SSTABLE; } } public UUID prepareForRepair(UUID parentRepairSession, InetAddressAndPort coordinator, Set<InetAddressAndPort> endpoints, RepairOption options, boolean isForcedRepair, List<ColumnFamilyStore> columnFamilyStores) { long repairedAt = getRepairedAt(options, isForcedRepair); registerParentRepairSession(parentRepairSession, coordinator, columnFamilyStores, options.getRanges(), options.isIncremental(), repairedAt, options.isGlobal(), options.getPreviewKind()); final CountDownLatch prepareLatch = new CountDownLatch(endpoints.size()); final AtomicBoolean status = new AtomicBoolean(true); final Set<String> failedNodes = Collections.synchronizedSet(new HashSet<String>()); RequestCallback callback = new RequestCallback() { @Override public void onResponse(Message msg) { prepareLatch.countDown(); } @Override public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) { status.set(false); failedNodes.add(from.toString()); prepareLatch.countDown(); } @Override public boolean invokeOnFailure() { return true; } }; List<TableId> tableIds = new ArrayList<>(columnFamilyStores.size()); for (ColumnFamilyStore cfs : columnFamilyStores) tableIds.add(cfs.metadata.id); for (InetAddressAndPort neighbour : endpoints) { if (FailureDetector.instance.isAlive(neighbour)) { PrepareMessage message = new PrepareMessage(parentRepairSession, tableIds, options.getRanges(), options.isIncremental(), repairedAt, options.isGlobal(), options.getPreviewKind()); Message<RepairMessage> msg = Message.out(PREPARE_MSG, message); MessagingService.instance().sendWithCallback(msg, neighbour, callback); } else { // we pre-filter the endpoints we want to repair for forced incremental repairs. So if any of the // remaining ones go down, we still want to fail so we don't create repair sessions that can't complete if (isForcedRepair && !options.isIncremental()) { prepareLatch.countDown(); } else { // bailout early to avoid potentially waiting for a long time. failRepair(parentRepairSession, "Endpoint not alive: " + neighbour); } } } try { // Failed repair is expensive so we wait for longer time. if (!prepareLatch.await(1, TimeUnit.HOURS)) { failRepair(parentRepairSession, "Did not get replies from all endpoints."); } } catch (InterruptedException e) { failRepair(parentRepairSession, "Interrupted while waiting for prepare repair response."); } if (!status.get()) { failRepair(parentRepairSession, "Got negative replies from endpoints " + failedNodes); } return parentRepairSession; } private void failRepair(UUID parentRepairSession, String errorMsg) { removeParentRepairSession(parentRepairSession); throw new RuntimeException(errorMsg); } public synchronized void registerParentRepairSession(UUID parentRepairSession, InetAddressAndPort coordinator, List<ColumnFamilyStore> columnFamilyStores, Collection<Range<Token>> ranges, boolean isIncremental, long repairedAt, boolean isGlobal, PreviewKind previewKind) { assert isIncremental || repairedAt == ActiveRepairService.UNREPAIRED_SSTABLE; if (!registeredForEndpointChanges) { Gossiper.instance.register(this); FailureDetector.instance.registerFailureDetectionEventListener(this); registeredForEndpointChanges = true; } if (!parentRepairSessions.containsKey(parentRepairSession)) { parentRepairSessions.put(parentRepairSession, new ParentRepairSession(coordinator, columnFamilyStores, ranges, isIncremental, repairedAt, isGlobal, previewKind)); } } public ParentRepairSession getParentRepairSession(UUID parentSessionId) { ParentRepairSession session = parentRepairSessions.get(parentSessionId); // this can happen if a node thinks that the coordinator was down, but that coordinator got back before noticing // that it was down itself. if (session == null) throw new RuntimeException("Parent repair session with id = " + parentSessionId + " has failed."); return session; } /** * called when the repair session is done - either failed or anticompaction has completed * * clears out any snapshots created by this repair * * @param parentSessionId * @return */ public synchronized ParentRepairSession removeParentRepairSession(UUID parentSessionId) { String snapshotName = parentSessionId.toString(); for (ColumnFamilyStore cfs : getParentRepairSession(parentSessionId).columnFamilyStores.values()) { if (cfs.snapshotExists(snapshotName)) cfs.clearSnapshot(snapshotName); } return parentRepairSessions.remove(parentSessionId); } public void handleMessage(Message<? extends RepairMessage> message) { RepairJobDesc desc = message.payload.desc; RepairSession session = sessions.get(desc.sessionId); if (session == null) return; switch (message.verb()) { case VALIDATION_RSP: ValidationResponse validation = (ValidationResponse) message.payload; session.validationComplete(desc, message.from(), validation.trees); break; case SYNC_RSP: // one of replica is synced. SyncResponse sync = (SyncResponse) message.payload; session.syncComplete(desc, sync.nodes, sync.success, sync.summaries); break; default: break; } } /** * We keep a ParentRepairSession around for the duration of the entire repair, for example, on a 256 token vnode rf=3 cluster * we would have 768 RepairSession but only one ParentRepairSession. We use the PRS to avoid anticompacting the sstables * 768 times, instead we take all repaired ranges at the end of the repair and anticompact once. */ public static class ParentRepairSession { private final Keyspace keyspace; private final Map<TableId, ColumnFamilyStore> columnFamilyStores = new HashMap<>(); private final Collection<Range<Token>> ranges; public final boolean isIncremental; public final boolean isGlobal; public final long repairedAt; public final InetAddressAndPort coordinator; public final PreviewKind previewKind; public ParentRepairSession(InetAddressAndPort coordinator, List<ColumnFamilyStore> columnFamilyStores, Collection<Range<Token>> ranges, boolean isIncremental, long repairedAt, boolean isGlobal, PreviewKind previewKind) { this.coordinator = coordinator; Set<Keyspace> keyspaces = new HashSet<>(); for (ColumnFamilyStore cfs : columnFamilyStores) { keyspaces.add(cfs.keyspace); this.columnFamilyStores.put(cfs.metadata.id, cfs); } Preconditions.checkArgument(keyspaces.size() == 1, "repair sessions cannot operate on multiple keyspaces"); this.keyspace = Iterables.getOnlyElement(keyspaces); this.ranges = ranges; this.repairedAt = repairedAt; this.isIncremental = isIncremental; this.isGlobal = isGlobal; this.previewKind = previewKind; } public boolean isPreview() { return previewKind != PreviewKind.NONE; } public Collection<ColumnFamilyStore> getColumnFamilyStores() { return ImmutableSet.<ColumnFamilyStore>builder().addAll(columnFamilyStores.values()).build(); } public Keyspace getKeyspace() { return keyspace; } public Set<TableId> getTableIds() { return ImmutableSet.copyOf(transform(getColumnFamilyStores(), cfs -> cfs.metadata.id)); } public Set<Range<Token>> getRanges() { return ImmutableSet.copyOf(ranges); } @Override public String toString() { return "ParentRepairSession{" + "columnFamilyStores=" + columnFamilyStores + ", ranges=" + ranges + ", repairedAt=" + repairedAt + '}'; } } /* If the coordinator node dies we should remove the parent repair session from the other nodes. This uses the same notifications as we get in RepairSession */ public void onJoin(InetAddressAndPort endpoint, EndpointState epState) { } public void beforeChange(InetAddressAndPort endpoint, EndpointState currentState, ApplicationState newStateKey, VersionedValue newValue) { } public void onChange(InetAddressAndPort endpoint, ApplicationState state, VersionedValue value) { } public void onAlive(InetAddressAndPort endpoint, EndpointState state) { } public void onDead(InetAddressAndPort endpoint, EndpointState state) { } public void onRemove(InetAddressAndPort endpoint) { convict(endpoint, Double.MAX_VALUE); } public void onRestart(InetAddressAndPort endpoint, EndpointState state) { convict(endpoint, Double.MAX_VALUE); } /** * Something has happened to a remote node - if that node is a coordinator, we mark the parent repair session id as failed. * * The fail marker is kept in the map for 24h to make sure that if the coordinator does not agree * that the repair failed, we need to fail the entire repair session * * @param ep endpoint to be convicted * @param phi the value of phi with with ep was convicted */ public void convict(InetAddressAndPort ep, double phi) { // We want a higher confidence in the failure detection than usual because failing a repair wrongly has a high cost. if (phi < 2 * DatabaseDescriptor.getPhiConvictThreshold() || parentRepairSessions.isEmpty()) return; Set<UUID> toRemove = new HashSet<>(); for (Map.Entry<UUID, ParentRepairSession> repairSessionEntry : parentRepairSessions.entrySet()) { if (repairSessionEntry.getValue().coordinator.equals(ep)) { toRemove.add(repairSessionEntry.getKey()); } } if (!toRemove.isEmpty()) { logger.debug("Removing {} in parent repair sessions", toRemove); for (UUID id : toRemove) removeParentRepairSession(id); } } }