Java tutorial
/** * Copyright 2015 Palantir Technologies * * Licensed under the BSD-3 License (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://opensource.org/licenses/BSD-3-Clause * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.palantir.atlasdb.keyvalue.cassandra; import java.net.InetSocketAddress; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.cassandra.thrift.Cassandra; import org.apache.cassandra.thrift.CfDef; import org.apache.cassandra.thrift.EndpointDetails; import org.apache.cassandra.thrift.InvalidRequestException; import org.apache.cassandra.thrift.KsDef; import org.apache.cassandra.thrift.NotFoundException; import org.apache.cassandra.thrift.SchemaDisagreementException; import org.apache.cassandra.thrift.TokenRange; import org.apache.commons.lang.Validate; import org.apache.thrift.TException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.HashMultimap; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Maps; import com.google.common.collect.Multimap; import com.google.common.collect.Sets; import com.palantir.atlasdb.AtlasDbConstants; import com.palantir.common.collect.Maps2; public class CassandraVerifier { private static final Logger log = LoggerFactory.getLogger(CassandraVerifier.class); // This method exists to verify a particularly nasty bug where cassandra doesn't have a // consistent ring across all of it's nodes. One node will think it owns more than the others // think it does and they will not send writes to it, but it will respond to requests // acting like it does. protected static void sanityCheckRingConsistency(Set<InetSocketAddress> currentAddrs, String keyspace, boolean isSsl, boolean safetyDisabled, int socketTimeoutMillis, int socketQueryTimeoutMillis) { Multimap<Set<TokenRange>, InetSocketAddress> tokenRangesToHost = HashMultimap.create(); for (InetSocketAddress addr : currentAddrs) { Cassandra.Client client = null; try { client = CassandraClientFactory.getClientInternal(addr, isSsl, socketTimeoutMillis, socketQueryTimeoutMillis); try { client.describe_keyspace(keyspace); } catch (NotFoundException e) { log.info( "Tried to check ring consistency for node {} before keyspace was fully setup; aborting check for now.", addr, e); return; } tokenRangesToHost.put(ImmutableSet.copyOf(client.describe_ring(keyspace)), addr); } catch (Exception e) { log.warn("failed to get ring info from host: {}", addr, e); } finally { if (client != null) { client.getOutputProtocol().getTransport().close(); } } } if (tokenRangesToHost.isEmpty()) { log.error( "Failed to get ring info for entire Cassandra cluster ({}); ring could not be checked for consistency.", keyspace); return; } if (tokenRangesToHost.keySet().size() == 1) { return; } RuntimeException e = new IllegalStateException( "Hosts have differing ring descriptions. This can lead to inconsistent reads and lost data. "); log.error("QA-86204 " + e.getMessage() + tokenRangesToHost, e); if (tokenRangesToHost.size() > 2) { for (Entry<Set<TokenRange>, Collection<InetSocketAddress>> entry : tokenRangesToHost.asMap() .entrySet()) { if (entry.getValue().size() == 1) { log.error("Host: " + entry.getValue().iterator().next() + " disagrees with the other nodes about the ring state."); } } } if (tokenRangesToHost.keySet().size() == 2) { ImmutableList<Set<TokenRange>> sets = ImmutableList.copyOf(tokenRangesToHost.keySet()); Set<TokenRange> set1 = sets.get(0); Set<TokenRange> set2 = sets.get(1); log.error("Hosts are split. group1: " + tokenRangesToHost.get(set1) + " group2: " + tokenRangesToHost.get(set2)); } logErrorOrThrow(e.getMessage(), safetyDisabled); } static Set<String> sanityCheckDatacenters(Cassandra.Client client, int desiredRf, boolean safetyDisabled) throws InvalidRequestException, TException { ensureTestKeyspaceExists(client); Set<String> hosts = Sets.newHashSet(); Multimap<String, String> dataCenterToRack = HashMultimap.create(); List<TokenRange> ring = client.describe_ring(CassandraConstants.SIMPLE_RF_TEST_KEYSPACE); for (TokenRange tokenRange : ring) { for (EndpointDetails details : tokenRange.getEndpoint_details()) { dataCenterToRack.put(details.datacenter, details.rack); hosts.add(details.host); } } if (dataCenterToRack.size() == 1) { String dc = dataCenterToRack.keySet().iterator().next(); String rack = dataCenterToRack.values().iterator().next(); if (dc.equals(CassandraConstants.DEFAULT_DC) && rack.equals(CassandraConstants.DEFAULT_RACK) && desiredRf > 1) { // We don't allow greater than RF=1 because they didn't set up their network. logErrorOrThrow( "The cassandra cluster is not set up to be datacenter and rack aware. " + "Please set this up before running with a replication factor higher than 1.", safetyDisabled); } if (dataCenterToRack.values().size() < desiredRf && hosts.size() > desiredRf) { logErrorOrThrow("The cassandra cluster only has one DC, " + "and is set up with less racks than the desired number of replicas, " + "and there are more hosts than the replication factor. " + "It is very likely that your rack configuration is incorrect and replicas would not be placed correctly for the failure tolerance you want.", safetyDisabled); } } return dataCenterToRack.keySet(); } static void sanityCheckTableName(String table) { Validate.isTrue( !(table.startsWith("_") && table.contains(".")) || AtlasDbConstants.hiddenTables.contains(table) || table.startsWith(AtlasDbConstants.NAMESPACE_PREFIX), "invalid tableName: " + table); } private static void logErrorOrThrow(String errorMessage, boolean safetyDisabled) { String safetyMessage = " This would have normally resulted in Palantir exiting, however safety checks have been disabled."; if (safetyDisabled) { log.error(errorMessage + safetyMessage); } else { throw new IllegalStateException(errorMessage); } } /* * This keyspace exists because we need a way to pull the datacenter information and they only * way to do it is if you have a valid keyspace set up. We will pull the info from here * so we can accurately create the actually NetworkTopologyStrategy keyspace. */ private static void ensureTestKeyspaceExists(Cassandra.Client client) { try { try { client.describe_keyspace(CassandraConstants.SIMPLE_RF_TEST_KEYSPACE); return; } catch (NotFoundException e) { // need to create key space } KsDef testKs = new KsDef(CassandraConstants.SIMPLE_RF_TEST_KEYSPACE, CassandraConstants.SIMPLE_STRATEGY, ImmutableList.<CfDef>of()); testKs.setStrategy_options(ImmutableMap.of(CassandraConstants.REPLICATION_FACTOR_OPTION, "1")); client.system_add_keyspace(testKs); } catch (Exception e) { log.warn(e.getMessage(), e); } } static void checkAndSetReplicationFactor(Cassandra.Client client, KsDef ks, boolean freshInstance, int desiredRf, boolean safetyDisabled) throws InvalidRequestException, SchemaDisagreementException, TException { if (freshInstance) { Set<String> dcs = CassandraVerifier.sanityCheckDatacenters(client, desiredRf, safetyDisabled); // If RF exceeds # hosts, then Cassandra will reject writes ks.setStrategy_options(Maps2.createConstantValueMap(dcs, String.valueOf(desiredRf))); return; } final Set<String> dcs; if (CassandraConstants.SIMPLE_STRATEGY.equals(ks.getStrategy_class())) { int currentRF = Integer .parseInt(ks.getStrategy_options().get(CassandraConstants.REPLICATION_FACTOR_OPTION)); String errorMessage = "This cassandra cluster is running using the simple partitioning strategy. " + "This partitioner is not rack aware and is not intended for use on prod. " + "This will have to be fixed by manually configuring to the network partitioner " + "and running the appropriate repairs. " + "Contact the AtlasDB team to perform these steps."; if (currentRF != 1) { logErrorOrThrow(errorMessage, safetyDisabled); } // Automatically convert RF=1 to look like network partitioner. dcs = CassandraVerifier.sanityCheckDatacenters(client, desiredRf, safetyDisabled); if (dcs.size() > 1) { logErrorOrThrow(errorMessage, safetyDisabled); } if (!safetyDisabled) { ks.setStrategy_class(CassandraConstants.NETWORK_STRATEGY); ks.setStrategy_options(ImmutableMap.of(dcs.iterator().next(), "1")); } } else { dcs = CassandraVerifier.sanityCheckDatacenters(client, desiredRf, safetyDisabled); } Map<String, String> strategyOptions = Maps.newHashMap(ks.getStrategy_options()); for (String dc : dcs) { if (strategyOptions.get(dc) == null) { logErrorOrThrow("The datacenter for this cassandra cluster is invalid. " + " failed dc: " + dc + " strategyOptions: " + strategyOptions, safetyDisabled); } } String dc = dcs.iterator().next(); int currentRF = Integer.parseInt(strategyOptions.get(dc)); // We need to worry about user not running repair and user skipping replication levels. if (currentRF == 1 && desiredRf == 2) { log.error("Upping AtlasDB replication factor from 1 to 2. User should run " + "`nodetool repair` on cluster if they have not already!"); strategyOptions.put(dc, String.valueOf(desiredRf)); ks.setStrategy_options(strategyOptions); } else if (currentRF == 1 && desiredRf == CassandraConstants.DEFAULT_REPLICATION_FACTOR) { log.error("Upping AtlasDB replication factor from 1 " + "to 3 is NOT allowed directly.\n" + "Increase replication factor to 2 first, then run `nodetool repair`. If it succeeds, increase replication factor to 3, and run `nodetool repair`"); } else if (currentRF == 2 && desiredRf == CassandraConstants.DEFAULT_REPLICATION_FACTOR) { strategyOptions.put(dc, String.valueOf(desiredRf)); ks.setStrategy_options(strategyOptions); ks.setCf_defs(ImmutableList.<CfDef>of()); client.system_update_keyspace(ks); log.warn("Updating AtlasDB replication factor from " + currentRF + " to " + desiredRf + " process are NOT completed!" + " User may want to run `nodetool repair` to make all replicas consistent."); } else if (currentRF > desiredRf) { // We are moving to a lower RF, this should be always safe from a consistency rule standpoint log.error("Reducing AtlasDB replication factor from " + currentRF + " to " + desiredRf + ". User may want to run `nodetool cleanup` to remove excess replication."); strategyOptions.put(dc, String.valueOf(desiredRf)); ks.setStrategy_options(strategyOptions); ks.setCf_defs(ImmutableList.<CfDef>of()); client.system_update_keyspace(ks); } else if (currentRF == desiredRf) { log.info("Did not change AtlasDB replication factor."); } else { logErrorOrThrow("We only support replication up to 3. Attempted to go from " + currentRF + " to " + desiredRf + ".", safetyDisabled); } } }