com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueService.java Source code

Java tutorial

Introduction

Here is the source code for com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueService.java

Source

/**
 * Copyright 2015 Palantir Technologies
 *
 * Licensed under the BSD-3 License (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://opensource.org/licenses/BSD-3-Clause
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.palantir.atlasdb.keyvalue.cassandra;

import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.cassandra.thrift.CASResult;
import org.apache.cassandra.thrift.Cassandra;
import org.apache.cassandra.thrift.Cassandra.Client;
import org.apache.cassandra.thrift.CfDef;
import org.apache.cassandra.thrift.Column;
import org.apache.cassandra.thrift.ColumnOrSuperColumn;
import org.apache.cassandra.thrift.ColumnParent;
import org.apache.cassandra.thrift.ConsistencyLevel;
import org.apache.cassandra.thrift.Deletion;
import org.apache.cassandra.thrift.InvalidRequestException;
import org.apache.cassandra.thrift.KeyRange;
import org.apache.cassandra.thrift.KeySlice;
import org.apache.cassandra.thrift.KsDef;
import org.apache.cassandra.thrift.Mutation;
import org.apache.cassandra.thrift.NotFoundException;
import org.apache.cassandra.thrift.SchemaDisagreementException;
import org.apache.cassandra.thrift.SlicePredicate;
import org.apache.cassandra.thrift.SliceRange;
import org.apache.cassandra.thrift.TimedOutException;
import org.apache.cassandra.thrift.UnavailableException;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.Validate;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Function;
import com.google.common.base.Functions;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicates;
import com.google.common.base.Stopwatch;
import com.google.common.base.Strings;
import com.google.common.base.Supplier;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableMap.Builder;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.Ordering;
import com.google.common.collect.SetMultimap;
import com.google.common.collect.Sets;
import com.google.common.collect.TreeMultimap;
import com.google.common.primitives.UnsignedBytes;
import com.palantir.atlasdb.AtlasDbConstants;
import com.palantir.atlasdb.cassandra.CassandraKeyValueServiceConfig;
import com.palantir.atlasdb.cassandra.CassandraKeyValueServiceConfigManager;
import com.palantir.atlasdb.encoding.PtBytes;
import com.palantir.atlasdb.keyvalue.api.Cell;
import com.palantir.atlasdb.keyvalue.api.ColumnSelection;
import com.palantir.atlasdb.keyvalue.api.InsufficientConsistencyException;
import com.palantir.atlasdb.keyvalue.api.KeyAlreadyExistsException;
import com.palantir.atlasdb.keyvalue.api.RangeRequest;
import com.palantir.atlasdb.keyvalue.api.RangeRequests;
import com.palantir.atlasdb.keyvalue.api.RowResult;
import com.palantir.atlasdb.keyvalue.api.Value;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServices.AllTimestampsCollector;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServices.StartTsResultsCollector;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServices.ThreadSafeResultVisitor;
import com.palantir.atlasdb.keyvalue.impl.AbstractKeyValueService;
import com.palantir.atlasdb.keyvalue.impl.Cells;
import com.palantir.atlasdb.keyvalue.impl.KeyValueServices;
import com.palantir.atlasdb.schema.UpgradeFailedException;
import com.palantir.atlasdb.table.description.TableMetadata;
import com.palantir.common.annotation.Idempotent;
import com.palantir.common.base.ClosableIterator;
import com.palantir.common.base.ClosableIterators;
import com.palantir.common.base.FunctionCheckedException;
import com.palantir.common.base.Throwables;
import com.palantir.common.concurrent.PTExecutors;
import com.palantir.common.exception.PalantirRuntimeException;
import com.palantir.common.pooling.PoolingContainer;
import com.palantir.util.paging.AbstractPagingIterable;
import com.palantir.util.paging.TokenBackedBasicResultsPage;

/**
 *
 * For each C* KVS, it maintains a list of active nodes, and the client connections attached to each node
 *
 * n1->c1, c2, c3
 * n2->c5, c4, c9
 * n3->[N C* thrift client connections]
 *
 * Where {n1, n2, n3} are the active nodes in the C* cluster. Also each
 * node contains the clients which are attached to the node.
 * if some nodes are down, and the change can be detected through active hosts,
 * and these inactive nodes will be removed afterwards.
 */
public class CassandraKeyValueService extends AbstractKeyValueService {

    private static final Logger log = LoggerFactory.getLogger(CassandraKeyValueService.class);

    private static final Function<Entry<Cell, Value>, Long> ENTRY_SIZING_FUNCTION = new Function<Entry<Cell, Value>, Long>() {
        @Override
        public Long apply(Entry<Cell, Value> input) {
            return input.getValue().getContents().length + 4L + Cells.getApproxSizeOfCell(input.getKey());
        }
    };

    private static final Set<String> HIDDEN_TABLES = ImmutableSet.of(CassandraConstants.METADATA_TABLE,
            CassandraTimestampBoundStore.TIMESTAMP_TABLE);

    private final CassandraKeyValueServiceConfigManager configManager;
    private final CassandraClientPoolingManager cassandraClientPoolingManager;
    private final CassandraJMXCompactionManager compactionManager;
    protected final ManyClientPoolingContainer containerPoolToUpdate;
    protected final PoolingContainer<Client> clientPool;
    private final ReentrantLock schemaMutationLock = new ReentrantLock(true);

    private ConsistencyLevel readConsistency = ConsistencyLevel.LOCAL_QUORUM;
    private final ConsistencyLevel writeConsistency = ConsistencyLevel.EACH_QUORUM;
    private final ConsistencyLevel deleteConsistency = ConsistencyLevel.ALL;

    private static final long TRANSACTION_TS = 0L;
    private static final int OVERSIZE_ROW_CUTOFF = 1000;

    public static CassandraKeyValueService create(CassandraKeyValueServiceConfigManager configManager) {
        final CassandraKeyValueService ret = new CassandraKeyValueService(configManager);
        try {
            ret.initializeFromFreshInstance(ret.containerPoolToUpdate.getCurrentHosts(),
                    configManager.getConfig().replicationFactor());
            ret.getPoolingManager().submitHostRefreshTask();
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
        return ret;
    }

    protected CassandraKeyValueService(CassandraKeyValueServiceConfigManager configManager) {
        super(PTExecutors.newFixedThreadPool(configManager.getConfig().poolSize() * 2,
                PTExecutors.newNamedThreadFactory(false)));
        this.configManager = configManager;
        this.containerPoolToUpdate = ManyClientPoolingContainer.create(configManager.getConfig());
        this.clientPool = new RetriablePoolingContainer(this.containerPoolToUpdate);
        cassandraClientPoolingManager = new CassandraClientPoolingManager(containerPoolToUpdate, clientPool,
                configManager);
        compactionManager = CassandraJMXCompactionManager.newInstance(configManager.getConfig());
    }

    public CassandraClientPoolingManager getPoolingManager() {
        return cassandraClientPoolingManager;
    }

    @Override
    public void initializeFromFreshInstance() {
        // we already did our init in our factory method
    }

    protected void initializeFromFreshInstance(List<String> addrList, int replicationFactor) {
        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        Map<String, Throwable> errorsByHost = Maps.newHashMap();

        int port = config.port();
        String keyspace = config.keyspace();
        boolean ssl = config.ssl();
        boolean safetyDisabled = config.safetyDisabled();
        int socketTimeoutMillis = config.socketTimeoutMillis();
        int socketQueryTimeoutMillis = config.socketQueryTimeoutMillis();
        for (String addr : addrList) {
            Cassandra.Client client = null;
            try {
                client = CassandraClientPoolingContainer.getClientInternal(addr, port, ssl, socketTimeoutMillis,
                        socketQueryTimeoutMillis);
                String partitioner = client.describe_partitioner();
                if (safetyDisabled) {
                    Validate.isTrue(
                            CassandraConstants.PARTITIONER.equals(partitioner)
                                    || CassandraConstants.PARTITIONER2.equals(partitioner),
                            "partitioner is: " + partitioner);
                }
                KsDef ks = null;
                try {
                    ks = client.describe_keyspace(keyspace);
                } catch (NotFoundException e) {
                    // need to create key space
                }

                Set<String> currentHosts = cassandraClientPoolingManager.getCurrentHostsFromServer(client);
                cassandraClientPoolingManager.setHostsToCurrentHostNames(currentHosts);
                if (ks != null) { // ks already exists
                    CassandraVerifier.checkAndSetReplicationFactor(client, ks, false, replicationFactor,
                            safetyDisabled);
                    lowerConsistencyWhenSafe(client, ks, replicationFactor);
                    // Can't call system_update_keyspace to update replication factor if CfDefs are set
                    ks.setCf_defs(ImmutableList.<CfDef>of());
                    client.system_update_keyspace(ks);
                    CassandraKeyValueServices.waitForSchemaVersions(client, "(updating the existing keyspace)");
                    client.set_keyspace(keyspace);
                    createTableInternal(client, CassandraConstants.METADATA_TABLE);
                    CassandraVerifier.sanityCheckRingConsistency(currentHosts, port, keyspace, config.ssl(),
                            safetyDisabled, socketTimeoutMillis, socketQueryTimeoutMillis);

                    upgradeFromOlderInternalSchema(client);
                    return;
                }
                ks = new KsDef(keyspace, CassandraConstants.NETWORK_STRATEGY, ImmutableList.<CfDef>of());
                CassandraVerifier.checkAndSetReplicationFactor(client, ks, true, replicationFactor, safetyDisabled);
                lowerConsistencyWhenSafe(client, ks, replicationFactor);
                ks.setDurable_writes(true);
                client.system_add_keyspace(ks);
                CassandraKeyValueServices.waitForSchemaVersions(client, "(adding the initial empty keyspace)");
                client.set_keyspace(keyspace);
                createTableInternal(client, CassandraConstants.METADATA_TABLE);
                CassandraVerifier.sanityCheckRingConsistency(currentHosts, port, keyspace, config.ssl(),
                        safetyDisabled, socketTimeoutMillis, socketQueryTimeoutMillis);
                CassandraKeyValueServices.failQuickInInitializationIfClusterAlreadyInInconsistentState(client,
                        safetyDisabled);
                return;
            } catch (TException e) {
                log.warn("failed to connect to host: " + addr, e);
                errorsByHost.put(addr.toString(), e);
            } finally {
                if (client != null) {
                    client.getOutputProtocol().getTransport().close();
                }
            }
        }
        throw new IllegalStateException(CassandraKeyValueServices
                .buildErrorMessage("Could not connect to any Cassandra hosts", errorsByHost));
    }

    private void upgradeFromOlderInternalSchema(Client client)
            throws NotFoundException, InvalidRequestException, TException {
        Map<String, byte[]> metadataForTables = getMetadataForTables();
        Map<String, byte[]> tablesToUpgrade = Maps.newHashMapWithExpectedSize(metadataForTables.size());

        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        String keyspace = config.keyspace();
        boolean safetyDisabled = config.safetyDisabled();
        for (CfDef clusterSideCf : client.describe_keyspace(keyspace).getCf_defs()) {
            String tableName = fromInternalTableName(clusterSideCf.getName());
            if (metadataForTables.containsKey(tableName)) {
                byte[] clusterSideMetadata = metadataForTables.get(tableName);
                CfDef clientSideCf = getCfForTable(tableName, clusterSideMetadata);
                if (!CassandraKeyValueServices.isMatchingCf(clientSideCf, clusterSideCf)) { // mismatch; we have changed how we generate schema since we last persisted
                    log.warn("Upgrading table {} to new internal Cassandra schema", tableName);
                    tablesToUpgrade.put(tableName, clusterSideMetadata);
                }
            } else if (!HIDDEN_TABLES.contains(tableName)) { // only expected cases
                // Possible to get here from a race condition with another server starting up and performing schema upgrades concurrent with us doing this check
                throw new RuntimeException(new UpgradeFailedException("Found a table " + tableName
                        + " that did not have persisted AtlasDB metadata. If you recently did a Palantir update, try waiting until schema upgrades are completed on all backend servers and restarting this service."));
            }
        }

        // we are racing another server to do these same operations here, but they are idempotent / safe
        if (!tablesToUpgrade.isEmpty()) {
            putMetadataForTables(tablesToUpgrade);
        } else {
            CassandraKeyValueServices.failQuickInInitializationIfClusterAlreadyInInconsistentState(client,
                    safetyDisabled);
        }
    }

    private void lowerConsistencyWhenSafe(Client client, KsDef ks, int desiredRf) {
        CassandraKeyValueServiceConfig config = configManager.getConfig();
        boolean safetyDisabled = config.safetyDisabled();

        Set<String> dcs;
        try {
            dcs = CassandraVerifier.sanityCheckDatacenters(client, desiredRf, safetyDisabled);
        } catch (InvalidRequestException e) {
            return;
        } catch (TException e) {
            return;
        }

        Map<String, String> strategyOptions = Maps.newHashMap(ks.getStrategy_options());
        if (dcs.size() == 1) {
            String dc = dcs.iterator().next();
            if (strategyOptions.get(dc) != null) {
                int currentRF = Integer.parseInt(strategyOptions.get(dc));
                if (currentRF == desiredRf) {
                    if (currentRF == 2) {
                        log.info("Setting Read Consistency to ONE, as cluster has only one datacenter at RF2.");
                        readConsistency = ConsistencyLevel.ONE;
                    }
                }
            }
        }
    }

    @Override
    public Map<Cell, Value> getRows(final String tableName, final Iterable<byte[]> rows, ColumnSelection selection,
            final long startTs) {
        if (!selection.allColumnsSelected()) {
            return getRowsForSpecificColumns(tableName, rows, selection, startTs);
        }

        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        int fetchBatchCount = config.fetchBatchCount();
        try {
            int rowCount = 0;
            ImmutableMap.Builder<Cell, Value> result = ImmutableMap.<Cell, Value>builder();
            for (final List<byte[]> batch : Iterables.partition(rows, fetchBatchCount)) {
                rowCount += batch.size();
                result.putAll(clientPool
                        .runWithPooledResource(new FunctionCheckedException<Client, Map<Cell, Value>, Exception>() {
                            @Override
                            public Map<Cell, Value> apply(Client client) throws Exception {
                                // We want to get all the columns in the row so set start and end to empty.
                                SliceRange slice = new SliceRange(ByteBuffer.wrap(PtBytes.EMPTY_BYTE_ARRAY),
                                        ByteBuffer.wrap(PtBytes.EMPTY_BYTE_ARRAY), false, Integer.MAX_VALUE);
                                SlicePredicate pred = new SlicePredicate();
                                pred.setSlice_range(slice);

                                List<ByteBuffer> rowNames = Lists.newArrayListWithCapacity(batch.size());
                                for (byte[] r : batch) {
                                    rowNames.add(ByteBuffer.wrap(r));
                                }
                                ColumnParent colFam = new ColumnParent(internalTableName(tableName));
                                Map<ByteBuffer, List<ColumnOrSuperColumn>> results = multigetInternal(client,
                                        tableName, rowNames, colFam, pred, readConsistency);
                                Map<Cell, Value> ret = Maps.newHashMap();
                                new ValueExtractor(ret).extractResults(results, startTs, ColumnSelection.all());
                                return ret;
                            }

                            @Override
                            public String toString() {
                                return "multiget_slice(" + tableName + ", " + batch.size() + " rows" + ")";
                            }
                        }));
            }
            if (rowCount > fetchBatchCount) {
                log.warn("Rebatched in getRows a call to " + tableName + " that attempted to multiget " + rowCount
                        + " rows; this may indicate overly-large batching on a higher level.\n"
                        + CassandraKeyValueServices.getFilteredStackTrace("com.palantir"));
            }
            return result.build();
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    private Map<Cell, Value> getRowsForSpecificColumns(final String tableName, final Iterable<byte[]> rows,
            ColumnSelection selection, final long startTs) {
        Preconditions.checkArgument(!selection.allColumnsSelected(), "Must select specific columns");

        Collection<byte[]> selectedColumns = selection.getSelectedColumns();
        Set<Cell> cells = Sets.newHashSetWithExpectedSize(selectedColumns.size() * Iterables.size(rows));
        for (byte[] row : rows) {
            for (byte[] col : selectedColumns) {
                cells.add(Cell.create(row, col));
            }
        }

        try {
            StartTsResultsCollector collector = new StartTsResultsCollector(startTs);
            loadWithTs(tableName, cells, startTs, collector, readConsistency);
            return collector.collectedResults;
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    @Override
    public Map<Cell, Value> get(String tableName, Map<Cell, Long> timestampByCell) {
        if (timestampByCell.isEmpty()) {
            log.info("Attempted get on '{}' table with empty cells", tableName);
            return ImmutableMap.of();
        }

        try {
            Long firstTs = timestampByCell.values().iterator().next();
            if (Iterables.all(timestampByCell.values(), Predicates.equalTo(firstTs))) {
                StartTsResultsCollector collector = new StartTsResultsCollector(firstTs);
                loadWithTs(tableName, timestampByCell.keySet(), firstTs, collector, readConsistency);
                return collector.collectedResults;
            }

            SetMultimap<Long, Cell> cellsByTs = Multimaps.invertFrom(Multimaps.forMap(timestampByCell),
                    HashMultimap.<Long, Cell>create());
            Builder<Cell, Value> builder = ImmutableMap.builder();
            for (long ts : cellsByTs.keySet()) {
                StartTsResultsCollector collector = new StartTsResultsCollector(ts);
                loadWithTs(tableName, cellsByTs.get(ts), ts, collector, readConsistency);
                builder.putAll(collector.collectedResults);
            }
            return builder.build();
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    // NOTE: This code makes a call to cassandra for each column concurrently
    // TODO: after cassandra api change: handle different column select per row
    private void loadWithTs(final String tableName, final Set<Cell> cells, final long startTs,
            final ThreadSafeResultVisitor v, final ConsistencyLevel consistency) throws Exception {
        final ColumnParent colFam = new ColumnParent(internalTableName(tableName));
        final Multimap<byte[], Cell> cellsByCol = TreeMultimap.create(UnsignedBytes.lexicographicalComparator(),
                Ordering.natural());
        for (Cell cell : cells) {
            cellsByCol.put(cell.getColumnName(), cell);
        }
        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        int fetchBatchCount = config.fetchBatchCount();
        List<Future<?>> futures = Lists.newArrayListWithCapacity(cellsByCol.size());
        for (final byte[] col : cellsByCol.keySet()) {
            if (cellsByCol.get(col).size() > fetchBatchCount) {
                log.warn("Re-batching in loadWithTs a call to " + tableName + " that attempted to multiget "
                        + cellsByCol.get(col).size()
                        + " rows; this may indicate overly-large batching on a higher level.\n"
                        + CassandraKeyValueServices.getFilteredStackTrace("com.palantir"));
            }
            for (final List<Cell> partition : Iterables.partition(cellsByCol.get(col), fetchBatchCount)) {
                futures.add(executor.submit(new Callable<Void>() {
                    @Override
                    public Void call() throws Exception {
                        return clientPool
                                .runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                                    @Override
                                    public Void apply(Client client) throws Exception {
                                        ByteBuffer start = CassandraKeyValueServices.makeCompositeBuffer(col,
                                                startTs - 1);
                                        ByteBuffer end = CassandraKeyValueServices.makeCompositeBuffer(col, -1);
                                        SliceRange slice = new SliceRange(start, end, false, 1);
                                        SlicePredicate pred = new SlicePredicate();
                                        pred.setSlice_range(slice);

                                        List<ByteBuffer> rowNames = Lists
                                                .newArrayListWithCapacity(partition.size());
                                        for (Cell c : partition) {
                                            rowNames.add(ByteBuffer.wrap(c.getRowName()));
                                        }
                                        Map<ByteBuffer, List<ColumnOrSuperColumn>> results = multigetInternal(
                                                client, tableName, rowNames, colFam, pred, consistency);
                                        v.visit(results);
                                        return null;
                                    }

                                    @Override
                                    public String toString() {
                                        return "multiget_slice(" + colFam + ", " + partition.size() + " rows" + ")";
                                    }
                                });
                    }
                }));
            }
        }
        for (Future<?> f : futures) {
            try {
                f.get();
            } catch (InterruptedException e) {
                throw Throwables.throwUncheckedException(e);
            } catch (ExecutionException e) {
                Throwables.throwIfInstance(e, Error.class);
                Throwables.throwIfInstance(e, Exception.class);
                throw Throwables.throwUncheckedException(e);
            }
        }
    }

    @Override
    public Map<Cell, Long> getLatestTimestamps(String tableName, Map<Cell, Long> timestampByCell) {
        // TODO: optimize by only getting column name after cassandra api change
        return super.getLatestTimestamps(tableName, timestampByCell);
    }

    @Override
    public void put(final String tableName, final Map<Cell, byte[]> values, final long timestamp) {
        try {
            putInternal(tableName, KeyValueServices.toConstantTimestampValues(values.entrySet(), timestamp));
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    @Override
    public void putWithTimestamps(String tableName, Multimap<Cell, Value> values) {
        try {
            putInternal(tableName, values.entries());
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    @Override
    protected int getMultiPutBatchCount() {
        return configManager.getConfig().mutationBatchCount();
    }

    private void putInternal(final String tableName, final Iterable<Map.Entry<Cell, Value>> values)
            throws Exception {
        putInternal(tableName, values, -1);
    }

    protected void putInternal(final String tableName, final Iterable<Map.Entry<Cell, Value>> values, final int ttl)
            throws Exception {
        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
            @Override
            public Void apply(Client client) throws Exception {
                int mutationBatchCount = config.mutationBatchCount();
                int mutationBatchSizeBytes = config.mutationBatchSizeBytes();
                for (List<Entry<Cell, Value>> partition : partitionByCountAndBytes(values, mutationBatchCount,
                        mutationBatchSizeBytes, tableName, ENTRY_SIZING_FUNCTION)) {
                    Map<ByteBuffer, Map<String, List<Mutation>>> map = Maps.newHashMap();
                    for (Map.Entry<Cell, Value> e : partition) {
                        Cell cell = e.getKey();
                        Column col = createColumn(cell, e.getValue(), ttl);

                        ColumnOrSuperColumn colOrSup = new ColumnOrSuperColumn();
                        colOrSup.setColumn(col);
                        Mutation m = new Mutation();
                        m.setColumn_or_supercolumn(colOrSup);

                        ByteBuffer rowName = ByteBuffer.wrap(cell.getRowName());

                        Map<String, List<Mutation>> rowPuts = map.get(rowName);
                        if (rowPuts == null) {
                            rowPuts = Maps.<String, List<Mutation>>newHashMap();
                            map.put(rowName, rowPuts);
                        }

                        List<Mutation> tableMutations = rowPuts.get(internalTableName(tableName));
                        if (tableMutations == null) {
                            tableMutations = Lists.<Mutation>newArrayList();
                            rowPuts.put(internalTableName(tableName), tableMutations);
                        }

                        tableMutations.add(m);
                    }
                    batchMutateInternal(client, tableName, map, writeConsistency);
                }
                return null;
            }

            @Override
            public String toString() {
                return "batch_mutate(" + tableName + ", " + Iterables.size(values) + " values, " + ttl
                        + " ttl sec)";
            }
        });
    }

    private Column createColumn(Cell cell, Value value, final int ttl) {
        byte[] contents = value.getContents();
        long timestamp = value.getTimestamp();
        ByteBuffer colName = CassandraKeyValueServices.makeCompositeBuffer(cell.getColumnName(), timestamp);
        Column col = new Column();
        col.setName(colName);
        col.setValue(contents);
        col.setTimestamp(timestamp);

        if (cell.getTtlDurationMillis() > 0) {
            col.setTtl(CassandraKeyValueServices.convertTtl(cell.getTtlDurationMillis(), TimeUnit.MILLISECONDS));
        }

        if (ttl > 0) {
            col.setTtl(ttl);
        }

        return col;
    }

    private void batchMutateInternal(Client client, String tableName,
            Map<ByteBuffer, Map<String, List<Mutation>>> map, ConsistencyLevel consistency)
            throws TException, InvalidRequestException, UnavailableException, TimedOutException {
        if (shouldTraceQuery(tableName)) {
            ByteBuffer recv_trace = client.trace_next_query();
            Stopwatch stopwatch = Stopwatch.createStarted();
            client.batch_mutate(map, consistency);
            long duration = stopwatch.elapsed(TimeUnit.MILLISECONDS);
            if (duration > getMinimumDurationToTraceMillis()) {
                log.error("Traced a call to " + tableName + " that took " + duration + " ms."
                        + " It will appear in system_traces with UUID="
                        + CassandraKeyValueServices.convertCassandraByteBufferUUIDtoString(recv_trace));
            }
        } else {
            client.batch_mutate(map, consistency);
        }
    }

    private Map<ByteBuffer, List<ColumnOrSuperColumn>> multigetInternal(Client client, String tableName,
            List<ByteBuffer> rowNames, ColumnParent colFam, SlicePredicate pred, ConsistencyLevel consistency)
            throws TException, InvalidRequestException, UnavailableException, TimedOutException {
        Map<ByteBuffer, List<ColumnOrSuperColumn>> results;
        if (shouldTraceQuery(tableName)) {
            ByteBuffer recv_trace = client.trace_next_query();
            Stopwatch stopwatch = Stopwatch.createStarted();
            results = client.multiget_slice(rowNames, colFam, pred, consistency);
            long duration = stopwatch.elapsed(TimeUnit.MILLISECONDS);
            if (duration > getMinimumDurationToTraceMillis()) {
                log.error("Traced a call to " + tableName + " that took " + duration + " ms."
                        + " It will appear in system_traces with UUID="
                        + CassandraKeyValueServices.convertCassandraByteBufferUUIDtoString(recv_trace));
            }
        } else {
            results = client.multiget_slice(rowNames, colFam, pred, consistency);
        }
        return results;
    }

    @Override
    public void truncateTable(final String tableName) {
        truncateTables(ImmutableSet.of(tableName));
    }

    @Override
    public void truncateTables(final Set<String> tableNames) {
        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        try {
            trySchemaMutationLock();
            clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                @Override
                public Void apply(Client client) throws Exception {
                    KsDef ks = client.describe_keyspace(config.keyspace());

                    for (String tableName : tableNames) {
                        for (CfDef cf : ks.getCf_defs()) {
                            if (cf.getName().equalsIgnoreCase(internalTableName(tableName))) {
                                if (shouldTraceQuery(tableName)) {
                                    ByteBuffer recv_trace = client.trace_next_query();
                                    Stopwatch stopwatch = Stopwatch.createStarted();
                                    client.truncate(internalTableName(tableName));
                                    long duration = stopwatch.elapsed(TimeUnit.MILLISECONDS);
                                    if (duration > getMinimumDurationToTraceMillis()) {
                                        log.error("Traced a call to " + tableName + " that took " + duration
                                                + " ms." + " It will appear in system_traces with UUID="
                                                + CassandraKeyValueServices
                                                        .convertCassandraByteBufferUUIDtoString(recv_trace));
                                    }
                                } else {
                                    client.truncate(internalTableName(tableName));
                                }
                            }
                        }
                    }

                    CassandraKeyValueServices.waitForSchemaVersions(client,
                            "(" + tableNames.size() + " tables in a call to truncateTables)");
                    return null;
                }

                @Override
                public String toString() {
                    return "truncateTables(" + tableNames.size() + " tables)";
                }
            });
        } catch (UnavailableException e) {
            throw new PalantirRuntimeException(
                    "Creating tables requires all Cassandra nodes to be up and available.");
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        } finally {
            schemaMutationLock.unlock();
        }
    }

    @Override
    public void delete(final String tableName, final Multimap<Cell, Long> keys) {
        try {
            clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                @Override
                public Void apply(Client client) throws Exception {
                    // Delete must delete in the order of timestamp and we don't trust batch_mutate to do it
                    // atomically so we have to potentially do many deletes if there are many timestamps for the
                    // same key.
                    Map<Integer, Map<ByteBuffer, Map<String, List<Mutation>>>> maps = Maps.newTreeMap();
                    for (Cell key : keys.keySet()) {
                        int mapIndex = 0;
                        for (long ts : Ordering.natural().immutableSortedCopy(keys.get(key))) {
                            if (!maps.containsKey(mapIndex)) {
                                maps.put(mapIndex, Maps.<ByteBuffer, Map<String, List<Mutation>>>newHashMap());
                            }
                            Map<ByteBuffer, Map<String, List<Mutation>>> map = maps.get(mapIndex);
                            ByteBuffer colName = CassandraKeyValueServices.makeCompositeBuffer(key.getColumnName(),
                                    ts);
                            SlicePredicate pred = new SlicePredicate();
                            pred.setColumn_names(Arrays.asList(colName));
                            Deletion del = new Deletion();
                            del.setPredicate(pred);
                            del.setTimestamp(Long.MAX_VALUE);
                            Mutation m = new Mutation();
                            m.setDeletion(del);
                            ByteBuffer rowName = ByteBuffer.wrap(key.getRowName());
                            if (!map.containsKey(rowName)) {
                                map.put(rowName, Maps.<String, List<Mutation>>newHashMap());
                            }
                            Map<String, List<Mutation>> rowPuts = map.get(rowName);
                            if (!rowPuts.containsKey(internalTableName(tableName))) {
                                rowPuts.put(internalTableName(tableName), Lists.<Mutation>newArrayList());
                            }
                            rowPuts.get(internalTableName(tableName)).add(m);
                            mapIndex++;
                        }
                    }
                    for (Map<ByteBuffer, Map<String, List<Mutation>>> map : maps.values()) {
                        // NOTE: we run with ConsistencyLevel.ALL here instead of ConsistencyLevel.QUORUM
                        // because we want to remove all copies of this data
                        batchMutateInternal(client, tableName, map, deleteConsistency);
                    }
                    return null;
                }

                @Override
                public String toString() {
                    return "batch_mutate(" + tableName + ", " + keys.size() + " keys" + ")";
                }
            });
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    // update CKVS.isMatchingCf if you update this method
    private CfDef getCfForTable(String tableName, byte[] rawMetadata) {
        Map<String, String> compressionOptions = Maps.newHashMap();
        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        CfDef cf = CassandraConstants.getStandardCfDef(config.keyspace(), internalTableName(tableName));

        boolean negativeLookups = false;
        double falsePositiveChance = CassandraConstants.DEFAULT_LEVELED_COMPACTION_BLOOM_FILTER_FP_CHANCE;
        int explicitCompressionBlockSizeKB = 0;

        if (rawMetadata != null && rawMetadata.length != 0) {
            TableMetadata tableMetadata = TableMetadata.BYTES_HYDRATOR.hydrateFromBytes(rawMetadata);
            negativeLookups = tableMetadata.hasNegativeLookups();
            explicitCompressionBlockSizeKB = tableMetadata.getExplicitCompressionBlockSizeKB();
        }

        if (explicitCompressionBlockSizeKB != 0) {
            compressionOptions.put(CassandraConstants.CFDEF_COMPRESSION_TYPE_KEY,
                    CassandraConstants.DEFAULT_COMPRESSION_TYPE);
            compressionOptions.put(CassandraConstants.CFDEF_COMPRESSION_CHUNK_LENGTH_KEY,
                    Integer.toString(explicitCompressionBlockSizeKB));
        } else {
            // We don't really need compression here nor anticipate it will garner us any gains
            // (which is why we're doing such a small chunk size), but this is how we can get "free" CRC checking.
            compressionOptions.put(CassandraConstants.CFDEF_COMPRESSION_TYPE_KEY,
                    CassandraConstants.DEFAULT_COMPRESSION_TYPE);
            compressionOptions.put(CassandraConstants.CFDEF_COMPRESSION_CHUNK_LENGTH_KEY,
                    Integer.toString(AtlasDbConstants.MINIMUM_COMPRESSION_BLOCK_SIZE_KB));
        }

        if (negativeLookups) {
            falsePositiveChance = CassandraConstants.NEGATIVE_LOOKUPS_BLOOM_FILTER_FP_CHANCE;
        }

        cf.setBloom_filter_fp_chance(falsePositiveChance);
        cf.setCompression_options(compressionOptions);
        return cf;
    }

    //TODO: after cassandra change: handle multiRanges
    @Override
    @Idempotent
    public Map<RangeRequest, TokenBackedBasicResultsPage<RowResult<Value>, byte[]>> getFirstBatchForRanges(
            String tableName, Iterable<RangeRequest> rangeRequests, long timestamp) {
        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        int concurrency = config.rangesConcurrency();
        return KeyValueServices.getFirstBatchForRangesUsingGetRangeConcurrent(executor, this, tableName,
                rangeRequests, timestamp, concurrency);
    }

    // TODO: after cassandra change: handle reverse ranges
    // TODO: after cassandra change: handle column filtering
    @Override
    @Idempotent
    public ClosableIterator<RowResult<Value>> getRange(String tableName, final RangeRequest rangeRequest,
            final long timestamp) {
        return getRangeWithPageCreator(tableName, rangeRequest, timestamp, readConsistency,
                ValueExtractor.SUPPLIER);
    }

    @Override
    @Idempotent
    public ClosableIterator<RowResult<Set<Long>>> getRangeOfTimestamps(String tableName, RangeRequest rangeRequest,
            long timestamp) {
        return getRangeWithPageCreator(tableName, rangeRequest, timestamp, deleteConsistency,
                TimestampExtractor.SUPPLIER);
    }

    @Override
    @Idempotent
    public ClosableIterator<RowResult<Set<Value>>> getRangeWithHistory(String tableName, RangeRequest rangeRequest,
            long timestamp) {
        return getRangeWithPageCreator(tableName, rangeRequest, timestamp, deleteConsistency,
                HistoryExtractor.SUPPLIER);
    }

    public <T, U> ClosableIterator<RowResult<U>> getRangeWithPageCreator(final String tableName,
            final RangeRequest rangeRequest, final long timestamp, final ConsistencyLevel consistency,
            final Supplier<ResultsExtractor<T, U>> resultsExtractor) {
        if (rangeRequest.isReverse()) {
            throw new UnsupportedOperationException();
        }
        final int batchHint = rangeRequest.getBatchHint() == null ? 100 : rangeRequest.getBatchHint();
        SliceRange slice = new SliceRange(ByteBuffer.wrap(PtBytes.EMPTY_BYTE_ARRAY),
                ByteBuffer.wrap(PtBytes.EMPTY_BYTE_ARRAY), false, Integer.MAX_VALUE);
        final SlicePredicate pred = new SlicePredicate();
        pred.setSlice_range(slice);

        final ColumnParent colFam = new ColumnParent(internalTableName(tableName));
        final ColumnSelection selection = rangeRequest.getColumnNames().isEmpty() ? ColumnSelection.all()
                : ColumnSelection.create(rangeRequest.getColumnNames());
        return ClosableIterators.wrap(
                new AbstractPagingIterable<RowResult<U>, TokenBackedBasicResultsPage<RowResult<U>, byte[]>>() {
                    @Override
                    protected TokenBackedBasicResultsPage<RowResult<U>, byte[]> getFirstPage() throws Exception {
                        return getPage(rangeRequest.getStartInclusive());
                    }

                    @Override
                    protected TokenBackedBasicResultsPage<RowResult<U>, byte[]> getNextPage(
                            TokenBackedBasicResultsPage<RowResult<U>, byte[]> previous) throws Exception {
                        return getPage(previous.getTokenForNextPage());
                    }

                    TokenBackedBasicResultsPage<RowResult<U>, byte[]> getPage(final byte[] startKey)
                            throws Exception {
                        return clientPool.runWithPooledResource(
                                new FunctionCheckedException<Client, TokenBackedBasicResultsPage<RowResult<U>, byte[]>, Exception>() {
                                    @Override
                                    public TokenBackedBasicResultsPage<RowResult<U>, byte[]> apply(Client client)
                                            throws Exception {
                                        final byte[] endExclusive = rangeRequest.getEndExclusive();

                                        KeyRange keyRange = new KeyRange(batchHint);
                                        keyRange.setStart_key(startKey);
                                        if (endExclusive.length == 0) {
                                            keyRange.setEnd_key(endExclusive);
                                        } else {
                                            // We need the previous name because this is inclusive, not exclusive
                                            keyRange.setEnd_key(
                                                    RangeRequests.previousLexicographicName(endExclusive));
                                        }

                                        List<KeySlice> firstPage;

                                        try {
                                            if (shouldTraceQuery(tableName)) {
                                                ByteBuffer recv_trace = client.trace_next_query();
                                                Stopwatch stopwatch = Stopwatch.createStarted();
                                                firstPage = client.get_range_slices(colFam, pred, keyRange,
                                                        consistency);
                                                long duration = stopwatch.elapsed(TimeUnit.MILLISECONDS);
                                                if (duration > getMinimumDurationToTraceMillis()) {
                                                    log.error("Traced a call to " + tableName + " that took "
                                                            + duration + " ms."
                                                            + " It will appear in system_traces with UUID="
                                                            + CassandraKeyValueServices
                                                                    .convertCassandraByteBufferUUIDtoString(
                                                                            recv_trace));
                                                }
                                            } else {
                                                firstPage = client.get_range_slices(colFam, pred, keyRange,
                                                        consistency);
                                            }
                                        } catch (UnavailableException e) {
                                            if (consistency.equals(ConsistencyLevel.ALL)) {
                                                throw new InsufficientConsistencyException(
                                                        "This operation requires all Cassandra nodes to be up and available.",
                                                        e);
                                            } else {
                                                throw e;
                                            }
                                        }

                                        Map<ByteBuffer, List<ColumnOrSuperColumn>> colsByKey = CassandraKeyValueServices
                                                .getColsByKey(firstPage);
                                        return resultsExtractor.get().getPageFromRangeResults(colsByKey, timestamp,
                                                selection, endExclusive);
                                    }

                                    @Override
                                    public String toString() {
                                        return "get_range_slices(" + colFam + ")";
                                    }
                                });
                    }

                }.iterator());
    }

    @Override
    public void dropTable(final String tableName) {
        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        boolean locked = false;
        try {
            trySchemaMutationLock();
            locked = true;
            clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                @Override
                public Void apply(Client client) throws Exception {
                    KsDef ks = client.describe_keyspace(config.keyspace());

                    for (CfDef cf : ks.getCf_defs()) {
                        if (cf.getName().equalsIgnoreCase(internalTableName(tableName))) {
                            client.system_drop_column_family(internalTableName(tableName));
                            putMetadataWithoutChangingSettings(tableName, PtBytes.EMPTY_BYTE_ARRAY);
                            CassandraKeyValueServices.waitForSchemaVersions(client, tableName);
                            return null;
                        }
                    }
                    return null;
                }
            });
        } catch (UnavailableException e) {
            throw new InsufficientConsistencyException(
                    "Drop table requires all Cassandra nodes to be up and available.", e);
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        } finally {
            if (locked) {
                schemaMutationLock.unlock();
            }
        }
    }

    private static String internalTableName(String tableName) {
        if (tableName.startsWith("_")) {
            return tableName;
        }
        return tableName.replaceFirst("\\.", "__");
    }

    private String fromInternalTableName(String tableName) {
        if (tableName.startsWith("_")) {
            return tableName;
        }
        return tableName.replaceFirst("__", ".");
    }

    protected void sanityCheckTableName(String table) {
        Validate.isTrue(
                !(table.startsWith("_") && table.contains(".")) || AtlasDbConstants.hiddenTables.contains(table)
                        || table.startsWith(AtlasDbConstants.TEMP_TABLE_PREFIX)
                        || table.startsWith(AtlasDbConstants.NAMESPACE_PREFIX),
                "invalid tableName: " + table);
    }

    /**
     * Main gains here vs. dropTable:
     *    - problems excepting, we will basically be serializing a rapid series of schema changes
     *      through a single host checked out from the client pool, so reduced chance of schema disagreement issues
     *    - client-side in-memory lock to prevent misbehaving callers from shooting themselves in the foot
     *    - one less round trip
     */
    @Override
    public void dropTables(final Set<String> tablesToDrop) {
        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        boolean locked = false;
        try {
            trySchemaMutationLock();
            locked = true;
            clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                @Override
                public Void apply(Client client) throws Exception {
                    KsDef ks = client.describe_keyspace(config.keyspace());
                    Set<String> existingTables = Sets.newHashSet();

                    for (CfDef cf : ks.getCf_defs()) {
                        existingTables.add(cf.getName().toLowerCase());
                    }

                    for (String table : tablesToDrop) {
                        sanityCheckTableName(table);
                        String caseInsensitiveTable = table.toLowerCase();

                        if (existingTables.contains(caseInsensitiveTable)) {
                            client.system_drop_column_family(caseInsensitiveTable);
                            putMetadataWithoutChangingSettings(caseInsensitiveTable, PtBytes.EMPTY_BYTE_ARRAY);
                        } else {
                            log.warn(String.format("Ignored call to drop a table (%s) that already existed.",
                                    table));
                        }
                    }
                    CassandraKeyValueServices.waitForSchemaVersions(client, "(all tables in a call to dropTables)");
                    return null;
                }
            });
        } catch (UnavailableException e) {
            throw new PalantirRuntimeException(
                    "Dropping tables requires all Cassandra nodes to be up and available.");
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        } finally {
            if (locked) {
                schemaMutationLock.unlock();
            }
        }
    }

    @Override
    public void createTable(final String tableName, final int maxValueSizeInBytes) {
        sanityCheckTableName(tableName);
        boolean locked = false;
        try {
            trySchemaMutationLock();
            locked = true;
            clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                @Override
                public Void apply(Client client) throws Exception {
                    createTableInternal(client, tableName);
                    return null;
                }
            });
        } catch (UnavailableException e) {
            throw new InsufficientConsistencyException(
                    "Create table requires all Cassandra nodes to be up and available.", e);
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        } finally {
            if (locked) {
                schemaMutationLock.unlock();
            }
        }
    }

    private void createTableInternal(Client client, final String tableName)
            throws InvalidRequestException, SchemaDisagreementException, TException, NotFoundException {
        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        String keyspace = config.keyspace();
        KsDef ks = client.describe_keyspace(keyspace);
        for (CfDef cf : ks.getCf_defs()) {
            if (cf.getName().equalsIgnoreCase(internalTableName(tableName))) {
                return;
            }
        }
        CfDef cf = CassandraConstants.getStandardCfDef(keyspace, internalTableName(tableName));
        client.system_add_column_family(cf);
        CassandraKeyValueServices.waitForSchemaVersions(client, tableName);
        return;
    }

    /**
     * Main gains here vs. createTable:
     *    - problems excepting, we will basically be serializing a rapid series of schema changes
     *      through a single host checked out from the client pool, so reduced chance of schema disagreement issues
     *    - client-side in-memory lock to prevent misbehaving callers from shooting themselves in the foot
     *    - one less round trip
     */
    @Override
    public void createTables(final Map<String, Integer> tableNamesToMaxValueSizeInBytes) {
        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        boolean locked = false;
        try {
            trySchemaMutationLock();
            locked = true;
            clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                @Override
                public Void apply(Client client) throws Exception {
                    KsDef ks = client.describe_keyspace(config.keyspace());
                    Set<String> tablesToCreate = tableNamesToMaxValueSizeInBytes.keySet();
                    Set<String> existingTables = Sets.newHashSet();

                    for (CfDef cf : ks.getCf_defs()) {
                        existingTables.add(cf.getName().toLowerCase());
                    }

                    for (String tableName : tablesToCreate) {
                        sanityCheckTableName(tableName);

                        if (!existingTables.contains(internalTableName(tableName.toLowerCase()))) {
                            CfDef newCf = CassandraConstants.getStandardCfDef(config.keyspace(),
                                    internalTableName(tableName));
                            client.system_add_column_family(newCf);
                        } else {
                            log.warn(String.format("Ignored call to create a table (%s) that already existed.",
                                    tableName));
                        }
                    }
                    CassandraKeyValueServices.waitForSchemaVersions(client,
                            "(all tables in a call to createTables)");
                    return null;
                }
            });
        } catch (UnavailableException e) {
            throw new PalantirRuntimeException(
                    "Creating tables requires all Cassandra nodes to be up and available.");
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        } finally {
            if (locked) {
                schemaMutationLock.unlock();
            }
        }
    }

    @Override
    public Set<String> getAllTableNames() {
        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        try {
            return clientPool.runWithPooledResource(new FunctionCheckedException<Client, Set<String>, Exception>() {
                @Override
                public Set<String> apply(Client client) throws Exception {
                    KsDef ks = client.describe_keyspace(config.keyspace());

                    Set<String> ret = Sets.newHashSet();
                    for (CfDef cf : ks.getCf_defs()) {
                        if (!HIDDEN_TABLES.contains(cf.getName())) {
                            ret.add(fromInternalTableName(cf.getName()));
                        }
                    }
                    return ret;
                }

                @Override
                public String toString() {
                    return "describe_keyspace(" + config.keyspace() + ")";
                }
            });
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    @Override
    public byte[] getMetadataForTable(String tableName) {
        Cell cell = getMetadataCell(tableName);
        Value v = get(CassandraConstants.METADATA_TABLE, ImmutableMap.of(cell, Long.MAX_VALUE)).get(cell);
        if (v == null) {
            return PtBytes.EMPTY_BYTE_ARRAY;
        } else {
            return v.getContents();
        }
    }

    @Override
    public Map<String, byte[]> getMetadataForTables() {
        Map<String, byte[]> tableToMetadataContents = Maps.newHashMap();
        ClosableIterator<RowResult<Value>> range = getRange(CassandraConstants.METADATA_TABLE, RangeRequest.all(),
                Long.MAX_VALUE);
        try {
            Set<String> currentlyExistingTables = getAllTableNames();
            while (range.hasNext()) {
                RowResult<Value> valueRow = range.next();
                Iterable<Entry<Cell, Value>> cells = valueRow.getCells();

                for (Entry<Cell, Value> entry : cells) {
                    Value value = entry.getValue();
                    String tableName = new String(entry.getKey().getRowName());
                    if (currentlyExistingTables.contains(tableName)) {
                        byte[] contents;
                        if (value == null) {
                            contents = PtBytes.EMPTY_BYTE_ARRAY;
                        } else {
                            contents = value.getContents();
                        }

                        tableToMetadataContents.put(tableName, contents);
                    } else {
                        log.info("Non-existing table {}: {}", tableName, value);
                    }
                }
            }
        } finally {
            range.close();
        }
        return tableToMetadataContents;
    }

    private Cell getMetadataCell(String tableName) { // would have preferred an explicit charset, but thrift uses default internally
        return Cell.create(tableName.getBytes(Charset.defaultCharset()), "m".getBytes());
    }

    @Override
    public void putMetadataForTable(final String tableName, final byte[] meta) {
        putMetadataForTables(ImmutableMap.of(tableName, meta));
    }

    @Override
    public void putMetadataForTables(final Map<String, byte[]> tableNameToMetadata) {
        final Map<Cell, byte[]> metadataRequestedForUpdate = Maps
                .newHashMapWithExpectedSize(tableNameToMetadata.size());
        for (Entry<String, byte[]> tableEntry : tableNameToMetadata.entrySet()) {
            metadataRequestedForUpdate.put(getMetadataCell(tableEntry.getKey()), tableEntry.getValue());
        }

        Map<Cell, Long> requestForLatestDbSideMetadata = Maps.transformValues(metadataRequestedForUpdate,
                Functions.constant(Long.MAX_VALUE));

        // technically we're racing other services from here on, during an update period,
        // but the penalty for not caring is just some superfluous schema mutations and a few dead rows in the metadata table.
        Map<Cell, Value> persistedMetadata = get(CassandraConstants.METADATA_TABLE, requestForLatestDbSideMetadata);
        final Map<Cell, byte[]> newMetadata = Maps.newHashMap();
        final Collection<CfDef> updatedCfs = Lists.newArrayList();
        for (Entry<Cell, byte[]> entry : metadataRequestedForUpdate.entrySet()) {
            Value val = persistedMetadata.get(entry.getKey());
            if (val == null || !Arrays.equals(val.getContents(), entry.getValue())) {
                newMetadata.put(entry.getKey(), entry.getValue());
                updatedCfs.add(getCfForTable(new String(entry.getKey().getRowName()), entry.getValue()));
            }
        }

        if (!newMetadata.isEmpty()) {
            boolean locked = false;
            try {
                trySchemaMutationLock();
                locked = true;
                clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                    @Override
                    public Void apply(Client client) throws Exception {
                        for (CfDef cf : updatedCfs) {
                            client.system_update_column_family(cf);
                        }

                        CassandraKeyValueServices.waitForSchemaVersions(client,
                                "(all tables in a call to putMetadataForTables)");
                        // Done with actual schema mutation, push the metadata
                        put(CassandraConstants.METADATA_TABLE, newMetadata, System.currentTimeMillis());
                        return null;
                    }
                });
            } catch (Exception e) {
                throw Throwables.throwUncheckedException(e);
            } finally {
                if (locked) {
                    schemaMutationLock.unlock();
                }
            }
        }
    }

    private void putMetadataWithoutChangingSettings(final String tableName, final byte[] meta) {
        put(CassandraConstants.METADATA_TABLE, ImmutableMap.of(getMetadataCell(tableName), meta),
                System.currentTimeMillis());
    }

    @Override
    public void close() {
        clientPool.shutdownPooling();
        configManager.shutdown();
        if (compactionManager != null) {
            compactionManager.close();
        }
        super.close();
    }

    @Override
    public void teardown() {
        close();
    }

    @Override
    public void addGarbageCollectionSentinelValues(String tableName, Set<Cell> cells) {
        try {
            final Value value = Value.create(PtBytes.EMPTY_BYTE_ARRAY, Value.INVALID_VALUE_TIMESTAMP);
            putInternal(tableName, Iterables.transform(cells, new Function<Cell, Map.Entry<Cell, Value>>() {
                @Override
                public Entry<Cell, Value> apply(Cell cell) {
                    return Maps.immutableEntry(cell, value);
                }
            }));
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    @Override
    public Multimap<Cell, Long> getAllTimestamps(String tableName, Set<Cell> cells, long ts) {
        AllTimestampsCollector collector = new AllTimestampsCollector();
        try {
            loadWithTs(tableName, cells, ts, collector, deleteConsistency);
        } catch (UnavailableException e) {
            throw new InsufficientConsistencyException(
                    "Get all timestamps requires all Cassandra nodes to be up and available.", e);
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
        return collector.collectedResults;
    }

    @Override
    public void putUnlessExists(final String tableName, final Map<Cell, byte[]> values)
            throws KeyAlreadyExistsException {
        try {
            clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                @Override
                public Void apply(Client client) throws Exception {
                    for (Map.Entry<Cell, byte[]> e : values.entrySet()) {
                        ByteBuffer rowName = ByteBuffer.wrap(e.getKey().getRowName());
                        byte[] contents = e.getValue();
                        long timestamp = TRANSACTION_TS;
                        byte[] colName = CassandraKeyValueServices
                                .makeCompositeBuffer(e.getKey().getColumnName(), timestamp).array();
                        Column col = new Column();
                        col.setName(colName);
                        col.setValue(contents);
                        col.setTimestamp(timestamp);
                        CASResult casResult = client.cas(rowName, tableName, ImmutableList.<Column>of(),
                                ImmutableList.of(col), ConsistencyLevel.SERIAL, writeConsistency);
                        if (!casResult.isSuccess()) {
                            throw new KeyAlreadyExistsException("This transaction row already exists.",
                                    ImmutableList.of(e.getKey()));
                        }
                    }
                    return null;
                }
            });
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    private void trySchemaMutationLock() throws InterruptedException, TimeoutException {
        if (!schemaMutationLock.tryLock(CassandraConstants.SECONDS_TO_WAIT_FOR_SCHEMA_MUTATION_LOCK,
                TimeUnit.SECONDS)) {
            throw new TimeoutException(
                    "AtlasDB was unable to get a lock on Cassandra system schema mutations for your cluster. Likely cause: performing heavy schema mutations in parallel, or extremely heavy Cassandra cluster load.");
        }
    }

    @Override
    public void compactInternally(String tableName) {
        Preconditions.checkArgument(!Strings.isNullOrEmpty(tableName), "tableName:[%s] should not be null or empty",
                tableName);
        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        String keyspace = config.keyspace();
        long compactionTimeoutSeconds = config.compactionTimeoutSeconds();

        try {
            alterGcAndTombstone(keyspace, tableName, 0, 0.0f);
            compactionManager.forceTableCompaction(compactionTimeoutSeconds, keyspace, tableName);
        } catch (TimeoutException e) {
            log.error("Compaction could not finish in {} seconds!", compactionTimeoutSeconds, e);
            log.error(compactionManager.getPendingCompactionStatus());
        } finally {
            alterGcAndTombstone(keyspace, tableName, CassandraConstants.GC_GRACE_SECONDS,
                    CassandraConstants.TOMBSTONE_THRESHOLD_RATIO);
        }
    }

    private void alterGcAndTombstone(final String keyspace, final String tableName, final int gcGraceSeconds,
            final float tombstone_threshold_ratio) {
        Preconditions.checkArgument(!Strings.isNullOrEmpty(keyspace), "keyspace:[%s] should not be null or empty",
                keyspace);
        Preconditions.checkArgument(!Strings.isNullOrEmpty(tableName), "tableName:[%s] should not be null or empty",
                tableName);
        Preconditions.checkArgument(gcGraceSeconds >= 0, "gcGraceSeconds:[%s] should not be negative",
                gcGraceSeconds);
        Preconditions.checkArgument(tombstone_threshold_ratio >= 0.0f && tombstone_threshold_ratio <= 1.0f,
                "tombstone_threshold_ratio:[%s] should be between [0.0, 1.0]", tombstone_threshold_ratio);

        boolean locked = false;
        try {
            trySchemaMutationLock();
            locked = true;
            clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                @Override
                public Void apply(Client client) throws NotFoundException, InvalidRequestException, TException {
                    KsDef ks = client.describe_keyspace(keyspace);
                    List<CfDef> cfs = ks.getCf_defs();
                    for (CfDef cf : cfs) {
                        if (cf.getName().equalsIgnoreCase(tableName)) {
                            cf.setGc_grace_seconds(gcGraceSeconds);
                            cf.setCompaction_strategy_options(ImmutableMap.of("tombstone_threshold",
                                    String.valueOf(tombstone_threshold_ratio)));
                            client.system_update_column_family(cf);
                            CassandraKeyValueServices.waitForSchemaVersions(client, tableName);
                            log.trace("gc_grace_seconds is set to {} for {}.{}", gcGraceSeconds, keyspace,
                                    tableName);
                            log.trace("tombstone_threshold is set to {} for {}.{}", tombstone_threshold_ratio,
                                    keyspace, tableName);
                        }
                    }
                    return null;
                }
            });
        } catch (Exception e) {
            log.error("Exception encountered while setting {}.{} gc_grace_seconds to {}", keyspace, tableName,
                    gcGraceSeconds, e);
        } finally {
            if (locked) {
                schemaMutationLock.unlock();
            }
        }
    }
}