com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueService.java Source code

Java tutorial

Introduction

Here is the source code for com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueService.java

Source

/**
 * Copyright 2015 Palantir Technologies
 *
 * Licensed under the BSD-3 License (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://opensource.org/licenses/BSD-3-Clause
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.palantir.atlasdb.keyvalue.cassandra;

import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.cassandra.thrift.CASResult;
import org.apache.cassandra.thrift.Cassandra;
import org.apache.cassandra.thrift.Cassandra.Client;
import org.apache.cassandra.thrift.CfDef;
import org.apache.cassandra.thrift.Column;
import org.apache.cassandra.thrift.ColumnOrSuperColumn;
import org.apache.cassandra.thrift.ColumnParent;
import org.apache.cassandra.thrift.ConsistencyLevel;
import org.apache.cassandra.thrift.Deletion;
import org.apache.cassandra.thrift.InvalidRequestException;
import org.apache.cassandra.thrift.KeyRange;
import org.apache.cassandra.thrift.KeySlice;
import org.apache.cassandra.thrift.KsDef;
import org.apache.cassandra.thrift.Mutation;
import org.apache.cassandra.thrift.NotFoundException;
import org.apache.cassandra.thrift.SchemaDisagreementException;
import org.apache.cassandra.thrift.SlicePredicate;
import org.apache.cassandra.thrift.SliceRange;
import org.apache.cassandra.thrift.UnavailableException;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Function;
import com.google.common.base.Functions;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicates;
import com.google.common.base.Stopwatch;
import com.google.common.base.Strings;
import com.google.common.base.Supplier;
import com.google.common.base.Verify;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableMap.Builder;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.Ordering;
import com.google.common.collect.SetMultimap;
import com.google.common.collect.Sets;
import com.google.common.collect.TreeMultimap;
import com.google.common.primitives.UnsignedBytes;
import com.palantir.atlasdb.AtlasDbConstants;
import com.palantir.atlasdb.cassandra.CassandraKeyValueServiceConfig;
import com.palantir.atlasdb.cassandra.CassandraKeyValueServiceConfigManager;
import com.palantir.atlasdb.encoding.PtBytes;
import com.palantir.atlasdb.keyvalue.api.Cell;
import com.palantir.atlasdb.keyvalue.api.ColumnSelection;
import com.palantir.atlasdb.keyvalue.api.InsufficientConsistencyException;
import com.palantir.atlasdb.keyvalue.api.KeyAlreadyExistsException;
import com.palantir.atlasdb.keyvalue.api.RangeRequest;
import com.palantir.atlasdb.keyvalue.api.RangeRequests;
import com.palantir.atlasdb.keyvalue.api.RowResult;
import com.palantir.atlasdb.keyvalue.api.Value;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServices.AllTimestampsCollector;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServices.StartTsResultsCollector;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServices.ThreadSafeResultVisitor;
import com.palantir.atlasdb.keyvalue.cassandra.jmx.CassandraJmxCompaction;
import com.palantir.atlasdb.keyvalue.cassandra.jmx.CassandraJmxCompactionManager;
import com.palantir.atlasdb.keyvalue.impl.AbstractKeyValueService;
import com.palantir.atlasdb.keyvalue.impl.Cells;
import com.palantir.atlasdb.keyvalue.impl.KeyValueServices;
import com.palantir.atlasdb.protos.generated.TableMetadataPersistence;
import com.palantir.atlasdb.table.description.TableMetadata;
import com.palantir.common.annotation.Idempotent;
import com.palantir.common.base.ClosableIterator;
import com.palantir.common.base.ClosableIterators;
import com.palantir.common.base.FunctionCheckedException;
import com.palantir.common.base.Throwables;
import com.palantir.common.concurrent.PTExecutors;
import com.palantir.common.exception.PalantirRuntimeException;
import com.palantir.common.pooling.PoolingContainer;
import com.palantir.util.paging.AbstractPagingIterable;
import com.palantir.util.paging.SimpleTokenBackedResultsPage;
import com.palantir.util.paging.TokenBackedBasicResultsPage;

/**
 *
 * each service can have one or many C* KVS.
 * For each C* KVS, it maintains a list of active nodes, and the client connections attached to each node
 *
 * n1->c1, c2, c3
 * n2->c5, c4, c9
 * n3->[N C* thrift client connections]
 *
 * Where {n1, n2, n3} are the active nodes in the C* cluster. Also each
 * node contains the clients which are attached to the node.
 * if some nodes are down, and the change can be detected through active hosts,
 * and these inactive nodes will be removed afterwards.
 */
public class CassandraKeyValueService extends AbstractKeyValueService {

    private static final Logger log = LoggerFactory.getLogger(CassandraKeyValueService.class);

    private static final Function<Entry<Cell, Value>, Long> ENTRY_SIZING_FUNCTION = new Function<Entry<Cell, Value>, Long>() {
        @Override
        public Long apply(Entry<Cell, Value> input) {
            return input.getValue().getContents().length + 4L + Cells.getApproxSizeOfCell(input.getKey());
        }
    };

    private static final int MAX_REQUEST_RETRIES = 10;
    private static final int REQUEST_TRIES_BEFORE_RANDOMIZING = 2;

    private final CassandraKeyValueServiceConfigManager configManager;
    private final CassandraClientPoolingManager cassandraClientPoolingManager;
    private final Optional<CassandraJmxCompactionManager> compactionManager;
    protected final ManyClientPoolingContainer containerPoolToUpdate;
    protected final ManyHostPoolingContainer<Client> clientPool;
    private final ScheduledExecutorService hostRefreshExecutor = PTExecutors.newScheduledThreadPool(1);
    private final ReentrantLock schemaMutationLock = new ReentrantLock(true);

    private ConsistencyLevel readConsistency = ConsistencyLevel.LOCAL_QUORUM;
    private final ConsistencyLevel writeConsistency = ConsistencyLevel.EACH_QUORUM;
    private final ConsistencyLevel deleteConsistency = ConsistencyLevel.ALL;

    private TokenAwareMapper tokenAwareMapper;

    public static CassandraKeyValueService create(CassandraKeyValueServiceConfigManager configManager) {
        Optional<CassandraJmxCompactionManager> compactionManager = CassandraJmxCompaction
                .createJmxCompactionManager(configManager);
        CassandraKeyValueService ret = new CassandraKeyValueService(configManager, compactionManager);
        ret.init();
        return ret;
    }

    protected CassandraKeyValueService(CassandraKeyValueServiceConfigManager configManager,
            Optional<CassandraJmxCompactionManager> compactionManager) {
        super(AbstractKeyValueService.createFixedThreadPool("Atlas Cassandra KVS",
                configManager.getConfig().poolSize() * configManager.getConfig().servers().size()));
        this.configManager = configManager;
        this.containerPoolToUpdate = ManyClientPoolingContainer.create(configManager.getConfig());
        this.clientPool = RetriableManyHostPoolingContainer.create(MAX_REQUEST_RETRIES,
                REQUEST_TRIES_BEFORE_RANDOMIZING, containerPoolToUpdate);
        this.cassandraClientPoolingManager = new PoolResizingCassandraClientPoolingManager(containerPoolToUpdate,
                clientPool, configManager);
        this.compactionManager = compactionManager;
    }

    protected void init() {
        int replicationFactor = configManager.getConfig().replicationFactor();
        initializeFromFreshInstance(containerPoolToUpdate.getCurrentHosts(), replicationFactor);
        poolingManager().submitHostRefreshTask();
    }

    public CassandraClientPoolingManager poolingManager() {
        return cassandraClientPoolingManager;
    }

    // Resizes the thread pool when hosts are updated, since the number of hosts may have changed
    private class PoolResizingCassandraClientPoolingManager extends CassandraClientPoolingManager {
        public PoolResizingCassandraClientPoolingManager(ManyClientPoolingContainer containerPoolToUpdate,
                PoolingContainer<Client> clientPool, CassandraKeyValueServiceConfigManager configManager) {
            super(containerPoolToUpdate, clientPool, configManager);
        }

        @Override
        public void setHostsToCurrentHostNames() throws TException {
            super.setHostsToCurrentHostNames();
            resizeThreadPool();
        }

        private void resizeThreadPool() {
            ThreadPoolExecutor threadPool = (ThreadPoolExecutor) executor;
            CassandraKeyValueServiceConfig config = configManager.getConfig();
            int threadPoolSize = config.poolSize() * config.servers().size();
            threadPool.setCorePoolSize(threadPoolSize);
            threadPool.setMaximumPoolSize(threadPoolSize);
        }
    }

    @Override
    public void initializeFromFreshInstance() {
        // we already did our init in our factory method
    }

    protected void initializeFromFreshInstance(List<InetSocketAddress> addrList, int replicationFactor) {
        Map<String, Throwable> errorsByHost = Maps.newHashMap();

        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        boolean safetyDisabled = config.safetyDisabled();
        String keyspace = config.keyspace();
        boolean ssl = config.ssl();
        int socketTimeoutMillis = config.socketTimeoutMillis();
        int socketQueryTimeoutMillis = config.socketQueryTimeoutMillis();

        for (InetSocketAddress addr : addrList) {
            Cassandra.Client client = null;
            try {
                client = CassandraClientFactory.getClientInternal(addr, ssl, socketTimeoutMillis,
                        socketQueryTimeoutMillis);

                validatePartitioner(client);

                Set<InetSocketAddress> currentHosts = cassandraClientPoolingManager
                        .getCurrentHostsFromServer(client);
                cassandraClientPoolingManager.setHostsToCurrentHostNames(currentHosts);

                ensureKeyspaceExistsAndIsUpToDate(replicationFactor, safetyDisabled, keyspace, client);
                client.set_keyspace(keyspace);

                tokenAwareMapper = TokenAwareMapper.create(configManager, clientPool);
                createTableInternal(client, CassandraConstants.METADATA_TABLE);
                CassandraVerifier.sanityCheckRingConsistency(currentHosts, keyspace, ssl, safetyDisabled,
                        socketTimeoutMillis, socketQueryTimeoutMillis);
                upgradeFromOlderInternalSchema(client);
                CassandraKeyValueServices.failQuickInInitializationIfClusterAlreadyInInconsistentState(client,
                        config.safetyDisabled(), configManager.getConfig().schemaMutationTimeoutMillis());
                return;
            } catch (TException e) {
                log.warn("failed to connect to host: " + addr, e);
                errorsByHost.put(addr.toString(), e);
            } finally {
                if (client != null) {
                    client.getOutputProtocol().getTransport().close();
                }
            }
        }
        throw new IllegalStateException(CassandraKeyValueServices.buildErrorMessage(
                "Could not connect to any Cassandra hosts. Check the status of your cluster, and check your host list and SSL settings to make sure they match.",
                errorsByHost));
    }

    private void validatePartitioner(Cassandra.Client client) throws TException {
        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        String partitioner = client.describe_partitioner();
        if (!config.safetyDisabled()) {
            Verify.verify(CassandraConstants.ALLOWED_PARTITIONERS.contains(partitioner),
                    "Invalid partitioner. Allowed: %s, but partitioner is: %s",
                    CassandraConstants.ALLOWED_PARTITIONERS, partitioner);
        }
    }

    private void ensureKeyspaceExistsAndIsUpToDate(int replicationFactor, boolean safetyDisabled, String keyspace,
            Cassandra.Client client) throws InvalidRequestException, TException, SchemaDisagreementException {
        try {
            KsDef originalKsDef = client.describe_keyspace(configManager.getConfig().keyspace());
            KsDef modifiedKsDef = originalKsDef.deepCopy();
            CassandraVerifier.checkAndSetReplicationFactor(client, modifiedKsDef, false, replicationFactor,
                    safetyDisabled);
            lowerConsistencyWhenSafe(client, modifiedKsDef, replicationFactor);

            if (!modifiedKsDef.equals(originalKsDef)) {
                modifiedKsDef.setCf_defs(ImmutableList.<CfDef>of()); // Can't call system_update_keyspace to update replication factor if CfDefs are set
                client.system_update_keyspace(modifiedKsDef);
                CassandraKeyValueServices.waitForSchemaVersions(client, "(updating the existing keyspace)",
                        configManager.getConfig().schemaMutationTimeoutMillis());
            }
        } catch (NotFoundException e) {
            createKeyspace(replicationFactor, safetyDisabled, keyspace, client);
        }
    }

    private void createKeyspace(int replicationFactor, boolean safetyDisabled, String keyspace,
            Cassandra.Client client) throws InvalidRequestException, SchemaDisagreementException, TException {
        KsDef ks = new KsDef(keyspace, CassandraConstants.NETWORK_STRATEGY, ImmutableList.<CfDef>of());
        CassandraVerifier.checkAndSetReplicationFactor(client, ks, true, replicationFactor, safetyDisabled);
        lowerConsistencyWhenSafe(client, ks, replicationFactor);
        ks.setDurable_writes(true);
        client.system_add_keyspace(ks);
        CassandraKeyValueServices.waitForSchemaVersions(client, "(adding the initial empty keyspace)",
                configManager.getConfig().schemaMutationTimeoutMillis());
    }

    private void upgradeFromOlderInternalSchema(Client client)
            throws NotFoundException, InvalidRequestException, TException {
        Map<String, byte[]> metadataForTables = getMetadataForTables();
        Map<String, byte[]> tablesToUpgrade = Maps.newHashMapWithExpectedSize(metadataForTables.size());
        String keyspace = configManager.getConfig().keyspace();

        for (CfDef clusterSideCf : client.describe_keyspace(keyspace).getCf_defs()) {
            String tableName = internalTableName(clusterSideCf.getName());
            if (metadataForTables.containsKey(tableName)) {
                byte[] clusterSideMetadata = metadataForTables.get(tableName);
                CfDef clientSideCf = getCfForTable(clusterSideCf.getName(), clusterSideMetadata);
                if (!CassandraKeyValueServices.isMatchingCf(clientSideCf, clusterSideCf)) { // mismatch; we have changed how we generate schema since we last persisted
                    log.warn("Upgrading table {} to new internal Cassandra schema", tableName);
                    tablesToUpgrade.put(tableName, clusterSideMetadata);
                }
            } else if (!tableName.equals(CassandraConstants.METADATA_TABLE)) { // only expected case
                // Possible to get here from a race condition with another service starting up and performing schema upgrades concurrent with us doing this check
                log.error("Found a table " + tableName + " that did not have persisted Atlas metadata."
                        + "If you recently did a Palantir update, try waiting until schema upgrades are completed on all backend CLIs/services etc and restarting this service."
                        + "If this error re-occurs on subsequent attempted startups, please contact Palantir support.");
            }
        }

        // we are racing another service to do these same operations here, but they are idempotent / safe
        if (!tablesToUpgrade.isEmpty()) {
            putMetadataForTables(tablesToUpgrade);
        }
    }

    private void lowerConsistencyWhenSafe(Client client, KsDef ks, int desiredRf) {
        Set<String> dcs;
        try {
            dcs = CassandraVerifier.sanityCheckDatacenters(client, desiredRf,
                    configManager.getConfig().safetyDisabled());
        } catch (InvalidRequestException e) {
            return;
        } catch (TException e) {
            return;
        }

        Map<String, String> strategyOptions = Maps.newHashMap(ks.getStrategy_options());
        if (dcs.size() == 1) {
            String dc = dcs.iterator().next();
            if (strategyOptions.get(dc) != null) {
                int currentRF = Integer.parseInt(strategyOptions.get(dc));
                if (currentRF == desiredRf) {
                    if (currentRF == 2) {
                        log.info("Setting Read Consistency to ONE, as cluster has only one datacenter at RF2.");
                        readConsistency = ConsistencyLevel.ONE;
                    }
                }
            }
        }
    }

    @Override
    public Map<Cell, Value> getRows(final String tableName, final Iterable<byte[]> rows, ColumnSelection selection,
            final long startTs) {
        if (!selection.allColumnsSelected()) {
            return getRowsForSpecificColumns(tableName, rows, selection, startTs);
        }

        Set<Entry<InetAddress, List<byte[]>>> rowsByHost = partitionByHost(rows, Functions.<byte[]>identity())
                .entrySet();
        List<Callable<Map<Cell, Value>>> tasks = Lists.newArrayListWithCapacity(rowsByHost.size());
        for (final Map.Entry<InetAddress, List<byte[]>> hostAndRows : rowsByHost) {
            tasks.add(new Callable<Map<Cell, Value>>() {
                @Override
                public Map<Cell, Value> call() {
                    return getRowsForSingleHost(hostAndRows.getKey(), tableName, hostAndRows.getValue(), startTs);
                }
            });
        }
        List<Map<Cell, Value>> perHostResults = runAllTasksCancelOnFailure(tasks);
        Map<Cell, Value> result = Maps.newHashMapWithExpectedSize(Iterables.size(rows));
        for (Map<Cell, Value> perHostResult : perHostResults) {
            result.putAll(perHostResult);
        }
        return result;
    }

    private Map<Cell, Value> getRowsForSingleHost(final InetAddress host, final String tableName,
            final List<byte[]> rows, final long startTs) {
        try {
            int rowCount = 0;
            final Map<Cell, Value> result = Maps.newHashMap();
            int fetchBatchCount = configManager.getConfig().fetchBatchCount();
            for (final List<byte[]> batch : Lists.partition(rows, fetchBatchCount)) {
                rowCount += batch.size();
                result.putAll(clientPool.runWithPooledResourceOnHost(host,
                        new FunctionCheckedException<Client, Map<Cell, Value>, Exception>() {
                            @Override
                            public Map<Cell, Value> apply(Client client) throws Exception {
                                // We want to get all the columns in the row so set start and end to empty.
                                SliceRange slice = new SliceRange(ByteBuffer.wrap(PtBytes.EMPTY_BYTE_ARRAY),
                                        ByteBuffer.wrap(PtBytes.EMPTY_BYTE_ARRAY), false, Integer.MAX_VALUE);
                                SlicePredicate pred = new SlicePredicate();
                                pred.setSlice_range(slice);

                                List<ByteBuffer> rowNames = Lists.newArrayListWithCapacity(batch.size());
                                for (byte[] r : batch) {
                                    rowNames.add(ByteBuffer.wrap(r));
                                }

                                ColumnParent colFam = new ColumnParent(internalTableName(tableName));
                                Map<ByteBuffer, List<ColumnOrSuperColumn>> results = multigetInternal(client,
                                        tableName, rowNames, colFam, pred, readConsistency);
                                Map<Cell, Value> ret = Maps.newHashMap();
                                new ValueExtractor(ret).extractResults(results, startTs, ColumnSelection.all());
                                return ret;
                            }

                            @Override
                            public String toString() {
                                return "multiget_slice(" + tableName + ", " + batch.size() + " rows" + ")";
                            }
                        }));
            }
            if (rowCount > fetchBatchCount) {
                log.warn("Rebatched in getRows a call to " + tableName + " that attempted to multiget " + rowCount
                        + " rows; this may indicate overly-large batching on a higher level.\n"
                        + CassandraKeyValueServices.getFilteredStackTrace("com.palantir"));
            }
            return ImmutableMap.copyOf(result);
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    private Map<Cell, Value> getRowsForSpecificColumns(final String tableName, final Iterable<byte[]> rows,
            ColumnSelection selection, final long startTs) {
        Preconditions.checkArgument(!selection.allColumnsSelected(), "Must select specific columns");

        Collection<byte[]> selectedColumns = selection.getSelectedColumns();
        Set<Cell> cells = Sets.newHashSetWithExpectedSize(selectedColumns.size() * Iterables.size(rows));
        for (byte[] row : rows) {
            for (byte[] col : selectedColumns) {
                cells.add(Cell.create(row, col));
            }
        }

        try {
            StartTsResultsCollector collector = new StartTsResultsCollector(startTs);
            loadWithTs(tableName, cells, startTs, false, collector, readConsistency);
            return collector.collectedResults;
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    @Override
    public Map<Cell, Value> get(String tableName, Map<Cell, Long> timestampByCell) {
        if (timestampByCell.isEmpty()) {
            log.info("Attempted get on '{}' table with empty cells", tableName);
            return ImmutableMap.of();
        }

        try {
            Long firstTs = timestampByCell.values().iterator().next();
            if (Iterables.all(timestampByCell.values(), Predicates.equalTo(firstTs))) {
                StartTsResultsCollector collector = new StartTsResultsCollector(firstTs);
                loadWithTs(tableName, timestampByCell.keySet(), firstTs, false, collector, readConsistency);
                return collector.collectedResults;
            }

            SetMultimap<Long, Cell> cellsByTs = Multimaps.invertFrom(Multimaps.forMap(timestampByCell),
                    HashMultimap.<Long, Cell>create());
            Builder<Cell, Value> builder = ImmutableMap.builder();
            for (long ts : cellsByTs.keySet()) {
                StartTsResultsCollector collector = new StartTsResultsCollector(ts);
                loadWithTs(tableName, cellsByTs.get(ts), ts, false, collector, readConsistency);
                builder.putAll(collector.collectedResults);
            }
            return builder.build();
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    private void loadWithTs(String tableName, Set<Cell> cells, long startTs, boolean loadAllTs,
            ThreadSafeResultVisitor v, ConsistencyLevel consistency) throws Exception {
        List<Callable<Void>> tasks = Lists.newArrayList();
        for (Map.Entry<InetAddress, List<Cell>> hostAndCells : partitionByHost(cells, Cells.getRowFunction())
                .entrySet()) {
            tasks.addAll(getLoadWithTsTasksForSingleHost(hostAndCells.getKey(), tableName, hostAndCells.getValue(),
                    startTs, loadAllTs, v, consistency));
        }
        runAllTasksCancelOnFailure(tasks);
    }

    // TODO: after cassandra api change: handle different column select per row
    private List<Callable<Void>> getLoadWithTsTasksForSingleHost(final InetAddress host, final String tableName,
            Collection<Cell> cells, final long startTs, final boolean loadAllTs, final ThreadSafeResultVisitor v,
            final ConsistencyLevel consistency) throws Exception {
        final ColumnParent colFam = new ColumnParent(internalTableName(tableName));
        TreeMultimap<byte[], Cell> cellsByCol = TreeMultimap.create(UnsignedBytes.lexicographicalComparator(),
                Ordering.natural());
        for (Cell cell : cells) {
            cellsByCol.put(cell.getColumnName(), cell);
        }
        List<Callable<Void>> tasks = Lists.newArrayList();
        int fetchBatchCount = configManager.getConfig().fetchBatchCount();
        for (final byte[] col : cellsByCol.keySet()) {
            if (cellsByCol.get(col).size() > fetchBatchCount) {
                log.warn(
                        "Re-batching in getLoadWithTsTasksForSingleHost a call to {} for table {} that attempted to "
                                + "multiget {} rows; this may indicate overly-large batching on a higher level.\n{}",
                        host, tableName, cellsByCol.get(col).size(),
                        CassandraKeyValueServices.getFilteredStackTrace("com.palantir"));
            }
            for (final List<Cell> partition : Lists.partition(ImmutableList.copyOf(cellsByCol.get(col)),
                    fetchBatchCount)) {
                tasks.add(new Callable<Void>() {
                    @Override
                    public Void call() throws Exception {
                        return clientPool.runWithPooledResourceOnHost(host,
                                new FunctionCheckedException<Client, Void, Exception>() {
                                    @Override
                                    public Void apply(Client client) throws Exception {
                                        ByteBuffer start = CassandraKeyValueServices.makeCompositeBuffer(col,
                                                startTs - 1);
                                        ByteBuffer end = CassandraKeyValueServices.makeCompositeBuffer(col, -1);
                                        SliceRange slice = new SliceRange(start, end, false,
                                                loadAllTs ? Integer.MAX_VALUE : 1);
                                        SlicePredicate pred = new SlicePredicate();
                                        pred.setSlice_range(slice);

                                        List<ByteBuffer> rowNames = Lists
                                                .newArrayListWithCapacity(partition.size());
                                        for (Cell c : partition) {
                                            rowNames.add(ByteBuffer.wrap(c.getRowName()));
                                        }
                                        Map<ByteBuffer, List<ColumnOrSuperColumn>> results = multigetInternal(
                                                client, tableName, rowNames, colFam, pred, consistency);
                                        v.visit(results);
                                        return null;
                                    }

                                    @Override
                                    public String toString() {
                                        return "multiget_slice(" + host + ", " + colFam + ", " + partition.size()
                                                + " rows" + ")";
                                    }
                                });
                    }
                });
            }
        }
        return tasks;
    }

    @Override
    public Map<Cell, Long> getLatestTimestamps(String tableName, Map<Cell, Long> timestampByCell) {
        // TODO: optimize by only getting column name after cassandra api change
        return super.getLatestTimestamps(tableName, timestampByCell);
    }

    @Override
    public void put(final String tableName, final Map<Cell, byte[]> values, final long timestamp) {
        try {
            putInternal(tableName, KeyValueServices.toConstantTimestampValues(values.entrySet(), timestamp));
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    @Override
    public void putWithTimestamps(String tableName, Multimap<Cell, Value> values) {
        try {
            putInternal(tableName, values.entries());
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    @Override
    protected int getMultiPutBatchCount() {
        return configManager.getConfig().mutationBatchCount();
    }

    private void putInternal(final String tableName, final Iterable<Map.Entry<Cell, Value>> values)
            throws Exception {
        putInternal(tableName, values, CassandraConstants.NO_TTL);
    }

    protected void putInternal(final String tableName, Iterable<Map.Entry<Cell, Value>> values, final int ttl)
            throws Exception {
        Map<InetAddress, Map<Cell, Value>> cellsByHost = partitionMapByHost(values);
        List<Callable<Void>> tasks = Lists.newArrayListWithCapacity(cellsByHost.size());
        for (final Map.Entry<InetAddress, Map<Cell, Value>> entry : cellsByHost.entrySet()) {
            tasks.add(new Callable<Void>() {
                @Override
                public Void call() throws Exception {
                    putForSingleHostInternal(entry.getKey(), tableName, entry.getValue().entrySet(), ttl);
                    return null;
                }
            });
        }
        runAllTasksCancelOnFailure(tasks);
    }

    private void putForSingleHostInternal(final InetAddress host, final String tableName,
            final Iterable<Map.Entry<Cell, Value>> values, final int ttl) throws Exception {
        clientPool.runWithPooledResourceOnHost(host, new FunctionCheckedException<Client, Void, Exception>() {
            @Override
            public Void apply(Client client) throws Exception {
                final CassandraKeyValueServiceConfig config = configManager.getConfig();
                int mutationBatchCount = config.mutationBatchCount();
                int mutationBatchSizeBytes = config.mutationBatchSizeBytes();
                for (List<Entry<Cell, Value>> partition : partitionByCountAndBytes(values, mutationBatchCount,
                        mutationBatchSizeBytes, tableName, ENTRY_SIZING_FUNCTION)) {
                    Map<ByteBuffer, Map<String, List<Mutation>>> map = Maps.newHashMap();
                    for (Map.Entry<Cell, Value> e : partition) {
                        Cell cell = e.getKey();
                        Column col = createColumn(cell, e.getValue(), ttl);

                        ColumnOrSuperColumn colOrSup = new ColumnOrSuperColumn();
                        colOrSup.setColumn(col);
                        Mutation m = new Mutation();
                        m.setColumn_or_supercolumn(colOrSup);

                        ByteBuffer rowName = ByteBuffer.wrap(cell.getRowName());

                        Map<String, List<Mutation>> rowPuts = map.get(rowName);
                        if (rowPuts == null) {
                            rowPuts = Maps.<String, List<Mutation>>newHashMap();
                            map.put(rowName, rowPuts);
                        }

                        List<Mutation> tableMutations = rowPuts.get(internalTableName(tableName));
                        if (tableMutations == null) {
                            tableMutations = Lists.<Mutation>newArrayList();
                            rowPuts.put(internalTableName(tableName), tableMutations);
                        }

                        tableMutations.add(m);
                    }
                    batchMutateInternal(client, tableName, map, writeConsistency);
                }
                return null;
            }

            @Override
            public String toString() {
                return "batch_mutate(" + host + ", " + tableName + ", " + Iterables.size(values) + " values, " + ttl
                        + " ttl sec)";
            }
        });
    }

    // Overridden to batch more intelligently than the default implementation.
    @Override
    public void multiPut(Map<String, ? extends Map<Cell, byte[]>> valuesByTable, final long timestamp)
            throws KeyAlreadyExistsException {
        List<TableCellAndValue> flattened = Lists.newArrayList();
        for (Map.Entry<String, ? extends Map<Cell, byte[]>> tableAndValues : valuesByTable.entrySet()) {
            for (Map.Entry<Cell, byte[]> entry : tableAndValues.getValue().entrySet()) {
                flattened.add(new TableCellAndValue(tableAndValues.getKey(), entry.getKey(), entry.getValue()));
            }
        }
        Map<InetAddress, List<TableCellAndValue>> partitionedByHost = partitionByHost(flattened,
                TableCellAndValue.EXTRACT_ROW_NAME_FUNCTION);

        List<Callable<Void>> callables = Lists.newArrayList();
        for (Map.Entry<InetAddress, List<TableCellAndValue>> entry : partitionedByHost.entrySet()) {
            callables.addAll(getMultiPutTasksForSingleHost(entry.getKey(), entry.getValue(), timestamp));
        }
        runAllTasksCancelOnFailure(callables);
    }

    private List<Callable<Void>> getMultiPutTasksForSingleHost(final InetAddress host,
            Collection<TableCellAndValue> values, final long timestamp) {
        Iterable<List<TableCellAndValue>> partitioned = partitionByCountAndBytes(values, getMultiPutBatchCount(),
                getMultiPutBatchSizeBytes(), extractTableNames(values).toString(),
                TableCellAndValue.SIZING_FUNCTION);
        List<Callable<Void>> tasks = Lists.newArrayList();
        for (final List<TableCellAndValue> batch : partitioned) {
            final Set<String> tableNames = extractTableNames(batch);
            tasks.add(new Callable<Void>() {
                @Override
                public Void call() throws Exception {
                    String originalName = Thread.currentThread().getName();
                    Thread.currentThread().setName(
                            "Atlas multiPut of " + batch.size() + " cells into " + tableNames + " on " + host);
                    try {
                        multiPutForSingleHostInternal(host, tableNames, batch, timestamp);
                        return null;
                    } finally {
                        Thread.currentThread().setName(originalName);
                    }
                }
            });
        }
        return tasks;
    }

    private Set<String> extractTableNames(Iterable<TableCellAndValue> tableCellAndValues) {
        Set<String> tableNames = Sets.newHashSet();
        for (TableCellAndValue tableCellAndValue : tableCellAndValues) {
            tableNames.add(tableCellAndValue.tableName);
        }
        return tableNames;
    }

    private void multiPutForSingleHostInternal(final InetAddress host, final Set<String> tableNames,
            final List<TableCellAndValue> batch, long timestamp) throws Exception {
        final Map<ByteBuffer, Map<String, List<Mutation>>> map = convertToMutations(batch, timestamp);
        clientPool.runWithPooledResourceOnHost(host, new FunctionCheckedException<Client, Void, Exception>() {
            @Override
            public Void apply(Client client) throws Exception {
                batchMutateInternal(client, tableNames, map, writeConsistency);
                return null;
            }

            @Override
            public String toString() {
                return "batch_mutate(" + host + ", " + tableNames + ", " + batch.size() + " values)";
            }
        });
    }

    private Map<ByteBuffer, Map<String, List<Mutation>>> convertToMutations(List<TableCellAndValue> batch,
            long timestamp) {
        Map<ByteBuffer, Map<String, List<Mutation>>> map = Maps.newHashMap();
        for (TableCellAndValue tableCellAndValue : batch) {
            Cell cell = tableCellAndValue.cell;
            Column col = createColumn(cell, Value.create(tableCellAndValue.value, timestamp),
                    CassandraConstants.NO_TTL);

            ColumnOrSuperColumn colOrSup = new ColumnOrSuperColumn();
            colOrSup.setColumn(col);
            Mutation m = new Mutation();
            m.setColumn_or_supercolumn(colOrSup);

            ByteBuffer rowName = ByteBuffer.wrap(cell.getRowName());

            Map<String, List<Mutation>> rowPuts = map.get(rowName);
            if (rowPuts == null) {
                rowPuts = Maps.<String, List<Mutation>>newHashMap();
                map.put(rowName, rowPuts);
            }

            List<Mutation> tableMutations = rowPuts.get(internalTableName(tableCellAndValue.tableName));
            if (tableMutations == null) {
                tableMutations = Lists.<Mutation>newArrayList();
                rowPuts.put(internalTableName(tableCellAndValue.tableName), tableMutations);
            }

            tableMutations.add(m);
        }
        return map;
    }

    private Column createColumn(Cell cell, Value value, final int ttl) {
        byte[] contents = value.getContents();
        long timestamp = value.getTimestamp();
        ByteBuffer colName = CassandraKeyValueServices.makeCompositeBuffer(cell.getColumnName(), timestamp);
        Column col = new Column();
        col.setName(colName);
        col.setValue(contents);
        col.setTimestamp(timestamp);

        if (cell.getTtlDurationMillis() > 0) {
            col.setTtl(CassandraKeyValueServices.convertTtl(cell.getTtlDurationMillis(), TimeUnit.MILLISECONDS));
        }

        if (ttl > 0) {
            col.setTtl(ttl);
        }

        return col;
    }

    private void batchMutateInternal(Client client, String tableName,
            Map<ByteBuffer, Map<String, List<Mutation>>> map, ConsistencyLevel consistency) throws TException {
        batchMutateInternal(client, ImmutableSet.of(tableName), map, consistency);
    }

    private void batchMutateInternal(Client client, Set<String> tableNames,
            Map<ByteBuffer, Map<String, List<Mutation>>> map, ConsistencyLevel consistency) throws TException {
        if (shouldTraceQuery(tableNames)) {
            ByteBuffer recv_trace = client.trace_next_query();
            Stopwatch stopwatch = Stopwatch.createStarted();
            client.batch_mutate(map, consistency);
            long duration = stopwatch.elapsed(TimeUnit.MILLISECONDS);
            if (duration > getMinimumDurationToTraceMillis()) {
                log.error("Traced a call to " + tableNames + " that took " + duration + " ms."
                        + " It will appear in system_traces with UUID="
                        + CassandraKeyValueServices.convertCassandraByteBufferUUIDtoString(recv_trace));
            }
        } else {
            client.batch_mutate(map, consistency);
        }
    }

    private boolean shouldTraceQuery(Set<String> tableNames) {
        for (String tableName : tableNames) {
            if (shouldTraceQuery(tableName)) {
                return true;
            }
        }
        return false;
    }

    private Map<ByteBuffer, List<ColumnOrSuperColumn>> multigetInternal(Client client, String tableName,
            List<ByteBuffer> rowNames, ColumnParent colFam, SlicePredicate pred, ConsistencyLevel consistency)
            throws TException {
        Map<ByteBuffer, List<ColumnOrSuperColumn>> results;
        if (shouldTraceQuery(tableName)) {
            ByteBuffer recv_trace = client.trace_next_query();
            Stopwatch stopwatch = Stopwatch.createStarted();
            results = client.multiget_slice(rowNames, colFam, pred, consistency);
            long duration = stopwatch.elapsed(TimeUnit.MILLISECONDS);
            if (duration > getMinimumDurationToTraceMillis()) {
                log.error("Traced a call to " + tableName + " that took " + duration + " ms."
                        + " It will appear in system_traces with UUID="
                        + CassandraKeyValueServices.convertCassandraByteBufferUUIDtoString(recv_trace));
            }
        } else {
            results = client.multiget_slice(rowNames, colFam, pred, consistency);
        }
        return results;
    }

    @Override
    public void truncateTable(final String tableName) {
        truncateTables(ImmutableSet.of(tableName));
    }

    @Override
    public void truncateTables(final Set<String> tableNames) {
        final Set<String> tablesToTruncate = filterOutTrulyEmptyTables(tableNames);
        if (!tablesToTruncate.isEmpty()) {
            try {
                trySchemaMutationLock();
                clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                    @Override
                    public Void apply(Client client) throws Exception {
                        for (String tableName : tablesToTruncate) {
                            if (shouldTraceQuery(tableName)) {
                                ByteBuffer recv_trace = client.trace_next_query();
                                Stopwatch stopwatch = Stopwatch.createStarted();
                                client.truncate(internalTableName(tableName));
                                long duration = stopwatch.elapsed(TimeUnit.MILLISECONDS);
                                if (duration > getMinimumDurationToTraceMillis()) {
                                    log.error("Traced a call to " + tableName + " that took " + duration + " ms."
                                            + " It will appear in system_traces with UUID="
                                            + CassandraKeyValueServices
                                                    .convertCassandraByteBufferUUIDtoString(recv_trace));
                                }
                            } else {
                                client.truncate(internalTableName(tableName));
                            }
                        }
                        CassandraKeyValueServices.waitForSchemaVersions(client,
                                "(" + tablesToTruncate.size() + " tables in a call to truncateTables)",
                                configManager.getConfig().schemaMutationTimeoutMillis());
                        return null;
                    }

                    @Override
                    public String toString() {
                        return "truncateTables(" + tablesToTruncate.size() + " tables)";
                    }
                });
            } catch (UnavailableException e) {
                throw new PalantirRuntimeException(
                        "Creating tables requires all Cassandra nodes to be up and available.");
            } catch (Exception e) {
                throw Throwables.throwUncheckedException(e);
            } finally {
                schemaMutationLock.unlock();
            }
        }
    }

    private Set<String> filterOutTrulyEmptyTables(Set<String> tableNames) {
        final Set<String> nonEmptyTables = Sets.newHashSet();
        SliceRange slice = new SliceRange(ByteBuffer.wrap(PtBytes.EMPTY_BYTE_ARRAY),
                ByteBuffer.wrap(PtBytes.EMPTY_BYTE_ARRAY), false, Integer.MAX_VALUE);
        final SlicePredicate predicate = new SlicePredicate();
        predicate.setSlice_range(slice);

        for (final String tableName : tableNames) {
            int results = 1;
            try {
                results = clientPool
                        .runWithPooledResource(new FunctionCheckedException<Client, Integer, Exception>() {
                            @Override
                            public Integer apply(Client client) throws Exception {
                                List<KeySlice> range_slices = client.get_range_slices(
                                        new ColumnParent(internalTableName(tableName)), predicate, new KeyRange(1),
                                        deleteConsistency);
                                return range_slices.size();
                            }
                        });
            } catch (Exception e) {
                log.error("Table " + tableName
                        + " could not be checked for emptiness. Proceeding with requested operation without optimization.");
            }

            if (results != 0) {
                nonEmptyTables.add(tableName);
            } else {
                log.info("Table " + tableName + " is empty and the requested operation will be skipped.");
            }
        }

        return nonEmptyTables;
    }

    @Override
    public void delete(String tableName, Multimap<Cell, Long> keys) {
        Map<InetAddress, Map<Cell, Collection<Long>>> keysByHost = partitionMapByHost(keys.asMap().entrySet());
        for (Map.Entry<InetAddress, Map<Cell, Collection<Long>>> entry : keysByHost.entrySet()) {
            deleteOnSingleHost(entry.getKey(), tableName, entry.getValue());
        }
    }

    private void deleteOnSingleHost(final InetAddress host, final String tableName,
            final Map<Cell, Collection<Long>> keys) {
        try {
            clientPool.runWithPooledResourceOnHost(host, new FunctionCheckedException<Client, Void, Exception>() {
                @Override
                public Void apply(Client client) throws Exception {
                    // Delete must delete in the order of timestamp and we don't trust batch_mutate to do it
                    // atomically so we have to potentially do many deletes if there are many timestamps for the
                    // same key.
                    Map<Integer, Map<ByteBuffer, Map<String, List<Mutation>>>> maps = Maps.newTreeMap();
                    for (Cell key : keys.keySet()) {
                        int mapIndex = 0;
                        for (long ts : Ordering.natural().immutableSortedCopy(keys.get(key))) {
                            if (!maps.containsKey(mapIndex)) {
                                maps.put(mapIndex, Maps.<ByteBuffer, Map<String, List<Mutation>>>newHashMap());
                            }
                            Map<ByteBuffer, Map<String, List<Mutation>>> map = maps.get(mapIndex);
                            ByteBuffer colName = CassandraKeyValueServices.makeCompositeBuffer(key.getColumnName(),
                                    ts);
                            SlicePredicate pred = new SlicePredicate();
                            pred.setColumn_names(Arrays.asList(colName));
                            Deletion del = new Deletion();
                            del.setPredicate(pred);
                            del.setTimestamp(Long.MAX_VALUE);
                            Mutation m = new Mutation();
                            m.setDeletion(del);
                            ByteBuffer rowName = ByteBuffer.wrap(key.getRowName());
                            if (!map.containsKey(rowName)) {
                                map.put(rowName, Maps.<String, List<Mutation>>newHashMap());
                            }
                            Map<String, List<Mutation>> rowPuts = map.get(rowName);
                            if (!rowPuts.containsKey(internalTableName(tableName))) {
                                rowPuts.put(internalTableName(tableName), Lists.<Mutation>newArrayList());
                            }
                            rowPuts.get(internalTableName(tableName)).add(m);
                            mapIndex++;
                        }
                    }
                    for (Map<ByteBuffer, Map<String, List<Mutation>>> map : maps.values()) {
                        // NOTE: we run with ConsistencyLevel.ALL here instead of ConsistencyLevel.QUORUM
                        // because we want to remove all copies of this data
                        batchMutateInternal(client, tableName, map, deleteConsistency);
                    }
                    return null;
                }

                @Override
                public String toString() {
                    return "batch_mutate(" + host + ", " + tableName + ", " + keys.size() + " keys" + ")";
                }
            });
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    // update CKVS.isMatchingCf if you update this method
    private CfDef getCfForTable(String tableName, byte[] rawMetadata) {
        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        Map<String, String> compressionOptions = Maps.newHashMap();
        CfDef cf = CassandraConstants.getStandardCfDef(config.keyspace(), internalTableName(tableName));

        boolean negativeLookups = false;
        double falsePositiveChance = CassandraConstants.DEFAULT_LEVELED_COMPACTION_BLOOM_FILTER_FP_CHANCE;
        int explicitCompressionBlockSizeKB = 0;
        boolean appendHeavyAndReadLight = false;
        TableMetadataPersistence.CachePriority cachePriority = TableMetadataPersistence.CachePriority.WARM;

        if (!CassandraKeyValueServices.isEmptyOrInvalidMetadata(rawMetadata)) {
            TableMetadata tableMetadata = TableMetadata.BYTES_HYDRATOR.hydrateFromBytes(rawMetadata);
            negativeLookups = tableMetadata.hasNegativeLookups();
            explicitCompressionBlockSizeKB = tableMetadata.getExplicitCompressionBlockSizeKB();
            appendHeavyAndReadLight = tableMetadata.isAppendHeavyAndReadLight();
            cachePriority = tableMetadata.getCachePriority();
        }

        if (explicitCompressionBlockSizeKB != 0) {
            compressionOptions.put(CassandraConstants.CFDEF_COMPRESSION_TYPE_KEY,
                    CassandraConstants.DEFAULT_COMPRESSION_TYPE);
            compressionOptions.put(CassandraConstants.CFDEF_COMPRESSION_CHUNK_LENGTH_KEY,
                    Integer.toString(explicitCompressionBlockSizeKB));
        } else {
            // We don't really need compression here nor anticipate it will garner us any gains
            // (which is why we're doing such a small chunk size), but this is how we can get "free" CRC checking.
            compressionOptions.put(CassandraConstants.CFDEF_COMPRESSION_TYPE_KEY,
                    CassandraConstants.DEFAULT_COMPRESSION_TYPE);
            compressionOptions.put(CassandraConstants.CFDEF_COMPRESSION_CHUNK_LENGTH_KEY,
                    Integer.toString(AtlasDbConstants.MINIMUM_COMPRESSION_BLOCK_SIZE_KB));
        }

        if (negativeLookups) {
            falsePositiveChance = CassandraConstants.NEGATIVE_LOOKUPS_BLOOM_FILTER_FP_CHANCE;
        }

        if (appendHeavyAndReadLight) {
            cf.setCompaction_strategy(CassandraConstants.SIZE_TIERED_COMPACTION_STRATEGY);
            cf.setCompaction_strategy_optionsIsSet(false); // clear out the now nonsensical "keep it at 80MB per sstable" option from LCS
            if (!negativeLookups) {
                falsePositiveChance = CassandraConstants.DEFAULT_SIZE_TIERED_COMPACTION_BLOOM_FILTER_FP_CHANCE;
            } else {
                falsePositiveChance = CassandraConstants.NEGATIVE_LOOKUPS_SIZE_TIERED_BLOOM_FILTER_FP_CHANCE;
            }
        }

        switch (cachePriority) {
        case COLDEST:
            break;
        case COLD:
            break;
        case WARM:
            break;
        case HOT:
            break;
        case HOTTEST:
            cf.setPopulate_io_cache_on_flushIsSet(true);
        }

        cf.setBloom_filter_fp_chance(falsePositiveChance);
        cf.setCompression_options(compressionOptions);
        return cf;
    }

    //TODO: after cassandra change: handle multiRanges
    @Override
    @Idempotent
    public Map<RangeRequest, TokenBackedBasicResultsPage<RowResult<Value>, byte[]>> getFirstBatchForRanges(
            String tableName, Iterable<RangeRequest> rangeRequests, long timestamp) {
        int concurrency = configManager.getConfig().rangesConcurrency();
        return KeyValueServices.getFirstBatchForRangesUsingGetRangeConcurrent(executor, this, tableName,
                rangeRequests, timestamp, concurrency);
    }

    // TODO: after cassandra change: handle reverse ranges
    // TODO: after cassandra change: handle column filtering
    @Override
    @Idempotent
    public ClosableIterator<RowResult<Value>> getRange(String tableName, final RangeRequest rangeRequest,
            final long timestamp) {
        return getRangeWithPageCreator(tableName, rangeRequest, timestamp, readConsistency,
                ValueExtractor.SUPPLIER);
    }

    @Override
    @Idempotent
    public ClosableIterator<RowResult<Set<Long>>> getRangeOfTimestamps(String tableName, RangeRequest rangeRequest,
            long timestamp) {
        return getRangeWithPageCreator(tableName, rangeRequest, timestamp, deleteConsistency,
                TimestampExtractor.SUPPLIER);
    }

    @Override
    @Idempotent
    public ClosableIterator<RowResult<Set<Value>>> getRangeWithHistory(String tableName, RangeRequest rangeRequest,
            long timestamp) {
        return getRangeWithPageCreator(tableName, rangeRequest, timestamp, deleteConsistency,
                HistoryExtractor.SUPPLIER);
    }

    public <T, U> ClosableIterator<RowResult<U>> getRangeWithPageCreator(final String tableName,
            final RangeRequest rangeRequest, final long timestamp, final ConsistencyLevel consistency,
            final Supplier<ResultsExtractor<T, U>> resultsExtractor) {
        if (rangeRequest.isReverse()) {
            throw new UnsupportedOperationException();
        }
        final int batchHint = rangeRequest.getBatchHint() == null ? 100 : rangeRequest.getBatchHint();
        SliceRange slice = new SliceRange(ByteBuffer.wrap(PtBytes.EMPTY_BYTE_ARRAY),
                ByteBuffer.wrap(PtBytes.EMPTY_BYTE_ARRAY), false, Integer.MAX_VALUE);
        final SlicePredicate pred = new SlicePredicate();
        pred.setSlice_range(slice);

        final ColumnParent colFam = new ColumnParent(internalTableName(tableName));
        final ColumnSelection selection = rangeRequest.getColumnNames().isEmpty() ? ColumnSelection.all()
                : ColumnSelection.create(rangeRequest.getColumnNames());
        return ClosableIterators.wrap(
                new AbstractPagingIterable<RowResult<U>, TokenBackedBasicResultsPage<RowResult<U>, byte[]>>() {
                    @Override
                    protected TokenBackedBasicResultsPage<RowResult<U>, byte[]> getFirstPage() throws Exception {
                        return page(rangeRequest.getStartInclusive());
                    }

                    @Override
                    protected TokenBackedBasicResultsPage<RowResult<U>, byte[]> getNextPage(
                            TokenBackedBasicResultsPage<RowResult<U>, byte[]> previous) throws Exception {
                        return page(previous.getTokenForNextPage());
                    }

                    TokenBackedBasicResultsPage<RowResult<U>, byte[]> page(final byte[] startKey) throws Exception {
                        InetAddress host = tokenAwareMapper.getRandomHostForKey(startKey);
                        return clientPool.runWithPooledResourceOnHost(host,
                                new FunctionCheckedException<Client, TokenBackedBasicResultsPage<RowResult<U>, byte[]>, Exception>() {
                                    @Override
                                    public TokenBackedBasicResultsPage<RowResult<U>, byte[]> apply(Client client)
                                            throws Exception {
                                        final byte[] endExclusive = rangeRequest.getEndExclusive();

                                        KeyRange keyRange = new KeyRange(batchHint);
                                        keyRange.setStart_key(startKey);
                                        if (endExclusive.length == 0) {
                                            keyRange.setEnd_key(endExclusive);
                                        } else {
                                            // We need the previous name because this is inclusive, not exclusive
                                            keyRange.setEnd_key(
                                                    RangeRequests.previousLexicographicName(endExclusive));
                                        }

                                        List<KeySlice> firstPage;

                                        try {
                                            if (shouldTraceQuery(tableName)) {
                                                ByteBuffer recv_trace = client.trace_next_query();
                                                Stopwatch stopwatch = Stopwatch.createStarted();
                                                firstPage = client.get_range_slices(colFam, pred, keyRange,
                                                        consistency);
                                                long duration = stopwatch.elapsed(TimeUnit.MILLISECONDS);
                                                if (duration > getMinimumDurationToTraceMillis()) {
                                                    log.error("Traced a call to " + tableName + " that took "
                                                            + duration + " ms."
                                                            + " It will appear in system_traces with UUID="
                                                            + CassandraKeyValueServices
                                                                    .convertCassandraByteBufferUUIDtoString(
                                                                            recv_trace));
                                                }
                                            } else {
                                                firstPage = client.get_range_slices(colFam, pred, keyRange,
                                                        consistency);
                                            }
                                        } catch (UnavailableException e) {
                                            if (consistency.equals(ConsistencyLevel.ALL)) {
                                                throw new InsufficientConsistencyException(
                                                        "This operation requires all Cassandra nodes to be up and available.",
                                                        e);
                                            } else {
                                                throw e;
                                            }
                                        }

                                        Map<ByteBuffer, List<ColumnOrSuperColumn>> colsByKey = CassandraKeyValueServices
                                                .getColsByKey(firstPage);
                                        TokenBackedBasicResultsPage<RowResult<U>, byte[]> page = resultsExtractor
                                                .get().getPageFromRangeResults(colsByKey, timestamp, selection,
                                                        endExclusive);
                                        if (page.moreResultsAvailable() && firstPage.size() < batchHint) {
                                            // If get_range_slices didn't return the full number of results, there's no
                                            // point to trying to get another page
                                            page = SimpleTokenBackedResultsPage.create(endExclusive,
                                                    page.getResults(), false);
                                        }
                                        return page;
                                    }

                                    @Override
                                    public String toString() {
                                        return "get_range_slices(" + colFam + ")";
                                    }
                                });
                    }

                }.iterator());
    }

    @Override
    public void dropTable(final String tableName) {
        try {
            trySchemaMutationLock();
            clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                @Override
                public Void apply(Client client) throws Exception {
                    String keyspace = configManager.getConfig().keyspace();
                    KsDef ks = client.describe_keyspace(keyspace);

                    for (CfDef cf : ks.getCf_defs()) {
                        if (cf.getName().equalsIgnoreCase(internalTableName(tableName))) {
                            client.system_drop_column_family(internalTableName(tableName));
                            putMetadataWithoutChangingSettings(tableName, PtBytes.EMPTY_BYTE_ARRAY);
                            CassandraKeyValueServices.waitForSchemaVersions(client, tableName,
                                    configManager.getConfig().schemaMutationTimeoutMillis());
                            return null;
                        }
                    }
                    return null;
                }
            });
        } catch (UnavailableException e) {
            throw new InsufficientConsistencyException(
                    "Drop table requires all Cassandra nodes to be up and available.", e);
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        } finally {
            schemaMutationLock.unlock();
        }
    }

    /**
     * Main gains here vs. dropTable:
     *    - problems excepting, we will basically be serializing a rapid series of schema changes
     *      through a single host checked out from the client pool, so reduced chance of schema disagreement issues
     *    - client-side in-memory lock to prevent misbehaving callers from shooting themselves in the foot
     *    - one less round trip
     */
    @Override
    public void dropTables(final Set<String> tablesToDrop) {
        try {
            trySchemaMutationLock();
            clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                @Override
                public Void apply(Client client) throws Exception {
                    KsDef ks = client.describe_keyspace(configManager.getConfig().keyspace());
                    Set<String> existingTables = Sets.newHashSet();

                    for (CfDef cf : ks.getCf_defs()) {
                        existingTables.add(cf.getName().toLowerCase());
                    }

                    for (String table : tablesToDrop) {
                        CassandraVerifier.sanityCheckTableName(table);
                        String caseInsensitiveTable = table.toLowerCase();

                        if (existingTables.contains(caseInsensitiveTable)) {
                            client.system_drop_column_family(caseInsensitiveTable);
                            putMetadataWithoutChangingSettings(caseInsensitiveTable, PtBytes.EMPTY_BYTE_ARRAY);
                        } else {
                            log.warn(String.format("Ignored call to drop a table (%s) that already existed.",
                                    table));
                        }
                    }
                    CassandraKeyValueServices.waitForSchemaVersions(client, "(all tables in a call to dropTables)",
                            configManager.getConfig().schemaMutationTimeoutMillis());
                    return null;
                }
            });
        } catch (UnavailableException e) {
            throw new PalantirRuntimeException(
                    "Dropping tables requires all Cassandra nodes to be up and available.");
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        } finally {
            schemaMutationLock.unlock();
        }
    }

    @Override
    public void createTable(final String tableName, final byte[] tableMetadata) {
        createTables(ImmutableMap.of(tableName, tableMetadata));
    }

    // for tables internal / implementation specific to this KVS; these also don't get metadata in metadata table, nor do they show up in getTablenames
    private void createTableInternal(Client client, final String tableName)
            throws InvalidRequestException, SchemaDisagreementException, TException, NotFoundException {
        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        KsDef ks = client.describe_keyspace(config.keyspace());
        for (CfDef cf : ks.getCf_defs()) {
            if (cf.getName().equalsIgnoreCase(internalTableName(tableName))) {
                return;
            }
        }
        CfDef cf = CassandraConstants.getStandardCfDef(config.keyspace(), internalTableName(tableName));
        client.system_add_column_family(cf);
        CassandraKeyValueServices.waitForSchemaVersions(client, tableName,
                configManager.getConfig().schemaMutationTimeoutMillis());
        return;
    }

    /**
     * Main gains here vs. createTable:
     *    - problems excepting, we will basically be serializing a rapid series of schema changes
     *      through a single host checked out from the client pool, so reduced chance of schema disagreement issues
     *    - client-side in-memory lock to prevent misbehaving callers from shooting themselves in the foot
     *    - one less round trip
     */
    @Override
    public void createTables(final Map<String, byte[]> tableNamesToTableMetadata) {
        try {
            trySchemaMutationLock();
            clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                @Override
                public Void apply(Client client) throws Exception {
                    KsDef ks = client.describe_keyspace(configManager.getConfig().keyspace());
                    Set<String> tablesToCreate = tableNamesToTableMetadata.keySet();
                    Set<String> existingTables = Sets.newHashSet();

                    for (CfDef cf : ks.getCf_defs()) {
                        existingTables.add(cf.getName().toLowerCase());
                    }

                    for (String table : tablesToCreate) {
                        CassandraVerifier.sanityCheckTableName(table);

                        if (!existingTables.contains(internalTableName(table.toLowerCase()))) {
                            client.system_add_column_family(
                                    getCfForTable(table, tableNamesToTableMetadata.get(table)));
                        } else {
                            log.warn(String.format("Ignored call to create a table (%s) that already existed.",
                                    table));
                        }
                    }
                    if (!tablesToCreate.isEmpty()) {
                        CassandraKeyValueServices.waitForSchemaVersions(client,
                                "(all tables in a call to createTables)",
                                configManager.getConfig().schemaMutationTimeoutMillis());
                    }
                    return null;
                }
            });
        } catch (UnavailableException e) {
            throw new PalantirRuntimeException(
                    "Creating tables requires all Cassandra nodes to be up and available.");
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        } finally {
            schemaMutationLock.unlock();
        }

        internalPutMetadataForTables(tableNamesToTableMetadata, false);
    }

    @Override
    public Set<String> getAllTableNames() {
        final CassandraKeyValueServiceConfig config = configManager.getConfig();
        try {
            return clientPool.runWithPooledResource(new FunctionCheckedException<Client, Set<String>, Exception>() {
                @Override
                public Set<String> apply(Client client) throws Exception {
                    KsDef ks = client.describe_keyspace(config.keyspace());

                    Set<String> ret = Sets.newHashSet();
                    for (CfDef cf : ks.getCf_defs()) {
                        if (!CassandraConstants.HIDDEN_TABLES.contains(cf.getName())) {
                            ret.add(fromInternalTableName(cf.getName()));
                        }
                    }
                    return ret;
                }

                @Override
                public String toString() {
                    return "describe_keyspace(" + config.keyspace() + ")";
                }
            });
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    @Override
    public byte[] getMetadataForTable(String tableName) {
        Cell cell = getMetadataCell(tableName);
        Value v = get(CassandraConstants.METADATA_TABLE, ImmutableMap.of(cell, Long.MAX_VALUE)).get(cell);
        if (v == null) {
            return AtlasDbConstants.EMPTY_TABLE_METADATA;
        } else {
            return v.getContents();
        }
    }

    @Override
    public Map<String, byte[]> getMetadataForTables() {
        Map<String, byte[]> tableToMetadataContents = Maps.newHashMap();
        ClosableIterator<RowResult<Value>> range = getRange(CassandraConstants.METADATA_TABLE, RangeRequest.all(),
                Long.MAX_VALUE);
        try {
            Set<String> currentlyExistingTables = getAllTableNames();
            while (range.hasNext()) {
                RowResult<Value> valueRow = range.next();
                Iterable<Entry<Cell, Value>> cells = valueRow.getCells();

                for (Entry<Cell, Value> entry : cells) {
                    Value value = entry.getValue();
                    String tableName = new String(entry.getKey().getRowName());
                    if (currentlyExistingTables.contains(tableName)) {
                        byte[] contents;
                        if (value == null) {
                            contents = AtlasDbConstants.EMPTY_TABLE_METADATA;
                        } else {
                            contents = value.getContents();
                        }
                        tableToMetadataContents.put(tableName, contents);
                    } else {
                        log.info("Non-existing table {}: {}", tableName, value);
                    }
                }
            }
        } finally {
            range.close();
        }
        return tableToMetadataContents;
    }

    private Cell getMetadataCell(String tableName) { // would have preferred an explicit charset, but thrift uses default internally
        return Cell.create(tableName.getBytes(Charset.defaultCharset()), "m".getBytes());
    }

    @Override
    public void putMetadataForTable(final String tableName, final byte[] meta) {
        putMetadataForTables(ImmutableMap.of(tableName, meta));
    }

    @Override
    public void putMetadataForTables(final Map<String, byte[]> tableNameToMetadata) {
        internalPutMetadataForTables(tableNameToMetadata, true);
    }

    private void internalPutMetadataForTables(final Map<String, byte[]> tableNameToMetadata,
            final boolean possiblyNeedToPerformSettingsChanges) {
        final Map<Cell, byte[]> metadataRequestedForUpdate = Maps
                .newHashMapWithExpectedSize(tableNameToMetadata.size());
        for (Entry<String, byte[]> tableEntry : tableNameToMetadata.entrySet()) {
            metadataRequestedForUpdate.put(getMetadataCell(tableEntry.getKey()), tableEntry.getValue());
        }

        Map<Cell, Long> requestForLatestDbSideMetadata = Maps.transformValues(metadataRequestedForUpdate,
                Functions.constant(Long.MAX_VALUE));

        // technically we're racing other services from here on, during an update period,
        // but the penalty for not caring is just some superfluous schema mutations and a few dead rows in the metadata table.
        Map<Cell, Value> persistedMetadata = get(CassandraConstants.METADATA_TABLE, requestForLatestDbSideMetadata);
        final Map<Cell, byte[]> newMetadata = Maps.newHashMap();
        final Collection<CfDef> updatedCfs = Lists.newArrayList();
        for (Entry<Cell, byte[]> entry : metadataRequestedForUpdate.entrySet()) {
            Value val = persistedMetadata.get(entry.getKey());
            if (val == null || !Arrays.equals(val.getContents(), entry.getValue())) {
                newMetadata.put(entry.getKey(), entry.getValue());
                updatedCfs.add(getCfForTable(new String(entry.getKey().getRowName()), entry.getValue()));
            }
        }

        if (!newMetadata.isEmpty()) {
            try {
                if (possiblyNeedToPerformSettingsChanges) {
                    trySchemaMutationLock();
                }
                clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                    @Override
                    public Void apply(Client client) throws Exception {
                        if (possiblyNeedToPerformSettingsChanges) {
                            for (CfDef cf : updatedCfs) {
                                client.system_update_column_family(cf);
                            }

                            CassandraKeyValueServices.waitForSchemaVersions(client,
                                    "(all tables in a call to putMetadataForTables)",
                                    configManager.getConfig().schemaMutationTimeoutMillis());
                        }
                        // Done with actual schema mutation, push the metadata
                        put(CassandraConstants.METADATA_TABLE, newMetadata, System.currentTimeMillis());
                        return null;
                    }
                });
            } catch (Exception e) {
                throw Throwables.throwUncheckedException(e);
            } finally {
                if (possiblyNeedToPerformSettingsChanges) {
                    schemaMutationLock.unlock();
                }
            }
        }
    }

    private void putMetadataWithoutChangingSettings(final String tableName, final byte[] meta) {
        put(CassandraConstants.METADATA_TABLE, ImmutableMap.of(getMetadataCell(tableName), meta),
                System.currentTimeMillis());
    }

    @Override
    public void close() {
        clientPool.shutdownPooling();
        hostRefreshExecutor.shutdown();
        if (compactionManager.isPresent()) {
            compactionManager.get().close();
        }
        tokenAwareMapper.shutdown();
        super.close();
    }

    @Override
    public void addGarbageCollectionSentinelValues(String tableName, Set<Cell> cells) {
        try {
            final Value value = Value.create(PtBytes.EMPTY_BYTE_ARRAY, Value.INVALID_VALUE_TIMESTAMP);
            putInternal(tableName, Iterables.transform(cells, new Function<Cell, Map.Entry<Cell, Value>>() {
                @Override
                public Entry<Cell, Value> apply(Cell cell) {
                    return Maps.immutableEntry(cell, value);
                }
            }));
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    @Override
    public Multimap<Cell, Long> getAllTimestamps(String tableName, Set<Cell> cells, long ts) {
        AllTimestampsCollector collector = new AllTimestampsCollector();
        try {
            loadWithTs(tableName, cells, ts, true, collector, deleteConsistency);
        } catch (UnavailableException e) {
            throw new InsufficientConsistencyException(
                    "Get all timestamps requires all Cassandra nodes to be up and available.", e);
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
        return collector.collectedResults;
    }

    @Override
    public void putUnlessExists(final String tableName, final Map<Cell, byte[]> values)
            throws KeyAlreadyExistsException {
        try {
            clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                @Override
                public Void apply(Client client) throws Exception {
                    for (Map.Entry<Cell, byte[]> e : values.entrySet()) {
                        ByteBuffer rowName = ByteBuffer.wrap(e.getKey().getRowName());
                        byte[] contents = e.getValue();
                        long timestamp = AtlasDbConstants.TRANSACTION_TS;
                        byte[] colName = CassandraKeyValueServices
                                .makeCompositeBuffer(e.getKey().getColumnName(), timestamp).array();
                        Column col = new Column();
                        col.setName(colName);
                        col.setValue(contents);
                        col.setTimestamp(timestamp);
                        CASResult casResult = client.cas(rowName, tableName, ImmutableList.<Column>of(),
                                ImmutableList.of(col), ConsistencyLevel.SERIAL, writeConsistency);
                        if (!casResult.isSuccess()) {
                            throw new KeyAlreadyExistsException("This transaction row already exists.",
                                    ImmutableList.of(e.getKey()));
                        }
                    }
                    return null;
                }
            });
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        }
    }

    private static String internalTableName(String tableName) {
        if (tableName.startsWith("_")) {
            return tableName;
        }
        return tableName.replaceFirst("\\.", "__");
    }

    private String fromInternalTableName(String tableName) {
        if (tableName.startsWith("_")) {
            return tableName;
        }
        return tableName.replaceFirst("__", ".");
    }

    private void trySchemaMutationLock() throws InterruptedException, TimeoutException {
        if (!schemaMutationLock.tryLock(configManager.getConfig().schemaMutationTimeoutMillis(),
                TimeUnit.MILLISECONDS)) {
            throw new TimeoutException(
                    "AtlasDB was unable to get a lock on Cassandra system schema mutations for your cluster. Likely cause: Service(s) performing heavy schema mutations in parallel, or extremely heavy Cassandra cluster load.");
        }
    }

    @Override
    public void compactInternally(String tableName) {
        Preconditions.checkArgument(!Strings.isNullOrEmpty(tableName),
                "tableName:[%s] should not be null or empty.", tableName);
        CassandraKeyValueServiceConfig config = configManager.getConfig();
        if (!compactionManager.isPresent()) {
            log.error(
                    "No compaction client was configured, but compact was called. If you actually want to clear deleted data immediately "
                            + "from Cassandra, lower your gc_grace_seconds setting and run `nodetool compact {} {}`.",
                    config.keyspace(), tableName);
            return;
        }
        long timeoutInSeconds = config.jmx().get().compactionTimeoutSeconds();
        String keyspace = config.keyspace();
        try {
            alterGcAndTombstone(keyspace, tableName, 0, 0.0f);
            compactionManager.get().performTombstoneCompaction(timeoutInSeconds, keyspace, tableName);
        } catch (TimeoutException e) {
            log.error("Compaction for {}.{} could not finish in {} seconds.", keyspace, tableName, timeoutInSeconds,
                    e);
            log.error(compactionManager.get().getCompactionStatus());
        } catch (InterruptedException e) {
            log.error("Compaction for {}.{} was interrupted.", keyspace, tableName);
        } finally {
            alterGcAndTombstone(keyspace, tableName, CassandraConstants.GC_GRACE_SECONDS,
                    CassandraConstants.TOMBSTONE_THRESHOLD_RATIO);
        }
    }

    private void alterGcAndTombstone(final String keyspace, final String tableName, final int gcGraceSeconds,
            final float tombstoneThresholdRatio) {
        Preconditions.checkArgument(!Strings.isNullOrEmpty(keyspace), "keyspace:[%s] should not be null or empty.",
                keyspace);
        Preconditions.checkArgument(!Strings.isNullOrEmpty(tableName),
                "tableName:[%s] should not be null or empty.", tableName);
        Preconditions.checkArgument(gcGraceSeconds >= 0, "gc_grace_seconds:[%s] should not be negative.",
                gcGraceSeconds);
        Preconditions.checkArgument(tombstoneThresholdRatio >= 0.0f && tombstoneThresholdRatio <= 1.0f,
                "tombstone_threshold_ratio:[%s] should be between [0.0, 1.0]", tombstoneThresholdRatio);

        try {
            trySchemaMutationLock();
            clientPool.runWithPooledResource(new FunctionCheckedException<Client, Void, Exception>() {
                @Override
                public Void apply(Client client) throws NotFoundException, InvalidRequestException, TException {
                    KsDef ks = client.describe_keyspace(keyspace);
                    List<CfDef> cfs = ks.getCf_defs();
                    for (CfDef cf : cfs) {
                        if (cf.getName().equalsIgnoreCase(tableName)) {
                            cf.setGc_grace_seconds(gcGraceSeconds);
                            cf.setCompaction_strategy_options(ImmutableMap.of("tombstone_threshold",
                                    String.valueOf(tombstoneThresholdRatio)));
                            client.system_update_column_family(cf);
                            CassandraKeyValueServices.waitForSchemaVersions(client, tableName,
                                    configManager.getConfig().schemaMutationTimeoutMillis());
                            log.trace("gc_grace_seconds is set to {} for {}.{}", gcGraceSeconds, keyspace,
                                    tableName);
                            log.trace("tombstone_threshold_ratio is set to {} for {}.{}", tombstoneThresholdRatio,
                                    keyspace, tableName);
                        }
                    }
                    return null;
                }
            });
        } catch (Exception e) {
            log.error(
                    "Exception encountered while setting gc_grace_seconds:{} and tombstone_threshold:{} for {}.{}",
                    gcGraceSeconds, tombstoneThresholdRatio, keyspace, tableName, e);
        } finally {
            schemaMutationLock.unlock();
        }
    }

    private <V> Map<InetAddress, Map<Cell, V>> partitionMapByHost(Iterable<Map.Entry<Cell, V>> cells) {
        Map<InetAddress, List<Map.Entry<Cell, V>>> partitionedByHost = partitionByHost(cells,
                new Function<Map.Entry<Cell, V>, byte[]>() {
                    @Override
                    public byte[] apply(Entry<Cell, V> entry) {
                        return entry.getKey().getRowName();
                    }
                });
        Map<InetAddress, Map<Cell, V>> cellsByHost = Maps.newHashMap();
        for (Map.Entry<InetAddress, List<Map.Entry<Cell, V>>> hostAndCells : partitionedByHost.entrySet()) {
            Map<Cell, V> cellsForHost = Maps.newHashMapWithExpectedSize(hostAndCells.getValue().size());
            for (Map.Entry<Cell, V> entry : hostAndCells.getValue()) {
                cellsForHost.put(entry.getKey(), entry.getValue());
            }
            cellsByHost.put(hostAndCells.getKey(), cellsForHost);
        }
        return cellsByHost;
    }

    private <V> Map<InetAddress, List<V>> partitionByHost(Iterable<V> iterable, Function<V, byte[]> keyExtractor) {
        // Ensure that the same key goes to the same partition. This is important when writing multiple columns
        // to the same row, since this is a normally a single write in cassandra, whereas splitting the columns
        // into different requests results in multiple writes.
        ListMultimap<ByteBuffer, V> partitionedByKey = ArrayListMultimap.create();
        for (V value : iterable) {
            partitionedByKey.put(ByteBuffer.wrap(keyExtractor.apply(value)), value);
        }
        ListMultimap<InetAddress, V> valuesByHost = ArrayListMultimap.create();
        for (ByteBuffer key : partitionedByKey.keySet()) {
            InetAddress host = tokenAwareMapper.getRandomHostForKey(key.array());
            valuesByHost.putAll(host, partitionedByKey.get(key));
        }
        return Multimaps.asMap(valuesByHost);
    }

    /*
     * Similar to executor.invokeAll, but cancels all remaining tasks if one fails and doesn't spawn new threads if
     * there is only one task
     */
    private <V> List<V> runAllTasksCancelOnFailure(List<Callable<V>> tasks) {
        if (tasks.size() == 1) {
            try {
                //Callable<Void> returns null, so can't use immutable list
                return Collections.singletonList(tasks.get(0).call());
            } catch (Exception e) {
                throw Throwables.throwUncheckedException(e);
            }
        }

        List<Future<V>> futures = Lists.newArrayListWithCapacity(tasks.size());
        for (Callable<V> task : tasks) {
            futures.add(executor.submit(task));
        }
        try {
            List<V> results = Lists.newArrayListWithCapacity(tasks.size());
            for (Future<V> future : futures) {
                results.add(future.get());
            }
            return results;
        } catch (Exception e) {
            throw Throwables.throwUncheckedException(e);
        } finally {
            for (Future<V> future : futures) {
                future.cancel(true);
            }
        }
    }

    private static class TableCellAndValue {
        private final String tableName;
        private final Cell cell;
        private final byte[] value;

        public static Function<TableCellAndValue, byte[]> EXTRACT_ROW_NAME_FUNCTION = new Function<TableCellAndValue, byte[]>() {
            @Override
            public byte[] apply(TableCellAndValue input) {
                return input.cell.getRowName();
            }
        };

        public static Function<TableCellAndValue, Long> SIZING_FUNCTION = new Function<CassandraKeyValueService.TableCellAndValue, Long>() {
            @Override
            public Long apply(TableCellAndValue input) {
                return input.value.length + Cells.getApproxSizeOfCell(input.cell);
            }
        };

        public TableCellAndValue(String tableName, Cell cell, byte[] value) {
            this.tableName = tableName;
            this.cell = cell;
            this.value = value;
        }
    }
}