org.apache.cassandra.db.SystemKeyspace.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.cassandra.db.SystemKeyspace.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.db;

import java.io.DataInputStream;
import java.io.IOException;
import java.net.InetAddress;
import java.nio.ByteBuffer;
import java.util.*;
import javax.management.openmbean.*;

import com.google.common.base.Function;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Iterables;
import com.google.common.collect.SetMultimap;
import com.google.common.collect.Sets;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.config.KSMetaData;
import org.apache.cassandra.config.Schema;
import org.apache.cassandra.cql3.QueryProcessor;
import org.apache.cassandra.cql3.UntypedResultSet;
import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
import org.apache.cassandra.db.compaction.CompactionHistoryTabularData;
import org.apache.cassandra.db.commitlog.ReplayPosition;
import org.apache.cassandra.db.filter.QueryFilter;
import org.apache.cassandra.db.marshal.*;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.exceptions.ConfigurationException;
import org.apache.cassandra.io.sstable.SSTableReader;
import org.apache.cassandra.io.util.DataOutputBuffer;
import org.apache.cassandra.locator.IEndpointSnitch;
import org.apache.cassandra.metrics.RestorableMeter;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.service.paxos.Commit;
import org.apache.cassandra.service.paxos.PaxosState;
import org.apache.cassandra.thrift.cassandraConstants;
import org.apache.cassandra.transport.Server;
import org.apache.cassandra.utils.*;

import static org.apache.cassandra.cql3.QueryProcessor.processInternal;

public class SystemKeyspace {
    private static final Logger logger = LoggerFactory.getLogger(SystemKeyspace.class);

    // see CFMetaData for schema definitions
    public static final String PEERS_CF = "peers";
    public static final String PEER_EVENTS_CF = "peer_events";
    public static final String LOCAL_CF = "local";
    public static final String INDEX_CF = "IndexInfo";
    public static final String COUNTER_ID_CF = "NodeIdInfo";
    public static final String HINTS_CF = "hints";
    public static final String RANGE_XFERS_CF = "range_xfers";
    public static final String BATCHLOG_CF = "batchlog";
    // see layout description in the DefsTables class header
    public static final String SCHEMA_KEYSPACES_CF = "schema_keyspaces";
    public static final String SCHEMA_COLUMNFAMILIES_CF = "schema_columnfamilies";
    public static final String SCHEMA_COLUMNS_CF = "schema_columns";
    public static final String SCHEMA_TRIGGERS_CF = "schema_triggers";
    public static final String COMPACTION_LOG = "compactions_in_progress";
    public static final String PAXOS_CF = "paxos";
    public static final String SSTABLE_ACTIVITY_CF = "sstable_activity";
    public static final String COMPACTION_HISTORY_CF = "compaction_history";

    private static final String LOCAL_KEY = "local";
    private static final ByteBuffer ALL_LOCAL_NODE_ID_KEY = ByteBufferUtil.bytes("Local");

    public static final List<String> allSchemaCfs = Arrays.asList(SCHEMA_KEYSPACES_CF, SCHEMA_COLUMNFAMILIES_CF,
            SCHEMA_COLUMNS_CF, SCHEMA_TRIGGERS_CF);

    private static volatile Map<UUID, Pair<ReplayPosition, Long>> truncationRecords;

    public enum BootstrapState {
        NEEDS_BOOTSTRAP, COMPLETED, IN_PROGRESS
    }

    private static DecoratedKey decorate(ByteBuffer key) {
        return StorageService.getPartitioner().decorateKey(key);
    }

    public static void finishStartup() {
        setupVersion();

        copyAllAliasesToColumnsProper();

        // add entries to system schema columnfamilies for the hardcoded system definitions
        KSMetaData ksmd = Schema.instance.getKSMetaData(Keyspace.SYSTEM_KS);

        // delete old, possibly obsolete entries in schema columnfamilies
        for (String cfname : Arrays.asList(SystemKeyspace.SCHEMA_KEYSPACES_CF,
                SystemKeyspace.SCHEMA_COLUMNFAMILIES_CF, SystemKeyspace.SCHEMA_COLUMNS_CF,
                SystemKeyspace.SCHEMA_TRIGGERS_CF))
            processInternal(String.format("DELETE FROM system.%s WHERE keyspace_name = '%s'", cfname, ksmd.name));

        // (+1 to timestamp to make sure we don't get shadowed by the tombstones we just added)
        ksmd.toSchema(FBUtilities.timestampMicros() + 1).apply();
    }

    // Starting with 2.0 (CASSANDRA-5125) we keep all the 'aliases' in system.schema_columns together with the regular columns,
    // but only for the newly-created tables. This migration is for the pre-2.0 created tables.
    private static void copyAllAliasesToColumnsProper() {
        for (UntypedResultSet.Row row : processInternal(
                String.format("SELECT * FROM system.%s", SCHEMA_COLUMNFAMILIES_CF))) {
            CFMetaData table = CFMetaData.fromSchema(row);
            String query = String.format(
                    "SELECT writetime(type) " + "FROM system.%s "
                            + "WHERE keyspace_name = '%s' AND columnfamily_name = '%s'",
                    SCHEMA_COLUMNFAMILIES_CF, table.ksName, table.cfName);
            long timestamp = processInternal(query).one().getLong("writetime(type)");
            try {
                table.toSchema(timestamp).apply();
            } catch (ConfigurationException e) {
                // shouldn't happen
            }
        }
    }

    private static void setupVersion() {
        String req = "INSERT INTO system.%s (key, release_version, cql_version, thrift_version, native_protocol_version, data_center, rack, partitioner, rpc_address, broadcast_address) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')";
        IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
        processInternal(String.format(req, LOCAL_CF, LOCAL_KEY, FBUtilities.getReleaseVersionString(),
                QueryProcessor.CQL_VERSION.toString(), cassandraConstants.VERSION, Server.CURRENT_VERSION,
                snitch.getDatacenter(FBUtilities.getBroadcastAddress()),
                snitch.getRack(FBUtilities.getBroadcastAddress()),
                DatabaseDescriptor.getPartitioner().getClass().getName(),
                DatabaseDescriptor.getRpcAddress().getHostAddress(),
                FBUtilities.getBroadcastAddress().getHostAddress()));
    }

    /**
     * Write compaction log, except columfamilies under system keyspace.
     *
     * @param cfs
     * @param toCompact sstables to compact
     * @return compaction task id or null if cfs is under system keyspace
     */
    public static UUID startCompaction(ColumnFamilyStore cfs, Iterable<SSTableReader> toCompact) {
        if (Keyspace.SYSTEM_KS.equals(cfs.keyspace.getName()))
            return null;

        UUID compactionId = UUIDGen.getTimeUUID();
        String req = "INSERT INTO system.%s (id, keyspace_name, columnfamily_name, inputs) VALUES (%s, '%s', '%s', {%s})";
        Iterable<Integer> generations = Iterables.transform(toCompact, new Function<SSTableReader, Integer>() {
            public Integer apply(SSTableReader sstable) {
                return sstable.descriptor.generation;
            }
        });
        processInternal(String.format(req, COMPACTION_LOG, compactionId, cfs.keyspace.getName(), cfs.name,
                StringUtils.join(Sets.newHashSet(generations), ',')));
        forceBlockingFlush(COMPACTION_LOG);
        return compactionId;
    }

    /**
     * Deletes the entry for this compaction from the set of compactions in progress.  The compaction does not need
     * to complete successfully for this to be called.
     * @param taskId what was returned from {@code startCompaction}
     */
    public static void finishCompaction(UUID taskId) {
        assert taskId != null;

        String req = "DELETE FROM system.%s WHERE id = %s";
        processInternal(String.format(req, COMPACTION_LOG, taskId));
        forceBlockingFlush(COMPACTION_LOG);
    }

    /**
     * Returns a Map whose keys are KS.CF pairs and whose values are maps from sstable generation numbers to the
     * task ID of the compaction they were participating in.
     */
    public static Map<Pair<String, String>, Map<Integer, UUID>> getUnfinishedCompactions() {
        String req = "SELECT * FROM system.%s";
        UntypedResultSet resultSet = processInternal(String.format(req, COMPACTION_LOG));

        Map<Pair<String, String>, Map<Integer, UUID>> unfinishedCompactions = new HashMap<>();
        for (UntypedResultSet.Row row : resultSet) {
            String keyspace = row.getString("keyspace_name");
            String columnfamily = row.getString("columnfamily_name");
            Set<Integer> inputs = row.getSet("inputs", Int32Type.instance);
            UUID taskID = row.getUUID("id");

            Pair<String, String> kscf = Pair.create(keyspace, columnfamily);
            Map<Integer, UUID> generationToTaskID = unfinishedCompactions.get(kscf);
            if (generationToTaskID == null)
                generationToTaskID = new HashMap<>(inputs.size());

            for (Integer generation : inputs)
                generationToTaskID.put(generation, taskID);

            unfinishedCompactions.put(kscf, generationToTaskID);
        }
        return unfinishedCompactions;
    }

    public static void discardCompactionsInProgress() {
        ColumnFamilyStore compactionLog = Keyspace.open(Keyspace.SYSTEM_KS).getColumnFamilyStore(COMPACTION_LOG);
        compactionLog.truncateBlocking();
    }

    public static void updateCompactionHistory(String ksname, String cfname, long compactedAt, long bytesIn,
            long bytesOut, Map<Integer, Long> rowsMerged) {
        // don't write anything when the history table itself is compacted, since that would in turn cause new compactions
        if (ksname.equals("system") && cfname.equals(COMPACTION_HISTORY_CF))
            return;
        String req = "INSERT INTO system.%s (id, keyspace_name, columnfamily_name, compacted_at, bytes_in, bytes_out, rows_merged) "
                + "VALUES (%s, '%s', '%s', %d, %d, %d, {%s})";
        processInternal(String.format(req, COMPACTION_HISTORY_CF, UUIDGen.getTimeUUID().toString(), ksname, cfname,
                compactedAt, bytesIn, bytesOut, FBUtilities.toString(rowsMerged)));
    }

    public static TabularData getCompactionHistory() throws OpenDataException {
        UntypedResultSet queryResultSet = processInternal("SELECT * from system.compaction_history");
        return CompactionHistoryTabularData.from(queryResultSet);
    }

    public static synchronized void saveTruncationRecord(ColumnFamilyStore cfs, long truncatedAt,
            ReplayPosition position) {
        String req = "UPDATE system.%s SET truncated_at = truncated_at + %s WHERE key = '%s'";
        processInternal(String.format(req, LOCAL_CF, truncationAsMapEntry(cfs, truncatedAt, position), LOCAL_KEY));
        truncationRecords = null;
        forceBlockingFlush(LOCAL_CF);
    }

    /**
     * This method is used to remove information about truncation time for specified column family
     */
    public static synchronized void removeTruncationRecord(UUID cfId) {
        String req = "DELETE truncated_at[%s] from system.%s WHERE key = '%s'";
        processInternal(String.format(req, cfId, LOCAL_CF, LOCAL_KEY));
        truncationRecords = null;
        forceBlockingFlush(LOCAL_CF);
    }

    private static String truncationAsMapEntry(ColumnFamilyStore cfs, long truncatedAt, ReplayPosition position) {
        DataOutputBuffer out = new DataOutputBuffer();
        try {
            ReplayPosition.serializer.serialize(position, out);
            out.writeLong(truncatedAt);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return String.format("{%s: 0x%s}", cfs.metadata.cfId,
                ByteBufferUtil.bytesToHex(ByteBuffer.wrap(out.getData(), 0, out.getLength())));
    }

    public static ReplayPosition getTruncatedPosition(UUID cfId) {
        Pair<ReplayPosition, Long> record = getTruncationRecord(cfId);
        return record == null ? null : record.left;
    }

    public static long getTruncatedAt(UUID cfId) {
        Pair<ReplayPosition, Long> record = getTruncationRecord(cfId);
        return record == null ? Long.MIN_VALUE : record.right;
    }

    private static synchronized Pair<ReplayPosition, Long> getTruncationRecord(UUID cfId) {
        if (truncationRecords == null)
            truncationRecords = readTruncationRecords();
        return truncationRecords.get(cfId);
    }

    private static Map<UUID, Pair<ReplayPosition, Long>> readTruncationRecords() {
        UntypedResultSet rows = processInternal(
                String.format("SELECT truncated_at FROM system.%s WHERE key = '%s'", LOCAL_CF, LOCAL_KEY));

        Map<UUID, Pair<ReplayPosition, Long>> records = new HashMap<>();

        if (!rows.isEmpty() && rows.one().has("truncated_at")) {
            Map<UUID, ByteBuffer> map = rows.one().getMap("truncated_at", UUIDType.instance, BytesType.instance);
            for (Map.Entry<UUID, ByteBuffer> entry : map.entrySet())
                records.put(entry.getKey(), truncationRecordFromBlob(entry.getValue()));
        }

        return records;
    }

    private static Pair<ReplayPosition, Long> truncationRecordFromBlob(ByteBuffer bytes) {
        try {
            DataInputStream in = new DataInputStream(ByteBufferUtil.inputStream(bytes));
            return Pair.create(ReplayPosition.serializer.deserialize(in),
                    in.available() > 0 ? in.readLong() : Long.MIN_VALUE);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Record tokens being used by another node
     */
    public static synchronized void updateTokens(InetAddress ep, Collection<Token> tokens) {
        if (ep.equals(FBUtilities.getBroadcastAddress())) {
            removeEndpoint(ep);
            return;
        }

        String req = "INSERT INTO system.%s (peer, tokens) VALUES ('%s', %s)";
        processInternal(String.format(req, PEERS_CF, ep.getHostAddress(), tokensAsSet(tokens)));
    }

    public static synchronized void updatePreferredIP(InetAddress ep, InetAddress preferred_ip) {
        String req = "INSERT INTO system.%s (peer, preferred_ip) VALUES ('%s', '%s')";
        processInternal(String.format(req, PEERS_CF, ep.getHostAddress(), preferred_ip.getHostAddress()));
        forceBlockingFlush(PEERS_CF);
    }

    public static synchronized void updatePeerInfo(InetAddress ep, String columnName, String value) {
        if (ep.equals(FBUtilities.getBroadcastAddress()))
            return;

        String req = "INSERT INTO system.%s (peer, %s) VALUES ('%s', %s)";
        processInternal(String.format(req, PEERS_CF, columnName, ep.getHostAddress(), value));
    }

    public static synchronized void updateHintsDropped(InetAddress ep, UUID timePeriod, int value) {
        // with 30 day TTL
        String req = "UPDATE system.%s USING TTL 2592000 SET hints_dropped[ %s ] = %s WHERE peer = '%s'";
        processInternal(String.format(req, PEER_EVENTS_CF, timePeriod.toString(), value, ep.getHostAddress()));
    }

    public static synchronized void updateSchemaVersion(UUID version) {
        String req = "INSERT INTO system.%s (key, schema_version) VALUES ('%s', %s)";
        processInternal(String.format(req, LOCAL_CF, LOCAL_KEY, version.toString()));
    }

    private static String tokensAsSet(Collection<Token> tokens) {
        Token.TokenFactory factory = StorageService.getPartitioner().getTokenFactory();
        StringBuilder sb = new StringBuilder();
        sb.append("{");
        Iterator<Token> iter = tokens.iterator();
        while (iter.hasNext()) {
            sb.append("'").append(factory.toString(iter.next())).append("'");
            if (iter.hasNext())
                sb.append(",");
        }
        sb.append("}");
        return sb.toString();
    }

    private static Collection<Token> deserializeTokens(Collection<String> tokensStrings) {
        Token.TokenFactory factory = StorageService.getPartitioner().getTokenFactory();
        List<Token> tokens = new ArrayList<Token>(tokensStrings.size());
        for (String tk : tokensStrings)
            tokens.add(factory.fromString(tk));
        return tokens;
    }

    /**
     * Remove stored tokens being used by another node
     */
    public static synchronized void removeEndpoint(InetAddress ep) {
        String req = "DELETE FROM system.%s WHERE peer = '%s'";
        processInternal(String.format(req, PEERS_CF, ep.getHostAddress()));
    }

    /**
     * This method is used to update the System Keyspace with the new tokens for this node
    */
    public static synchronized void updateTokens(Collection<Token> tokens) {
        assert !tokens.isEmpty() : "removeEndpoint should be used instead";
        String req = "INSERT INTO system.%s (key, tokens) VALUES ('%s', %s)";
        processInternal(String.format(req, LOCAL_CF, LOCAL_KEY, tokensAsSet(tokens)));
        forceBlockingFlush(LOCAL_CF);
    }

    /**
     * Convenience method to update the list of tokens in the local system keyspace.
     *
     * @param addTokens tokens to add
     * @param rmTokens tokens to remove
     * @return the collection of persisted tokens
     */
    public static synchronized Collection<Token> updateLocalTokens(Collection<Token> addTokens,
            Collection<Token> rmTokens) {
        Collection<Token> tokens = getSavedTokens();
        tokens.removeAll(rmTokens);
        tokens.addAll(addTokens);
        updateTokens(tokens);
        return tokens;
    }

    public static void forceBlockingFlush(String cfname) {
        if (!Boolean.getBoolean("cassandra.unsafesystem"))
            FBUtilities.waitOnFuture(Keyspace.open(Keyspace.SYSTEM_KS).getColumnFamilyStore(cfname).forceFlush());
    }

    /**
     * Return a map of stored tokens to IP addresses
     *
     */
    public static SetMultimap<InetAddress, Token> loadTokens() {
        SetMultimap<InetAddress, Token> tokenMap = HashMultimap.create();
        for (UntypedResultSet.Row row : processInternal("SELECT peer, tokens FROM system." + PEERS_CF)) {
            InetAddress peer = row.getInetAddress("peer");
            if (row.has("tokens"))
                tokenMap.putAll(peer, deserializeTokens(row.getSet("tokens", UTF8Type.instance)));
        }

        return tokenMap;
    }

    /**
     * Return a map of store host_ids to IP addresses
     *
     */
    public static Map<InetAddress, UUID> loadHostIds() {
        Map<InetAddress, UUID> hostIdMap = new HashMap<InetAddress, UUID>();
        for (UntypedResultSet.Row row : processInternal("SELECT peer, host_id FROM system." + PEERS_CF)) {
            InetAddress peer = row.getInetAddress("peer");
            if (row.has("host_id")) {
                hostIdMap.put(peer, row.getUUID("host_id"));
            }
        }
        return hostIdMap;
    }

    /**
     * Get preferred IP for given endpoint if it is known. Otherwise this returns given endpoint itself.
     *
     * @param ep endpoint address to check
     * @return Preferred IP for given endpoint if present, otherwise returns given ep
     */
    public static InetAddress getPreferredIP(InetAddress ep) {
        String req = "SELECT preferred_ip FROM system.%s WHERE peer='%s'";
        UntypedResultSet result = processInternal(String.format(req, PEERS_CF, ep.getHostAddress()));
        if (!result.isEmpty() && result.one().has("preferred_ip"))
            return result.one().getInetAddress("preferred_ip");
        return ep;
    }

    /**
     * Return a map of IP addresses containing a map of dc and rack info
     */
    public static Map<InetAddress, Map<String, String>> loadDcRackInfo() {
        Map<InetAddress, Map<String, String>> result = new HashMap<InetAddress, Map<String, String>>();
        for (UntypedResultSet.Row row : processInternal("SELECT peer, data_center, rack from system." + PEERS_CF)) {
            InetAddress peer = row.getInetAddress("peer");
            if (row.has("data_center") && row.has("rack")) {
                Map<String, String> dcRack = new HashMap<String, String>();
                dcRack.put("data_center", row.getString("data_center"));
                dcRack.put("rack", row.getString("rack"));
                result.put(peer, dcRack);
            }
        }
        return result;
    }

    /**
     * One of three things will happen if you try to read the system keyspace:
     * 1. files are present and you can read them: great
     * 2. no files are there: great (new node is assumed)
     * 3. files are present but you can't read them: bad
     * @throws ConfigurationException
     */
    public static void checkHealth() throws ConfigurationException {
        Keyspace keyspace;
        try {
            keyspace = Keyspace.open(Keyspace.SYSTEM_KS);
        } catch (AssertionError err) {
            // this happens when a user switches from OPP to RP.
            ConfigurationException ex = new ConfigurationException("Could not read system keyspace!");
            ex.initCause(err);
            throw ex;
        }
        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(LOCAL_CF);

        String req = "SELECT cluster_name FROM system.%s WHERE key='%s'";
        UntypedResultSet result = processInternal(String.format(req, LOCAL_CF, LOCAL_KEY));

        if (result.isEmpty() || !result.one().has("cluster_name")) {
            // this is a brand new node
            if (!cfs.getSSTables().isEmpty())
                throw new ConfigurationException("Found system keyspace files, but they couldn't be loaded!");

            // no system files.  this is a new node.
            req = "INSERT INTO system.%s (key, cluster_name) VALUES ('%s', '%s')";
            processInternal(String.format(req, LOCAL_CF, LOCAL_KEY, DatabaseDescriptor.getClusterName()));
            return;
        }

        String savedClusterName = result.one().getString("cluster_name");
        if (!DatabaseDescriptor.getClusterName().equals(savedClusterName))
            throw new ConfigurationException("Saved cluster name " + savedClusterName + " != configured name "
                    + DatabaseDescriptor.getClusterName());
    }

    public static Collection<Token> getSavedTokens() {
        String req = "SELECT tokens FROM system.%s WHERE key='%s'";
        UntypedResultSet result = processInternal(String.format(req, LOCAL_CF, LOCAL_KEY));
        return result.isEmpty() || !result.one().has("tokens") ? Collections.<Token>emptyList()
                : deserializeTokens(result.one().<String>getSet("tokens", UTF8Type.instance));
    }

    public static int incrementAndGetGeneration() {
        String req = "SELECT gossip_generation FROM system.%s WHERE key='%s'";
        UntypedResultSet result = processInternal(String.format(req, LOCAL_CF, LOCAL_KEY));

        int generation;
        if (result.isEmpty() || !result.one().has("gossip_generation")) {
            // seconds-since-epoch isn't a foolproof new generation
            // (where foolproof is "guaranteed to be larger than the last one seen at this ip address"),
            // but it's as close as sanely possible
            generation = (int) (System.currentTimeMillis() / 1000);
        } else {
            // Other nodes will ignore gossip messages about a node that have a lower generation than previously seen.
            final int storedGeneration = result.one().getInt("gossip_generation") + 1;
            final int now = (int) (System.currentTimeMillis() / 1000);
            if (storedGeneration >= now) {
                logger.warn(
                        "Using stored Gossip Generation {} as it is greater than current system time {}.  See CASSANDRA-3654 if you experience problems",
                        storedGeneration, now);
                generation = storedGeneration;
            } else {
                generation = now;
            }
        }

        req = "INSERT INTO system.%s (key, gossip_generation) VALUES ('%s', %d)";
        processInternal(String.format(req, LOCAL_CF, LOCAL_KEY, generation));
        forceBlockingFlush(LOCAL_CF);

        return generation;
    }

    public static BootstrapState getBootstrapState() {
        String req = "SELECT bootstrapped FROM system.%s WHERE key='%s'";
        UntypedResultSet result = processInternal(String.format(req, LOCAL_CF, LOCAL_KEY));

        if (result.isEmpty() || !result.one().has("bootstrapped"))
            return BootstrapState.NEEDS_BOOTSTRAP;

        return BootstrapState.valueOf(result.one().getString("bootstrapped"));
    }

    public static boolean bootstrapComplete() {
        return getBootstrapState() == BootstrapState.COMPLETED;
    }

    public static boolean bootstrapInProgress() {
        return getBootstrapState() == BootstrapState.IN_PROGRESS;
    }

    public static void setBootstrapState(BootstrapState state) {
        String req = "INSERT INTO system.%s (key, bootstrapped) VALUES ('%s', '%s')";
        processInternal(String.format(req, LOCAL_CF, LOCAL_KEY, state.name()));
        forceBlockingFlush(LOCAL_CF);
    }

    public static boolean isIndexBuilt(String keyspaceName, String indexName) {
        ColumnFamilyStore cfs = Keyspace.open(Keyspace.SYSTEM_KS).getColumnFamilyStore(INDEX_CF);
        QueryFilter filter = QueryFilter.getNamesFilter(decorate(ByteBufferUtil.bytes(keyspaceName)), INDEX_CF,
                FBUtilities.singleton(ByteBufferUtil.bytes(indexName), cfs.getComparator()),
                System.currentTimeMillis());
        return ColumnFamilyStore.removeDeleted(cfs.getColumnFamily(filter), Integer.MAX_VALUE) != null;
    }

    public static void setIndexBuilt(String keyspaceName, String indexName) {
        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(Keyspace.SYSTEM_KS, INDEX_CF);
        cf.addColumn(new Column(ByteBufferUtil.bytes(indexName), ByteBufferUtil.EMPTY_BYTE_BUFFER,
                FBUtilities.timestampMicros()));
        RowMutation rm = new RowMutation(Keyspace.SYSTEM_KS, ByteBufferUtil.bytes(keyspaceName), cf);
        rm.apply();
        forceBlockingFlush(INDEX_CF);
    }

    public static void setIndexRemoved(String keyspaceName, String indexName) {
        RowMutation rm = new RowMutation(Keyspace.SYSTEM_KS, ByteBufferUtil.bytes(keyspaceName));
        rm.delete(INDEX_CF, ByteBufferUtil.bytes(indexName), FBUtilities.timestampMicros());
        rm.apply();
    }

    /**
     * Read the host ID from the system keyspace, creating (and storing) one if
     * none exists.
     */
    public static UUID getLocalHostId() {
        UUID hostId = null;

        String req = "SELECT host_id FROM system.%s WHERE key='%s'";
        UntypedResultSet result = processInternal(String.format(req, LOCAL_CF, LOCAL_KEY));

        // Look up the Host UUID (return it if found)
        if (!result.isEmpty() && result.one().has("host_id")) {
            return result.one().getUUID("host_id");
        }

        // ID not found, generate a new one, persist, and then return it.
        hostId = UUID.randomUUID();
        logger.warn("No host ID found, created {} (Note: This should happen exactly once per node).", hostId);
        return setLocalHostId(hostId);
    }

    /**
     * Sets the local host ID explicitly.  Should only be called outside of SystemTable when replacing a node.
     */
    public static UUID setLocalHostId(UUID hostId) {
        String req = "INSERT INTO system.%s (key, host_id) VALUES ('%s', %s)";
        processInternal(String.format(req, LOCAL_CF, LOCAL_KEY, hostId));
        return hostId;
    }

    /**
     * Read the current local node id from the system keyspace or null if no
     * such node id is recorded.
     */
    public static CounterId getCurrentLocalCounterId() {
        Keyspace keyspace = Keyspace.open(Keyspace.SYSTEM_KS);

        // Get the last CounterId (since CounterId are timeuuid is thus ordered from the older to the newer one)
        QueryFilter filter = QueryFilter.getSliceFilter(decorate(ALL_LOCAL_NODE_ID_KEY), COUNTER_ID_CF,
                ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER, true, 1,
                System.currentTimeMillis());
        ColumnFamily cf = keyspace.getColumnFamilyStore(COUNTER_ID_CF).getColumnFamily(filter);
        if (cf != null && cf.getColumnCount() != 0)
            return CounterId.wrap(cf.iterator().next().name());
        else
            return null;
    }

    /**
     * Write a new current local node id to the system keyspace.
     *
     * @param newCounterId the new current local node id to record
     * @param now microsecond time stamp.
     */
    public static void writeCurrentLocalCounterId(CounterId newCounterId, long now) {
        ByteBuffer ip = ByteBuffer.wrap(FBUtilities.getBroadcastAddress().getAddress());

        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(Keyspace.SYSTEM_KS, COUNTER_ID_CF);
        cf.addColumn(new Column(newCounterId.bytes(), ip, now));
        RowMutation rm = new RowMutation(Keyspace.SYSTEM_KS, ALL_LOCAL_NODE_ID_KEY, cf);
        rm.apply();
        forceBlockingFlush(COUNTER_ID_CF);
    }

    public static List<CounterId.CounterIdRecord> getOldLocalCounterIds() {
        List<CounterId.CounterIdRecord> l = new ArrayList<CounterId.CounterIdRecord>();

        Keyspace keyspace = Keyspace.open(Keyspace.SYSTEM_KS);
        QueryFilter filter = QueryFilter.getIdentityFilter(decorate(ALL_LOCAL_NODE_ID_KEY), COUNTER_ID_CF,
                System.currentTimeMillis());
        ColumnFamily cf = keyspace.getColumnFamilyStore(COUNTER_ID_CF).getColumnFamily(filter);

        CounterId previous = null;
        for (Column c : cf) {
            if (previous != null)
                l.add(new CounterId.CounterIdRecord(previous, c.timestamp()));

            // this will ignore the last column on purpose since it is the
            // current local node id
            previous = CounterId.wrap(c.name());
        }
        return l;
    }

    /**
     * @param cfName The name of the ColumnFamily responsible for part of the schema (keyspace, ColumnFamily, columns)
     * @return CFS responsible to hold low-level serialized schema
     */
    public static ColumnFamilyStore schemaCFS(String cfName) {
        return Keyspace.open(Keyspace.SYSTEM_KS).getColumnFamilyStore(cfName);
    }

    public static List<Row> serializedSchema() {
        List<Row> schema = new ArrayList<>();

        schema.addAll(serializedSchema(SCHEMA_KEYSPACES_CF));
        schema.addAll(serializedSchema(SCHEMA_COLUMNFAMILIES_CF));
        schema.addAll(serializedSchema(SCHEMA_COLUMNS_CF));
        schema.addAll(serializedSchema(SCHEMA_TRIGGERS_CF));

        return schema;
    }

    /**
     * @param schemaCfName The name of the ColumnFamily responsible for part of the schema (keyspace, ColumnFamily, columns)
     * @return low-level schema representation (each row represents individual Keyspace or ColumnFamily)
     */
    public static List<Row> serializedSchema(String schemaCfName) {
        Token minToken = StorageService.getPartitioner().getMinimumToken();

        return schemaCFS(schemaCfName).getRangeSlice(
                new Range<RowPosition>(minToken.minKeyBound(), minToken.maxKeyBound()), null,
                new IdentityQueryFilter(), Integer.MAX_VALUE, System.currentTimeMillis());
    }

    public static Collection<RowMutation> serializeSchema() {
        Map<DecoratedKey, RowMutation> mutationMap = new HashMap<>();

        serializeSchema(mutationMap, SCHEMA_KEYSPACES_CF);
        serializeSchema(mutationMap, SCHEMA_COLUMNFAMILIES_CF);
        serializeSchema(mutationMap, SCHEMA_COLUMNS_CF);
        serializeSchema(mutationMap, SCHEMA_TRIGGERS_CF);

        return mutationMap.values();
    }

    private static void serializeSchema(Map<DecoratedKey, RowMutation> mutationMap, String schemaCfName) {
        for (Row schemaRow : serializedSchema(schemaCfName)) {
            if (Schema.ignoredSchemaRow(schemaRow))
                continue;

            RowMutation mutation = mutationMap.get(schemaRow.key);
            if (mutation == null) {
                mutation = new RowMutation(Keyspace.SYSTEM_KS, schemaRow.key.key);
                mutationMap.put(schemaRow.key, mutation);
            }

            mutation.add(schemaRow.cf);
        }
    }

    public static Map<DecoratedKey, ColumnFamily> getSchema(String cfName) {
        Map<DecoratedKey, ColumnFamily> schema = new HashMap<DecoratedKey, ColumnFamily>();

        for (Row schemaEntity : SystemKeyspace.serializedSchema(cfName))
            schema.put(schemaEntity.key, schemaEntity.cf);

        return schema;
    }

    public static ByteBuffer getSchemaKSKey(String ksName) {
        return AsciiType.instance.fromString(ksName);
    }

    public static Row readSchemaRow(String ksName) {
        DecoratedKey key = StorageService.getPartitioner().decorateKey(getSchemaKSKey(ksName));

        ColumnFamilyStore schemaCFS = SystemKeyspace.schemaCFS(SCHEMA_KEYSPACES_CF);
        ColumnFamily result = schemaCFS.getColumnFamily(
                QueryFilter.getIdentityFilter(key, SCHEMA_KEYSPACES_CF, System.currentTimeMillis()));

        return new Row(key, result);
    }

    /**
     * Fetches a subset of schema (table data, columns metadata or triggers) for the keyspace+table pair.
     *
     * @param schemaCfName the schema table to get the data from (schema_columnfamilies, schema_columns or schema_triggers)
     * @param ksName the keyspace of the table we are interested in
     * @param cfName the table we are interested in
     * @return a Row containing the schema data of a particular type for the table
     */
    public static Row readSchemaRow(String schemaCfName, String ksName, String cfName) {
        DecoratedKey key = StorageService.getPartitioner().decorateKey(getSchemaKSKey(ksName));
        ColumnFamilyStore schemaCFS = SystemKeyspace.schemaCFS(schemaCfName);
        ColumnFamily cf = schemaCFS.getColumnFamily(key, DefsTables.searchComposite(cfName, true),
                DefsTables.searchComposite(cfName, false), false, Integer.MAX_VALUE, System.currentTimeMillis());
        return new Row(key, cf);
    }

    public static PaxosState loadPaxosState(ByteBuffer key, CFMetaData metadata) {
        String req = "SELECT * FROM system.%s WHERE row_key = 0x%s AND cf_id = %s";
        UntypedResultSet results = processInternal(
                String.format(req, PAXOS_CF, ByteBufferUtil.bytesToHex(key), metadata.cfId));
        if (results.isEmpty())
            return new PaxosState(key, metadata);
        UntypedResultSet.Row row = results.one();
        Commit promised = row.has("in_progress_ballot")
                ? new Commit(key, row.getUUID("in_progress_ballot"), EmptyColumns.factory.create(metadata))
                : Commit.emptyCommit(key, metadata);
        // either we have both a recently accepted ballot and update or we have neither
        Commit accepted = row.has("proposal")
                ? new Commit(key, row.getUUID("proposal_ballot"), ColumnFamily.fromBytes(row.getBytes("proposal")))
                : Commit.emptyCommit(key, metadata);
        // either most_recent_commit and most_recent_commit_at will both be set, or neither
        Commit mostRecent = row.has("most_recent_commit")
                ? new Commit(key, row.getUUID("most_recent_commit_at"),
                        ColumnFamily.fromBytes(row.getBytes("most_recent_commit")))
                : Commit.emptyCommit(key, metadata);
        return new PaxosState(promised, accepted, mostRecent);
    }

    public static void savePaxosPromise(Commit promise) {
        String req = "UPDATE %s USING TIMESTAMP %d AND TTL %d SET in_progress_ballot = %s WHERE row_key = 0x%s AND cf_id = %s";
        processInternal(String.format(req, PAXOS_CF, UUIDGen.microsTimestamp(promise.ballot),
                paxosTtl(promise.update.metadata), promise.ballot, ByteBufferUtil.bytesToHex(promise.key),
                promise.update.id()));
    }

    public static void savePaxosProposal(Commit proposal) {
        processInternal(String.format(
                "UPDATE %s USING TIMESTAMP %d AND TTL %d SET proposal_ballot = %s, proposal = 0x%s WHERE row_key = 0x%s AND cf_id = %s",
                PAXOS_CF, UUIDGen.microsTimestamp(proposal.ballot), paxosTtl(proposal.update.metadata),
                proposal.ballot, ByteBufferUtil.bytesToHex(proposal.update.toBytes()),
                ByteBufferUtil.bytesToHex(proposal.key), proposal.update.id()));
    }

    private static int paxosTtl(CFMetaData metadata) {
        // keep paxos state around for at least 3h
        return Math.max(3 * 3600, metadata.getGcGraceSeconds());
    }

    public static void savePaxosCommit(Commit commit) {
        // We always erase the last proposal (with the commit timestamp to no erase more recent proposal in case the commit is old)
        // even though that's really just an optimization  since SP.beginAndRepairPaxos will exclude accepted proposal older than the mrc.
        String cql = "UPDATE %s USING TIMESTAMP %d AND TTL %d SET proposal_ballot = null, proposal = null, most_recent_commit_at = %s, most_recent_commit = 0x%s WHERE row_key = 0x%s AND cf_id = %s";
        processInternal(String.format(cql, PAXOS_CF, UUIDGen.microsTimestamp(commit.ballot),
                paxosTtl(commit.update.metadata), commit.ballot, ByteBufferUtil.bytesToHex(commit.update.toBytes()),
                ByteBufferUtil.bytesToHex(commit.key), commit.update.id()));
    }

    /**
     * Returns a RestorableMeter tracking the average read rate of a particular SSTable, restoring the last-seen rate
     * from values in system.sstable_activity if present.
     * @param keyspace the keyspace the sstable belongs to
     * @param table the table the sstable belongs to
     * @param generation the generation number for the sstable
     */
    public static RestorableMeter getSSTableReadMeter(String keyspace, String table, int generation) {
        String cql = "SELECT * FROM %s WHERE keyspace_name='%s' and columnfamily_name='%s' and generation=%d";
        UntypedResultSet results = processInternal(
                String.format(cql, SSTABLE_ACTIVITY_CF, keyspace, table, generation));

        if (results.isEmpty())
            return new RestorableMeter();

        UntypedResultSet.Row row = results.one();
        double m15rate = row.getDouble("rate_15m");
        double m120rate = row.getDouble("rate_120m");
        return new RestorableMeter(m15rate, m120rate);
    }

    /**
     * Writes the current read rates for a given SSTable to system.sstable_activity
     */
    public static void persistSSTableReadMeter(String keyspace, String table, int generation,
            RestorableMeter meter) {
        // Store values with a one-day TTL to handle corner cases where cleanup might not occur
        String cql = "INSERT INTO %s (keyspace_name, columnfamily_name, generation, rate_15m, rate_120m) VALUES ('%s', '%s', %d, %f, %f) USING TTL 864000";
        processInternal(String.format(cql, SSTABLE_ACTIVITY_CF, keyspace, table, generation,
                meter.fifteenMinuteRate(), meter.twoHourRate()));
    }

    /**
     * Clears persisted read rates from system.sstable_activity for SSTables that have been deleted.
     */
    public static void clearSSTableReadMeter(String keyspace, String table, int generation) {
        String cql = "DELETE FROM %s WHERE keyspace_name='%s' AND columnfamily_name='%s' and generation=%d";
        processInternal(String.format(cql, SSTABLE_ACTIVITY_CF, keyspace, table, generation));
    }
}