package org.apache.cassandra.db;

import java.util.concurrent.TimeUnit;


import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.filter.*;
import org.apache.cassandra.db.lifecycle.View;
import org.apache.cassandra.db.partitions.*;
import org.apache.cassandra.db.rows.BaseRowIterator;
import org.apache.cassandra.db.transform.RTBoundValidator;
import org.apache.cassandra.db.transform.Transformation;
import org.apache.cassandra.dht.AbstractBounds;
import org.apache.cassandra.dht.Bounds;
import org.apache.cassandra.exceptions.RequestExecutionException;
import org.apache.cassandra.index.Index;
import org.apache.cassandra.metrics.TableMetrics;
import org.apache.cassandra.schema.IndexMetadata;
import org.apache.cassandra.service.ClientState;
import org.apache.cassandra.service.StorageProxy;
import org.apache.cassandra.tracing.Tracing;

 * A read command that selects a (part of a) range of partitions.
public class PartitionRangeReadCommand extends ReadCommand implements PartitionRangeReadQuery {
    protected static final SelectionDeserializer selectionDeserializer = new Deserializer();

    private final DataRange dataRange;

    private PartitionRangeReadCommand(boolean isDigest, int digestVersion, boolean acceptsTransient,
            TableMetadata metadata, int nowInSec, ColumnFilter columnFilter, RowFilter rowFilter, DataLimits limits,
            DataRange dataRange, IndexMetadata index) {
        super(Kind.PARTITION_RANGE, isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter,
                rowFilter, limits, index);
        this.dataRange = dataRange;

    public static PartitionRangeReadCommand create(TableMetadata metadata, int nowInSec, ColumnFilter columnFilter,
            RowFilter rowFilter, DataLimits limits, DataRange dataRange) {
        return new PartitionRangeReadCommand(false, 0, false, metadata, nowInSec, columnFilter, rowFilter, limits,
                dataRange, findIndex(metadata, rowFilter));

     * Creates a new read command that query all the data in the table.
     * @param metadata the table to query.
     * @param nowInSec the time in seconds to use are "now" for this query.
     * @return a newly created read command that queries everything in the table.
    public static PartitionRangeReadCommand allDataRead(TableMetadata metadata, int nowInSec) {
        return new PartitionRangeReadCommand(false, 0, false, metadata, nowInSec, ColumnFilter.all(metadata),
                RowFilter.NONE, DataLimits.NONE, DataRange.allData(metadata.partitioner), null);

    public DataRange dataRange() {
        return dataRange;

    public ClusteringIndexFilter clusteringIndexFilter(DecoratedKey key) {
        return dataRange.clusteringIndexFilter(key);

    public boolean isNamesQuery() {
        return dataRange.isNamesQuery();

     * Returns an equivalent command but that only queries data within the provided range.
     * @param range the sub-range to restrict the command to. This method <b>assumes</b> that this is a proper sub-range
     * of the command this is applied to.
     * @param isRangeContinuation whether {@code range} is a direct continuation of whatever previous range we have
     * queried. This matters for the {@code DataLimits} that may contain states when we do paging and in the context of
     * parallel queries: that state only make sense if the range queried is indeed the follow-up of whatever range we've
     * previously query (that yield said state). In practice this means that ranges for which {@code isRangeContinuation}
     * is false may have to be slightly pessimistic when counting data and may include a little bit than necessary, and
     * this should be dealt with post-query (in the case of {@code StorageProxy.getRangeSlice()}, which uses this method
     * for replica queries, this is dealt with by re-counting results on the coordinator). Note that if this is the
     * first range we queried, then the {@code DataLimits} will have not state and the value of this parameter doesn't
     * matter.
    public PartitionRangeReadCommand forSubRange(AbstractBounds<PartitionPosition> range,
            boolean isRangeContinuation) {
        // If we're not a continuation of whatever range we've previously queried, we should ignore the states of the
        // DataLimits as it's either useless, or misleading. This is particularly important for GROUP BY queries, where
        // DataLimits.CQLGroupByLimits.GroupByAwareCounter assumes that if GroupingState.hasClustering(), then we're in
        // the middle of a group, but we can't make that assumption if we query and range "in advance" of where we are
        // on the ring.
        return new PartitionRangeReadCommand(isDigestQuery(), digestVersion(), acceptsTransient(), metadata(),
                nowInSec(), columnFilter(), rowFilter(), isRangeContinuation ? limits() : limits().withoutState(),
                dataRange().forSubRange(range), indexMetadata());

    public PartitionRangeReadCommand copy() {
        return new PartitionRangeReadCommand(isDigestQuery(), digestVersion(), acceptsTransient(), metadata(),
                nowInSec(), columnFilter(), rowFilter(), limits(), dataRange(), indexMetadata());

    protected PartitionRangeReadCommand copyAsDigestQuery() {
        return new PartitionRangeReadCommand(true, digestVersion(), false, metadata(), nowInSec(), columnFilter(),
                rowFilter(), limits(), dataRange(), indexMetadata());

    protected PartitionRangeReadCommand copyAsTransientQuery() {
        return new PartitionRangeReadCommand(false, 0, true, metadata(), nowInSec(), columnFilter(), rowFilter(),
                limits(), dataRange(), indexMetadata());

    public PartitionRangeReadCommand withUpdatedLimit(DataLimits newLimits) {
        return new PartitionRangeReadCommand(isDigestQuery(), digestVersion(), acceptsTransient(), metadata(),
                nowInSec(), columnFilter(), rowFilter(), newLimits, dataRange(), indexMetadata());

    public PartitionRangeReadCommand withUpdatedLimitsAndDataRange(DataLimits newLimits, DataRange newDataRange) {
        return new PartitionRangeReadCommand(isDigestQuery(), digestVersion(), acceptsTransient(), metadata(),
                nowInSec(), columnFilter(), rowFilter(), newLimits, newDataRange, indexMetadata());

    public long getTimeout(TimeUnit unit) {
        return DatabaseDescriptor.getRangeRpcTimeout(unit);

    public PartitionIterator execute(ConsistencyLevel consistency, ClientState clientState, long queryStartNanoTime)
            throws RequestExecutionException {
        return StorageProxy.getRangeSlice(this, consistency, queryStartNanoTime);

    protected void recordLatency(TableMetrics metric, long latencyNanos) {

    public UnfilteredPartitionIterator queryStorage(final ColumnFamilyStore cfs,
            ReadExecutionController executionController) {
        ColumnFamilyStore.ViewFragment view =;
        Tracing.trace("Executing seq scan across {} sstables for {}", view.sstables.size(),

        // fetch data from current memtable, historical memtables, and SSTables in the correct order.
        InputCollector<UnfilteredPartitionIterator> inputCollector = iteratorsForRange(view);
        try {
            for (Memtable memtable : view.memtables) {
                @SuppressWarnings("resource") // We close on exception and on closing the result returned by this method
                Memtable.MemtableUnfilteredPartitionIterator iter = memtable.makePartitionIterator(columnFilter(),
                oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, iter.getMinLocalDeletionTime());
                        RTBoundValidator.validate(iter, RTBoundValidator.Stage.MEMTABLE, false));

            SSTableReadsListener readCountUpdater = newReadCountUpdater();
            for (SSTableReader sstable : view.sstables) {
                @SuppressWarnings("resource") // We close on exception and on closing the result returned by this method
                UnfilteredPartitionIterator iter = sstable.getScanner(columnFilter(), dataRange(),
                        RTBoundValidator.validate(iter, RTBoundValidator.Stage.SSTABLE, false));

                if (!sstable.isRepaired())
                    oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone,
            // iterators can be empty for offline tools
            if (inputCollector.isEmpty())
                return EmptyIterators.unfilteredPartition(metadata());

            return checkCacheFilter(UnfilteredPartitionIterators.mergeLazily(inputCollector.finalizeIterators()),
        } catch (RuntimeException | Error e) {
            try {
            } catch (Exception e1) {
            throw e;

     * Creates a new {@code SSTableReadsListener} to update the SSTables read counts.
     * @return a new {@code SSTableReadsListener} to update the SSTables read counts.
    private static SSTableReadsListener newReadCountUpdater() {
        return new SSTableReadsListener() {
            public void onScanningStarted(SSTableReader sstable) {

    private UnfilteredPartitionIterator checkCacheFilter(UnfilteredPartitionIterator iter,
            final ColumnFamilyStore cfs) {
        class CacheFilter extends Transformation {
            public BaseRowIterator applyToPartition(BaseRowIterator iter) {
                // Note that we rely on the fact that until we actually advance 'iter', no really costly operation is actually done
                // (except for reading the partition key from the index file) due to the call to mergeLazily in queryStorage.
                DecoratedKey dk = iter.partitionKey();

                // Check if this partition is in the rowCache and if it is, if  it covers our filter
                CachedPartition cached = cfs.getRawCachedPartition(dk);
                ClusteringIndexFilter filter = dataRange().clusteringIndexFilter(dk);

                if (cached != null && cfs.isFilterFullyCoveredBy(filter, limits(), cached, nowInSec(),
                        iter.metadata().enforceStrictLiveness())) {
                    // We won't use 'iter' so close it now.

                    return filter.getUnfilteredRowIterator(columnFilter(), cached);

                return iter;
        return Transformation.apply(iter, new CacheFilter());

    public Verb verb() {
        return Verb.RANGE_REQ;

    protected void appendCQLWhereClause(StringBuilder sb) {
        if (dataRange.isUnrestricted() && rowFilter().isEmpty())

        sb.append(" WHERE ");
        // We put the row filter first because the data range can end by "ORDER BY"
        if (!rowFilter().isEmpty()) {
            if (!dataRange.isUnrestricted())
                sb.append(" AND ");
        if (!dataRange.isUnrestricted())

     * Allow to post-process the result of the query after it has been reconciled on the coordinator
     * but before it is passed to the CQL layer to return the ResultSet.
     * See CASSANDRA-8717 for why this exists.
    public PartitionIterator postReconciliationProcessing(PartitionIterator result) {
        ColumnFamilyStore cfs =;
        Index index = getIndex(cfs);
        return index == null ? result : index.postProcessorFor(this).apply(result, this);

    public String toString() {
        return String.format("Read(%s columns=%s rowfilter=%s limits=%s %s)", metadata().toString(), columnFilter(),
                rowFilter(), limits(), dataRange().toString(metadata()));

    protected void serializeSelection(DataOutputPlus out, int version) throws IOException {
        DataRange.serializer.serialize(dataRange(), out, version, metadata());

    protected long selectionSerializedSize(int version) {
        return DataRange.serializer.serializedSize(dataRange(), version, metadata());

     * We are currently using PartitionRangeReadCommand for most index queries, even if they are explicitly restricted
     * to a single partition key. Return true if that is the case.
     * See CASSANDRA-11617 and CASSANDRA-11872 for details.
    public boolean isLimitedToOnePartition() {
        return dataRange.keyRange instanceof Bounds && dataRange.startKey().kind() == PartitionPosition.Kind.ROW_KEY
                && dataRange.startKey().equals(dataRange.stopKey());

    public boolean isRangeRequest() {
        return true;

    private static class Deserializer extends SelectionDeserializer {
        public ReadCommand deserialize(DataInputPlus in, int version, boolean isDigest, int digestVersion,
                boolean acceptsTransient, TableMetadata metadata, int nowInSec, ColumnFilter columnFilter,
                RowFilter rowFilter, DataLimits limits, IndexMetadata index) throws IOException {
            DataRange range = DataRange.serializer.deserialize(in, version, metadata);
            return new PartitionRangeReadCommand(isDigest, digestVersion, acceptsTransient, metadata, nowInSec,
                    columnFilter, rowFilter, limits, range, index);