org.apache.cassandra.db.PartitionRangeReadCommand.java Source code

Introduction

Here is the source code for org.apache.cassandra.db.PartitionRangeReadCommand.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.db;

import java.io.IOException;
import java.util.concurrent.TimeUnit;

import com.google.common.annotations.VisibleForTesting;

import org.apache.cassandra.net.MessageFlag;
import org.apache.cassandra.net.Verb;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.filter.*;
import org.apache.cassandra.db.lifecycle.View;
import org.apache.cassandra.db.partitions.*;
import org.apache.cassandra.db.rows.BaseRowIterator;
import org.apache.cassandra.db.transform.RTBoundValidator;
import org.apache.cassandra.db.transform.Transformation;
import org.apache.cassandra.dht.AbstractBounds;
import org.apache.cassandra.dht.Bounds;
import org.apache.cassandra.exceptions.RequestExecutionException;
import org.apache.cassandra.index.Index;
import org.apache.cassandra.io.sstable.format.SSTableReader;
import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
import org.apache.cassandra.io.util.DataInputPlus;
import org.apache.cassandra.io.util.DataOutputPlus;
import org.apache.cassandra.metrics.TableMetrics;
import org.apache.cassandra.net.Message;
import org.apache.cassandra.schema.IndexMetadata;
import org.apache.cassandra.service.ClientState;
import org.apache.cassandra.service.StorageProxy;
import org.apache.cassandra.tracing.Tracing;

/**
 * A read command that selects a (part of a) range of partitions.
 */
public class PartitionRangeReadCommand extends ReadCommand implements PartitionRangeReadQuery {
    protected static final SelectionDeserializer selectionDeserializer = new Deserializer();

    private final DataRange dataRange;

    private PartitionRangeReadCommand(boolean isDigest, int digestVersion, boolean acceptsTransient,
            TableMetadata metadata, int nowInSec, ColumnFilter columnFilter, RowFilter rowFilter, DataLimits limits,
            DataRange dataRange, IndexMetadata index) {
        super(Kind.PARTITION_RANGE, isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter,
                rowFilter, limits, index);
        this.dataRange = dataRange;
    }

    public static PartitionRangeReadCommand create(TableMetadata metadata, int nowInSec, ColumnFilter columnFilter,
            RowFilter rowFilter, DataLimits limits, DataRange dataRange) {
        return new PartitionRangeReadCommand(false, 0, false, metadata, nowInSec, columnFilter, rowFilter, limits,
                dataRange, findIndex(metadata, rowFilter));
    }

    /**
     * Creates a new read command that query all the data in the table.
     *
     * @param metadata the table to query.
     * @param nowInSec the time in seconds to use are "now" for this query.
     *
     * @return a newly created read command that queries everything in the table.
     */
    public static PartitionRangeReadCommand allDataRead(TableMetadata metadata, int nowInSec) {
        return new PartitionRangeReadCommand(false, 0, false, metadata, nowInSec, ColumnFilter.all(metadata),
                RowFilter.NONE, DataLimits.NONE, DataRange.allData(metadata.partitioner), null);
    }

    public DataRange dataRange() {
        return dataRange;
    }

    public ClusteringIndexFilter clusteringIndexFilter(DecoratedKey key) {
        return dataRange.clusteringIndexFilter(key);
    }

    public boolean isNamesQuery() {
        return dataRange.isNamesQuery();
    }

    /**
     * Returns an equivalent command but that only queries data within the provided range.
     *
     * @param range the sub-range to restrict the command to. This method <b>assumes</b> that this is a proper sub-range
     * of the command this is applied to.
     * @param isRangeContinuation whether {@code range} is a direct continuation of whatever previous range we have
     * queried. This matters for the {@code DataLimits} that may contain states when we do paging and in the context of
     * parallel queries: that state only make sense if the range queried is indeed the follow-up of whatever range we've
     * previously query (that yield said state). In practice this means that ranges for which {@code isRangeContinuation}
     * is false may have to be slightly pessimistic when counting data and may include a little bit than necessary, and
     * this should be dealt with post-query (in the case of {@code StorageProxy.getRangeSlice()}, which uses this method
     * for replica queries, this is dealt with by re-counting results on the coordinator). Note that if this is the
     * first range we queried, then the {@code DataLimits} will have not state and the value of this parameter doesn't
     * matter.
     */
    public PartitionRangeReadCommand forSubRange(AbstractBounds<PartitionPosition> range,
            boolean isRangeContinuation) {
        // If we're not a continuation of whatever range we've previously queried, we should ignore the states of the
        // DataLimits as it's either useless, or misleading. This is particularly important for GROUP BY queries, where
        // DataLimits.CQLGroupByLimits.GroupByAwareCounter assumes that if GroupingState.hasClustering(), then we're in
        // the middle of a group, but we can't make that assumption if we query and range "in advance" of where we are
        // on the ring.
        return new PartitionRangeReadCommand(isDigestQuery(), digestVersion(), acceptsTransient(), metadata(),
                nowInSec(), columnFilter(), rowFilter(), isRangeContinuation ? limits() : limits().withoutState(),
                dataRange().forSubRange(range), indexMetadata());
    }

    public PartitionRangeReadCommand copy() {
        return new PartitionRangeReadCommand(isDigestQuery(), digestVersion(), acceptsTransient(), metadata(),
                nowInSec(), columnFilter(), rowFilter(), limits(), dataRange(), indexMetadata());
    }

    @Override
    protected PartitionRangeReadCommand copyAsDigestQuery() {
        return new PartitionRangeReadCommand(true, digestVersion(), false, metadata(), nowInSec(), columnFilter(),
                rowFilter(), limits(), dataRange(), indexMetadata());
    }

    @Override
    protected PartitionRangeReadCommand copyAsTransientQuery() {
        return new PartitionRangeReadCommand(false, 0, true, metadata(), nowInSec(), columnFilter(), rowFilter(),
                limits(), dataRange(), indexMetadata());
    }

    @Override
    public PartitionRangeReadCommand withUpdatedLimit(DataLimits newLimits) {
        return new PartitionRangeReadCommand(isDigestQuery(), digestVersion(), acceptsTransient(), metadata(),
                nowInSec(), columnFilter(), rowFilter(), newLimits, dataRange(), indexMetadata());
    }

    @Override
    public PartitionRangeReadCommand withUpdatedLimitsAndDataRange(DataLimits newLimits, DataRange newDataRange) {
        return new PartitionRangeReadCommand(isDigestQuery(), digestVersion(), acceptsTransient(), metadata(),
                nowInSec(), columnFilter(), rowFilter(), newLimits, newDataRange, indexMetadata());
    }

    public long getTimeout(TimeUnit unit) {
        return DatabaseDescriptor.getRangeRpcTimeout(unit);
    }

    public PartitionIterator execute(ConsistencyLevel consistency, ClientState clientState, long queryStartNanoTime)
            throws RequestExecutionException {
        return StorageProxy.getRangeSlice(this, consistency, queryStartNanoTime);
    }

    protected void recordLatency(TableMetrics metric, long latencyNanos) {
        metric.rangeLatency.addNano(latencyNanos);
    }

    @VisibleForTesting
    public UnfilteredPartitionIterator queryStorage(final ColumnFamilyStore cfs,
            ReadExecutionController executionController) {
        ColumnFamilyStore.ViewFragment view = cfs.select(View.selectLive(dataRange().keyRange()));
        Tracing.trace("Executing seq scan across {} sstables for {}", view.sstables.size(),
                dataRange().keyRange().getString(metadata().partitionKeyType));

        // fetch data from current memtable, historical memtables, and SSTables in the correct order.
        InputCollector<UnfilteredPartitionIterator> inputCollector = iteratorsForRange(view);
        try {
            for (Memtable memtable : view.memtables) {
                @SuppressWarnings("resource") // We close on exception and on closing the result returned by this method
                Memtable.MemtableUnfilteredPartitionIterator iter = memtable.makePartitionIterator(columnFilter(),
                        dataRange());
                oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, iter.getMinLocalDeletionTime());
                inputCollector.addMemtableIterator(
                        RTBoundValidator.validate(iter, RTBoundValidator.Stage.MEMTABLE, false));
            }

            SSTableReadsListener readCountUpdater = newReadCountUpdater();
            for (SSTableReader sstable : view.sstables) {
                @SuppressWarnings("resource") // We close on exception and on closing the result returned by this method
                UnfilteredPartitionIterator iter = sstable.getScanner(columnFilter(), dataRange(),
                        readCountUpdater);
                inputCollector.addSSTableIterator(sstable,
                        RTBoundValidator.validate(iter, RTBoundValidator.Stage.SSTABLE, false));

                if (!sstable.isRepaired())
                    oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone,
                            sstable.getMinLocalDeletionTime());
            }
            // iterators can be empty for offline tools
            if (inputCollector.isEmpty())
                return EmptyIterators.unfilteredPartition(metadata());

            return checkCacheFilter(UnfilteredPartitionIterators.mergeLazily(inputCollector.finalizeIterators()),
                    cfs);
        } catch (RuntimeException | Error e) {
            try {
                inputCollector.close();
            } catch (Exception e1) {
                e.addSuppressed(e1);
            }
            throw e;
        }
    }

    /**
     * Creates a new {@code SSTableReadsListener} to update the SSTables read counts.
     * @return a new {@code SSTableReadsListener} to update the SSTables read counts.
     */
    private static SSTableReadsListener newReadCountUpdater() {
        return new SSTableReadsListener() {
            @Override
            public void onScanningStarted(SSTableReader sstable) {
                sstable.incrementReadCount();
            }
        };
    }

    private UnfilteredPartitionIterator checkCacheFilter(UnfilteredPartitionIterator iter,
            final ColumnFamilyStore cfs) {
        class CacheFilter extends Transformation {
            @Override
            public BaseRowIterator applyToPartition(BaseRowIterator iter) {
                // Note that we rely on the fact that until we actually advance 'iter', no really costly operation is actually done
                // (except for reading the partition key from the index file) due to the call to mergeLazily in queryStorage.
                DecoratedKey dk = iter.partitionKey();

                // Check if this partition is in the rowCache and if it is, if  it covers our filter
                CachedPartition cached = cfs.getRawCachedPartition(dk);
                ClusteringIndexFilter filter = dataRange().clusteringIndexFilter(dk);

                if (cached != null && cfs.isFilterFullyCoveredBy(filter, limits(), cached, nowInSec(),
                        iter.metadata().enforceStrictLiveness())) {
                    // We won't use 'iter' so close it now.
                    iter.close();

                    return filter.getUnfilteredRowIterator(columnFilter(), cached);
                }

                return iter;
            }
        }
        return Transformation.apply(iter, new CacheFilter());
    }

    @Override
    public Verb verb() {
        return Verb.RANGE_REQ;
    }

    protected void appendCQLWhereClause(StringBuilder sb) {
        if (dataRange.isUnrestricted() && rowFilter().isEmpty())
            return;

        sb.append(" WHERE ");
        // We put the row filter first because the data range can end by "ORDER BY"
        if (!rowFilter().isEmpty()) {
            sb.append(rowFilter());
            if (!dataRange.isUnrestricted())
                sb.append(" AND ");
        }
        if (!dataRange.isUnrestricted())
            sb.append(dataRange.toCQLString(metadata()));
    }

    /**
     * Allow to post-process the result of the query after it has been reconciled on the coordinator
     * but before it is passed to the CQL layer to return the ResultSet.
     *
     * See CASSANDRA-8717 for why this exists.
     */
    public PartitionIterator postReconciliationProcessing(PartitionIterator result) {
        ColumnFamilyStore cfs = Keyspace.open(metadata().keyspace).getColumnFamilyStore(metadata().name);
        Index index = getIndex(cfs);
        return index == null ? result : index.postProcessorFor(this).apply(result, this);
    }

    @Override
    public String toString() {
        return String.format("Read(%s columns=%s rowfilter=%s limits=%s %s)", metadata().toString(), columnFilter(),
                rowFilter(), limits(), dataRange().toString(metadata()));
    }

    protected void serializeSelection(DataOutputPlus out, int version) throws IOException {
        DataRange.serializer.serialize(dataRange(), out, version, metadata());
    }

    protected long selectionSerializedSize(int version) {
        return DataRange.serializer.serializedSize(dataRange(), version, metadata());
    }

    /*
     * We are currently using PartitionRangeReadCommand for most index queries, even if they are explicitly restricted
     * to a single partition key. Return true if that is the case.
     *
     * See CASSANDRA-11617 and CASSANDRA-11872 for details.
     */
    public boolean isLimitedToOnePartition() {
        return dataRange.keyRange instanceof Bounds && dataRange.startKey().kind() == PartitionPosition.Kind.ROW_KEY
                && dataRange.startKey().equals(dataRange.stopKey());
    }

    public boolean isRangeRequest() {
        return true;
    }

    private static class Deserializer extends SelectionDeserializer {
        public ReadCommand deserialize(DataInputPlus in, int version, boolean isDigest, int digestVersion,
                boolean acceptsTransient, TableMetadata metadata, int nowInSec, ColumnFilter columnFilter,
                RowFilter rowFilter, DataLimits limits, IndexMetadata index) throws IOException {
            DataRange range = DataRange.serializer.deserialize(in, version, metadata);
            return new PartitionRangeReadCommand(isDigest, digestVersion, acceptsTransient, metadata, nowInSec,
                    columnFilter, rowFilter, limits, range, index);
        }
    }
}