org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.java Source code

Introduction

Here is the source code for org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.phoenix.coprocessor;

import static org.apache.phoenix.query.QueryConstants.AGG_TIMESTAMP;
import static org.apache.phoenix.query.QueryConstants.SINGLE_COLUMN;
import static org.apache.phoenix.query.QueryConstants.SINGLE_COLUMN_FAMILY;
import static org.apache.phoenix.query.QueryConstants.UNGROUPED_AGG_ROW_KEY;
import static org.apache.phoenix.query.QueryServices.MUTATE_BATCH_SIZE_ATTRIB;
import static org.apache.phoenix.query.QueryServices.MUTATE_BATCH_SIZE_BYTES_ATTRIB;
import static org.apache.phoenix.schema.stats.StatisticsCollectionRunTracker.COMPACTION_UPDATE_STATS_ROW_COUNT;
import static org.apache.phoenix.schema.stats.StatisticsCollectionRunTracker.CONCURRENT_UPDATE_STATS_ROW_COUNT;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.security.PrivilegedExceptionAction;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Callable;

import javax.annotation.concurrent.GuardedBy;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.CoprocessorEnvironment;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.NamespaceDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.CoprocessorHConnection;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.HTableInterface;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
import org.apache.hadoop.hbase.ipc.controller.InterRegionServerIndexRpcControllerFactory;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.regionserver.InternalScanner;
import org.apache.hadoop.hbase.regionserver.Region;
import org.apache.hadoop.hbase.regionserver.RegionScanner;
import org.apache.hadoop.hbase.regionserver.ScanType;
import org.apache.hadoop.hbase.regionserver.Store;
import org.apache.hadoop.hbase.regionserver.StoreFile;
import org.apache.hadoop.hbase.regionserver.compactions.CompactionRequest;
import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.io.WritableUtils;
import org.apache.phoenix.cache.ServerCacheClient;
import org.apache.phoenix.coprocessor.MetaDataProtocol.MutationCode;
import org.apache.phoenix.coprocessor.generated.PTableProtos;
import org.apache.phoenix.exception.DataExceedsCapacityException;
import org.apache.phoenix.execute.TupleProjector;
import org.apache.phoenix.expression.Expression;
import org.apache.phoenix.expression.ExpressionType;
import org.apache.phoenix.expression.aggregator.Aggregator;
import org.apache.phoenix.expression.aggregator.Aggregators;
import org.apache.phoenix.expression.aggregator.ServerAggregators;
import org.apache.phoenix.hbase.index.ValueGetter;
import org.apache.phoenix.hbase.index.covered.update.ColumnReference;
import org.apache.phoenix.hbase.index.util.GenericKeyValueBuilder;
import org.apache.phoenix.hbase.index.util.ImmutableBytesPtr;
import org.apache.phoenix.hbase.index.util.KeyValueBuilder;
import org.apache.phoenix.index.IndexMaintainer;
import org.apache.phoenix.index.PhoenixIndexCodec;
import org.apache.phoenix.jdbc.PhoenixDatabaseMetaData;
import org.apache.phoenix.join.HashJoinInfo;
import org.apache.phoenix.query.QueryConstants;
import org.apache.phoenix.query.QueryServices;
import org.apache.phoenix.query.QueryServicesOptions;
import org.apache.phoenix.schema.ColumnFamilyNotFoundException;
import org.apache.phoenix.schema.PColumn;
import org.apache.phoenix.schema.PIndexState;
import org.apache.phoenix.schema.PRow;
import org.apache.phoenix.schema.PTable;
import org.apache.phoenix.schema.PTableImpl;
import org.apache.phoenix.schema.RowKeySchema;
import org.apache.phoenix.schema.SortOrder;
import org.apache.phoenix.schema.TableRef;
import org.apache.phoenix.schema.ValueSchema.Field;
import org.apache.phoenix.schema.stats.StatisticsCollectionRunTracker;
import org.apache.phoenix.schema.stats.StatisticsCollector;
import org.apache.phoenix.schema.stats.StatisticsCollectorFactory;
import org.apache.phoenix.schema.tuple.EncodedColumnQualiferCellsList;
import org.apache.phoenix.schema.tuple.MultiKeyValueTuple;
import org.apache.phoenix.schema.tuple.PositionBasedMultiKeyValueTuple;
import org.apache.phoenix.schema.tuple.Tuple;
import org.apache.phoenix.schema.types.PBinary;
import org.apache.phoenix.schema.types.PChar;
import org.apache.phoenix.schema.types.PDataType;
import org.apache.phoenix.schema.types.PDouble;
import org.apache.phoenix.schema.types.PFloat;
import org.apache.phoenix.schema.types.PLong;
import org.apache.phoenix.transaction.PhoenixTransactionContext;
import org.apache.phoenix.util.ByteUtil;
import org.apache.phoenix.util.EncodedColumnsUtil;
import org.apache.phoenix.util.EnvironmentEdgeManager;
import org.apache.phoenix.util.ExpressionUtil;
import org.apache.phoenix.util.IndexUtil;
import org.apache.phoenix.util.KeyValueUtil;
import org.apache.phoenix.util.LogUtil;
import org.apache.phoenix.util.PropertiesUtil;
import org.apache.phoenix.util.ScanUtil;
import org.apache.phoenix.util.SchemaUtil;
import org.apache.phoenix.util.ServerUtil;
import org.apache.phoenix.util.StringUtil;
import org.apache.phoenix.util.TimeKeeper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Throwables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.primitives.Ints;

/**
 * Region observer that aggregates ungrouped rows(i.e. SQL query with aggregation function and no GROUP BY).
 *
 *
 * @since 0.1
 */
public class UngroupedAggregateRegionObserver extends BaseScannerRegionObserver {
    // TODO: move all constants into a single class
    public static final String UNGROUPED_AGG = "UngroupedAgg";
    public static final String DELETE_AGG = "DeleteAgg";
    public static final String DELETE_CQ = "DeleteCQ";
    public static final String DELETE_CF = "DeleteCF";
    public static final String EMPTY_CF = "EmptyCF";
    /**
     * This lock used for synchronizing the state of
     * {@link UngroupedAggregateRegionObserver#scansReferenceCount},
     * {@link UngroupedAggregateRegionObserver#isRegionClosingOrSplitting} variables used to avoid possible
     * dead lock situation in case below steps: 
     * 1. We get read lock when we start writing local indexes, deletes etc.. 
     * 2. when memstore reach threshold, flushes happen. Since they use read (shared) lock they 
     * happen without any problem until someone tries to obtain write lock. 
     * 3. at one moment we decide to split/bulkload/close and try to acquire write lock. 
     * 4. Since that moment all attempts to get read lock will be blocked. I.e. no more 
     * flushes will happen. But we continue to fill memstore with local index batches and 
     * finally we get RTBE.
     * 
     * The solution to this is to not allow or delay operations acquire the write lock.
     * 1) In case of split we just throw IOException so split won't happen but it will not cause any harm.
     * 2) In case of bulkload failing it by throwing the exception. 
     * 3) In case of region close by balancer/move wait before closing the reason and fail the query which 
     * does write after reading. 
     * 
     * See PHOENIX-3111 for more info.
     */

    private final Object lock = new Object();
    /**
     * To maintain the number of scans used for create index, delete and upsert select operations
     * which reads and writes to same region in coprocessors.
     */
    @GuardedBy("lock")
    private int scansReferenceCount = 0;
    @GuardedBy("lock")
    private boolean isRegionClosingOrSplitting = false;
    private static final Logger logger = LoggerFactory.getLogger(UngroupedAggregateRegionObserver.class);
    private KeyValueBuilder kvBuilder;
    private Configuration upsertSelectConfig;
    private Configuration compactionConfig;

    @Override
    public void start(CoprocessorEnvironment e) throws IOException {
        super.start(e);
        // Can't use ClientKeyValueBuilder on server-side because the memstore expects to
        // be able to get a single backing buffer for a KeyValue.
        this.kvBuilder = GenericKeyValueBuilder.INSTANCE;
        /*
         * We need to create a copy of region's configuration since we don't want any side effect of
         * setting the RpcControllerFactory.
         */
        upsertSelectConfig = PropertiesUtil.cloneConfig(e.getConfiguration());
        /*
         * Till PHOENIX-3995 is fixed, we need to use the
         * InterRegionServerIndexRpcControllerFactory. Although this would cause remote RPCs to use
         * index handlers on the destination region servers, it is better than using the regular
         * priority handlers which could result in a deadlock.
         */
        upsertSelectConfig.setClass(RpcControllerFactory.CUSTOM_CONTROLLER_CONF_KEY,
                InterRegionServerIndexRpcControllerFactory.class, RpcControllerFactory.class);

        compactionConfig = PropertiesUtil.cloneConfig(e.getConfiguration());
        // lower the number of rpc retries, so we don't hang the compaction
        compactionConfig.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
                e.getConfiguration().getInt(QueryServices.METADATA_WRITE_RETRIES_NUMBER,
                        QueryServicesOptions.DEFAULT_METADATA_WRITE_RETRIES_NUMBER));
        compactionConfig.setInt(HConstants.HBASE_CLIENT_PAUSE, e.getConfiguration().getInt(
                QueryServices.METADATA_WRITE_RETRY_PAUSE, QueryServicesOptions.DEFAULT_METADATA_WRITE_RETRY_PAUSE));
    }

    private void commitBatch(Region region, List<Mutation> mutations, long blockingMemstoreSize)
            throws IOException {
        if (mutations.isEmpty()) {
            return;
        }

        Mutation[] mutationArray = new Mutation[mutations.size()];
        // When memstore size reaches blockingMemstoreSize we are waiting 3 seconds for the
        // flush happen which decrease the memstore size and then writes allowed on the region.
        for (int i = 0; region.getMemstoreSize() > blockingMemstoreSize && i < 30; i++) {
            try {
                checkForRegionClosing();
                Thread.sleep(100);
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                throw new IOException(e);
            }
        }
        // TODO: should we use the one that is all or none?
        logger.debug("Committing bactch of " + mutations.size() + " mutations for "
                + region.getRegionInfo().getTable().getNameAsString());
        region.batchMutate(mutations.toArray(mutationArray), HConstants.NO_NONCE, HConstants.NO_NONCE);
    }

    private void setIndexAndTransactionProperties(List<Mutation> mutations, byte[] indexUUID,
            byte[] indexMaintainersPtr, byte[] txState, boolean useIndexProto) {
        for (Mutation m : mutations) {
            if (indexMaintainersPtr != null) {
                m.setAttribute(useIndexProto ? PhoenixIndexCodec.INDEX_PROTO_MD : PhoenixIndexCodec.INDEX_MD,
                        indexMaintainersPtr);
            }
            if (indexUUID != null) {
                m.setAttribute(PhoenixIndexCodec.INDEX_UUID, indexUUID);
            }
            if (txState != null) {
                m.setAttribute(BaseScannerRegionObserver.TX_STATE, txState);
            }
        }
    }

    private void commitBatchWithHTable(HTable table, List<Mutation> mutations) throws IOException {
        if (mutations.isEmpty()) {
            return;
        }

        logger.debug("Committing batch of " + mutations.size() + " mutations for " + table);
        try {
            table.batch(mutations);
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
            throw new RuntimeException(e);
        }
    }

    /**
     * There is a chance that region might be closing while running balancer/move/merge. In this
     * case if the memstore size reaches blockingMemstoreSize better to fail query because there is
     * a high chance that flush might not proceed and memstore won't be freed up.
     * @throws IOException
     */
    private void checkForRegionClosing() throws IOException {
        synchronized (lock) {
            if (isRegionClosingOrSplitting) {
                lock.notifyAll();
                throw new IOException(
                        "Region is getting closed. Not allowing to write to avoid possible deadlock.");
            }
        }
    }

    public static void serializeIntoScan(Scan scan) {
        scan.setAttribute(BaseScannerRegionObserver.UNGROUPED_AGG, QueryConstants.TRUE);
    }

    @Override
    public RegionScanner preScannerOpen(ObserverContext<RegionCoprocessorEnvironment> e, Scan scan, RegionScanner s)
            throws IOException {
        s = super.preScannerOpen(e, scan, s);
        if (ScanUtil.isAnalyzeTable(scan)) {
            // We are setting the start row and stop row such that it covers the entire region. As part
            // of Phonenix-1263 we are storing the guideposts against the physical table rather than
            // individual tenant specific tables.
            scan.setStartRow(HConstants.EMPTY_START_ROW);
            scan.setStopRow(HConstants.EMPTY_END_ROW);
            scan.setFilter(null);
        }
        return s;
    }

    public static class MutationList extends ArrayList<Mutation> {
        private long byteSize = 0l;

        public MutationList() {
            super();
        }

        public MutationList(int size) {
            super(size);
        }

        @Override
        public boolean add(Mutation e) {
            boolean r = super.add(e);
            if (r) {
                this.byteSize += KeyValueUtil.calculateMutationDiskSize(e);
            }
            return r;
        }

        public long byteSize() {
            return byteSize;
        }

        @Override
        public void clear() {
            byteSize = 0l;
            super.clear();
        }
    }

    @Override
    protected RegionScanner doPostScannerOpen(final ObserverContext<RegionCoprocessorEnvironment> c,
            final Scan scan, final RegionScanner s) throws IOException, SQLException {
        RegionCoprocessorEnvironment env = c.getEnvironment();
        Region region = env.getRegion();
        long ts = scan.getTimeRange().getMax();
        boolean localIndexScan = ScanUtil.isLocalIndex(scan);
        if (ScanUtil.isAnalyzeTable(scan)) {
            byte[] gp_width_bytes = scan.getAttribute(BaseScannerRegionObserver.GUIDEPOST_WIDTH_BYTES);
            byte[] gp_per_region_bytes = scan.getAttribute(BaseScannerRegionObserver.GUIDEPOST_PER_REGION);
            // Let this throw, as this scan is being done for the sole purpose of collecting stats
            StatisticsCollector statsCollector = StatisticsCollectorFactory.createStatisticsCollector(env,
                    region.getRegionInfo().getTable().getNameAsString(), ts, gp_width_bytes, gp_per_region_bytes);
            return collectStats(s, statsCollector, region, scan, env.getConfiguration());
        } else if (ScanUtil.isIndexRebuild(scan)) {
            return rebuildIndices(s, region, scan, env.getConfiguration());
        }
        int offsetToBe = 0;
        if (localIndexScan) {
            /*
             * For local indexes, we need to set an offset on row key expressions to skip
             * the region start key.
             */
            offsetToBe = region.getRegionInfo().getStartKey().length != 0
                    ? region.getRegionInfo().getStartKey().length
                    : region.getRegionInfo().getEndKey().length;
            ScanUtil.setRowKeyOffset(scan, offsetToBe);
        }
        final int offset = offsetToBe;

        PTable projectedTable = null;
        PTable writeToTable = null;
        byte[][] values = null;
        byte[] descRowKeyTableBytes = scan.getAttribute(UPGRADE_DESC_ROW_KEY);
        boolean isDescRowKeyOrderUpgrade = descRowKeyTableBytes != null;
        if (isDescRowKeyOrderUpgrade) {
            logger.debug("Upgrading row key for " + region.getRegionInfo().getTable().getNameAsString());
            projectedTable = deserializeTable(descRowKeyTableBytes);
            try {
                writeToTable = PTableImpl.makePTable(projectedTable, true);
            } catch (SQLException e) {
                ServerUtil.throwIOException("Upgrade failed", e); // Impossible
            }
            values = new byte[projectedTable.getPKColumns().size()][];
        }
        boolean useProto = false;
        byte[] localIndexBytes = scan.getAttribute(LOCAL_INDEX_BUILD_PROTO);
        useProto = localIndexBytes != null;
        if (localIndexBytes == null) {
            localIndexBytes = scan.getAttribute(LOCAL_INDEX_BUILD);
        }
        List<IndexMaintainer> indexMaintainers = localIndexBytes == null ? null
                : IndexMaintainer.deserialize(localIndexBytes, useProto);
        MutationList indexMutations = localIndexBytes == null ? new MutationList() : new MutationList(1024);

        RegionScanner theScanner = s;

        byte[] replayMutations = scan.getAttribute(BaseScannerRegionObserver.REPLAY_WRITES);
        byte[] indexUUID = scan.getAttribute(PhoenixIndexCodec.INDEX_UUID);
        byte[] txState = scan.getAttribute(BaseScannerRegionObserver.TX_STATE);
        List<Expression> selectExpressions = null;
        byte[] upsertSelectTable = scan.getAttribute(BaseScannerRegionObserver.UPSERT_SELECT_TABLE);
        boolean isUpsert = false;
        boolean isDelete = false;
        byte[] deleteCQ = null;
        byte[] deleteCF = null;
        byte[] emptyCF = null;
        HTable targetHTable = null;
        boolean isPKChanging = false;
        ImmutableBytesWritable ptr = new ImmutableBytesWritable();
        if (upsertSelectTable != null) {
            isUpsert = true;
            projectedTable = deserializeTable(upsertSelectTable);
            targetHTable = new HTable(upsertSelectConfig, projectedTable.getPhysicalName().getBytes());
            selectExpressions = deserializeExpressions(
                    scan.getAttribute(BaseScannerRegionObserver.UPSERT_SELECT_EXPRS));
            values = new byte[projectedTable.getPKColumns().size()][];
            isPKChanging = ExpressionUtil.isPkPositionChanging(new TableRef(projectedTable), selectExpressions);
        } else {
            byte[] isDeleteAgg = scan.getAttribute(BaseScannerRegionObserver.DELETE_AGG);
            isDelete = isDeleteAgg != null && Bytes.compareTo(PDataType.TRUE_BYTES, isDeleteAgg) == 0;
            if (!isDelete) {
                deleteCF = scan.getAttribute(BaseScannerRegionObserver.DELETE_CF);
                deleteCQ = scan.getAttribute(BaseScannerRegionObserver.DELETE_CQ);
            }
            emptyCF = scan.getAttribute(BaseScannerRegionObserver.EMPTY_CF);
        }
        TupleProjector tupleProjector = null;
        byte[][] viewConstants = null;
        ColumnReference[] dataColumns = IndexUtil.deserializeDataTableColumnsToJoin(scan);
        final TupleProjector p = TupleProjector.deserializeProjectorFromScan(scan);
        final HashJoinInfo j = HashJoinInfo.deserializeHashJoinFromScan(scan);
        boolean useQualifierAsIndex = EncodedColumnsUtil
                .useQualifierAsIndex(EncodedColumnsUtil.getMinMaxQualifiersFromScan(scan));
        if ((localIndexScan && !isDelete && !isDescRowKeyOrderUpgrade) || (j == null && p != null)) {
            if (dataColumns != null) {
                tupleProjector = IndexUtil.getTupleProjector(scan, dataColumns);
                viewConstants = IndexUtil.deserializeViewConstantsFromScan(scan);
            }
            ImmutableBytesWritable tempPtr = new ImmutableBytesWritable();
            theScanner = getWrappedScanner(c, theScanner, offset, scan, dataColumns, tupleProjector, region,
                    indexMaintainers == null ? null : indexMaintainers.get(0), viewConstants, p, tempPtr,
                    useQualifierAsIndex);
        }

        if (j != null) {
            theScanner = new HashJoinRegionScanner(theScanner, p, j, ScanUtil.getTenantId(scan), env,
                    useQualifierAsIndex, useNewValueColumnQualifier);
        }

        int maxBatchSize = 0;
        long maxBatchSizeBytes = 0L;
        MutationList mutations = new MutationList();
        boolean needToWrite = false;
        Configuration conf = env.getConfiguration();
        long flushSize = region.getTableDesc().getMemStoreFlushSize();

        if (flushSize <= 0) {
            flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE,
                    HTableDescriptor.DEFAULT_MEMSTORE_FLUSH_SIZE);
        }

        /**
         * Slow down the writes if the memstore size more than
         * (hbase.hregion.memstore.block.multiplier - 1) times hbase.hregion.memstore.flush.size
         * bytes. This avoids flush storm to hdfs for cases like index building where reads and
         * write happen to all the table regions in the server.
         */
        final long blockingMemStoreSize = flushSize * (conf.getLong(HConstants.HREGION_MEMSTORE_BLOCK_MULTIPLIER,
                HConstants.DEFAULT_HREGION_MEMSTORE_BLOCK_MULTIPLIER) - 1);

        boolean buildLocalIndex = indexMaintainers != null && dataColumns == null && !localIndexScan;
        if (buildLocalIndex) {
            checkForLocalIndexColumnFamilies(region, indexMaintainers);
        }
        if (isDescRowKeyOrderUpgrade || isDelete || isUpsert || (deleteCQ != null && deleteCF != null)
                || emptyCF != null || buildLocalIndex) {
            needToWrite = true;
            maxBatchSize = conf.getInt(MUTATE_BATCH_SIZE_ATTRIB, QueryServicesOptions.DEFAULT_MUTATE_BATCH_SIZE);
            mutations = new MutationList(Ints.saturatedCast(maxBatchSize + maxBatchSize / 10));
            maxBatchSizeBytes = conf.getLong(MUTATE_BATCH_SIZE_BYTES_ATTRIB,
                    QueryServicesOptions.DEFAULT_MUTATE_BATCH_SIZE_BYTES);
        }
        Aggregators aggregators = ServerAggregators
                .deserialize(scan.getAttribute(BaseScannerRegionObserver.AGGREGATORS), conf);
        Aggregator[] rowAggregators = aggregators.getAggregators();
        boolean hasMore;
        boolean hasAny = false;
        Pair<Integer, Integer> minMaxQualifiers = EncodedColumnsUtil.getMinMaxQualifiersFromScan(scan);
        Tuple result = useQualifierAsIndex ? new PositionBasedMultiKeyValueTuple() : new MultiKeyValueTuple();
        if (logger.isDebugEnabled()) {
            logger.debug(LogUtil.addCustomAnnotations(
                    "Starting ungrouped coprocessor scan " + scan + " " + region.getRegionInfo(),
                    ScanUtil.getCustomAnnotations(scan)));
        }
        int rowCount = 0;
        final RegionScanner innerScanner = theScanner;
        boolean useIndexProto = true;
        byte[] indexMaintainersPtr = scan.getAttribute(PhoenixIndexCodec.INDEX_PROTO_MD);
        // for backward compatiblity fall back to look by the old attribute
        if (indexMaintainersPtr == null) {
            indexMaintainersPtr = scan.getAttribute(PhoenixIndexCodec.INDEX_MD);
            useIndexProto = false;
        }
        boolean acquiredLock = false;
        boolean incrScanRefCount = false;
        try {
            if (needToWrite) {
                synchronized (lock) {
                    if (isRegionClosingOrSplitting) {
                        throw new IOException(
                                "Temporarily unable to write from scan because region is closing or splitting");
                    }
                    scansReferenceCount++;
                    incrScanRefCount = true;
                    lock.notifyAll();
                }
            }
            region.startRegionOperation();
            acquiredLock = true;
            synchronized (innerScanner) {
                do {
                    List<Cell> results = useQualifierAsIndex
                            ? new EncodedColumnQualiferCellsList(minMaxQualifiers.getFirst(),
                                    minMaxQualifiers.getSecond(), encodingScheme)
                            : new ArrayList<Cell>();
                    // Results are potentially returned even when the return value of s.next is false
                    // since this is an indication of whether or not there are more values after the
                    // ones returned
                    hasMore = innerScanner.nextRaw(results);
                    if (!results.isEmpty()) {
                        rowCount++;
                        result.setKeyValues(results);
                        if (isDescRowKeyOrderUpgrade) {
                            Arrays.fill(values, null);
                            Cell firstKV = results.get(0);
                            RowKeySchema schema = projectedTable.getRowKeySchema();
                            int maxOffset = schema.iterator(firstKV.getRowArray(), firstKV.getRowOffset() + offset,
                                    firstKV.getRowLength(), ptr);
                            for (int i = 0; i < schema.getFieldCount(); i++) {
                                Boolean hasValue = schema.next(ptr, i, maxOffset);
                                if (hasValue == null) {
                                    break;
                                }
                                Field field = schema.getField(i);
                                if (field.getSortOrder() == SortOrder.DESC) {
                                    // Special case for re-writing DESC ARRAY, as the actual byte value needs to change in this case
                                    if (field.getDataType().isArrayType()) {
                                        field.getDataType().coerceBytes(ptr, null, field.getDataType(),
                                                field.getMaxLength(), field.getScale(), field.getSortOrder(),
                                                field.getMaxLength(), field.getScale(), field.getSortOrder(), true); // force to use correct separator byte
                                    }
                                    // Special case for re-writing DESC CHAR or DESC BINARY, to force the re-writing of trailing space characters
                                    else if (field.getDataType() == PChar.INSTANCE
                                            || field.getDataType() == PBinary.INSTANCE) {
                                        int len = ptr.getLength();
                                        while (len > 0
                                                && ptr.get()[ptr.getOffset() + len - 1] == StringUtil.SPACE_UTF8) {
                                            len--;
                                        }
                                        ptr.set(ptr.get(), ptr.getOffset(), len);
                                        // Special case for re-writing DESC FLOAT and DOUBLE, as they're not inverted like they should be (PHOENIX-2171)
                                    } else if (field.getDataType() == PFloat.INSTANCE
                                            || field.getDataType() == PDouble.INSTANCE) {
                                        byte[] invertedBytes = SortOrder.invert(ptr.get(), ptr.getOffset(),
                                                ptr.getLength());
                                        ptr.set(invertedBytes);
                                    }
                                } else if (field.getDataType() == PBinary.INSTANCE) {
                                    // Remove trailing space characters so that the setValues call below will replace them
                                    // with the correct zero byte character. Note this is somewhat dangerous as these
                                    // could be legit, but I don't know what the alternative is.
                                    int len = ptr.getLength();
                                    while (len > 0
                                            && ptr.get()[ptr.getOffset() + len - 1] == StringUtil.SPACE_UTF8) {
                                        len--;
                                    }
                                    ptr.set(ptr.get(), ptr.getOffset(), len);
                                }
                                values[i] = ptr.copyBytes();
                            }
                            writeToTable.newKey(ptr, values);
                            if (Bytes.compareTo(firstKV.getRowArray(), firstKV.getRowOffset() + offset,
                                    firstKV.getRowLength(), ptr.get(), ptr.getOffset() + offset,
                                    ptr.getLength()) == 0) {
                                continue;
                            }
                            byte[] newRow = ByteUtil.copyKeyBytesIfNecessary(ptr);
                            if (offset > 0) { // for local indexes (prepend region start key)
                                byte[] newRowWithOffset = new byte[offset + newRow.length];
                                System.arraycopy(firstKV.getRowArray(), firstKV.getRowOffset(), newRowWithOffset, 0,
                                        offset);
                                ;
                                System.arraycopy(newRow, 0, newRowWithOffset, offset, newRow.length);
                                newRow = newRowWithOffset;
                            }
                            byte[] oldRow = Bytes.copy(firstKV.getRowArray(), firstKV.getRowOffset(),
                                    firstKV.getRowLength());
                            for (Cell cell : results) {
                                // Copy existing cell but with new row key
                                Cell newCell = new KeyValue(newRow, 0, newRow.length, cell.getFamilyArray(),
                                        cell.getFamilyOffset(), cell.getFamilyLength(), cell.getQualifierArray(),
                                        cell.getQualifierOffset(), cell.getQualifierLength(), cell.getTimestamp(),
                                        KeyValue.Type.codeToType(cell.getTypeByte()), cell.getValueArray(),
                                        cell.getValueOffset(), cell.getValueLength());
                                switch (KeyValue.Type.codeToType(cell.getTypeByte())) {
                                case Put:
                                    // If Put, point delete old Put
                                    Delete del = new Delete(oldRow);
                                    del.addDeleteMarker(new KeyValue(cell.getRowArray(), cell.getRowOffset(),
                                            cell.getRowLength(), cell.getFamilyArray(), cell.getFamilyOffset(),
                                            cell.getFamilyLength(), cell.getQualifierArray(),
                                            cell.getQualifierOffset(), cell.getQualifierLength(),
                                            cell.getTimestamp(), KeyValue.Type.Delete, ByteUtil.EMPTY_BYTE_ARRAY, 0,
                                            0));
                                    mutations.add(del);

                                    Put put = new Put(newRow);
                                    put.add(newCell);
                                    mutations.add(put);
                                    break;
                                case Delete:
                                case DeleteColumn:
                                case DeleteFamily:
                                case DeleteFamilyVersion:
                                    Delete delete = new Delete(newRow);
                                    delete.addDeleteMarker(newCell);
                                    mutations.add(delete);
                                    break;
                                }
                            }
                        } else if (buildLocalIndex) {
                            for (IndexMaintainer maintainer : indexMaintainers) {
                                if (!results.isEmpty()) {
                                    result.getKey(ptr);
                                    ValueGetter valueGetter = maintainer.createGetterFromKeyValues(
                                            ImmutableBytesPtr.copyBytesIfNecessary(ptr), results);
                                    Put put = maintainer.buildUpdateMutation(kvBuilder, valueGetter, ptr,
                                            results.get(0).getTimestamp(),
                                            env.getRegion().getRegionInfo().getStartKey(),
                                            env.getRegion().getRegionInfo().getEndKey());
                                    indexMutations.add(put);
                                }
                            }
                            result.setKeyValues(results);
                        } else if (isDelete) {
                            // FIXME: the version of the Delete constructor without the lock
                            // args was introduced in 0.94.4, thus if we try to use it here
                            // we can no longer use the 0.94.2 version of the client.
                            Cell firstKV = results.get(0);
                            Delete delete = new Delete(firstKV.getRowArray(), firstKV.getRowOffset(),
                                    firstKV.getRowLength(), ts);
                            if (replayMutations != null) {
                                delete.setAttribute(REPLAY_WRITES, replayMutations);
                            }
                            mutations.add(delete);
                            // force tephra to ignore this deletes
                            delete.setAttribute(PhoenixTransactionContext.TX_ROLLBACK_ATTRIBUTE_KEY, new byte[0]);
                        } else if (isUpsert) {
                            Arrays.fill(values, null);
                            int bucketNumOffset = 0;
                            if (projectedTable.getBucketNum() != null) {
                                values[0] = new byte[] { 0 };
                                bucketNumOffset = 1;
                            }
                            int i = bucketNumOffset;
                            List<PColumn> projectedColumns = projectedTable.getColumns();
                            for (; i < projectedTable.getPKColumns().size(); i++) {
                                Expression expression = selectExpressions.get(i - bucketNumOffset);
                                if (expression.evaluate(result, ptr)) {
                                    values[i] = ptr.copyBytes();
                                    // If SortOrder from expression in SELECT doesn't match the
                                    // column being projected into then invert the bits.
                                    if (expression.getSortOrder() != projectedColumns.get(i).getSortOrder()) {
                                        SortOrder.invert(values[i], 0, values[i], 0, values[i].length);
                                    }
                                } else {
                                    values[i] = ByteUtil.EMPTY_BYTE_ARRAY;
                                }
                            }
                            projectedTable.newKey(ptr, values);
                            PRow row = projectedTable.newRow(kvBuilder, ts, ptr, false);
                            for (; i < projectedColumns.size(); i++) {
                                Expression expression = selectExpressions.get(i - bucketNumOffset);
                                if (expression.evaluate(result, ptr)) {
                                    PColumn column = projectedColumns.get(i);
                                    if (!column.getDataType().isSizeCompatible(ptr, null, expression.getDataType(),
                                            expression.getSortOrder(), expression.getMaxLength(),
                                            expression.getScale(), column.getMaxLength(), column.getScale())) {
                                        throw new DataExceedsCapacityException(column.getDataType(),
                                                column.getMaxLength(), column.getScale(),
                                                column.getName().getString(), ptr);
                                    }
                                    column.getDataType().coerceBytes(ptr, null, expression.getDataType(),
                                            expression.getMaxLength(), expression.getScale(),
                                            expression.getSortOrder(), column.getMaxLength(), column.getScale(),
                                            column.getSortOrder(), projectedTable.rowKeyOrderOptimizable());
                                    byte[] bytes = ByteUtil.copyKeyBytesIfNecessary(ptr);
                                    row.setValue(column, bytes);
                                }
                            }
                            for (Mutation mutation : row.toRowMutations()) {
                                if (replayMutations != null) {
                                    mutation.setAttribute(REPLAY_WRITES, replayMutations);
                                }
                                mutations.add(mutation);
                            }
                            for (i = 0; i < selectExpressions.size(); i++) {
                                selectExpressions.get(i).reset();
                            }
                        } else if (deleteCF != null && deleteCQ != null) {
                            // No need to search for delete column, since we project only it
                            // if no empty key value is being set
                            if (emptyCF == null || result.getValue(deleteCF, deleteCQ) != null) {
                                Delete delete = new Delete(results.get(0).getRowArray(),
                                        results.get(0).getRowOffset(), results.get(0).getRowLength());
                                delete.deleteColumns(deleteCF, deleteCQ, ts);
                                // force tephra to ignore this deletes
                                delete.setAttribute(PhoenixTransactionContext.TX_ROLLBACK_ATTRIBUTE_KEY,
                                        new byte[0]);
                                mutations.add(delete);
                            }
                        }
                        if (emptyCF != null) {
                            /*
                             * If we've specified an emptyCF, then we need to insert an empty
                             * key value "retroactively" for any key value that is visible at
                             * the timestamp that the DDL was issued. Key values that are not
                             * visible at this timestamp will not ever be projected up to
                             * scans past this timestamp, so don't need to be considered.
                             * We insert one empty key value per row per timestamp.
                             */
                            Set<Long> timeStamps = Sets.newHashSetWithExpectedSize(results.size());
                            for (Cell kv : results) {
                                long kvts = kv.getTimestamp();
                                if (!timeStamps.contains(kvts)) {
                                    Put put = new Put(kv.getRowArray(), kv.getRowOffset(), kv.getRowLength());
                                    put.add(emptyCF, QueryConstants.EMPTY_COLUMN_BYTES, kvts,
                                            ByteUtil.EMPTY_BYTE_ARRAY);
                                    mutations.add(put);
                                }
                            }
                        }
                        if (ServerUtil.readyToCommit(mutations.size(), mutations.byteSize(), maxBatchSize,
                                maxBatchSizeBytes)) {
                            commit(region, mutations, indexUUID, blockingMemStoreSize, indexMaintainersPtr, txState,
                                    targetHTable, useIndexProto, isPKChanging);
                            mutations.clear();
                        }
                        // Commit in batches based on UPSERT_BATCH_SIZE_BYTES_ATTRIB in config

                        if (ServerUtil.readyToCommit(indexMutations.size(), indexMutations.byteSize(), maxBatchSize,
                                maxBatchSizeBytes)) {
                            setIndexAndTransactionProperties(indexMutations, indexUUID, indexMaintainersPtr,
                                    txState, useIndexProto);
                            commitBatch(region, indexMutations, blockingMemStoreSize);
                            indexMutations.clear();
                        }
                        aggregators.aggregate(rowAggregators, result);
                        hasAny = true;
                    }
                } while (hasMore);
                if (!mutations.isEmpty()) {
                    commit(region, mutations, indexUUID, blockingMemStoreSize, indexMaintainersPtr, txState,
                            targetHTable, useIndexProto, isPKChanging);
                    mutations.clear();
                }

                if (!indexMutations.isEmpty()) {
                    commitBatch(region, indexMutations, blockingMemStoreSize);
                    indexMutations.clear();
                }
            }
        } finally {
            if (needToWrite && incrScanRefCount) {
                synchronized (lock) {
                    scansReferenceCount--;
                    if (scansReferenceCount < 0) {
                        logger.warn(
                                "Scan reference count went below zero. Something isn't correct. Resetting it back to zero");
                        scansReferenceCount = 0;
                    }
                    lock.notifyAll();
                }
            }
            try {
                if (targetHTable != null) {
                    targetHTable.close();
                }
            } finally {
                try {
                    innerScanner.close();
                } finally {
                    if (acquiredLock)
                        region.closeRegionOperation();
                }
            }
        }
        if (logger.isDebugEnabled()) {
            logger.debug(LogUtil.addCustomAnnotations(
                    "Finished scanning " + rowCount + " rows for ungrouped coprocessor scan " + scan,
                    ScanUtil.getCustomAnnotations(scan)));
        }

        final boolean hadAny = hasAny;
        KeyValue keyValue = null;
        if (hadAny) {
            byte[] value = aggregators.toBytes(rowAggregators);
            keyValue = KeyValueUtil.newKeyValue(UNGROUPED_AGG_ROW_KEY, SINGLE_COLUMN_FAMILY, SINGLE_COLUMN,
                    AGG_TIMESTAMP, value, 0, value.length);
        }
        final KeyValue aggKeyValue = keyValue;

        RegionScanner scanner = new BaseRegionScanner(innerScanner) {
            private boolean done = !hadAny;

            @Override
            public boolean isFilterDone() {
                return done;
            }

            @Override
            public boolean next(List<Cell> results) throws IOException {
                if (done)
                    return false;
                done = true;
                results.add(aggKeyValue);
                return false;
            }

            @Override
            public long getMaxResultSize() {
                return scan.getMaxResultSize();
            }
        };
        return scanner;

    }

    private void checkForLocalIndexColumnFamilies(Region region, List<IndexMaintainer> indexMaintainers)
            throws IOException {
        HTableDescriptor tableDesc = region.getTableDesc();
        String schemaName = tableDesc.getTableName().getNamespaceAsString()
                .equals(NamespaceDescriptor.DEFAULT_NAMESPACE_NAME_STR)
                        ? SchemaUtil.getSchemaNameFromFullName(tableDesc.getTableName().getNameAsString())
                        : tableDesc.getTableName().getNamespaceAsString();
        String tableName = SchemaUtil.getTableNameFromFullName(tableDesc.getTableName().getNameAsString());
        for (IndexMaintainer indexMaintainer : indexMaintainers) {
            Set<ColumnReference> coveredColumns = indexMaintainer.getCoveredColumns();
            if (coveredColumns.isEmpty()) {
                byte[] localIndexCf = indexMaintainer.getEmptyKeyValueFamily().get();
                // When covered columns empty we store index data in default column family so check for it.
                if (tableDesc.getFamily(localIndexCf) == null) {
                    ServerUtil.throwIOException("Column Family Not Found",
                            new ColumnFamilyNotFoundException(schemaName, tableName, Bytes.toString(localIndexCf)));
                }
            }
            for (ColumnReference reference : coveredColumns) {
                byte[] cf = IndexUtil.getLocalIndexColumnFamily(reference.getFamily());
                HColumnDescriptor family = region.getTableDesc().getFamily(cf);
                if (family == null) {
                    ServerUtil.throwIOException("Column Family Not Found",
                            new ColumnFamilyNotFoundException(schemaName, tableName, Bytes.toString(cf)));
                }
            }
        }
    }

    private void commit(Region region, List<Mutation> mutations, byte[] indexUUID, long blockingMemStoreSize,
            byte[] indexMaintainersPtr, byte[] txState, HTable targetHTable, boolean useIndexProto,
            boolean isPKChanging) throws IOException {
        List<Mutation> localRegionMutations = Lists.newArrayList();
        List<Mutation> remoteRegionMutations = Lists.newArrayList();
        setIndexAndTransactionProperties(mutations, indexUUID, indexMaintainersPtr, txState, useIndexProto);
        separateLocalAndRemoteMutations(targetHTable, region, mutations, localRegionMutations,
                remoteRegionMutations, isPKChanging);
        commitBatch(region, localRegionMutations, blockingMemStoreSize);
        commitBatchWithHTable(targetHTable, remoteRegionMutations);
        localRegionMutations.clear();
        remoteRegionMutations.clear();
    }

    private void separateLocalAndRemoteMutations(HTable targetHTable, Region region, List<Mutation> mutations,
            List<Mutation> localRegionMutations, List<Mutation> remoteRegionMutations, boolean isPKChanging) {
        boolean areMutationsInSameTable = areMutationsInSameTable(targetHTable, region);
        //if we're writing to the same table, but the PK can change, that means that some
        //mutations might be in our current region, and others in a different one.
        if (areMutationsInSameTable && isPKChanging) {
            HRegionInfo regionInfo = region.getRegionInfo();
            for (Mutation mutation : mutations) {
                if (regionInfo.containsRow(mutation.getRow())) {
                    localRegionMutations.add(mutation);
                } else {
                    remoteRegionMutations.add(mutation);
                }
            }
        } else if (areMutationsInSameTable && !isPKChanging) {
            localRegionMutations.addAll(mutations);
        } else {
            remoteRegionMutations.addAll(mutations);
        }
    }

    private boolean areMutationsInSameTable(HTable targetHTable, Region region) {
        return (targetHTable == null || Bytes.compareTo(targetHTable.getTableName(),
                region.getTableDesc().getTableName().getName()) == 0);
    }

    @Override
    public InternalScanner preCompact(final ObserverContext<RegionCoprocessorEnvironment> c, final Store store,
            final InternalScanner scanner, final ScanType scanType) throws IOException {
        // Compaction and split upcalls run with the effective user context of the requesting user.
        // This will lead to failure of cross cluster RPC if the effective user is not
        // the login user. Switch to the login user context to ensure we have the expected
        // security context.
        return User.runAsLoginUser(new PrivilegedExceptionAction<InternalScanner>() {
            @Override
            public InternalScanner run() throws Exception {
                TableName table = c.getEnvironment().getRegion().getRegionInfo().getTable();
                InternalScanner internalScanner = scanner;
                if (scanType.equals(ScanType.COMPACT_DROP_DELETES)) {
                    try {
                        long clientTimeStamp = EnvironmentEdgeManager.currentTimeMillis();
                        StatisticsCollector stats = StatisticsCollectorFactory.createStatisticsCollector(
                                c.getEnvironment(), table.getNameAsString(), clientTimeStamp,
                                store.getFamily().getName());
                        internalScanner = stats.createCompactionScanner(c.getEnvironment(), store, scanner);
                    } catch (IOException e) {
                        // If we can't reach the stats table, don't interrupt the normal
                        // compaction operation, just log a warning.
                        if (logger.isWarnEnabled()) {
                            logger.warn("Unable to collect stats for " + table, e);
                        }
                    }
                }
                return internalScanner;
            }
        });
    }

    @Override
    public void postCompact(final ObserverContext<RegionCoprocessorEnvironment> e, final Store store,
            final StoreFile resultFile, CompactionRequest request) throws IOException {
        // If we're compacting all files, then delete markers are removed
        // and we must permanently disable an index that needs to be
        // partially rebuild because we're potentially losing the information
        // we need to successfully rebuilt it.
        if (request.isAllFiles() || request.isMajor()) {
            // Compaction and split upcalls run with the effective user context of the requesting user.
            // This will lead to failure of cross cluster RPC if the effective user is not
            // the login user. Switch to the login user context to ensure we have the expected
            // security context.
            User.runAsLoginUser(new PrivilegedExceptionAction<Void>() {
                @Override
                public Void run() throws Exception {
                    MutationCode mutationCode = null;
                    long disableIndexTimestamp = 0;

                    try (CoprocessorHConnection coprocessorHConnection = new CoprocessorHConnection(
                            compactionConfig, (HRegionServer) e.getEnvironment().getRegionServerServices());
                            HTableInterface htable = coprocessorHConnection
                                    .getTable(SchemaUtil.getPhysicalTableName(
                                            PhoenixDatabaseMetaData.SYSTEM_CATALOG_NAME_BYTES, compactionConfig))) {
                        String tableName = e.getEnvironment().getRegion().getRegionInfo().getTable()
                                .getNameAsString();
                        // FIXME: if this is an index on a view, we won't find a row for it in SYSTEM.CATALOG
                        // Instead, we need to disable all indexes on the view.
                        byte[] tableKey = SchemaUtil.getTableKeyFromFullName(tableName);
                        Get get = new Get(tableKey);
                        get.addColumn(PhoenixDatabaseMetaData.TABLE_FAMILY_BYTES,
                                PhoenixDatabaseMetaData.INDEX_DISABLE_TIMESTAMP_BYTES);
                        Result result = htable.get(get);
                        if (!result.isEmpty()) {
                            Cell cell = result.listCells().get(0);
                            if (cell.getValueLength() > 0) {
                                disableIndexTimestamp = PLong.INSTANCE.getCodec().decodeLong(cell.getValueArray(),
                                        cell.getValueOffset(), SortOrder.getDefault());
                                if (disableIndexTimestamp != 0) {
                                    logger.info(
                                            "Major compaction running while index on table is disabled.  Clearing index disable timestamp: "
                                                    + tableName);
                                    mutationCode = IndexUtil
                                            .updateIndexState(tableKey, 0L, htable, PIndexState.DISABLE)
                                            .getMutationCode();
                                }
                            }
                        }
                    } catch (Throwable t) { // log, but swallow exception as we don't want to impact compaction
                        logger.warn("Potential failure to permanently disable index during compaction "
                                + e.getEnvironment().getRegionInfo().getTable().getNameAsString(), t);
                    } finally {
                        if (disableIndexTimestamp != 0 && mutationCode != MutationCode.TABLE_ALREADY_EXISTS
                                && mutationCode != MutationCode.TABLE_NOT_FOUND) {
                            logger.warn("Attempt to permanently disable index "
                                    + e.getEnvironment().getRegionInfo().getTable().getNameAsString()
                                    + " during compaction"
                                    + (mutationCode == null ? "" : " failed with code = " + mutationCode));
                        }
                    }
                    return null;
                }
            });
        }
    }

    private static PTable deserializeTable(byte[] b) {
        try {
            PTableProtos.PTable ptableProto = PTableProtos.PTable.parseFrom(b);
            return PTableImpl.createFromProto(ptableProto);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private RegionScanner rebuildIndices(final RegionScanner innerScanner, final Region region, final Scan scan,
            Configuration config) throws IOException {
        byte[] indexMetaData = scan.getAttribute(PhoenixIndexCodec.INDEX_PROTO_MD);
        boolean useProto = true;
        // for backward compatibility fall back to look up by the old attribute
        if (indexMetaData == null) {
            useProto = false;
            indexMetaData = scan.getAttribute(PhoenixIndexCodec.INDEX_MD);
        }
        boolean hasMore;
        int rowCount = 0;
        try {
            int maxBatchSize = config.getInt(MUTATE_BATCH_SIZE_ATTRIB,
                    QueryServicesOptions.DEFAULT_MUTATE_BATCH_SIZE);
            long maxBatchSizeBytes = config.getLong(MUTATE_BATCH_SIZE_BYTES_ATTRIB,
                    QueryServicesOptions.DEFAULT_MUTATE_BATCH_SIZE_BYTES);
            MutationList mutations = new MutationList(maxBatchSize);
            region.startRegionOperation();
            byte[] uuidValue = ServerCacheClient.generateId();
            synchronized (innerScanner) {
                do {
                    List<Cell> results = new ArrayList<Cell>();
                    hasMore = innerScanner.nextRaw(results);
                    if (!results.isEmpty()) {
                        Put put = null;
                        Delete del = null;
                        for (Cell cell : results) {

                            if (KeyValue.Type.codeToType(cell.getTypeByte()) == KeyValue.Type.Put) {
                                if (put == null) {
                                    put = new Put(CellUtil.cloneRow(cell));
                                    put.setAttribute(useProto ? PhoenixIndexCodec.INDEX_PROTO_MD
                                            : PhoenixIndexCodec.INDEX_MD, indexMetaData);
                                    put.setAttribute(PhoenixIndexCodec.INDEX_UUID, uuidValue);
                                    put.setAttribute(REPLAY_WRITES, REPLAY_ONLY_INDEX_WRITES);
                                    mutations.add(put);
                                    // Since we're replaying existing mutations, it makes no sense to write them to the wal
                                    put.setDurability(Durability.SKIP_WAL);
                                }
                                put.add(cell);
                            } else {
                                if (del == null) {
                                    del = new Delete(CellUtil.cloneRow(cell));
                                    del.setAttribute(useProto ? PhoenixIndexCodec.INDEX_PROTO_MD
                                            : PhoenixIndexCodec.INDEX_MD, indexMetaData);
                                    del.setAttribute(PhoenixIndexCodec.INDEX_UUID, uuidValue);
                                    del.setAttribute(REPLAY_WRITES, REPLAY_ONLY_INDEX_WRITES);
                                    mutations.add(del);
                                    // Since we're replaying existing mutations, it makes no sense to write them to the wal
                                    del.setDurability(Durability.SKIP_WAL);
                                }
                                del.addDeleteMarker(cell);
                            }
                        }
                        if (ServerUtil.readyToCommit(mutations.size(), mutations.byteSize(), maxBatchSize,
                                maxBatchSizeBytes)) {
                            region.batchMutate(mutations.toArray(new Mutation[mutations.size()]),
                                    HConstants.NO_NONCE, HConstants.NO_NONCE);
                            uuidValue = ServerCacheClient.generateId();
                            mutations.clear();
                        }
                        rowCount++;
                    }

                } while (hasMore);
                if (!mutations.isEmpty()) {
                    region.batchMutate(mutations.toArray(new Mutation[mutations.size()]), HConstants.NO_NONCE,
                            HConstants.NO_NONCE);
                }
            }
        } catch (IOException e) {
            logger.error("IOException during rebuilding: " + Throwables.getStackTraceAsString(e));
            throw e;
        } finally {
            region.closeRegionOperation();
        }
        byte[] rowCountBytes = PLong.INSTANCE.toBytes(Long.valueOf(rowCount));
        final KeyValue aggKeyValue = KeyValueUtil.newKeyValue(UNGROUPED_AGG_ROW_KEY, SINGLE_COLUMN_FAMILY,
                SINGLE_COLUMN, AGG_TIMESTAMP, rowCountBytes, 0, rowCountBytes.length);

        RegionScanner scanner = new BaseRegionScanner(innerScanner) {
            @Override
            public HRegionInfo getRegionInfo() {
                return region.getRegionInfo();
            }

            @Override
            public boolean isFilterDone() {
                return true;
            }

            @Override
            public void close() throws IOException {
                innerScanner.close();
            }

            @Override
            public boolean next(List<Cell> results) throws IOException {
                results.add(aggKeyValue);
                return false;
            }

            @Override
            public long getMaxResultSize() {
                return scan.getMaxResultSize();
            }
        };
        return scanner;
    }

    private RegionScanner collectStats(final RegionScanner innerScanner, StatisticsCollector stats,
            final Region region, final Scan scan, Configuration config) throws IOException {
        StatsCollectionCallable callable = new StatsCollectionCallable(stats, region, innerScanner, config, scan);
        byte[] asyncBytes = scan.getAttribute(BaseScannerRegionObserver.RUN_UPDATE_STATS_ASYNC_ATTRIB);
        boolean async = false;
        if (asyncBytes != null) {
            async = Bytes.toBoolean(asyncBytes);
        }
        long rowCount = 0; // in case of async, we report 0 as number of rows updated
        StatisticsCollectionRunTracker statsRunTracker = StatisticsCollectionRunTracker.getInstance(config);
        boolean runUpdateStats = statsRunTracker.addUpdateStatsCommandRegion(region.getRegionInfo());
        if (runUpdateStats) {
            if (!async) {
                rowCount = callable.call();
            } else {
                statsRunTracker.runTask(callable);
            }
        } else {
            rowCount = CONCURRENT_UPDATE_STATS_ROW_COUNT;
            logger.info(
                    "UPDATE STATISTICS didn't run because another UPDATE STATISTICS command was already running on the region "
                            + region.getRegionInfo().getRegionNameAsString());
        }
        byte[] rowCountBytes = PLong.INSTANCE.toBytes(Long.valueOf(rowCount));
        final KeyValue aggKeyValue = KeyValueUtil.newKeyValue(UNGROUPED_AGG_ROW_KEY, SINGLE_COLUMN_FAMILY,
                SINGLE_COLUMN, AGG_TIMESTAMP, rowCountBytes, 0, rowCountBytes.length);
        RegionScanner scanner = new BaseRegionScanner(innerScanner) {
            @Override
            public HRegionInfo getRegionInfo() {
                return region.getRegionInfo();
            }

            @Override
            public boolean isFilterDone() {
                return true;
            }

            @Override
            public void close() throws IOException {
                // No-op because we want to manage closing of the inner scanner ourselves.
                // This happens inside StatsCollectionCallable.
            }

            @Override
            public boolean next(List<Cell> results) throws IOException {
                results.add(aggKeyValue);
                return false;
            }

            @Override
            public long getMaxResultSize() {
                return scan.getMaxResultSize();
            }
        };
        return scanner;
    }

    /**
     * 
     * Callable to encapsulate the collection of stats triggered by 
     * UPDATE STATISTICS command.
     *
     * Package private for tests.
     */
    static class StatsCollectionCallable implements Callable<Long> {
        private final StatisticsCollector statsCollector;
        private final Region region;
        private final RegionScanner innerScanner;
        private final Configuration config;
        private final Scan scan;

        StatsCollectionCallable(StatisticsCollector s, Region r, RegionScanner rs, Configuration config,
                Scan scan) {
            this.statsCollector = s;
            this.region = r;
            this.innerScanner = rs;
            this.config = config;
            this.scan = scan;
        }

        @Override
        public Long call() throws IOException {
            return collectStatsInternal();
        }

        private boolean areStatsBeingCollectedViaCompaction() {
            return StatisticsCollectionRunTracker.getInstance(config)
                    .areStatsBeingCollectedOnCompaction(region.getRegionInfo());
        }

        private long collectStatsInternal() throws IOException {
            long startTime = EnvironmentEdgeManager.currentTimeMillis();
            region.startRegionOperation();
            boolean hasMore = false;
            boolean noErrors = false;
            boolean compactionRunning = areStatsBeingCollectedViaCompaction();
            long rowCount = 0;
            try {
                if (!compactionRunning) {
                    statsCollector.init();
                    synchronized (innerScanner) {
                        do {
                            List<Cell> results = new ArrayList<Cell>();
                            hasMore = innerScanner.nextRaw(results);
                            statsCollector.collectStatistics(results);
                            rowCount++;
                            compactionRunning = areStatsBeingCollectedViaCompaction();
                        } while (hasMore && !compactionRunning);
                        noErrors = true;
                    }
                }
                return compactionRunning ? COMPACTION_UPDATE_STATS_ROW_COUNT : rowCount;
            } catch (IOException e) {
                logger.error("IOException in update stats: " + Throwables.getStackTraceAsString(e));
                throw e;
            } finally {
                try {
                    if (noErrors && !compactionRunning) {
                        statsCollector.updateStatistic(region, scan);
                        logger.info("UPDATE STATISTICS finished successfully for scanner: " + innerScanner
                                + ". Number of rows scanned: " + rowCount + ". Time: "
                                + (System.currentTimeMillis() - startTime));
                    }
                    if (compactionRunning) {
                        logger.info(
                                "UPDATE STATISTICS stopped in between because major compaction was running for region "
                                        + region.getRegionInfo().getRegionNameAsString());
                    }
                } finally {
                    try {
                        StatisticsCollectionRunTracker.getInstance(config)
                                .removeUpdateStatsCommandRegion(region.getRegionInfo());
                        statsCollector.close();
                    } finally {
                        try {
                            innerScanner.close();
                        } finally {
                            region.closeRegionOperation();
                        }
                    }
                }
            }
        }
    }

    private static List<Expression> deserializeExpressions(byte[] b) {
        ByteArrayInputStream stream = new ByteArrayInputStream(b);
        try {
            DataInputStream input = new DataInputStream(stream);
            int size = WritableUtils.readVInt(input);
            List<Expression> selectExpressions = Lists.newArrayListWithExpectedSize(size);
            for (int i = 0; i < size; i++) {
                ExpressionType type = ExpressionType.values()[WritableUtils.readVInt(input)];
                Expression selectExpression = type.newInstance();
                selectExpression.readFields(input);
                selectExpressions.add(selectExpression);
            }
            return selectExpressions;
        } catch (IOException e) {
            throw new RuntimeException(e);
        } finally {
            try {
                stream.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    }

    public static byte[] serialize(PTable projectedTable) {
        PTableProtos.PTable ptableProto = PTableImpl.toProto(projectedTable);
        return ptableProto.toByteArray();
    }

    public static byte[] serialize(List<Expression> selectExpressions) {
        ByteArrayOutputStream stream = new ByteArrayOutputStream();
        try {
            DataOutputStream output = new DataOutputStream(stream);
            WritableUtils.writeVInt(output, selectExpressions.size());
            for (int i = 0; i < selectExpressions.size(); i++) {
                Expression expression = selectExpressions.get(i);
                WritableUtils.writeVInt(output, ExpressionType.valueOf(expression).ordinal());
                expression.write(output);
            }
            return stream.toByteArray();
        } catch (IOException e) {
            throw new RuntimeException(e);
        } finally {
            try {
                stream.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    }

    @Override
    public void preSplit(ObserverContext<RegionCoprocessorEnvironment> c, byte[] splitRow) throws IOException {
        // Don't allow splitting if operations need read and write to same region are going on in the
        // the coprocessors to avoid dead lock scenario. See PHOENIX-3111.
        synchronized (lock) {
            isRegionClosingOrSplitting = true;
            if (scansReferenceCount > 0) {
                throw new IOException("Operations like local index building/delete/upsert select"
                        + " might be going on so not allowing to split.");
            }
        }
    }

    @Override
    public void preBulkLoadHFile(ObserverContext<RegionCoprocessorEnvironment> c,
            List<Pair<byte[], String>> familyPaths) throws IOException {
        // Don't allow bulkload if operations need read and write to same region are going on in the
        // the coprocessors to avoid dead lock scenario. See PHOENIX-3111.
        synchronized (lock) {
            if (scansReferenceCount > 0) {
                throw new DoNotRetryIOException("Operations like local index building/delete/upsert select"
                        + " might be going on so not allowing to bulkload.");
            }
        }
    }

    @Override
    public void preClose(ObserverContext<RegionCoprocessorEnvironment> c, boolean abortRequested)
            throws IOException {
        synchronized (lock) {
            isRegionClosingOrSplitting = true;
            while (scansReferenceCount > 0) {
                try {
                    lock.wait(1000);
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                }
            }
        }
    }

    @Override
    protected boolean isRegionObserverFor(Scan scan) {
        return scan.getAttribute(BaseScannerRegionObserver.UNGROUPED_AGG) != null;
    }
}