org.apache.phoenix.cache.aggcache.SpillableGroupByCache.java Source code

Introduction

Here is the source code for org.apache.phoenix.cache.aggcache.SpillableGroupByCache.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.phoenix.cache.aggcache;

import static org.apache.phoenix.query.QueryConstants.AGG_TIMESTAMP;
import static org.apache.phoenix.query.QueryConstants.SINGLE_COLUMN;
import static org.apache.phoenix.query.QueryConstants.SINGLE_COLUMN_FAMILY;
import static org.apache.phoenix.query.QueryServices.GROUPBY_MAX_CACHE_SIZE_ATTRIB;
import static org.apache.phoenix.query.QueryServices.GROUPBY_SPILL_FILES_ATTRIB;
import static org.apache.phoenix.query.QueryServicesOptions.DEFAULT_GROUPBY_MAX_CACHE_MAX;
import static org.apache.phoenix.query.QueryServicesOptions.DEFAULT_GROUPBY_SPILL_FILES;

import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.regionserver.RegionScanner;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.phoenix.cache.GlobalCache;
import org.apache.phoenix.cache.TenantCache;
import org.apache.phoenix.cache.aggcache.SpillManager.CacheEntry;
import org.apache.phoenix.coprocessor.BaseRegionScanner;
import org.apache.phoenix.coprocessor.GroupByCache;
import org.apache.phoenix.coprocessor.GroupedAggregateRegionObserver;
import org.apache.phoenix.expression.aggregator.Aggregator;
import org.apache.phoenix.expression.aggregator.ServerAggregators;
import org.apache.phoenix.hbase.index.util.ImmutableBytesPtr;
import org.apache.phoenix.memory.InsufficientMemoryException;
import org.apache.phoenix.memory.MemoryManager.MemoryChunk;
import org.apache.phoenix.util.Closeables;
import org.apache.phoenix.util.KeyValueUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * The main entry point is in GroupedAggregateRegionObserver. It instantiates a SpillableGroupByCache and invokes a
 * get() method on it. There is no: "if key not exists -> put into map" case, since the cache is a Loading cache and
 * therefore handles the put under the covers. I tried to implement the final cache element accesses (RegionScanner
 * below) streaming, i.e. there is just an iterator on it and removed the existing result materialization.
 * SpillableGroupByCache implements a LRU cache using a LinkedHashMap with access order. There is a configurable an
 * upper and lower size limit in bytes which are used as follows to compute the initial cache size in number of
 * elements: Max(lowerBoundElements, Min(upperBoundElements, estimatedCacheSize)). Once the number of cached elements
 * exceeds this number, the cache size is increased by a factor of 1.5. This happens until the additional memory to grow
 * the cache cannot be requested. At this point the Cache starts spilling elements. As long as no eviction happens no
 * spillable data structures are allocated, this only happens as soon as the first element is evicted from the cache. We
 * cannot really make any assumptions on which keys arrive at the map, but assume the LRU would at least cover the cases
 * where some keys have a slight skew and they should stay memory resident. Once a key gets evicted, the spillManager is
 * instantiated. It basically takes care of spilling an element to disk and does all the SERDE work. It pre-allocates a
 * configurable number of SpillFiles (spill partition) which are memory mapped temp files. The SpillManager keeps a list
 * of these and hash distributes the keys within this list. Once an element gets spilled, it is serialized and will only
 * get deserialized again, when it is requested from the client, i.e. loaded back into the LRU cache. The SpillManager
 * holds a single SpillMap object in memory for every spill partition (SpillFile). The SpillMap is an in memory Map
 * representation of a single page of spilled serialized key/value pairs. To achieve fast key lookup the key is hash
 * partitioned into random pages of the current spill file. The code implements an extendible hashing approach which
 * dynamically adjusts the hash function, in order to adapt to growing number of storage pages and avoiding long chains
 * of overflow buckets. For an excellent discussion of the algorithm please refer to the following online resource:
 * http://db.inf.uni-tuebingen.de/files/teaching/ws1011/db2/db2-hash-indexes.pdf . For this, each SpillFile keeps a
 * directory of pointers to Integer.MAX_VALUE 4K pages in memory, which allows each directory to address more pages than
 * a single memory mapped temp file could theoretically store. In case directory doubling, requests a page index that
 * exceeds the limits of the initial temp file limits, the implementation dynamically allocates additional temp files to
 * the SpillFile. The directory starts with a global depth of 1 and therefore a directory size of 2 buckets. Only during
 * bucket split and directory doubling more than one page is temporarily kept in memory until all elements have been
 * redistributed. The current implementation conducts bucket splits as long as an element does not fit onto a page. No
 * overflow chain is created, which might be an alternative. For get requests, each directory entry maintains a
 * bloomFilter to prevent page-in operations in case an element has never been spilled before. The deserialization is
 * only triggered when a key a loaded back into the LRU cache. The aggregators are returned from the LRU cache and the
 * next value is computed. In case the key is not found on any page, the Loader create new aggregators for it.
 */

public class SpillableGroupByCache implements GroupByCache {

    private static final Logger logger = LoggerFactory.getLogger(SpillableGroupByCache.class);

    // Min size of 1st level main memory cache in bytes --> lower bound
    private static final int SPGBY_CACHE_MIN_SIZE = 4096; // 4K

    // TODO Generally better to use Collection API with generics instead of
    // array types
    private final LinkedHashMap<ImmutableBytesWritable, Aggregator[]> cache;
    private SpillManager spillManager = null;
    private long totalNumElements;
    private final ServerAggregators aggregators;
    private final RegionCoprocessorEnvironment env;
    private final MemoryChunk chunk;

    /*
     * inner class that makes cache queryable for other classes that should not get the full instance. Queryable view of
     * the cache
     */
    public class QueryCache {
        public boolean isKeyContained(ImmutableBytesPtr key) {
            return cache.containsKey(key);
        }
    }

    /**
     * Instantiates a Loading LRU Cache that stores key / aggregator[] tuples used for group by queries
     *
     * @param estSize
     * @param estValueSize
     * @param aggs
     * @param ctxt
     */
    public SpillableGroupByCache(final RegionCoprocessorEnvironment env, ImmutableBytesPtr tenantId,
            ServerAggregators aggs, final int estSizeNum) {
        totalNumElements = 0;
        this.aggregators = aggs;
        this.env = env;

        final int estValueSize = aggregators.getEstimatedByteSize();
        final TenantCache tenantCache = GlobalCache.getTenantCache(env, tenantId);

        // Compute Map initial map
        final Configuration conf = env.getConfiguration();
        final long maxCacheSizeConf = conf.getLong(GROUPBY_MAX_CACHE_SIZE_ATTRIB, DEFAULT_GROUPBY_MAX_CACHE_MAX);
        final int numSpillFilesConf = conf.getInt(GROUPBY_SPILL_FILES_ATTRIB, DEFAULT_GROUPBY_SPILL_FILES);

        final int maxSizeNum = (int) (maxCacheSizeConf / estValueSize);
        final int minSizeNum = (SPGBY_CACHE_MIN_SIZE / estValueSize);

        // use upper and lower bounds for the cache size
        final int maxCacheSize = Math.max(minSizeNum, Math.min(maxSizeNum, estSizeNum));
        final long estSize = GroupedAggregateRegionObserver.sizeOfUnorderedGroupByMap(maxCacheSize, estValueSize);
        try {
            this.chunk = tenantCache.getMemoryManager().allocate(estSize);
        } catch (InsufficientMemoryException ime) {
            logger.error("Requested Map size exceeds memory limit, please decrease max size via config paramter: "
                    + GROUPBY_MAX_CACHE_SIZE_ATTRIB);
            throw ime;
        }

        if (logger.isDebugEnabled()) {
            logger.debug("Instantiating LRU groupby cache of element size: " + maxCacheSize);
        }

        // LRU cache implemented as LinkedHashMap with access order
        cache = new LinkedHashMap<ImmutableBytesWritable, Aggregator[]>(maxCacheSize, 0.75f, true) {
            boolean spill = false;
            int cacheSize = maxCacheSize;

            @Override
            protected boolean removeEldestEntry(Map.Entry<ImmutableBytesWritable, Aggregator[]> eldest) {
                if (!spill && size() > cacheSize) { // increase allocation
                    cacheSize *= 1.5f;
                    long estSize = GroupedAggregateRegionObserver.sizeOfUnorderedGroupByMap(cacheSize,
                            estValueSize);
                    try {
                        chunk.resize(estSize);
                    } catch (InsufficientMemoryException im) {
                        // Cannot extend Map anymore, start spilling
                        spill = true;
                    }
                }

                if (spill) {
                    try {
                        if (spillManager == null) {
                            // Lazy instantiation of spillable data
                            // structures
                            //
                            // Only create spill data structs if LRU
                            // cache is too small
                            spillManager = new SpillManager(numSpillFilesConf, aggregators, env.getConfiguration(),
                                    new QueryCache());
                        }
                        spillManager.spill(eldest.getKey(), eldest.getValue());
                    } catch (IOException ioe) {
                        // Ensure that we always close and delete the temp files
                        try {
                            throw new RuntimeException(ioe);
                        } finally {
                            Closeables.closeQuietly(SpillableGroupByCache.this);
                        }
                    }
                    return true;
                }

                return false;
            }
        };
    }

    /**
     * Size function returns the current number of cached elements
     */
    @Override
    public long size() {
        return totalNumElements;
    }

    /**
     * Extract an element from the Cache If element is not present in in-memory cache / or in spill files cache
     * implements an implicit put() of a new key/value tuple and loads it into the cache
     */
    @Override
    public Aggregator[] cache(ImmutableBytesPtr cacheKey) {
        ImmutableBytesPtr key = new ImmutableBytesPtr(cacheKey);
        Aggregator[] rowAggregators = cache.get(key);
        if (rowAggregators == null) {
            // If Aggregators not found for this distinct
            // value, clone our original one (we need one
            // per distinct value)
            if (spillManager != null) {
                // Spill manager present, check if key has been
                // spilled before
                try {
                    rowAggregators = spillManager.loadEntry(key);
                } catch (IOException ioe) {
                    // Ensure that we always close and delete the temp files
                    try {
                        throw new RuntimeException(ioe);
                    } finally {
                        Closeables.closeQuietly(SpillableGroupByCache.this);
                    }
                }
            }
            if (rowAggregators == null) {
                // No, key never spilled before, create a new tuple
                rowAggregators = aggregators.newAggregators(env.getConfiguration());
                if (logger.isDebugEnabled()) {
                    logger.debug("Adding new aggregate bucket for row key "
                            + Bytes.toStringBinary(key.get(), key.getOffset(), key.getLength()));
                }
            }
            if (cache.put(key, rowAggregators) == null) {
                totalNumElements++;
            }
        }
        return rowAggregators;
    }

    /**
     * Iterator over the cache and the spilled data structures by returning CacheEntries. CacheEntries are either
     * extracted from the LRU cache or from the spillable data structures.The key/value tuples are returned in
     * non-deterministic order.
     */
    private final class EntryIterator implements Iterator<Map.Entry<ImmutableBytesWritable, Aggregator[]>> {
        final Iterator<Map.Entry<ImmutableBytesWritable, Aggregator[]>> cacheIter;
        final Iterator<byte[]> spilledCacheIter;

        private EntryIterator() {
            cacheIter = cache.entrySet().iterator();
            if (spillManager != null) {
                spilledCacheIter = spillManager.newDataIterator();
            } else {
                spilledCacheIter = null;
            }
        }

        @Override
        public boolean hasNext() {
            return cacheIter.hasNext();
        }

        @Override
        public Map.Entry<ImmutableBytesWritable, Aggregator[]> next() {
            if (spilledCacheIter != null && spilledCacheIter.hasNext()) {
                try {
                    byte[] value = spilledCacheIter.next();
                    // Deserialize into a CacheEntry
                    Map.Entry<ImmutableBytesWritable, Aggregator[]> spilledEntry = spillManager.toCacheEntry(value);

                    boolean notFound = false;
                    // check against map and return only if not present
                    while (cache.containsKey(spilledEntry.getKey())) {
                        // LRU Cache entries always take precedence,
                        // since they are more up to date
                        if (spilledCacheIter.hasNext()) {
                            value = spilledCacheIter.next();
                            spilledEntry = spillManager.toCacheEntry(value);
                        } else {
                            notFound = true;
                            break;
                        }
                    }
                    if (!notFound) {
                        // Return a spilled entry, this only happens if the
                        // entry was not
                        // found in the LRU cache
                        return spilledEntry;
                    }
                } catch (IOException ioe) {
                    // TODO rework error handling
                    throw new RuntimeException(ioe);
                }
            }
            // Spilled elements exhausted
            // Finally return all elements from LRU cache
            Map.Entry<ImmutableBytesWritable, Aggregator[]> entry = cacheIter.next();
            return new CacheEntry<ImmutableBytesWritable>(entry.getKey(), entry.getValue());
        }

        /**
         * Remove??? Denied!!!
         */
        @Override
        public void remove() {
            throw new IllegalAccessError("Remove is not supported for this type of iterator");
        }
    }

    /**
     * Closes cache and releases spill resources
     *
     * @throws IOException
     */
    @Override
    public void close() throws IOException {
        // Close spillable resources
        Closeables.closeQuietly(spillManager);
        Closeables.closeQuietly(chunk);
    }

    @Override
    public RegionScanner getScanner(final RegionScanner s) {
        final Iterator<Entry<ImmutableBytesWritable, Aggregator[]>> cacheIter = new EntryIterator();

        // scanner using the spillable implementation
        return new BaseRegionScanner(s) {
            @Override
            public void close() throws IOException {
                try {
                    s.close();
                } finally {
                    // Always close gbCache and swallow possible Exceptions
                    Closeables.closeQuietly(SpillableGroupByCache.this);
                }
            }

            @Override
            public boolean next(List<Cell> results) throws IOException {
                if (!cacheIter.hasNext()) {
                    return false;
                }
                Map.Entry<ImmutableBytesWritable, Aggregator[]> ce = cacheIter.next();
                ImmutableBytesWritable key = ce.getKey();
                Aggregator[] aggs = ce.getValue();
                byte[] value = aggregators.toBytes(aggs);
                if (logger.isDebugEnabled()) {
                    logger.debug("Adding new distinct group: "
                            + Bytes.toStringBinary(key.get(), key.getOffset(), key.getLength())
                            + " with aggregators " + aggs.toString() + " value = " + Bytes.toStringBinary(value));
                }
                results.add(KeyValueUtil.newKeyValue(key.get(), key.getOffset(), key.getLength(),
                        SINGLE_COLUMN_FAMILY, SINGLE_COLUMN, AGG_TIMESTAMP, value, 0, value.length));
                return cacheIter.hasNext();
            }
        };
    }
}