org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetCache.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetCache.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hdfs.server.datanode.fsdataset.impl;

import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_CACHE_REVOCATION_TIMEOUT_MS;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_CACHE_REVOCATION_TIMEOUT_MS_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_CACHE_REVOCATION_POLLING_MS;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_CACHE_REVOCATION_POLLING_MS_DEFAULT;

import com.google.common.base.Preconditions;
import com.google.common.util.concurrent.ThreadFactoryBuilder;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.concurrent.Executor;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.time.DurationFormatUtils;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.ChecksumException;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.ExtendedBlockId;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.server.datanode.DatanodeUtil;
import org.apache.hadoop.io.nativeio.NativeIO;
import org.apache.hadoop.util.Time;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Manages caching for an FsDatasetImpl by using the mmap(2) and mlock(2)
 * system calls to lock blocks into memory. Block checksums are verified upon
 * entry into the cache.
 */
@InterfaceAudience.Private
@InterfaceStability.Unstable
public class FsDatasetCache {

    /**
     * MappableBlocks that we know about.
     */
    private static final class Value {
        final State state;
        final MappableBlock mappableBlock;

        Value(MappableBlock mappableBlock, State state) {
            this.mappableBlock = mappableBlock;
            this.state = state;
        }
    }

    private enum State {
        /**
         * The MappableBlock is in the process of being cached.
         */
        CACHING,

        /**
         * The MappableBlock was in the process of being cached, but it was
         * cancelled.  Only the FsDatasetCache#WorkerTask can remove cancelled
         * MappableBlock objects.
         */
        CACHING_CANCELLED,

        /**
         * The MappableBlock is in the cache.
         */
        CACHED,

        /**
         * The MappableBlock is in the process of uncaching.
         */
        UNCACHING;

        /**
         * Whether we should advertise this block as cached to the NameNode and
         * clients.
         */
        public boolean shouldAdvertise() {
            return (this == CACHED);
        }
    }

    private static final Logger LOG = LoggerFactory.getLogger(FsDatasetCache.class);

    /**
     * Stores MappableBlock objects and the states they're in.
     */
    private final HashMap<ExtendedBlockId, Value> mappableBlockMap = new HashMap<ExtendedBlockId, Value>();

    private final AtomicLong numBlocksCached = new AtomicLong(0);

    private final FsDatasetImpl dataset;

    private final ThreadPoolExecutor uncachingExecutor;

    private final ScheduledThreadPoolExecutor deferredUncachingExecutor;

    private final long revocationMs;

    private final long revocationPollingMs;

    /**
     * The approximate amount of cache space in use.
     *
     * This number is an overestimate, counting bytes that will be used only
     * if pending caching operations succeed.  It does not take into account
     * pending uncaching operations.
     *
     * This overestimate is more useful to the NameNode than an underestimate,
     * since we don't want the NameNode to assign us more replicas than
     * we can cache, because of the current batch of operations.
     */
    private final UsedBytesCount usedBytesCount;

    public static class PageRounder {
        private final long osPageSize = NativeIO.POSIX.getCacheManipulator().getOperatingSystemPageSize();

        /**
         * Round up a number to the operating system page size.
         */
        public long round(long count) {
            long newCount = (count + (osPageSize - 1)) / osPageSize;
            return newCount * osPageSize;
        }
    }

    private class UsedBytesCount {
        private final AtomicLong usedBytes = new AtomicLong(0);

        private PageRounder rounder = new PageRounder();

        /**
         * Try to reserve more bytes.
         *
         * @param count    The number of bytes to add.  We will round this
         *                 up to the page size.
         *
         * @return         The new number of usedBytes if we succeeded;
         *                 -1 if we failed.
         */
        long reserve(long count) {
            count = rounder.round(count);
            while (true) {
                long cur = usedBytes.get();
                long next = cur + count;
                if (next > maxBytes) {
                    return -1;
                }
                if (usedBytes.compareAndSet(cur, next)) {
                    return next;
                }
            }
        }

        /**
         * Release some bytes that we're using.
         *
         * @param count    The number of bytes to release.  We will round this
         *                 up to the page size.
         *
         * @return         The new number of usedBytes.
         */
        long release(long count) {
            count = rounder.round(count);
            return usedBytes.addAndGet(-count);
        }

        long get() {
            return usedBytes.get();
        }
    }

    /**
     * The total cache capacity in bytes.
     */
    private final long maxBytes;

    /**
     * Number of cache commands that could not be completed successfully
     */
    AtomicLong numBlocksFailedToCache = new AtomicLong(0);
    /**
     * Number of uncache commands that could not be completed successfully
     */
    AtomicLong numBlocksFailedToUncache = new AtomicLong(0);

    public FsDatasetCache(FsDatasetImpl dataset) {
        this.dataset = dataset;
        this.maxBytes = dataset.datanode.getDnConf().getMaxLockedMemory();
        ThreadFactory workerFactory = new ThreadFactoryBuilder().setDaemon(true)
                .setNameFormat("FsDatasetCache-%d-" + dataset.toString()).build();
        this.usedBytesCount = new UsedBytesCount();
        this.uncachingExecutor = new ThreadPoolExecutor(0, 1, 60, TimeUnit.SECONDS,
                new LinkedBlockingQueue<Runnable>(), workerFactory);
        this.uncachingExecutor.allowCoreThreadTimeOut(true);
        this.deferredUncachingExecutor = new ScheduledThreadPoolExecutor(1, workerFactory);
        this.revocationMs = dataset.datanode.getConf().getLong(DFS_DATANODE_CACHE_REVOCATION_TIMEOUT_MS,
                DFS_DATANODE_CACHE_REVOCATION_TIMEOUT_MS_DEFAULT);
        long confRevocationPollingMs = dataset.datanode.getConf().getLong(DFS_DATANODE_CACHE_REVOCATION_POLLING_MS,
                DFS_DATANODE_CACHE_REVOCATION_POLLING_MS_DEFAULT);
        long minRevocationPollingMs = revocationMs / 2;
        if (minRevocationPollingMs < confRevocationPollingMs) {
            throw new RuntimeException("configured value " + confRevocationPollingMs + "for "
                    + DFS_DATANODE_CACHE_REVOCATION_POLLING_MS
                    + " is too high.  It must not be more than half of the " + "value of "
                    + DFS_DATANODE_CACHE_REVOCATION_TIMEOUT_MS + ".  Reconfigure this to "
                    + minRevocationPollingMs);
        }
        this.revocationPollingMs = confRevocationPollingMs;
    }

    /**
     * @return List of cached blocks suitable for translation into a
     * {@link BlockListAsLongs} for a cache report.
     */
    synchronized List<Long> getCachedBlocks(String bpid) {
        List<Long> blocks = new ArrayList<Long>();
        for (Iterator<Entry<ExtendedBlockId, Value>> iter = mappableBlockMap.entrySet().iterator(); iter
                .hasNext();) {
            Entry<ExtendedBlockId, Value> entry = iter.next();
            if (entry.getKey().getBlockPoolId().equals(bpid)) {
                if (entry.getValue().state.shouldAdvertise()) {
                    blocks.add(entry.getKey().getBlockId());
                }
            }
        }
        return blocks;
    }

    /**
     * Attempt to begin caching a block.
     */
    synchronized void cacheBlock(long blockId, String bpid, String blockFileName, long length, long genstamp,
            Executor volumeExecutor) {
        ExtendedBlockId key = new ExtendedBlockId(blockId, bpid);
        Value prevValue = mappableBlockMap.get(key);
        if (prevValue != null) {
            LOG.debug("Block with id {}, pool {} already exists in the " + "FsDatasetCache with state {}", blockId,
                    bpid, prevValue.state);
            numBlocksFailedToCache.incrementAndGet();
            return;
        }
        mappableBlockMap.put(key, new Value(null, State.CACHING));
        volumeExecutor.execute(new CachingTask(key, blockFileName, length, genstamp));
        LOG.debug("Initiating caching for Block with id {}, pool {}", blockId, bpid);
    }

    synchronized void uncacheBlock(String bpid, long blockId) {
        ExtendedBlockId key = new ExtendedBlockId(blockId, bpid);
        Value prevValue = mappableBlockMap.get(key);
        boolean deferred = false;

        if (!dataset.datanode.getShortCircuitRegistry().processBlockMunlockRequest(key)) {
            deferred = true;
        }
        if (prevValue == null) {
            LOG.debug("Block with id {}, pool {} does not need to be uncached, "
                    + "because it is not currently in the mappableBlockMap.", blockId, bpid);
            numBlocksFailedToUncache.incrementAndGet();
            return;
        }
        switch (prevValue.state) {
        case CACHING:
            LOG.debug("Cancelling caching for block with id {}, pool {}.", blockId, bpid);
            mappableBlockMap.put(key, new Value(prevValue.mappableBlock, State.CACHING_CANCELLED));
            break;
        case CACHED:
            mappableBlockMap.put(key, new Value(prevValue.mappableBlock, State.UNCACHING));
            if (deferred) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("{} is anchored, and can't be uncached now.  Scheduling it " + "for uncaching in {} ",
                            key, DurationFormatUtils.formatDurationHMS(revocationPollingMs));
                }
                deferredUncachingExecutor.schedule(new UncachingTask(key, revocationMs), revocationPollingMs,
                        TimeUnit.MILLISECONDS);
            } else {
                LOG.debug("{} has been scheduled for immediate uncaching.", key);
                uncachingExecutor.execute(new UncachingTask(key, 0));
            }
            break;
        default:
            LOG.debug("Block with id {}, pool {} does not need to be uncached, " + "because it is in state {}.",
                    blockId, bpid, prevValue.state);
            numBlocksFailedToUncache.incrementAndGet();
            break;
        }
    }

    /**
     * Background worker that mmaps, mlocks, and checksums a block
     */
    private class CachingTask implements Runnable {
        private final ExtendedBlockId key;
        private final String blockFileName;
        private final long length;
        private final long genstamp;

        CachingTask(ExtendedBlockId key, String blockFileName, long length, long genstamp) {
            this.key = key;
            this.blockFileName = blockFileName;
            this.length = length;
            this.genstamp = genstamp;
        }

        @Override
        public void run() {
            boolean success = false;
            FileInputStream blockIn = null, metaIn = null;
            MappableBlock mappableBlock = null;
            ExtendedBlock extBlk = new ExtendedBlock(key.getBlockPoolId(), key.getBlockId(), length, genstamp);
            long newUsedBytes = usedBytesCount.reserve(length);
            boolean reservedBytes = false;
            try {
                try {
                    if (newUsedBytes < 0) {
                        LOG.warn("Failed to cache " + key + ": could not reserve " + length
                                + " more bytes in the cache: " + DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY
                                + " of " + maxBytes + " exceeded.");
                        return;
                    }
                    reservedBytes = true;
                    blockIn = (FileInputStream) dataset.getBlockInputStream(extBlk, 0);
                    metaIn = DatanodeUtil.getMetaDataInputStream(extBlk, dataset);
                } catch (ClassCastException e) {
                    LOG.warn("Failed to cache " + key + ": Underlying blocks are not backed by files.", e);
                    return;
                } catch (FileNotFoundException e) {
                    LOG.info("Failed to cache " + key + ": failed to find backing " + "files.");
                    return;
                } catch (IOException e) {
                    LOG.warn("Failed to cache " + key + ": failed to open file", e);
                    return;
                }
                try {
                    mappableBlock = MappableBlock.load(length, blockIn, metaIn, blockFileName);
                } catch (ChecksumException e) {
                    // Exception message is bogus since this wasn't caused by a file read
                    LOG.warn("Failed to cache " + key + ": checksum verification failed.");
                    return;
                } catch (IOException e) {
                    LOG.warn("Failed to cache " + key, e);
                    return;
                }
                synchronized (FsDatasetCache.this) {
                    Value value = mappableBlockMap.get(key);
                    Preconditions.checkNotNull(value);
                    Preconditions
                            .checkState(value.state == State.CACHING || value.state == State.CACHING_CANCELLED);
                    if (value.state == State.CACHING_CANCELLED) {
                        mappableBlockMap.remove(key);
                        LOG.warn("Caching of " + key + " was cancelled.");
                        return;
                    }
                    mappableBlockMap.put(key, new Value(mappableBlock, State.CACHED));
                }
                LOG.debug("Successfully cached {}.  We are now caching {} bytes in" + " total.", key, newUsedBytes);
                dataset.datanode.getShortCircuitRegistry().processBlockMlockEvent(key);
                numBlocksCached.addAndGet(1);
                dataset.datanode.getMetrics().incrBlocksCached(1);
                success = true;
            } finally {
                IOUtils.closeQuietly(blockIn);
                IOUtils.closeQuietly(metaIn);
                if (!success) {
                    if (reservedBytes) {
                        usedBytesCount.release(length);
                    }
                    LOG.debug("Caching of {} was aborted.  We are now caching only {} " + "bytes in total.", key,
                            usedBytesCount.get());
                    if (mappableBlock != null) {
                        mappableBlock.close();
                    }
                    numBlocksFailedToCache.incrementAndGet();

                    synchronized (FsDatasetCache.this) {
                        mappableBlockMap.remove(key);
                    }
                }
            }
        }
    }

    private class UncachingTask implements Runnable {
        private final ExtendedBlockId key;
        private final long revocationTimeMs;

        UncachingTask(ExtendedBlockId key, long revocationDelayMs) {
            this.key = key;
            if (revocationDelayMs == 0) {
                this.revocationTimeMs = 0;
            } else {
                this.revocationTimeMs = revocationDelayMs + Time.monotonicNow();
            }
        }

        private boolean shouldDefer() {
            /* If revocationTimeMs == 0, this is an immediate uncache request.
             * No clients were anchored at the time we made the request. */
            if (revocationTimeMs == 0) {
                return false;
            }
            /* Let's check if any clients still have this block anchored. */
            boolean anchored = !dataset.datanode.getShortCircuitRegistry().processBlockMunlockRequest(key);
            if (!anchored) {
                LOG.debug("Uncaching {} now that it is no longer in use " + "by any clients.", key);
                return false;
            }
            long delta = revocationTimeMs - Time.monotonicNow();
            if (delta < 0) {
                LOG.warn("Forcibly uncaching {} after {} " + "because client(s) {} refused to stop using it.", key,
                        DurationFormatUtils.formatDurationHMS(revocationTimeMs),
                        dataset.datanode.getShortCircuitRegistry().getClientNames(key));
                return false;
            }
            LOG.info(
                    "Replica {} still can't be uncached because some "
                            + "clients continue to use it.  Will wait for {}",
                    key, DurationFormatUtils.formatDurationHMS(delta));
            return true;
        }

        @Override
        public void run() {
            Value value;

            if (shouldDefer()) {
                deferredUncachingExecutor.schedule(this, revocationPollingMs, TimeUnit.MILLISECONDS);
                return;
            }

            synchronized (FsDatasetCache.this) {
                value = mappableBlockMap.get(key);
            }
            Preconditions.checkNotNull(value);
            Preconditions.checkArgument(value.state == State.UNCACHING);

            IOUtils.closeQuietly(value.mappableBlock);
            synchronized (FsDatasetCache.this) {
                mappableBlockMap.remove(key);
            }
            long newUsedBytes = usedBytesCount.release(value.mappableBlock.getLength());
            numBlocksCached.addAndGet(-1);
            dataset.datanode.getMetrics().incrBlocksUncached(1);
            if (revocationTimeMs != 0) {
                LOG.debug("Uncaching of {} completed. usedBytes = {}", key, newUsedBytes);
            } else {
                LOG.debug("Deferred uncaching of {} completed. usedBytes = {}", key, newUsedBytes);
            }
        }
    }

    // Stats related methods for FSDatasetMBean

    /**
     * Get the approximate amount of cache space used.
     */
    public long getCacheUsed() {
        return usedBytesCount.get();
    }

    /**
     * Get the maximum amount of bytes we can cache.  This is a constant.
     */
    public long getCacheCapacity() {
        return maxBytes;
    }

    public long getNumBlocksFailedToCache() {
        return numBlocksFailedToCache.get();
    }

    public long getNumBlocksFailedToUncache() {
        return numBlocksFailedToUncache.get();
    }

    public long getNumBlocksCached() {
        return numBlocksCached.get();
    }

    public synchronized boolean isCached(String bpid, long blockId) {
        ExtendedBlockId block = new ExtendedBlockId(blockId, bpid);
        Value val = mappableBlockMap.get(block);
        return (val != null) && val.state.shouldAdvertise();
    }
}