org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicyRaid.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicyRaid.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.blockmanagement;

import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.LinkedHashMap;
import java.util.ArrayList;
import java.util.Map;
import java.util.Comparator;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.server.namenode.*;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.net.Node;
import org.apache.hadoop.raid.RaidNode;
import org.apache.hadoop.util.StringUtils;

/**
 * This BlockPlacementPolicy spreads out the group of blocks which used by RAID
 * for recovering each other. This is important for the availability
 * of the blocks. This class can be used by multiple threads. It has to be
 * thread safe.
 */
public class BlockPlacementPolicyRaid extends BlockPlacementPolicy {
    public static final Log LOG = LogFactory.getLog(BlockPlacementPolicyRaid.class);
    Configuration conf;
    private int stripeLength;
    private int xorParityLength;
    private int rsParityLength;
    private String xorPrefix = null;
    private String rsPrefix = null;
    private String raidTempPrefix = null;
    private String raidrsTempPrefix = null;
    private String raidHarTempPrefix = null;
    private String raidrsHarTempPrefix = null;
    private FSNamesystem namesystem = null;
    private BlockPlacementPolicyDefault defaultPolicy;

    CachedLocatedBlocks cachedLocatedBlocks;
    CachedFullPathNames cachedFullPathNames;

    /** {@inheritDoc} */
    @Override
    public void initialize(Configuration conf, FSClusterStats stats, NetworkTopology clusterMap) {
        this.conf = conf;
        this.stripeLength = RaidNode.getStripeLength(conf);
        this.rsParityLength = RaidNode.rsParityLength(conf);
        this.xorParityLength = 1;
        try {
            this.xorPrefix = RaidNode.xorDestinationPath(conf).toUri().getPath();
            this.rsPrefix = RaidNode.rsDestinationPath(conf).toUri().getPath();
        } catch (IOException e) {
        }
        if (this.xorPrefix == null) {
            this.xorPrefix = RaidNode.DEFAULT_RAID_LOCATION;
        }
        if (this.rsPrefix == null) {
            this.rsPrefix = RaidNode.DEFAULT_RAIDRS_LOCATION;
        }
        // Throws ClassCastException if we cannot cast here.
        this.namesystem = (FSNamesystem) stats;
        this.cachedLocatedBlocks = new CachedLocatedBlocks(namesystem);
        this.cachedFullPathNames = new CachedFullPathNames(namesystem);
        this.raidTempPrefix = RaidNode.xorTempPrefix(conf);
        this.raidrsTempPrefix = RaidNode.rsTempPrefix(conf);
        this.raidHarTempPrefix = RaidNode.xorHarTempPrefix(conf);
        this.raidrsHarTempPrefix = RaidNode.rsHarTempPrefix(conf);
        defaultPolicy = new BlockPlacementPolicyDefault(conf, stats, clusterMap);
    }

    @Override
    DatanodeDescriptor[] chooseTarget(String srcPath, int numOfReplicas, DatanodeDescriptor writer,
            List<DatanodeDescriptor> chosenNodes, long blocksize) {
        return chooseTarget(srcPath, numOfReplicas, writer, chosenNodes, null, blocksize);
    }

    @Override
    public DatanodeDescriptor[] chooseTarget(String srcPath, int numOfReplicas, DatanodeDescriptor writer,
            List<DatanodeDescriptor> chosenNodes, boolean returnChosenNodes, HashMap<Node, Node> excludedNodes,
            long blocksize) {
        try {
            FileType type = getFileType(srcPath);
            if (type == FileType.NOT_RAID) {
                return defaultPolicy.chooseTarget(srcPath, numOfReplicas, writer, chosenNodes, blocksize);
            }
            if (excludedNodes == null) {
                excludedNodes = new HashMap<Node, Node>();
            }
            addExcludedNodes(srcPath, type, excludedNodes);
            DatanodeDescriptor[] result = defaultPolicy.chooseTarget(numOfReplicas, writer, chosenNodes,
                    returnChosenNodes, excludedNodes, blocksize);
            // Add the added block locations in the block locations cache.
            // So the rest of the blocks know about these locations.
            cachedLocatedBlocks.get(srcPath).add(new LocatedBlock(new ExtendedBlock(), result));
            return result;
        } catch (Exception e) {
            LOG.debug("Error happend when choosing datanode to write:" + StringUtils.stringifyException(e));
            return defaultPolicy.chooseTarget(srcPath, numOfReplicas, writer, chosenNodes, blocksize);
        }
    }

    @Override
    public int verifyBlockPlacement(String srcPath, LocatedBlock lBlk, int minRacks) {
        return defaultPolicy.verifyBlockPlacement(srcPath, lBlk, minRacks);
    }

    /** {@inheritDoc} */
    @Override
    public DatanodeDescriptor chooseReplicaToDelete(FSInodeInfo inode, Block block, short replicationFactor,
            Collection<DatanodeDescriptor> first, Collection<DatanodeDescriptor> second) {

        DatanodeDescriptor chosenNode = null;
        try {
            String path = cachedFullPathNames.get(inode);
            FileType type = getFileType(path);
            if (type == FileType.NOT_RAID) {
                return defaultPolicy.chooseReplicaToDelete(inode, block, replicationFactor, first, second);
            }
            List<LocatedBlock> companionBlocks = getCompanionBlocks(path, type, block);
            if (companionBlocks == null || companionBlocks.size() == 0) {
                // Use the default method if it is not a valid raided or parity file
                return defaultPolicy.chooseReplicaToDelete(inode, block, replicationFactor, first, second);
            }
            // Delete from the first collection first
            // This ensures the number of unique rack of this block is not reduced
            Collection<DatanodeDescriptor> all = new HashSet<DatanodeDescriptor>();
            all.addAll(first);
            all.addAll(second);
            chosenNode = chooseReplicaToDelete(companionBlocks, all);
            if (chosenNode != null) {
                return chosenNode;
            }
            return defaultPolicy.chooseReplicaToDelete(inode, block, replicationFactor, first, second);
        } catch (Exception e) {
            LOG.debug("Error happend when choosing replica to delete" + StringUtils.stringifyException(e));
            return defaultPolicy.chooseReplicaToDelete(inode, block, replicationFactor, first, second);
        }
    }

    /**
     * Obtain the excluded nodes for the current block that is being written
     */
    void addExcludedNodes(String file, FileType type, HashMap<Node, Node> excluded) throws IOException {
        Collection<LocatedBlock> blocks = getCompanionBlocks(file, type, null);
        if (blocks == null) {
            return;
        }
        for (LocatedBlock b : blocks) {
            for (Node n : b.getLocations()) {
                excluded.put(n, n);
            }
        }
    }

    private DatanodeDescriptor chooseReplicaToDelete(Collection<LocatedBlock> companionBlocks,
            Collection<DatanodeDescriptor> dataNodes) throws IOException {

        if (dataNodes.isEmpty()) {
            return null;
        }
        // Count the number of replicas on each node and rack
        final Map<String, Integer> nodeCompanionBlockCount = countCompanionBlocks(companionBlocks, false);
        final Map<String, Integer> rackCompanionBlockCount = countCompanionBlocks(companionBlocks, true);

        NodeComparator comparator = new NodeComparator(nodeCompanionBlockCount, rackCompanionBlockCount);
        return Collections.max(dataNodes, comparator);
    }

    /**
     * Count how many companion blocks are on each datanode or the each rack
     * @param companionBlocks a collection of all the companion blocks
     * @param doRackCount count the companion blocks on the racks of datanodes
     * @param result the map from node name to the number of companion blocks
     */
    static Map<String, Integer> countCompanionBlocks(Collection<LocatedBlock> companionBlocks,
            boolean doRackCount) {
        Map<String, Integer> result = new HashMap<String, Integer>();
        for (LocatedBlock block : companionBlocks) {
            for (DatanodeInfo d : block.getLocations()) {
                String name = doRackCount ? d.getParent().getName() : d.getName();
                if (result.containsKey(name)) {
                    int count = result.get(name) + 1;
                    result.put(name, count);
                } else {
                    result.put(name, 1);
                }
            }
        }
        return result;
    }

    /**
     * Compares the datanodes based on the number of companion blocks on the same
     * node and rack. If even, compare the remaining space on the datanodes.
     */
    class NodeComparator implements Comparator<DatanodeDescriptor> {
        private Map<String, Integer> nodeBlockCount;
        private Map<String, Integer> rackBlockCount;

        private NodeComparator(Map<String, Integer> nodeBlockCount, Map<String, Integer> rackBlockCount) {
            this.nodeBlockCount = nodeBlockCount;
            this.rackBlockCount = rackBlockCount;
        }

        @Override
        public int compare(DatanodeDescriptor d1, DatanodeDescriptor d2) {
            int res = compareBlockCount(d1, d2, nodeBlockCount);
            if (res != 0) {
                return res;
            }
            res = compareBlockCount(d1.getParent(), d2.getParent(), rackBlockCount);
            if (res != 0) {
                return res;
            }
            if (d1.getRemaining() > d2.getRemaining()) {
                return -1;
            }
            if (d1.getRemaining() < d2.getRemaining()) {
                return 1;
            }
            return 0;
        }

        private int compareBlockCount(Node node1, Node node2, Map<String, Integer> blockCount) {
            Integer count1 = blockCount.get(node1.getName());
            Integer count2 = blockCount.get(node2.getName());
            count1 = count1 == null ? 0 : count1;
            count2 = count2 == null ? 0 : count2;
            if (count1 > count2) {
                return 1;
            }
            if (count1 < count2) {
                return -1;
            }
            return 0;
        }
    }

    /**
     * Obtain the companion blocks of the give block
     * Companion blocks are defined as the blocks that can help recover each
     * others by using raid decoder.
     * @param path The path of the file contains the block
     * @param type The type of this file
     * @param block The given block
     *              null if it is the block which is currently being written to
     * @return the block locations of companion blocks
     */
    List<LocatedBlock> getCompanionBlocks(String path, FileType type, Block block) throws IOException {
        switch (type) {
        case NOT_RAID:
            return new ArrayList<LocatedBlock>();
        case XOR_HAR_TEMP_PARITY:
            return getCompanionBlocksForHarParityBlock(path, xorParityLength, block);
        case RS_HAR_TEMP_PARITY:
            return getCompanionBlocksForHarParityBlock(path, rsParityLength, block);
        case XOR_TEMP_PARITY:
            return getCompanionBlocksForParityBlock(getSourceFile(path, raidTempPrefix), path, xorParityLength,
                    block);
        case RS_TEMP_PARITY:
            return getCompanionBlocksForParityBlock(getSourceFile(path, raidrsTempPrefix), path, rsParityLength,
                    block);
        case XOR_PARITY:
            return getCompanionBlocksForParityBlock(getSourceFile(path, xorPrefix), path, xorParityLength, block);
        case RS_PARITY:
            return getCompanionBlocksForParityBlock(getSourceFile(path, rsPrefix), path, rsParityLength, block);
        case XOR_SOURCE:
            return getCompanionBlocksForSourceBlock(path, getParityFile(path), xorParityLength, block);
        case RS_SOURCE:
            return getCompanionBlocksForSourceBlock(path, getParityFile(path), xorParityLength, block);
        }
        return new ArrayList<LocatedBlock>();
    }

    private List<LocatedBlock> getCompanionBlocksForHarParityBlock(String parity, int parityLength, Block block)
            throws IOException {
        int blockIndex = getBlockIndex(parity, block);
        // consider only parity file in this case because source file block
        // location is not easy to obtain
        List<LocatedBlock> parityBlocks = cachedLocatedBlocks.get(parity);
        List<LocatedBlock> result = new ArrayList<LocatedBlock>();
        synchronized (parityBlocks) {
            int start = Math.max(0, blockIndex - parityLength + 1);
            int end = Math.min(parityBlocks.size(), blockIndex + parityLength);
            result.addAll(parityBlocks.subList(start, end));
        }
        return result;
    }

    private List<LocatedBlock> getCompanionBlocksForParityBlock(String src, String parity, int parityLength,
            Block block) throws IOException {
        int blockIndex = getBlockIndex(parity, block);
        List<LocatedBlock> result = new ArrayList<LocatedBlock>();
        List<LocatedBlock> parityBlocks = cachedLocatedBlocks.get(parity);
        int stripeIndex = blockIndex / parityLength;
        synchronized (parityBlocks) {
            int parityStart = stripeIndex * parityLength;
            int parityEnd = Math.min(parityStart + parityLength, parityBlocks.size());
            // for parity, always consider the neighbor blocks as companion blocks
            if (parityStart < parityBlocks.size()) {
                result.addAll(parityBlocks.subList(parityStart, parityEnd));
            }
        }

        if (src == null) {
            return result;
        }
        List<LocatedBlock> sourceBlocks = cachedLocatedBlocks.get(src);
        synchronized (sourceBlocks) {
            int sourceStart = stripeIndex * stripeLength;
            int sourceEnd = Math.min(sourceStart + stripeLength, sourceBlocks.size());
            if (sourceStart < sourceBlocks.size()) {
                result.addAll(sourceBlocks.subList(sourceStart, sourceEnd));
            }
        }
        return result;
    }

    private List<LocatedBlock> getCompanionBlocksForSourceBlock(String src, String parity, int parityLength,
            Block block) throws IOException {
        int blockIndex = getBlockIndex(src, block);
        List<LocatedBlock> result = new ArrayList<LocatedBlock>();
        List<LocatedBlock> sourceBlocks = cachedLocatedBlocks.get(src);
        int stripeIndex = blockIndex / stripeLength;
        synchronized (sourceBlocks) {
            int sourceStart = stripeIndex * stripeLength;
            int sourceEnd = Math.min(sourceStart + stripeLength, sourceBlocks.size());
            if (sourceStart < sourceBlocks.size()) {
                result.addAll(sourceBlocks.subList(sourceStart, sourceEnd));
            }
        }
        if (parity == null) {
            return result;
        }
        List<LocatedBlock> parityBlocks = cachedLocatedBlocks.get(parity);
        synchronized (parityBlocks) {
            int parityStart = stripeIndex * parityLength;
            int parityEnd = Math.min(parityStart + parityLength, parityBlocks.size());
            if (parityStart < parityBlocks.size()) {
                result.addAll(parityBlocks.subList(parityStart, parityEnd));
            }
        }
        return result;
    }

    private int getBlockIndex(String file, Block block) throws IOException {
        List<LocatedBlock> blocks = cachedLocatedBlocks.get(file);
        synchronized (blocks) {
            // null indicates that this block is currently added. Return size()
            // as the index in this case
            if (block == null) {
                return blocks.size();
            }
            for (int i = 0; i < blocks.size(); i++) {
                if (blocks.get(i).getBlock().getLocalBlock().equals(block)) {
                    return i;
                }
            }
        }
        throw new IOException("Cannot locate " + block + " in file " + file);
    }

    /**
     * Cache results for FSInodeInfo.getFullPathName()
     */
    static class CachedFullPathNames {
        FSNamesystem namesystem;

        CachedFullPathNames(FSNamesystem namesystem) {
            this.namesystem = namesystem;
        }

        private Cache<INodeWithHashCode, String> cacheInternal = new Cache<INodeWithHashCode, String>() {
            @Override
            public String getDirectly(INodeWithHashCode inode) throws IOException {
                namesystem.readLock();
                try {
                    return inode.getFullPathName();
                } finally {
                    namesystem.readUnlock();
                }
            }
        };

        static private class INodeWithHashCode {
            FSInodeInfo inode;

            INodeWithHashCode(FSInodeInfo inode) {
                this.inode = inode;
            }

            @Override
            public boolean equals(Object obj) {
                return inode == obj;
            }

            @Override
            public int hashCode() {
                return System.identityHashCode(inode);
            }

            String getFullPathName() {
                return inode.getFullPathName();
            }
        }

        public String get(FSInodeInfo inode) throws IOException {
            return cacheInternal.get(new INodeWithHashCode(inode));
        }
    }

    /**
     * Cache results for FSNamesystem.getBlockLocations()
     */
    static class CachedLocatedBlocks extends Cache<String, List<LocatedBlock>> {
        FSNamesystem namesystem;

        CachedLocatedBlocks(FSNamesystem namesystem) {
            this.namesystem = namesystem;
        }

        @Override
        public List<LocatedBlock> getDirectly(String file) throws IOException {
            long len = NameNodeRaidUtil.getFileInfo(namesystem, file, true).getLen();
            List<LocatedBlock> result = NameNodeRaidUtil.getBlockLocations(namesystem, file, 0L, len, false, false)
                    .getLocatedBlocks();
            if (result == null || result.isEmpty()) {
                result = new ArrayList<LocatedBlock>();
            }
            return Collections.synchronizedList(result);
        }
    }

    static abstract class Cache<K, V> {
        private Map<K, ValueWithTime> cache;
        private static final long CACHE_TIMEOUT = 300000L; // 5 minutes
        // The timeout is long but the consequence of stale value is not serious

        Cache() {
            Map<K, ValueWithTime> map = new LinkedHashMap<K, ValueWithTime>() {
                private static final long serialVersionUID = 1L;
                final private int MAX_ENTRIES = 50000;

                @Override
                protected boolean removeEldestEntry(Map.Entry<K, ValueWithTime> eldest) {
                    return size() > MAX_ENTRIES;
                }
            };
            this.cache = Collections.synchronizedMap(map);
        }

        // Note that this method may hold FSNamesystem.readLock() and it may
        // be called inside FSNamesystem.writeLock(). If we make this method
        // synchronized, it will deadlock.
        abstract protected V getDirectly(K key) throws IOException;

        public V get(K key) throws IOException {
            // The method is not synchronized so we may get some stale value here but
            // it's OK.
            ValueWithTime result = cache.get(key);
            long now = System.currentTimeMillis();
            if (result != null && now - result.cachedTime < CACHE_TIMEOUT) {
                return result.value;
            }
            result = new ValueWithTime();
            result.value = getDirectly(key);
            result.cachedTime = now;
            cache.put(key, result);
            return result.value;
        }

        private class ValueWithTime {
            V value = null;
            long cachedTime = 0L;
        }
    }

    /**
     * Get path for the corresponding source file for a valid parity
     * file. Returns null if it does not exists
     * @param parity the toUri path of the parity file
     * @return the toUri path of the source file
     */
    String getSourceFile(String parity, String prefix) throws IOException {
        if (isHarFile(parity)) {
            return null;
        }
        // remove the prefix
        String src = parity.substring(prefix.length());
        if (NameNodeRaidUtil.getFileInfo(namesystem, src, true) == null) {
            return null;
        }
        return src;
    }

    /**
     * Get path for the corresponding parity file for a source file.
     * Returns null if it does not exists
     * @param src the toUri path of the source file
     * @return the toUri path of the parity file
     */
    String getParityFile(String src) throws IOException {
        String xorParity = getParityFile(xorPrefix, src);
        if (xorParity != null) {
            return xorParity;
        }
        String rsParity = getParityFile(rsPrefix, src);
        if (rsParity != null) {
            return rsParity;
        }
        return null;
    }

    /**
     * Get path for the parity file. Returns null if it does not exists
     * @param parityPrefix usuall "/raid/" or "/raidrs/"
     * @return the toUri path of the parity file
     */
    private String getParityFile(String parityPrefix, String src) throws IOException {
        String parity = parityPrefix + src;
        if (NameNodeRaidUtil.getFileInfo(namesystem, parity, true) == null) {
            return null;
        }
        return parity;
    }

    private boolean isHarFile(String path) {
        return path.lastIndexOf(RaidNode.HAR_SUFFIX) != -1;
    }

    enum FileType {
        NOT_RAID, XOR_HAR_TEMP_PARITY, XOR_TEMP_PARITY, XOR_PARITY, XOR_SOURCE, RS_HAR_TEMP_PARITY, RS_TEMP_PARITY, RS_PARITY, RS_SOURCE,
    }

    FileType getFileType(String path) throws IOException {
        if (path.startsWith(raidHarTempPrefix + Path.SEPARATOR)) {
            return FileType.XOR_HAR_TEMP_PARITY;
        }
        if (path.startsWith(raidrsHarTempPrefix + Path.SEPARATOR)) {
            return FileType.RS_HAR_TEMP_PARITY;
        }
        if (path.startsWith(raidTempPrefix + Path.SEPARATOR)) {
            return FileType.XOR_TEMP_PARITY;
        }
        if (path.startsWith(raidrsTempPrefix + Path.SEPARATOR)) {
            return FileType.RS_TEMP_PARITY;
        }
        if (path.startsWith(xorPrefix + Path.SEPARATOR)) {
            return FileType.XOR_PARITY;
        }
        if (path.startsWith(rsPrefix + Path.SEPARATOR)) {
            return FileType.RS_PARITY;
        }
        String parity = getParityFile(path);
        if (parity == null) {
            return FileType.NOT_RAID;
        }
        if (parity.startsWith(xorPrefix + Path.SEPARATOR)) {
            return FileType.XOR_SOURCE;
        }
        if (parity.startsWith(rsPrefix + Path.SEPARATOR)) {
            return FileType.RS_SOURCE;
        }
        return FileType.NOT_RAID;
    }
}