org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.BlockPoolSlice.java Source code

Introduction

Here is the source code for org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.BlockPoolSlice.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.datanode.fsdataset.impl;

import com.google.common.io.Files;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.DU;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs.BlockReportReplica;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
import org.apache.hadoop.hdfs.server.datanode.DataStorage;
import org.apache.hadoop.hdfs.server.datanode.DatanodeUtil;
import org.apache.hadoop.hdfs.server.datanode.FinalizedReplica;
import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo;
import org.apache.hadoop.hdfs.server.datanode.ReplicaWaitingToBeRecovered;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.DiskChecker;
import org.apache.hadoop.util.DiskChecker.DiskErrorException;

import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.util.Iterator;
import java.util.Scanner;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.server.datanode.ReplicaBeingWritten;
import org.apache.hadoop.hdfs.server.protocol.BlockReport;
import org.apache.hadoop.util.ShutdownHookManager;
import org.apache.hadoop.util.Time;

/**
 * A block pool slice represents a portion of a block pool stored on a volume.
 * Taken together, all BlockPoolSlices sharing a block pool ID across a
 * cluster represent a single block pool.
 * <p/>
 * This class is synchronized by {@link FsVolumeImpl}.
 */
class BlockPoolSlice {
    static final Log LOG = LogFactory.getLog(BlockPoolSlice.class);

    private final String bpid;
    private final FsVolumeImpl volume;
    // volume to which this BlockPool belongs to
    private final File currentDir; // StorageDirectory/current/bpid/current
    // directory where finalized replicas are stored
    private final File finalizedDir;
    private final File rbwDir; // directory store RBW replica
    private final File tmpDir; // directory store Temporary replica
    private static String DU_CACHE_FILE = "dfsUsed";
    private volatile boolean dfsUsedSaved = false;
    private static final int SHUTDOWN_HOOK_PRIORITY = 30;
    private static final String REPLICA_CACHE_FILE = "replicas";
    private final long replicaCacheExpiry = 5 * 60 * 1000;

    // TODO:FEDERATION scalability issue - a thread per DU is needed
    private final DU dfsUsage;

    /**
     * Create a blook pool slice
     *
     * @param bpid
     *     Block pool Id
     * @param volume
     *     {@link FsVolumeImpl} to which this BlockPool belongs to
     * @param bpDir
     *     directory corresponding to the BlockPool
     * @param conf
     * @throws IOException
     */
    BlockPoolSlice(String bpid, FsVolumeImpl volume, File bpDir, Configuration conf) throws IOException {
        this.bpid = bpid;
        this.volume = volume;
        this.currentDir = new File(bpDir, DataStorage.STORAGE_DIR_CURRENT);
        this.finalizedDir = new File(currentDir, DataStorage.STORAGE_DIR_FINALIZED);
        if (!this.finalizedDir.exists()) {
            if (!this.finalizedDir.mkdirs()) {
                throw new IOException("Failed to mkdirs " + this.finalizedDir);
            }
        }

        // Files that were being written when the datanode was last shutdown
        // are now moved back to the data directory. It is possible that
        // in the future, we might want to do some sort of datanode-local
        // recovery for these blocks. For example, crc validation.
        //
        this.tmpDir = new File(bpDir, DataStorage.STORAGE_DIR_TMP);
        if (tmpDir.exists()) {
            FileUtil.fullyDelete(tmpDir);
        }
        this.rbwDir = new File(currentDir, DataStorage.STORAGE_DIR_RBW);
        final boolean supportAppends = conf.getBoolean(DFSConfigKeys.DFS_SUPPORT_APPEND_KEY,
                DFSConfigKeys.DFS_SUPPORT_APPEND_DEFAULT);
        if (rbwDir.exists() && !supportAppends) {
            FileUtil.fullyDelete(rbwDir);
        }
        if (!rbwDir.mkdirs()) { // create rbw directory if not exist
            if (!rbwDir.isDirectory()) {
                throw new IOException("Mkdirs failed to create " + rbwDir.toString());
            }
        }
        if (!tmpDir.mkdirs()) {
            if (!tmpDir.isDirectory()) {
                throw new IOException("Mkdirs failed to create " + tmpDir.toString());
            }
        }
        // Use cached value initially if available. Or the following call will
        // block until the initial du command completes.
        this.dfsUsage = new DU(bpDir, conf, loadDfsUsed());
        this.dfsUsage.start();

        // Make the dfs usage to be saved during shutdown.
        ShutdownHookManager.get().addShutdownHook(new Runnable() {
            @Override
            public void run() {
                if (!dfsUsedSaved) {
                    saveDfsUsed();
                }
            }
        }, SHUTDOWN_HOOK_PRIORITY);
    }

    File getDirectory() {
        return currentDir.getParentFile();
    }

    File getFinalizedDir() {
        return finalizedDir;
    }

    File getRbwDir() {
        return rbwDir;
    }

    File getTmpDir() {
        return tmpDir;
    }

    /** Run DU on local drives.  It must be synchronized from caller. */
    void decDfsUsed(long value) {
        dfsUsage.decDfsUsed(value);
    }

    long getDfsUsed() throws IOException {
        return dfsUsage.getUsed();
    }

    /**
     * Read in the cached DU value and return it if it is less than 600 seconds
     * old (DU update interval). Slight imprecision of dfsUsed is not critical
     * and skipping DU can significantly shorten the startup time.
     * If the cached value is not available or too old, -1 is returned.
     */
    long loadDfsUsed() {
        long cachedDfsUsed;
        long mtime;
        Scanner sc;

        try {
            sc = new Scanner(new File(currentDir, DU_CACHE_FILE), "UTF-8");
        } catch (FileNotFoundException fnfe) {
            return -1;
        }

        try {
            // Get the recorded dfsUsed from the file.
            if (sc.hasNextLong()) {
                cachedDfsUsed = sc.nextLong();
            } else {
                return -1;
            }
            // Get the recorded mtime from the file.
            if (sc.hasNextLong()) {
                mtime = sc.nextLong();
            } else {
                return -1;
            }

            // Return the cached value if mtime is okay.
            if (mtime > 0 && (Time.now() - mtime < 600000L)) {
                FsDatasetImpl.LOG.info("Cached dfsUsed found for " + currentDir + ": " + cachedDfsUsed);
                return cachedDfsUsed;
            }
            return -1;
        } finally {
            sc.close();
        }
    }

    /**
     * Write the current dfsUsed to the cache file.
     */
    void saveDfsUsed() {
        File outFile = new File(currentDir, DU_CACHE_FILE);
        if (outFile.exists() && !outFile.delete()) {
            FsDatasetImpl.LOG.warn("Failed to delete old dfsUsed file in " + outFile.getParent());
        }

        try {
            long used = getDfsUsed();
            try (Writer out = new OutputStreamWriter(new FileOutputStream(outFile), "UTF-8")) {
                // mtime is written last, so that truncated writes won't be valid.
                out.write(Long.toString(used) + " " + Long.toString(Time.now()));
                out.flush();
            }
        } catch (IOException ioe) {
            // If write failed, the volume might be bad. Since the cache file is
            // not critical, log the error and continue.
            FsDatasetImpl.LOG.warn("Failed to write dfsUsed to " + outFile, ioe);
        }
    }

    /**
     * Temporary files. They get moved to the finalized block directory when
     * the block is finalized.
     */
    File createTmpFile(Block b) throws IOException {
        File f = new File(tmpDir, b.getBlockName());
        return DatanodeUtil.createTmpFile(b, f);
    }

    /**
     * RBW files. They get moved to the finalized block directory when
     * the block is finalized.
     */
    File createRbwFile(Block b) throws IOException {
        File f = new File(rbwDir, b.getBlockName());
        return DatanodeUtil.createTmpFile(b, f);
    }

    File addBlock(Block b, File f) throws IOException {
        File blockDir = DatanodeUtil.idToBlockDir(finalizedDir, b.getBlockId());
        if (!blockDir.exists()) {
            if (!blockDir.mkdirs()) {
                throw new IOException("Failed to mkdirs " + blockDir);
            }
        }
        File blockFile = FsDatasetImpl.moveBlockFiles(b, f, blockDir);
        File metaFile = FsDatasetUtil.getMetaFile(blockFile, b.getGenerationStamp());
        dfsUsage.incDfsUsed(b.getNumBytes() + metaFile.length());
        return blockFile;
    }

    void checkDirs() throws DiskErrorException {
        DiskChecker.checkDir(finalizedDir);
        DiskChecker.checkDir(tmpDir);
        DiskChecker.checkDir(rbwDir);
    }

    void getVolumeMap(ReplicaMap volumeMap) throws IOException {
        boolean success = readReplicasFromCache(volumeMap);
        if (!success) {
            // add finalized replicas
            addToReplicasMap(volumeMap, finalizedDir, true);
            // add rbw replicas
            addToReplicasMap(volumeMap, rbwDir, false);
        }
    }

    /**
     * Recover an unlinked tmp file on datanode restart. If the original block
     * does not exist, then the tmp file is renamed to be the
     * original file name and the original name is returned; otherwise the tmp
     * file is deleted and null is returned.
     */
    File recoverTempUnlinkedBlock(File unlinkedTmp) throws IOException {
        File blockFile = FsDatasetUtil.getOrigFile(unlinkedTmp);
        if (blockFile.exists()) {
            // If the original block file still exists, then no recovery is needed.
            if (!unlinkedTmp.delete()) {
                throw new IOException("Unable to cleanup unlinked tmp file " + unlinkedTmp);
            }
            return null;
        } else {
            if (!unlinkedTmp.renameTo(blockFile)) {
                throw new IOException("Unable to rename unlinked tmp file " + unlinkedTmp);
            }
            return blockFile;
        }
    }

    private void addReplicaToReplicasMap(Block block, ReplicaMap volumeMap, boolean isFinalized)
            throws IOException {
        ReplicaInfo newReplica = null;
        long blockId = block.getBlockId();
        long genStamp = block.getGenerationStamp();
        if (isFinalized) {
            newReplica = new FinalizedReplica(blockId, block.getNumBytes(), genStamp, volume,
                    DatanodeUtil.idToBlockDir(finalizedDir, blockId));
        } else {
            File file = new File(rbwDir, block.getBlockName());
            boolean loadRwr = true;
            File restartMeta = new File(file.getParent() + File.pathSeparator + "." + file.getName() + ".restart");
            Scanner sc = null;
            try {
                sc = new Scanner(restartMeta, "UTF-8");
                // The restart meta file exists
                if (sc.hasNextLong() && (sc.nextLong() > Time.now())) {
                    // It didn't expire. Load the replica as a RBW.
                    // We don't know the expected block length, so just use 0
                    // and don't reserve any more space for writes.
                    newReplica = new ReplicaBeingWritten(blockId, validateIntegrityAndSetLength(file, genStamp),
                            genStamp, volume, file.getParentFile(), null, 0);
                    loadRwr = false;
                }
                sc.close();
                if (!restartMeta.delete()) {
                    FsDatasetImpl.LOG.warn("Failed to delete restart meta file: " + restartMeta.getPath());
                }
            } catch (FileNotFoundException fnfe) {
                // nothing to do hereFile dir =
            } finally {
                if (sc != null) {
                    sc.close();
                }
            }
            // Restart meta doesn't exist or expired.
            if (loadRwr) {
                newReplica = new ReplicaWaitingToBeRecovered(blockId, validateIntegrityAndSetLength(file, genStamp),
                        genStamp, volume, file.getParentFile());
            }
        }

        ReplicaInfo oldReplica = volumeMap.get(bpid, newReplica.getBlockId());
        if (oldReplica == null) {
            volumeMap.add(bpid, newReplica);
        } else {
            FsDatasetImpl.LOG.warn("Two block files with the same block id exist " + "on disk: "
                    + oldReplica.getBlockFile() + " and " + newReplica.getBlockFile());
        }
    }

    /**
     * Add replicas under the given directory to the volume map
     *
     * @param volumeMap
     *     the replicas map
     * @param dir
     *     an input directory
     * @param isFinalized
     *     true if the directory has finalized replicas;
     *     false if the directory has rbw replicas
     */
    void addToReplicasMap(ReplicaMap volumeMap, File dir, boolean isFinalized) throws IOException {
        File files[] = FileUtil.listFiles(dir);
        for (File file : files) {
            if (file.isDirectory()) {
                addToReplicasMap(volumeMap, file, isFinalized);
            }

            if (isFinalized && FsDatasetUtil.isUnlinkTmpFile(file)) {
                file = recoverTempUnlinkedBlock(file);
                if (file == null) { // the original block still exists, so we cover it
                    // in another iteration and can continue here
                    continue;
                }
            }
            if (!Block.isBlockFilename(file))
                continue;

            long genStamp = FsDatasetUtil.getGenerationStampFromFile(files, file);
            long blockId = Block.filename2id(file.getName());
            Block block = new Block(blockId, file.length(), genStamp);
            addReplicaToReplicasMap(block, volumeMap, isFinalized);
        }
    }

    /**
     * Find out the number of bytes in the block that match its crc.
     * <p/>
     * This algorithm assumes that data corruption caused by unexpected
     * datanode shutdown occurs only in the last crc chunk. So it checks
     * only the last chunk.
     *
     * @param blockFile
     *     the block file
     * @param genStamp
     *     generation stamp of the block
     * @return the number of valid bytes
     */
    private long validateIntegrityAndSetLength(File blockFile, long genStamp) {
        DataInputStream checksumIn = null;
        InputStream blockIn = null;
        try {
            final File metaFile = FsDatasetUtil.getMetaFile(blockFile, genStamp);
            long blockFileLen = blockFile.length();
            long metaFileLen = metaFile.length();
            int crcHeaderLen = DataChecksum.getChecksumHeaderSize();
            if (!blockFile.exists() || blockFileLen == 0 || !metaFile.exists() || metaFileLen < crcHeaderLen) {
                return 0;
            }
            checksumIn = new DataInputStream(
                    new BufferedInputStream(new FileInputStream(metaFile), HdfsConstants.IO_FILE_BUFFER_SIZE));

            // read and handle the common header here. For now just a version
            BlockMetadataHeader header = BlockMetadataHeader.readHeader(checksumIn);
            short version = header.getVersion();
            if (version != BlockMetadataHeader.VERSION) {
                FsDatasetImpl.LOG
                        .warn("Wrong version (" + version + ") for metadata file " + metaFile + " ignoring ...");
            }
            DataChecksum checksum = header.getChecksum();
            int bytesPerChecksum = checksum.getBytesPerChecksum();
            int checksumSize = checksum.getChecksumSize();
            long numChunks = Math.min((blockFileLen + bytesPerChecksum - 1) / bytesPerChecksum,
                    (metaFileLen - crcHeaderLen) / checksumSize);
            if (numChunks == 0) {
                return 0;
            }
            IOUtils.skipFully(checksumIn, (numChunks - 1) * checksumSize);
            blockIn = new FileInputStream(blockFile);
            long lastChunkStartPos = (numChunks - 1) * bytesPerChecksum;
            IOUtils.skipFully(blockIn, lastChunkStartPos);
            int lastChunkSize = (int) Math.min(bytesPerChecksum, blockFileLen - lastChunkStartPos);
            byte[] buf = new byte[lastChunkSize + checksumSize];
            checksumIn.readFully(buf, lastChunkSize, checksumSize);
            IOUtils.readFully(blockIn, buf, 0, lastChunkSize);

            checksum.update(buf, 0, lastChunkSize);
            long validFileLength;
            if (checksum.compare(buf, lastChunkSize)) { // last chunk matches crc
                validFileLength = lastChunkStartPos + lastChunkSize;
            } else { // last chunck is corrupt
                validFileLength = lastChunkStartPos;
            }

            // truncate if extra bytes are present without CRC
            if (blockFile.length() > validFileLength) {
                RandomAccessFile blockRAF = new RandomAccessFile(blockFile, "rw");
                try {
                    // truncate blockFile
                    blockRAF.setLength(validFileLength);
                } finally {
                    blockRAF.close();
                }
            }

            return validFileLength;
        } catch (IOException e) {
            FsDatasetImpl.LOG.warn(e);
            return 0;
        } finally {
            IOUtils.closeStream(checksumIn);
            IOUtils.closeStream(blockIn);
        }
    }

    @Override
    public String toString() {
        return currentDir.getAbsolutePath();
    }

    void shutdown(BlockReport blocksListToPersist) {
        saveReplicas(blocksListToPersist);
        saveDfsUsed();
        dfsUsedSaved = true;
        dfsUsage.shutdown();
    }

    private boolean readReplicasFromCache(ReplicaMap volumeMap) {
        ReplicaMap tmpReplicaMap = new ReplicaMap(this);
        File replicaFile = new File(currentDir, REPLICA_CACHE_FILE);
        // Check whether the file exists or not.
        if (!replicaFile.exists()) {
            LOG.info("Replica Cache file: " + replicaFile.getPath() + " doesn't exist ");
            return false;
        }
        long fileLastModifiedTime = replicaFile.lastModified();
        if (System.currentTimeMillis() > fileLastModifiedTime + replicaCacheExpiry) {
            LOG.info("Replica Cache file: " + replicaFile.getPath() + " has gone stale");
            // Just to make findbugs happy
            if (!replicaFile.delete()) {
                LOG.info("Replica Cache file: " + replicaFile.getPath() + " cannot be deleted");
            }
            return false;
        }
        FileInputStream inputStream = null;
        try {
            inputStream = new FileInputStream(replicaFile);
            BlockReport blocksList = BlockReport.readFrom(inputStream);
            Iterator<BlockListAsLongs.BlockReportReplica> iterator = blocksList.iterator();
            while (iterator.hasNext()) {
                BlockReportReplica replica = iterator.next();
                switch (replica.getState()) {
                case FINALIZED:
                    addReplicaToReplicasMap(replica, tmpReplicaMap, true);
                    break;
                case RUR:
                case RBW:
                case RWR:
                    addReplicaToReplicasMap(replica, tmpReplicaMap, false);
                    break;
                default:
                    break;
                }
            }
            inputStream.close();
            // Now it is safe to add the replica into volumeMap
            // In case of any exception during parsing this cache file, fall back
            // to scan all the files on disk.
            for (ReplicaInfo info : tmpReplicaMap.replicas(bpid)) {
                volumeMap.add(bpid, info);
            }
            LOG.info("Successfully read replica from cache file : " + replicaFile.getPath());
            return true;
        } catch (Exception e) {
            // Any exception we need to revert back to read from disk
            // Log the error and return false
            LOG.info("Exception occured while reading the replicas cache file: " + replicaFile.getPath(), e);
            return false;
        } finally {
            if (!replicaFile.delete()) {
                LOG.info("Failed to delete replica cache file: " + replicaFile.getPath());
            }
            // close the inputStream
            IOUtils.closeStream(inputStream);
        }
    }

    private void saveReplicas(BlockReport blocksListToPersist) {
        if (blocksListToPersist == null || blocksListToPersist.getNumberOfBlocks() == 0) {
            return;
        }
        File tmpFile = new File(currentDir, REPLICA_CACHE_FILE + ".tmp");
        if (tmpFile.exists() && !tmpFile.delete()) {
            LOG.warn("Failed to delete tmp replicas file in " + tmpFile.getPath());
            return;
        }
        File replicaCacheFile = new File(currentDir, REPLICA_CACHE_FILE);
        if (replicaCacheFile.exists() && !replicaCacheFile.delete()) {
            LOG.warn("Failed to delete replicas file in " + replicaCacheFile.getPath());
            return;
        }

        FileOutputStream out = null;
        try {
            out = new FileOutputStream(tmpFile);
            blocksListToPersist.writeTo(out);
            out.close();
            // Renaming the tmp file to replicas
            Files.move(tmpFile, replicaCacheFile);
        } catch (Exception e) {
            // If write failed, the volume might be bad. Since the cache file is
            // not critical, log the error, delete both the files (tmp and cache)
            // and continue.
            LOG.warn("Failed to write replicas to cache ", e);
            if (replicaCacheFile.exists() && !replicaCacheFile.delete()) {
                LOG.warn("Failed to delete replicas file: " + replicaCacheFile.getPath());
            }
        } finally {
            IOUtils.closeStream(out);
            if (tmpFile.exists() && !tmpFile.delete()) {
                LOG.warn("Failed to delete tmp file in " + tmpFile.getPath());
            }
        }
    }
}