org.apache.hadoop.hdfs.server.hightidenode.FileFixer.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hdfs.server.hightidenode.FileFixer.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hdfs.server.hightidenode;

import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.HashMap;
import java.util.Set;
import java.util.HashSet;
import java.util.List;
import java.util.Collection;
import java.util.regex.Pattern;
import java.util.Random;
import java.util.Queue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.nio.channels.SocketChannel;

import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.DFSInputStream;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.protocol.FSConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.datanode.BlockSender;
import org.apache.hadoop.hdfs.server.datanode.FSDataset;
import org.apache.hadoop.io.Text;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockMissingException;
import org.apache.hadoop.fs.ChecksumException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.ipc.RPC;

import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
import org.apache.hadoop.hdfs.protocol.ProtocolCompatible;
import org.apache.hadoop.hdfs.protocol.HighTideProtocol;
import org.apache.hadoop.hdfs.protocol.PolicyInfo;
import org.apache.hadoop.hdfs.protocol.PolicyInfo.PathInfo;
import org.apache.hadoop.hdfs.server.hightidenode.metrics.HighTideNodeMetrics;

/**
 * This class fixes files by copying data from one of the files in the
 * equivalent set.
 * It periodically fetches the list of corrupt files from the namenode,
 * and fixed missing blocks
 */
public class FileFixer implements Runnable {
    public static final Log LOG = LogFactory.getLog("org.apache.hadoop.hdfs.hightide.FileFixer");
    private final Configuration conf;

    private volatile boolean running = true;
    private int blockFixInterval = 60 * 1000; // 1min
    private int numThreads = 100;

    // ThreadPool keep-alive time for threads over core pool size
    private static final long THREADS_KEEP_ALIVE_SECONDS = 60;

    // a queue to store corrupted files
    static class PathToPolicy {
        String spath;
        PolicyInfo pinfo;

        PathToPolicy(Path p, PolicyInfo info) {
            this.spath = p.toString();
            this.pinfo = info;
        }
    }

    private Collection<PolicyInfo> all; // list of all policies
    List<PathToPolicy> pathToPolicy; // find policy based on longest path match

    private PendingReplication filesBeingFixed; // files that are being fixed

    ThreadPoolExecutor executor; // threads to fix blocks

    FileFixer(Configuration conf) throws IOException {
        this.conf = conf;
        blockFixInterval = conf.getInt("hightide.blockfix.interval", blockFixInterval);
        numThreads = conf.getInt("hightide.blockfix.numthreads", numThreads);

        pathToPolicy = new LinkedList<PathToPolicy>();
        executor = new ThreadPoolExecutor(numThreads, numThreads, THREADS_KEEP_ALIVE_SECONDS, TimeUnit.SECONDS,
                new LinkedBlockingQueue<Runnable>());

        // start a thread to purge enties from this set automatically
        filesBeingFixed = new PendingReplication(conf.getInt("dfs.hightide.pending.timeout.sec", -1) * 1000L);
    }

    /**
     * The list of all configured policies.
     */
    void setPolicyInfo(Collection<PolicyInfo> all) throws IOException {
        this.all = all;
        this.pathToPolicy.clear();

        // keep a reverse map from all top-level paths to policies
        for (PolicyInfo pinfo : all) {
            pathToPolicy.add(new PathToPolicy(pinfo.getSrcPath(), pinfo));
            for (PathInfo d : pinfo.getDestPaths()) {
                pathToPolicy.add(new PathToPolicy(d.rpath, pinfo));
            }
        }

        // keep all paths sorted in revere lexicographical order so that 
        // we longest path is first.
        Comparator<PathToPolicy> comp = new Comparator<PathToPolicy>() {
            public int compare(PathToPolicy p1, PathToPolicy p2) {
                return 0 - p1.spath.compareTo(p2.spath);
            }
        };
        Collections.sort(pathToPolicy, comp);
    }

    /**
     * A singleton thread that finds corrupted files and then schedules
     * blocks to be copied. This thread talks only to NameNodes and does
     * not talk to any datanodes.
     */
    public void run() {
        while (running) {
            try {
                LOG.info("FileFixer continuing to run...");
                doFindFiles();
            } catch (Exception e) {
                LOG.error(StringUtils.stringifyException(e));
            } catch (Error err) {
                LOG.error("Exiting after encountering " + StringUtils.stringifyException(err));
                shutdown();
                throw err;
            }
            try {
                // Sleep before proceeding to fix more files.
                Thread.sleep(blockFixInterval);
            } catch (InterruptedException ie) {
                LOG.error("Encountering InturruptedException " + StringUtils.stringifyException(ie));
            }
        }
    }

    /*
     * Release all resources, shutdown any threads
     */
    void shutdown() {
        running = false;
        filesBeingFixed.stop();
    }

    /*
     * returns the FileSystem of the path. If the FileSystem is down, then
     * log an error and return null
     */
    static FileSystem getFs(Configuration conf, Path p) {
        try {
            return p.getFileSystem(conf);
        } catch (Exception e) {
            // if a single namenode is down, log it and ignore. Continue to
            // fix other namenodes.
            LOG.warn("getFs: Unable to contact filesystem: " + p + " ignoring.... " + e);
            e.printStackTrace();
            return null;
        }
    }

    /**
     * Poll namenode(s) to find corrupted files. Enqueue blocks for replication 
     * if needed.
     */
    private void doFindFiles() throws IOException {
        Set<FileSystem> allFs = new HashSet<FileSystem>();
        Set<Path> filesToFix = new HashSet<Path>(); // files that are yet to be fixed

        // collect all unique filesystems in all policies.
        for (PolicyInfo pinfo : all) {
            FileSystem fs = getFs(pinfo.getConf(), pinfo.getSrcPath());
            if (fs != null) {
                allFs.add(fs);
            }
            for (PathInfo d : pinfo.getDestPaths()) {
                fs = getFs(pinfo.getConf(), d.rpath);
                if (fs != null) {
                    allFs.add(fs);
                }
            }
        }

        // make a RPC to all relevant namenodes to find corrupt files
        for (FileSystem fs : allFs) {
            if (!running)
                break;
            List<Path> corruptFiles = null;

            corruptFiles = getCorruptFilesFromNamenode(fs);

            // if we are not already fixing this one, then put it in the list
            // of files that need fixing.
            for (Path p : corruptFiles) {
                if (filesBeingFixed.add(p)) {
                    filesToFix.add(p);
                }
            }
        }

        if (!filesToFix.isEmpty()) {
            LOG.info("Found " + filesToFix.size() + " corrupt files.");
        }

        for (Path path : filesToFix) {
            if (!running)
                break;
            try {
                fixFile(path);
            } catch (IOException ie) {
                LOG.error("Error while processing " + path + ": " + StringUtils.stringifyException(ie));
                // For certain kinds of errors, it might be good if we remove
                // this file from filesBeingFixed, so that the file-fix gets
                // attemted in the immediate next iteration. For example, if
                // we get a network Exception, we can retry immediately. On 
                // the other hand, if we get a file length mismatch exception
                // then no amount of retry will fix it, so it is better to 
                // retry less frequently.
            }
        }
    }

    /**
     * Fix a specific file
     */
    private void fixFile(Path badFile) throws IOException {

        PolicyInfo pinfo = null;
        String filename = badFile.toString();

        LOG.info("File  = file to fix:" + badFile);

        // Find the policy that maps this file
        for (PathToPolicy pp : pathToPolicy) {
            if (filename.startsWith(pp.spath)) {
                pinfo = pp.pinfo;
                break;
            }
        }
        if (pinfo == null) {
            throw new IOException("Unable to find matching policy for " + badFile);
        }

        // process the file and fix it. 
        Path src;
        HighTideNode.getMetrics().fixAttempt.inc();

        if (filename.startsWith(pinfo.getSrcPath().toString())) {
            // srcPath is corrupted, pick the first destPath as source of truth.
            String[] splits = filename.split(pinfo.getSrcPath().toString());
            src = new Path(pinfo.getDestPaths().get(0).rpath.toString() + splits[1]);
        } else {
            // dest file is corrupted, copy from source to destination
            String[] splits = filename.split(pinfo.getDestPaths().get(0).rpath.toString());
            src = new Path(pinfo.getSrcPath().toString() + splits[1]);
        }
        DistributedFileSystem srcFs = (DistributedFileSystem) src.getFileSystem(pinfo.getConf());
        DistributedFileSystem destFs = (DistributedFileSystem) badFile.getFileSystem(pinfo.getConf());

        FileStatus sstat = srcFs.getFileStatus(src);
        FileStatus dstat = destFs.getFileStatus(badFile);

        // assert that modtime of the two files are same
        if (sstat.getModificationTime() != dstat.getModificationTime()) {
            String msg = "Unable to fix file " + badFile + " because src " + src + " has modification time as "
                    + HighTideNode.dateForm.format(new Date(sstat.getModificationTime())) + " but destination "
                    + badFile + " has modification time as "
                    + HighTideNode.dateForm.format(new Date(dstat.getModificationTime()));
            LOG.error(msg);
            HighTideNode.getMetrics().fixFailedModTimeMismatch.inc();
            throw new IOException(msg);
        }

        // check that blocksize of the two files are same
        if (sstat.getBlockSize() != dstat.getBlockSize()) {
            String msg = "Unable to fix file " + badFile + " because src " + src + " has blocksize as "
                    + sstat.getBlockSize() + " but destination " + badFile + " has blocksize as "
                    + dstat.getBlockSize();
            LOG.error(msg);
            HighTideNode.getMetrics().fixFailedBlockSizeMismatch.inc();
            throw new IOException(msg);
        }

        // check that size of the two files are same
        if (sstat.getLen() != dstat.getLen()) {
            String msg = "Unable to fix file " + badFile + " because src " + src + " has size as " + sstat.getLen()
                    + " but destination " + badFile + " has size as " + dstat.getLen();
            LOG.error(msg);
            HighTideNode.getMetrics().fixFailedFileLengthMismatch.inc();
            throw new IOException(msg);
        }

        List<LocatedBlock> badBlocks = corruptBlocksInFile(destFs, badFile.toUri().getPath(), dstat);
        List<LocatedBlock> goodBlocks = srcFs.getClient().namenode
                .getBlockLocations(src.toUri().getPath(), 0L, sstat.getLen()).getLocatedBlocks();

        // for each of the bad blocks, find the good block
        for (LocatedBlock badBlock : badBlocks) {
            LocatedBlock found = null;
            for (LocatedBlock goodBlock : goodBlocks) {
                if (badBlock.getStartOffset() == goodBlock.getStartOffset()) {
                    found = goodBlock;
                    break;
                }
            }
            if (found == null || found.getLocations().length == 0) {
                String msg = "Could not find a good block location for badBlock " + badBlock + " in file "
                        + badFile;
                LOG.error(msg);
                HighTideNode.getMetrics().fixFailedNoGoodBlock.inc();
                throw new IOException(msg);
            }

            // execute asynchronously
            WorkItem bp = new WorkItem(badFile, found, badBlock, destFs, conf);
            LOG.info("Queueing up block " + badBlock.getBlock().getBlockName() + " to be fixed from block "
                    + found.getBlock().getBlockName());
            executor.execute(bp);
        }
    }

    /**
     * @return A list of corrupt files as obtained from the namenode
     * If the namenode is down, then return an empty list.
     */
    List<Path> getCorruptFilesFromNamenode(FileSystem fs) throws IOException {
        if (!(fs instanceof DistributedFileSystem)) {
            throw new IOException("Only DistributedFileSystem can be handled " + " by HighTide.");
        }

        DistributedFileSystem dfs = (DistributedFileSystem) fs;
        List<Path> corruptFiles = new LinkedList<Path>();

        try {
            LOG.info("Checking filesystem: " + dfs.getUri());
            String[] files = DFSUtil.getCorruptFiles(dfs);
            for (String f : files) {
                Path p = new Path(f).makeQualified(fs);
                corruptFiles.add(p);
            }
            return corruptFiles;
        } catch (Exception e) {
            // if a single namenode is down, log it and ignore. Continue to
            // fix other namenodes.
            LOG.warn("getCorruptFilesFromNamenode: Unable to contact filesystem: " + fs.getUri() + " ignoring..."
                    + e);
            e.printStackTrace();
            return corruptFiles;
        }
    }

    /**
     * Returns the corrupt blocks in a file.
     **/
    List<LocatedBlock> corruptBlocksInFile(DistributedFileSystem fs, String uriPath, FileStatus stat)
            throws IOException {
        List<LocatedBlock> corrupt = new LinkedList<LocatedBlock>();
        LocatedBlocks locatedBlocks = fs.getClient().namenode.getBlockLocations(uriPath, 0, stat.getLen());
        for (LocatedBlock b : locatedBlocks.getLocatedBlocks()) {
            if (b.isCorrupt() || (b.getLocations().length == 0 && b.getBlockSize() > 0)) {
                LOG.info("Adding bad block for file " + uriPath);
                corrupt.add(b);
            }
        }
        return corrupt;
    }

    /**
     * Setup a session with the specified datanode
     */
    static ClientDatanodeProtocol createClientDatanodeProtocolProxy(DatanodeInfo datanodeid, Configuration conf)
            throws IOException {
        InetSocketAddress addr = NetUtils.createSocketAddr(datanodeid.getHost() + ":" + datanodeid.getIpcPort());
        if (ClientDatanodeProtocol.LOG.isDebugEnabled()) {
            ClientDatanodeProtocol.LOG.info("ClientDatanodeProtocol addr=" + addr);
        }
        try {
            return (ClientDatanodeProtocol) RPC.getProxy(ClientDatanodeProtocol.class,
                    ClientDatanodeProtocol.versionID, addr, conf);
        } catch (RPC.VersionMismatch e) {
            long clientVersion = e.getClientVersion();
            long datanodeVersion = e.getServerVersion();
            if (clientVersion > datanodeVersion
                    && !ProtocolCompatible.isCompatibleClientDatanodeProtocol(clientVersion, datanodeVersion)) {
                throw new RPC.VersionIncompatible(ClientDatanodeProtocol.class.getName(), clientVersion,
                        datanodeVersion);
            }
            return (ClientDatanodeProtocol) e.getProxy();
        }
    }

    // a class to store pairs of blocks.
    static class WorkItem implements Runnable {
        Path badfile; // file to be fixed
        LocatedBlock goodBlock; // existing replica of missing block
        LocatedBlock badBlock; // missing block
        DistributedFileSystem destFs; // filesystem of destination
        Configuration conf;
        private static Random rand = new Random();

        WorkItem(Path file, LocatedBlock g, LocatedBlock b, FileSystem fs, Configuration conf) {
            this.goodBlock = g;
            this.badBlock = b;
            this.badfile = file;
            this.destFs = (DistributedFileSystem) fs;
            this.conf = conf;
        }

        @Override
        public void run() {

            String msg = "";
            try {
                // find a random datanode from the destination cluster
                DatanodeInfo[] targets = destFs.getClient().datanodeReport(DatanodeReportType.LIVE);
                DatanodeInfo target = targets[rand.nextInt(targets.length)];

                // find a source datanode from among the datanodes that host this block
                DatanodeInfo srcdn = goodBlock.getLocations()[rand.nextInt(goodBlock.getLocations().length)];

                // The RPC is asynchronous, i.e. the RPC will return immediately even before the
                // physical block copy occurs from the datanode.
                msg = "File " + badfile + ": Copying block " + goodBlock.getBlock().getBlockName() + " from "
                        + srcdn.getName() + " to block " + badBlock.getBlock().getBlockName() + " on "
                        + target.getName();
                LOG.info(msg);
                ClientDatanodeProtocol datanode = createClientDatanodeProtocolProxy(srcdn, conf);
                datanode.copyBlock(goodBlock.getBlock(), badBlock.getBlock(), target);
                RPC.stopProxy(datanode);
                HighTideNode.getMetrics().fixSuccessfullyStarted.inc();
            } catch (Throwable e) {
                HighTideNode.getMetrics().fixFailedDatanodeError.inc();
                LOG.error(StringUtils.stringifyException(e) + msg + ". Failed to contact datanode.");
            }
        }
    }
}