org.apache.hadoop.mapred.DatanodeBenThread.java Source code

Introduction

Here is the source code for org.apache.hadoop.mapred.DatanodeBenThread.java
Source

package org.apache.hadoop.mapred;
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.DistributedRaidFileSystem;
import org.apache.hadoop.hdfs.GeneralConstant;

import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.util.Set;
import java.util.HashSet;
import java.util.Random;
import java.util.Collections;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.commons.logging.Log;
import org.apache.hadoop.mapred.GenWriterThread.TokenBucket;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.server.namenode.NameNode;

@SuppressWarnings("deprecation")
/**
 * DatanodeBenThread is used for benchmark datanode. Its test type is dnben
 * There are two steps in the dnben:
 * 1. prepare step:
 *   Has multiple mappers, each mapper is running on one datanode.
 *   Each mapper writes N files that has only 1 replia.
 *   Because the first replica is written locally, it ends up with
 *   N 1-replica files for each datanode  
 * 2. Stress test step
 *   It will first randomly pick N (defined by -dn) victim datanodes
 *   from the cluster, these victim datanodes need to satisfy two conditions:
 *   a. it's alive in all namespaces
 *   b. it has at least M (defined by -minfile) files under workdir 
 *   Then in each mapper, we will spawn T threads and T/X threads per 
 *   namespace where X is the number of namespaces
 *   Each thread will have P (defined by pread) probability to become
 *   a read thread and 1-P probability to become a write thread.
 *   For read thread:
 *     it will keep picking a random file belongs to one victim node and 
 *     read one buffer data from the file. Because the victim node contains
 *     the only replica of the file. All the reads will go to victim nodes.
 *   For write thread:
 *     It will pass victim nodes as favornodes to the DFSClient and namenode
 *     will allocate victim nodes to the new files. Most of the writes will 
 *     go to victim nodes
 * @author weiyan
 */
public class DatanodeBenThread extends GenThread implements GeneralConstant {
    private static final Log LOG = LogFactory.getLog(DatanodeBenThread.class);
    public static final String TEST_TYPE = "dnben";
    public static final long DEFAULT_MAX_TIME_SEC = 60;
    public static final long DEFAULT_MIN_NUMBER_OF_FILES_PER_DATANODE = 60;
    public static final long DEFAULT_FILE_SIZE = 10; //10MB
    public static final long DEFAULT_DATANODE_NUMBER = 1;
    public static final float DEFAULT_READ_PERCENT = 1.0f;
    public static final long DEFAULT_MAX_NUMBER_OF_FILES_PER_THREAD = Long.MAX_VALUE;
    public static final short DEFAULT_REPLICATION_NUM = 3;

    public static final String MAX_TIME_SEC_KEY = "max.time.sec";
    public static final String FILE_SIZE_KEY = "dfs.MB.file.size";
    public static final String DATANODE_NUMBER_KEY = "dfs.stress.test.datanode.num";
    public static final String READ_PERCENT_KEY = "dfs.read.percent";
    public static final String MIN_FILE_PER_DATANODE_KEY = "dfs.min.file.per.datanode";
    public static final String RUNNING_TYPE_KEY = "dfs.running.type";
    public static final String MAX_NUMBER_OF_FILES_PER_THREAD_KEY = "dfs.max.nfile.per.thread";
    public static final String VICTIM_DATANODE_KEY = "dfs.victim.datanodes";

    public static enum RUNNING_TYPE {
        PREPARE, GENERAL, READ, WRITE
    }

    public class DatanodeBenRunTimeConstants extends RunTimeConstants {
        long max_time = DEFAULT_MAX_TIME_SEC * 1000;
        long data_rate = DEFAULT_DATA_RATE * 1024;
        long min_file = DEFAULT_MIN_NUMBER_OF_FILES_PER_DATANODE;
        long file_size = DEFAULT_FILE_SIZE * 1024 * 1024;
        long max_files = DEFAULT_MAX_NUMBER_OF_FILES_PER_THREAD;
        String task_name = null;
        //Used by prepare 
        String cur_datanode = null;
        //Used by General
        InetSocketAddress[] victims = null;
        Set<String> victimSet = null;
        // namespace default uri -> {datanode host name -> list of files} 
        HashMap<String, HashMap<String, ArrayList<Path>>> pickLists = null;
    }

    private DatanodeBenRunTimeConstants rtc = null;
    private int id;
    private String file_prefix = "";
    private RUNNING_TYPE running_type = RUNNING_TYPE.GENERAL;
    private Random rb = new Random();
    private HashMap<String, ArrayList<Path>> nsPickLists = null;
    private DistributedFileSystem dfs = null;
    public TokenBucket tb = null;
    public short replication = 3;
    public float pread = DEFAULT_READ_PERCENT;
    public long max_size = DEFAULT_FILE_SIZE * 1024 * 1024;
    public String thread_name = null;
    // Counters
    private long read_size = 0;
    private long write_size = 0;
    private float average_read_rate = 0;
    private float average_write_rate = 0;
    private long total_num = 0;

    public DatanodeBenThread() {

    }

    public DatanodeBenThread(Configuration conf, Path input, Path output, int id, RUNNING_TYPE init_type,
            DatanodeBenRunTimeConstants rtc) throws IOException {
        super(conf, input, output, rtc);
        this.rtc = rtc;
        this.replication = (short) conf.getInt(REPLICATION_KEY, DEFAULT_REPLICATION_NUM);
        this.max_size = conf.getLong(FILE_SIZE_KEY, DEFAULT_FILE_SIZE) * 1024 * 1024;
        this.pread = conf.getFloat(READ_PERCENT_KEY, DEFAULT_READ_PERCENT);
        this.tb = new TokenBucket(rtc.data_rate);
        this.id = id;
        this.thread_name = rtc.task_name + "_" + id;
        this.running_type = init_type;
        if (running_type.equals(RUNNING_TYPE.PREPARE)) {
            this.file_prefix = rtc.cur_datanode + thread_name + "_part";
        } else {
            this.file_prefix = thread_name + "_part";
            this.nsPickLists = rtc.pickLists.get(conf.get(FileSystem.FS_DEFAULT_NAME_KEY));
            this.dfs = (DistributedFileSystem) fs;
            float f = rb.nextFloat();
            if (f < pread + 1e-9) {
                this.running_type = RUNNING_TYPE.READ;
            } else {
                this.outputPath = new Path(outputPath, thread_name);
                this.running_type = RUNNING_TYPE.WRITE;
            }
        }
        fs.mkdirs(this.outputPath);
    }

    public DatanodeBenThread(Configuration conf, Path output, int id, RUNNING_TYPE running_type,
            DatanodeBenRunTimeConstants rtc) throws IOException {
        this(conf, null, output, id, running_type, rtc);
    }

    public DatanodeBenThread(JobConf conf) {
        super(conf);
    }

    /*
     * Look at the output directory to see how many files belong to 
     * the current datanode
     */
    public int getNumberOfFiles() throws IOException {
        DistributedFileSystem dfs = (DistributedFileSystem) fs;
        RemoteIterator<LocatedFileStatus> iter = dfs.listLocatedStatus(outputPath);
        int fn = 0;
        while (iter.hasNext()) {
            LocatedFileStatus lfs = iter.next();
            if (lfs.isDir())
                continue;
            if (lfs.getBlockLocations().length != 1)
                continue;
            String curHost = rtc.cur_datanode;
            for (String host : lfs.getBlockLocations()[0].getHosts()) {
                if (curHost.equals(host)) {
                    fn++;
                    break;
                }
            }
        }
        LOG.info(" Found " + fn + " files in " + dfs.getUri());
        return fn;
    }

    public void write() throws Exception {
        long endTime = System.currentTimeMillis() + rtc.max_time;
        long currentId = 0;
        FSDataOutputStream out = null;
        DistributedFileSystem dfs = (DistributedFileSystem) fs;
        while (System.currentTimeMillis() < endTime && currentId < rtc.max_files) {
            if (running_type == RUNNING_TYPE.PREPARE) {
                //The number of files reach the minimum limit, exit
                if (getNumberOfFiles() > rtc.min_file)
                    break;
            }
            Path fileName = new Path(outputPath, file_prefix + currentId);
            try {
                out = dfs.create(fileName, FsPermission.getDefault(), false,
                        dfs.getConf().getInt("io.file.buffer.size", 4096), (short) replication,
                        dfs.getDefaultBlockSize(), dfs.getConf().getInt("io.bytes.per.checksum", 512), null,
                        rtc.victims);
                long size = 0;
                while (true) {
                    rb.nextBytes(buffer);
                    tb.getTokens(rtc.buffer_size);
                    out.write(buffer, 0, rtc.buffer_size);
                    size += rtc.buffer_size;
                    if (System.currentTimeMillis() > endTime || size + rtc.buffer_size > max_size) {
                        // Roll the file
                        out.close();
                        out = null;
                        currentId++;
                        files_processed++;
                        processed_size += size;
                        write_size += size;
                        Path fullName = fs.makeQualified(fileName);
                        BlockLocation bl = dfs.getClient().getBlockLocations(fullName.toUri().getPath(), 0L, 1L)[0];
                        String hosts = "";
                        for (String host : bl.getHosts()) {
                            hosts += host + " ";
                        }
                        LOG.info("[close (" + size + "B)] " + hosts + " file " + fullName);
                        break;
                    }
                }
            } catch (Exception e) {
                LOG.error("Error in writing file:" + fileName, e);
                this.errors.add(e);
            } finally {
                IOUtils.closeStream(out);
            }
        }
    }

    public void read() throws Exception {
        long endTime = System.currentTimeMillis() + rtc.max_time;
        while (System.currentTimeMillis() < endTime) {
            // Randomly pick a datanode from victims
            int idx = rb.nextInt(rtc.victims.length);
            // Randomly pick a file to read
            ArrayList<Path> fileList = nsPickLists.get(rtc.victims[idx].getHostName());
            int fid = rb.nextInt(fileList.size());
            Path readFile = fileList.get(fid);
            FSDataInputStream in = null;
            try {
                in = fs.open(readFile);
                if (in.isUnderConstruction()) {
                    LOG.info("file " + readFile + " is still open");
                }
                FileStatus fileStatus = fs.getFileStatus(readFile);
                long offset = rb.nextInt((int) Math.max(fileStatus.getLen() - rtc.buffer_size, 0) + 1);
                int size = 0;
                in.seek(offset);
                size = in.read(buffer, 0, rtc.buffer_size);
                if (size < 0) {
                    continue;
                }
                processed_size += size;
                read_size += size;
                LOG.info("Read file " + readFile + " from " + offset + " to " + (offset + size));
            } catch (Exception e) {
                LOG.error("Error in read file: " + readFile, e);
                this.errors.add(e);
            } finally {
                IOUtils.closeStream(in);
            }
            files_processed++;
            Thread.sleep(5);
        }
    }

    public void run() {
        try {
            switch (running_type) {
            case PREPARE:
                write();
                break;
            case READ:
                LOG.info("Read Thread: " + thread_name);
                read();
                break;
            case WRITE:
                LOG.info("Write Thread:" + thread_name);
                write();
                break;
            }
            LOG.info("Thread " + thread_name + " is done.");
        } catch (Exception ioe) {
            LOG.error("Error: ", ioe);
            this.errors.add(ioe);
        }
    }

    private static HashMap<String, ArrayList<Path>> getNSPickLists(DistributedFileSystem dfs, String workdir,
            long minFile, Set<String> victims) throws IOException {
        HashMap<String, ArrayList<Path>> nsPickLists = new HashMap<String, ArrayList<Path>>();
        RemoteIterator<LocatedFileStatus> iter = dfs.listLocatedStatus(new Path(workdir));
        while (iter.hasNext()) {
            LocatedFileStatus lfs = iter.next();
            if (lfs.isDir())
                continue;
            if (lfs.getBlockLocations().length != 1)
                continue;
            for (String hostname : lfs.getBlockLocations()[0].getHosts()) {
                // Skip the uninterested datanodes
                if (victims != null && !victims.contains(hostname))
                    continue;
                ArrayList<Path> value = null;
                if (!nsPickLists.containsKey(hostname)) {
                    value = new ArrayList<Path>();
                    nsPickLists.put(hostname, value);
                } else {
                    value = nsPickLists.get(hostname);
                }
                value.add(lfs.getPath());
            }
        }
        if (victims == null) {
            String[] hostnames = nsPickLists.keySet().toArray(new String[0]);
            //Remove the datanodes with not enough files
            for (String hostname : hostnames) {
                if (nsPickLists.get(hostname).size() < minFile) {
                    nsPickLists.remove(hostname);
                }
            }
        }
        return nsPickLists;
    }

    private static DistributedFileSystem getDFS(FileSystem fs) throws IOException {
        if (fs instanceof DistributedRaidFileSystem)
            fs = ((DistributedRaidFileSystem) fs).getFileSystem();
        return (DistributedFileSystem) fs;
    }

    private static HashMap<String, HashMap<String, ArrayList<Path>>> getPickLists(List<JobConf> nameNodeConfs,
            String workdir, long minFile, Set<String> victims) throws IOException {
        HashMap<String, HashMap<String, ArrayList<Path>>> allList = new HashMap<String, HashMap<String, ArrayList<Path>>>();
        for (JobConf nameNodeConf : nameNodeConfs) {
            FileSystem fs = FileSystem.get(nameNodeConf);
            DistributedFileSystem dfs = getDFS(fs);
            HashMap<String, ArrayList<Path>> nsPickLists = getNSPickLists(dfs, workdir, minFile, victims);
            allList.put(nameNodeConf.get(FileSystem.FS_DEFAULT_NAME_KEY), nsPickLists);
        }
        return allList;
    }

    private static Set<DatanodeInfo> getValidDatanodes(JobConf nameNodeConf, DatanodeBenRunTimeConstants rtc)
            throws IOException {
        FileSystem fs = FileSystem.get(nameNodeConf);
        DistributedFileSystem dfs = getDFS(fs);
        HashMap<String, ArrayList<Path>> nsPickLists = rtc.pickLists
                .get(nameNodeConf.get(FileSystem.FS_DEFAULT_NAME_KEY));
        DatanodeInfo[] dnStats = dfs.getLiveDataNodeStats();
        Set<DatanodeInfo> validDatanodes = new HashSet<DatanodeInfo>();
        for (DatanodeInfo dn : dnStats) {
            if (dn.isDecommissioned() || dn.isDecommissionInProgress()) {
                continue;
            }
            if (!nsPickLists.containsKey(dn.getHostName())) {
                continue;
            }
            validDatanodes.add(dn);
        }
        return validDatanodes;
    }

    /**
     * We randomly pick nDatanode datanodes to do stress tests. 
     * Valid datanodes should satisfy two conditions:
     * 1. alive in all namespaces
     * 2. have at lease min_file files under the working directory.
     */
    public List<DatanodeInfo> getTestDatanodes(List<JobConf> nameNodeConfs, String workdir, long nDatanode,
            long minFile) throws IOException {
        this.rtc = new DatanodeBenRunTimeConstants();
        rtc.pickLists = getPickLists(nameNodeConfs, workdir, minFile, null);
        Set<DatanodeInfo> validDatanodes = getValidDatanodes(nameNodeConfs.get(0), rtc);
        for (int i = 1; i < nameNodeConfs.size(); i++) {
            Set<DatanodeInfo> tmpDatanodes = getValidDatanodes(nameNodeConfs.get(i), rtc);
            Set<DatanodeInfo> mergeDatanodes = new HashSet<DatanodeInfo>();
            for (DatanodeInfo dn : tmpDatanodes)
                if (validDatanodes.contains(dn)) {
                    mergeDatanodes.add(dn);
                }
            validDatanodes = mergeDatanodes;
        }
        LOG.info("There are " + validDatanodes.size() + " valid datanodes.");
        String logInfo = "";
        List<DatanodeInfo> dnList = new ArrayList<DatanodeInfo>();
        for (DatanodeInfo dn : validDatanodes) {
            logInfo += ' ' + dn.getHostName();
            dnList.add(dn);
        }
        LOG.info(logInfo);
        if (dnList.size() < nDatanode)
            return dnList;
        Collections.shuffle(dnList);
        return dnList.subList(0, (int) nDatanode);
    }

    /**
     * Write a small file to figure out which datanode we are running
     */
    private String getRunningDatanode(Configuration conf) throws IOException {
        FileSystem fs = FileSystem.newInstance(conf);
        fs.mkdirs(new Path("/tmp"));
        Path fileName = new Path("/tmp", rtc.task_name + System.currentTimeMillis() + rb.nextInt());
        if (fs.exists(fileName)) {
            fs.delete(fileName);
        }
        FSDataOutputStream out = null;
        byte[] buffer = new byte[1];
        buffer[0] = '0';
        try {
            out = fs.create(fileName, (short) 1);
            out.write(buffer, 0, 1);
        } finally {
            IOUtils.closeStream(out);
        }
        fs = getDFS(fs);
        assert fs instanceof DistributedFileSystem;
        DistributedFileSystem dfs = (DistributedFileSystem) fs;
        BlockLocation[] lbs = dfs.getClient().getBlockLocations(fileName.toUri().getPath(), 0, 1);
        fs.delete(fileName);
        return lbs[0].getHosts()[0];
    }

    public static List<JobConf> getNameNodeConfs(JobConf conf) throws IOException {
        List<InetSocketAddress> nameNodeAddrs = DFSUtil.getClientRpcAddresses(conf, null);
        List<JobConf> nameNodeConfs = new ArrayList<JobConf>(nameNodeAddrs.size());
        for (InetSocketAddress nnAddr : nameNodeAddrs) {
            JobConf newConf = new JobConf(conf);
            newConf.set(NameNode.DFS_NAMENODE_RPC_ADDRESS_KEY, nnAddr.getHostName() + ":" + nnAddr.getPort());
            NameNode.setupDefaultURI(newConf);
            nameNodeConfs.add(newConf);
        }
        return nameNodeConfs;
    }

    @Override
    public GenThread[] prepare(JobConf conf, Text key, Text value) throws IOException {
        this.rtc = new DatanodeBenRunTimeConstants();
        super.prepare(conf, key, value, rtc);
        rtc.task_name = key.toString() + rtc.taskID;
        rtc.min_file = conf.getLong(MIN_FILE_PER_DATANODE_KEY, DEFAULT_MIN_NUMBER_OF_FILES_PER_DATANODE);
        rtc.max_time = conf.getLong(MAX_TIME_SEC_KEY, DEFAULT_MAX_TIME_SEC) * 1000;
        rtc.data_rate = conf.getLong(WRITER_DATARATE_KEY, DEFAULT_DATA_RATE) * 1024;
        rtc.file_size = conf.getLong(FILE_SIZE_KEY, DEFAULT_FILE_SIZE) * 1024 * 1024;
        LOG.info("data rate: " + rtc.data_rate);
        String working_dir = value.toString();
        int run_type = conf.getInt(RUNNING_TYPE_KEY, RUNNING_TYPE.GENERAL.ordinal());
        List<JobConf> nameNodeConfs = getNameNodeConfs(conf);
        if (run_type == RUNNING_TYPE.PREPARE.ordinal()) {
            rtc.cur_datanode = getRunningDatanode(conf);
            LOG.info("Current datanode is " + rtc.cur_datanode);
            // Make sure each namespace has same number of threads
            long nthread_per_namespace = ((rtc.nthreads - 1) / nameNodeConfs.size() + 1);
            rtc.max_files = (rtc.min_file - 1) / nthread_per_namespace + 1;
            rtc.nthreads = nthread_per_namespace * nameNodeConfs.size();
            LOG.info("Number of threads: " + rtc.nthreads + " max_file:" + rtc.max_files);
            DatanodeBenThread[] threads = new DatanodeBenThread[(int) rtc.nthreads];
            int nsIdx = 0;
            for (int i = 0; i < rtc.nthreads; i++) {
                threads[i] = new DatanodeBenThread(nameNodeConfs.get(nsIdx), new Path(working_dir), i,
                        RUNNING_TYPE.PREPARE, rtc);
                nsIdx++;
                if (nsIdx == nameNodeConfs.size()) {
                    nsIdx = 0;
                }
            }
            return threads;
        } else {
            String[] victimStrs = conf.getStrings(VICTIM_DATANODE_KEY);
            LOG.info("Victim datanodes are :" + conf.get(VICTIM_DATANODE_KEY));
            rtc.victims = new InetSocketAddress[victimStrs.length];
            rtc.victimSet = new HashSet<String>();
            for (int i = 0; i < victimStrs.length; i++) {
                //hostname:port
                String[] values = victimStrs[i].split(":");
                rtc.victims[i] = new InetSocketAddress(values[0], Integer.parseInt(values[1]));
                rtc.victimSet.add(values[0]);
            }
            rtc.pickLists = getPickLists(nameNodeConfs, working_dir, Long.MAX_VALUE, rtc.victimSet);
            DatanodeBenThread[] threads = new DatanodeBenThread[(int) rtc.nthreads];
            int nsIdx = 0;
            for (int i = 0; i < rtc.nthreads; i++) {
                threads[i] = new DatanodeBenThread(nameNodeConfs.get(nsIdx), new Path(working_dir),
                        new Path(rtc.output_dir), i, RUNNING_TYPE.GENERAL, rtc);
                nsIdx++;
                if (nsIdx == nameNodeConfs.size()) {
                    nsIdx = 0;
                }
            }
            return threads;
        }
    }

    @Override
    public Map<String, String> collectStats(JobConf conf, GenThread[] threads, long execTime) throws IOException {
        long total_read_size = 0;
        long total_write_size = 0;
        for (GenThread t : threads) {
            DatanodeBenThread dbt = (DatanodeBenThread) t;
            total_read_size += dbt.read_size;
            total_write_size += dbt.write_size;
        }
        float readRateMbSec = (float) total_read_size * 1000 / (execTime * MEGA);
        float writeRateMbSec = (float) total_write_size * 1000 / (execTime * MEGA);
        LOG.info("Read IO rate = " + readRateMbSec);
        LOG.info("Write IO rate = " + writeRateMbSec);
        Map<String, String> stat = super.collectStats(conf, threads, execTime);
        stat.put("readrate", String.valueOf(readRateMbSec));
        stat.put("writerate", String.valueOf(writeRateMbSec));
        return stat;
    }

    @Override
    public void reset() {
        average_read_rate = 0;
        average_write_rate = 0;
        total_num = 0;
    }

    @Override
    public void analyze(Map<String, String> stat) throws IOException {
        average_read_rate += Float.parseFloat(stat.get("readrate"));
        average_write_rate += Float.parseFloat(stat.get("writerate"));
        total_num++;
    }

    @Override
    public void output(FSDataOutputStream out) throws IOException {
        out.writeChars("Average Read (MB/sec): \t\t\t" + average_read_rate / total_num + "\n");
        out.writeChars("Average Write (MB/sec): \t\t" + average_write_rate / total_num + "\n");
    }
}