org.apache.hama.bsp.BSPMaster.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hama.bsp.BSPMaster.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hama.bsp;

import java.io.IOException;
import java.lang.reflect.Constructor;
import java.net.InetSocketAddress;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hama.ipc.RPC;
import org.apache.hama.ipc.Server;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hama.Constants;
import org.apache.hama.HamaConfiguration;
import org.apache.hama.bsp.sync.MasterSyncClient;
import org.apache.hama.bsp.sync.ZKSyncBSPMasterClient;
import org.apache.hama.http.HttpServer;
import org.apache.hama.ipc.GroomProtocol;
import org.apache.hama.ipc.HamaRPCProtocolVersion;
import org.apache.hama.ipc.JobSubmissionProtocol;
import org.apache.hama.ipc.MasterProtocol;
import org.apache.hama.monitor.fd.FDProvider;
import org.apache.hama.monitor.fd.Supervisor;
import org.apache.hama.monitor.fd.UDPSupervisor;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher;

/**
 * BSPMaster is responsible to control all the groom servers and to manage bsp
 * jobs. It has the following responsibilities:
 * <ol>
 * <li><b>Job submission</b>. BSPMaster is responsible for accepting new job
 * requests and assigning the job to scheduler for scheduling BSP Tasks defined
 * for the job.
 * <li><b>GroomServer monitoring</b> BSPMaster keeps track of all the groom
 * servers in the cluster. It is responsible for adding new grooms to the
 * cluster and keeping a tab on all the grooms and could blacklist a groom if it
 * get fails the availability requirement.
 * <li>BSPMaster keeps track of all the task status for each job and handles the
 * failure of job as requested by the jobs.
 * </ol>
 */
public class BSPMaster
        implements JobSubmissionProtocol, MasterProtocol, GroomServerManager, Watcher, MonitorManager {

    public static final Log LOG = LogFactory.getLog(BSPMaster.class);
    public static final String localModeMessage = "Local mode detected, no launch of the daemon needed.";
    private static final int FS_ACCESS_RETRY_PERIOD = 10000;

    private HamaConfiguration conf;
    MasterSyncClient syncClient = null;

    /**
     * Constants for BSPMaster's status.
     */
    public static enum State {
        INITIALIZING, RUNNING
    }

    static long JOBINIT_SLEEP_INTERVAL = 2000;

    // States
    final AtomicReference<State> state = new AtomicReference<State>(State.INITIALIZING);

    // Attributes
    String masterIdentifier;
    // private Server interServer;
    private Server masterServer;

    // host and port
    private String host;
    private int port;

    // startTime
    private long startTime;

    // HTTP server
    private HttpServer infoServer;
    private int infoPort;

    // Filesystem
    static final String SUBDIR = "bspMaster";
    FileSystem fs = null;
    Path systemDir = null;
    // system directories are world-wide readable and owner readable
    final static FsPermission SYSTEM_DIR_PERMISSION = FsPermission.createImmutable((short) 0733); // rwx-wx-wx
    // system files should have 700 permission
    final static FsPermission SYSTEM_FILE_PERMISSION = FsPermission.createImmutable((short) 0700); // rwx------

    // Jobs' Meta Data
    private Integer nextJobId = 1;
    private int totalSubmissions = 0; // how many jobs has been submitted by
                                      // clients
    private int totalTasks = 0; // currnetly running tasks
    private int totalTaskCapacity; // max tasks that groom server can run

    private Map<BSPJobID, JobInProgress> jobs = new TreeMap<BSPJobID, JobInProgress>();
    private TaskScheduler taskScheduler;

    // Gorom Server Manager attributes
    // GroomServers cache
    protected ConcurrentMap<GroomServerStatus, GroomProtocol> groomServers = new ConcurrentHashMap<GroomServerStatus, GroomProtocol>();

    private final List<GroomServerStatus> blackList = new CopyOnWriteArrayList<GroomServerStatus>();

    private Instructor instructor;

    private final List<JobInProgressListener> jobInProgressListeners = new CopyOnWriteArrayList<JobInProgressListener>();

    private final List<GroomStatusListener> groomStatusListeners = new CopyOnWriteArrayList<GroomStatusListener>();

    private final AtomicReference<Supervisor> supervisor = new AtomicReference<Supervisor>();

    /**
     * ReportGroomStatusHandler keeps track of the status reported by each
     * Groomservers on the task they are executing currently. Based on the status
     * reported, it is responsible for issuing task recovery requests, updating
     * the job progress and other book keeping on currently running jobs.
     */
    private class ReportGroomStatusHandler implements DirectiveHandler {

        @Override
        public void handle(Directive directive) throws DirectiveException {
            // update GroomServerStatus held in the groomServers cache.
            GroomServerStatus groomStatus = ((ReportGroomStatusDirective) directive).getStatus();
            // groomServers cache contains groom server status reported back
            if (groomServers.containsKey(groomStatus)) {
                GroomServerStatus tmpStatus = null;
                for (GroomServerStatus old : groomServers.keySet()) {
                    if (old.equals(groomStatus)) {
                        totalTasks -= old.countTasks();
                        tmpStatus = groomStatus;
                        updateGroomServersKey(old, tmpStatus);
                        break;
                    }
                }

                if (null != tmpStatus) {
                    totalTasks += tmpStatus.countTasks();

                    List<TaskStatus> tlist = tmpStatus.getTaskReports();
                    for (TaskStatus ts : tlist) {
                        JobInProgress jip = taskScheduler.findJobById(ts.getJobId());
                        TaskInProgress tip = jip.findTaskInProgress(ts.getTaskId().getTaskID());

                        if (ts.getRunState() == TaskStatus.State.SUCCEEDED) {
                            jip.completedTask(tip, ts);
                            // increment counters only if successful
                            jip.getCounters().incrAllCounters(ts.getCounters());
                            for (GroomStatusListener listener : groomStatusListeners) {
                                listener.taskComplete(groomStatus, tip);
                            }
                        } else if (ts.getRunState() == TaskStatus.State.RUNNING) {
                            jip.getStatus().setProgress(ts.getSuperstepCount());
                            jip.getStatus().setSuperstepCount(ts.getSuperstepCount());
                        } else if (ts.getRunState() == TaskStatus.State.FAILED) {
                            if (jip.handleFailure(tip)) {
                                recoverTask(jip);
                            } else {
                                jip.status.setRunState(JobStatus.FAILED);
                                jip.failedTask(tip, ts);
                                for (JobInProgressListener listener : jobInProgressListeners) {
                                    try {
                                        listener.jobRemoved(jip);
                                    } catch (IOException ioe) {
                                        LOG.error("Fail to alter scheduler a job is moved.", ioe);
                                    }
                                }
                            }
                        }
                        if (jip.getStatus().getRunState() == JobStatus.SUCCEEDED) {
                            for (JobInProgressListener listener : jobInProgressListeners) {
                                try {
                                    listener.jobRemoved(jip);
                                } catch (IOException ioe) {
                                    LOG.error("Fail to alter scheduler a job is moved.", ioe);
                                }
                            }
                        } else if (jip.getStatus().getRunState() == JobStatus.RUNNING) {
                            jip.getStatus().setProgress(ts.getSuperstepCount());
                            jip.getStatus().setSuperstepCount(ts.getSuperstepCount());
                        } else if (jip.getStatus().getRunState() == JobStatus.KILLED) {

                            GroomProtocol worker = findGroomServer(tmpStatus);
                            Directive d1 = new DispatchTasksDirective(
                                    new GroomServerAction[] { new KillTaskAction(ts.getTaskId()) });
                            try {
                                worker.dispatch(d1);
                            } catch (IOException ioe) {
                                throw new DirectiveException("Error when dispatching kill task" + " action.", ioe);
                            }

                            for (JobInProgressListener listener : jobInProgressListeners) {
                                try {
                                    listener.jobRemoved(jip);
                                } catch (IOException ioe) {
                                    LOG.error("Fail to alter scheduler a job is moved.", ioe);
                                }
                            }
                        }
                    }
                } else {
                    throw new RuntimeException(
                            "BSPMaster contains GroomServerSatus, " + "but fail to retrieve it.");
                }
            } else {
                throw new RuntimeException("GroomServer not found." + groomStatus.getGroomName());
            }
        }
    }

    private class Instructor extends Thread {
        private final BlockingQueue<Directive> buffer = new LinkedBlockingQueue<Directive>();
        private final ConcurrentMap<Class<? extends Directive>, DirectiveHandler> handlers = new ConcurrentHashMap<Class<? extends Directive>, DirectiveHandler>();

        public void bind(Class<? extends Directive> instruction, DirectiveHandler handler) {
            handlers.putIfAbsent(instruction, handler);
        }

        public void put(Directive directive) {
            try {
                buffer.put(directive);
            } catch (InterruptedException ie) {
                LOG.error("Fail to put directive into queue.", ie);
            }
        }

        @Override
        public void run() {
            while (true) {
                try {
                    Directive directive = this.buffer.take();
                    handlers.get(directive.getClass()).handle(directive);
                } catch (InterruptedException ie) {
                    LOG.error("Unable to retrieve directive from the queue.", ie);
                    Thread.currentThread().interrupt();
                } catch (Exception e) {
                    LOG.error("Fail to execute directive command.", e);
                }
            }
        }
    }

    /**
     * Start the BSPMaster process, listen on the indicated hostname/port
     * 
     * @param conf provides runtime parameters.
     */
    public BSPMaster(HamaConfiguration conf) throws IOException, InterruptedException {
        this(conf, generateNewIdentifier());
    }

    BSPMaster(HamaConfiguration conf, String identifier) throws IOException, InterruptedException {
        this.conf = conf;
        this.masterIdentifier = identifier;

        // Create the scheduler and init scheduler services
        Class<? extends TaskScheduler> schedulerClass = conf.getClass("bsp.master.taskscheduler",
                SimpleTaskScheduler.class, TaskScheduler.class);
        this.taskScheduler = ReflectionUtils.newInstance(schedulerClass, conf);

        InetSocketAddress inetSocketAddress = getAddress(conf);
        // inetSocketAddress is null if we are in local mode, then we should start
        // nothing.
        if (inetSocketAddress != null) {
            host = inetSocketAddress.getHostName();
            port = inetSocketAddress.getPort();
            LOG.info("RPC BSPMaster: host " + host + " port " + port);

            startTime = System.currentTimeMillis();
            this.masterServer = RPC.getServer(this, host, port, conf);

            infoPort = conf.getInt("bsp.http.infoserver.port", 40013);

            infoServer = new HttpServer("bspmaster", host, infoPort, true, conf);
            infoServer.setAttribute("bsp.master", this);

            // starting webserver
            infoServer.start();

            if (conf.getBoolean("bsp.monitor.fd.enabled", false)) {
                this.supervisor.set(FDProvider.createSupervisor(
                        conf.getClass("bsp.monitor.fd.supervisor.class", UDPSupervisor.class, Supervisor.class),
                        conf));
            }

            while (!Thread.currentThread().isInterrupted()) {
                try {
                    if (fs == null) {
                        fs = FileSystem.get(conf);
                    }
                } catch (IOException e) {
                    LOG.error("Can't get connection to Hadoop Namenode!", e);
                }
                try {
                    // clean up the system dir, which will only work if hdfs is out of
                    // safe mode
                    if (systemDir == null) {
                        systemDir = new Path(getSystemDir());
                    }

                    LOG.info("Cleaning up the system directory");
                    LOG.info(systemDir);
                    fs.delete(systemDir, true);
                    if (FileSystem.mkdirs(fs, systemDir, new FsPermission(SYSTEM_DIR_PERMISSION))) {
                        break;
                    }
                    LOG.error("Mkdirs failed to create " + systemDir);
                    LOG.info(SUBDIR);

                } catch (AccessControlException ace) {
                    LOG.warn("Failed to operate on bsp.system.dir (" + systemDir + ") because of permissions.");
                    LOG.warn(
                            "Manually delete the bsp.system.dir (" + systemDir + ") and then start the BSPMaster.");
                    LOG.warn("Bailing out ... ");
                    throw ace;
                } catch (IOException ie) {
                    LOG.info("problem cleaning system directory: " + systemDir, ie);
                }
                Thread.sleep(FS_ACCESS_RETRY_PERIOD);
            }

            if (Thread.currentThread().isInterrupted()) {
                throw new InterruptedException();
            }

            deleteLocalFiles(SUBDIR);
        } else {
            System.out.println(localModeMessage);
            LOG.info(localModeMessage);
        }
    }

    /**
     * A GroomServer registers with its status to BSPMaster when startup, which
     * will update GroomServers cache.
     * 
     * @param status to be updated in cache.
     * @return true if registering successfully; false if fail.
     */
    @Override
    public boolean register(GroomServerStatus status) throws IOException {
        if (null == status) {
            LOG.error("No groom server status.");
            throw new NullPointerException("No groom server status.");
        }
        Throwable e = null;
        try {
            GroomProtocol wc = (GroomProtocol) RPC.waitForProxy(GroomProtocol.class,
                    HamaRPCProtocolVersion.versionID, resolveWorkerAddress(status.getRpcServer()), this.conf);
            if (null == wc) {
                LOG.warn("Fail to create Worker client at host");
                return false;
            }
            // TODO: need to check if peer name has changed
            groomServers.putIfAbsent(status, wc);
        } catch (UnsupportedOperationException u) {
            e = u;
        } catch (ClassCastException c) {
            e = c;
        } catch (NullPointerException n) {
            e = n;
        } catch (IllegalArgumentException i) {
            e = i;
        } catch (Exception ex) {
            e = ex;
        }

        if (null != e) {
            LOG.error("Fail to register GroomServer " + status.getGroomName(), e);
            return false;
        }

        for (GroomStatusListener listener : groomStatusListeners) {
            listener.groomServerRegistered(status);
        }

        LOG.info(status.getGroomName() + " is added.");
        return true;
    }

    public static InetSocketAddress resolveWorkerAddress(String data) {
        return new InetSocketAddress(data.split(":")[0], Integer.parseInt(data.split(":")[1]));
    }

    private void updateGroomServersKey(GroomServerStatus old, GroomServerStatus newKey) {
        synchronized (groomServers) {
            GroomProtocol worker = groomServers.remove(old);
            groomServers.put(newKey, worker);
        }
    }

    @Override
    public boolean report(Directive directive) throws IOException {
        instructor.put(directive);
        return true;
    }

    // /////////////////////////////////////////////////////////////
    // BSPMaster methods
    // /////////////////////////////////////////////////////////////

    // Get the job directory in system directory
    Path getSystemDirectoryForJob(BSPJobID id) {
        return new Path(getSystemDir(), id.toString());
    }

    String[] getLocalDirs() throws IOException {
        return conf.getStrings("bsp.local.dir");
    }

    void deleteLocalFiles() throws IOException {
        String[] localDirs = getLocalDirs();
        for (String localDir : localDirs) {
            FileSystem.getLocal(conf).delete(new Path(localDir), true);
        }
    }

    void deleteLocalFiles(String subdir) throws IOException {
        try {
            String[] localDirs = getLocalDirs();
            for (String localDir : localDirs) {
                FileSystem.getLocal(conf).delete(new Path(localDir, subdir), true);
            }
        } catch (NullPointerException e) {
            LOG.info(e);
        }
    }

    /**
     * Constructs a local file name. Files are distributed among configured local
     * directories.
     */
    Path getLocalPath(String pathString) throws IOException {
        return conf.getLocalPath("bsp.local.dir", pathString);
    }

    /**
     * Starts the BSP Master process.
     * 
     * @param conf The Hama configuration.
     * @return an instance of BSPMaster
     * @throws IOException
     * @throws InterruptedException
     */
    public static BSPMaster startMaster(HamaConfiguration conf) throws IOException, InterruptedException {
        return startMaster(conf, generateNewIdentifier());
    }

    /**
     * Starts the BSP Master process
     * 
     * @param conf The Hama configuration
     * @param identifier Identifier for the job.
     * @return an instance of BSPMaster
     * @throws IOException
     * @throws InterruptedException
     */
    public static BSPMaster startMaster(HamaConfiguration conf, String identifier)
            throws IOException, InterruptedException {
        BSPMaster result = new BSPMaster(conf, identifier);
        // init zk root and child nodes
        // zk is required to be initialized before scheduler is started.
        result.initZK(conf);
        result.taskScheduler.setGroomServerManager(result);
        result.taskScheduler.setMonitorManager(result);
        if (conf.getBoolean("bsp.monitor.fd.enabled", false)) {
            result.supervisor.get().start();
        }
        result.taskScheduler.start();
        return result;
    }

    /**
     * Initialize the global synchronization client.
     * 
     * @param conf Hama configuration.
     */
    private void initZK(HamaConfiguration conf) {
        this.syncClient = new ZKSyncBSPMasterClient();
        this.syncClient.init(conf);
    }

    /**
     * Get a handle of the global synchronization client used by BSPMaster.
     * 
     * @return The synchronization client.
     */
    public MasterSyncClient getSyncClient() {
        return this.syncClient;
    }

    /**
     * Parses the configuration for the master addresses.
     * 
     * @param conf normal configuration containing the information the user
     *          configured.
     * @return a InetSocketAddress if everything went fine. Or "null" if "local"
     *         was configured, which is no valid hostname.
     */
    public static InetSocketAddress getAddress(Configuration conf) {
        String hamaMasterStr = conf.get("bsp.master.address", "localhost");
        // we ensure that hamaMasterStr is non-null here because we provided
        // "localhost" as default.
        if (!hamaMasterStr.equals("local")) {
            int defaultPort = conf.getInt("bsp.master.port", 40000);
            return NetUtils.createSocketAddr(hamaMasterStr, defaultPort);
        } else {
            return null;
        }
    }

    /**
     * BSPMaster identifier
     * 
     * @return String BSPMaster identification number
     */
    private static String generateNewIdentifier() {
        return new SimpleDateFormat("yyyyMMddHHmm").format(new Date());
    }

    public void offerService() throws InterruptedException, IOException {

        this.masterServer.start();

        state.set(State.RUNNING);

        instructor = new Instructor();
        instructor.bind(ReportGroomStatusDirective.class, new ReportGroomStatusHandler());
        instructor.start();

        LOG.info("Starting RUNNING");

        this.masterServer.join();

        LOG.info("Stopped RPC Master server.");
    }

    // //////////////////////////////////////////////////
    // InterServerProtocol
    // //////////////////////////////////////////////////
    @Override
    public long getProtocolVersion(String protocol, long clientVersion) throws IOException {
        if (protocol.equals(MasterProtocol.class.getName())) {
            return HamaRPCProtocolVersion.versionID;
        } else if (protocol.equals(JobSubmissionProtocol.class.getName())) {
            return HamaRPCProtocolVersion.versionID;
        } else {
            throw new IOException("Unknown protocol to BSPMaster: " + protocol);
        }
    }

    // //////////////////////////////////////////////////
    // JobSubmissionProtocol
    // //////////////////////////////////////////////////
    /**
     * This method returns new job id. The returned job id increases sequentially.
     */
    @Override
    public BSPJobID getNewJobId() throws IOException {
        int id;
        synchronized (nextJobId) {
            id = nextJobId;
            nextJobId = id + 1;
        }
        return new BSPJobID(this.masterIdentifier, id);
    }

    @Override
    public JobStatus submitJob(BSPJobID jobID, String jobFile) throws IOException {
        if (jobs.containsKey(jobID)) {
            // job already running, don't start twice
            LOG.info("The job (" + jobID + ") was already submitted");
            return jobs.get(jobID).getStatus();
        }

        JobInProgress job = new JobInProgress(jobID, new Path(jobFile), this, this.conf);
        ++totalSubmissions;
        if (LOG.isDebugEnabled()) {
            LOG.debug("Submitting job number = " + totalSubmissions + " id = " + job.getJobID());
        }
        return addJob(jobID, job);
    }

    // //////////////////////////////////////////////////
    // GroomServerManager functions
    // //////////////////////////////////////////////////

    @Override
    public ClusterStatus getClusterStatus(boolean detailed) {
        Map<String, GroomServerStatus> groomsMap = null;

        // give the caller a snapshot of the cluster status
        int numGroomServers = groomServers.size();
        if (detailed) {
            groomsMap = new HashMap<String, GroomServerStatus>();
            for (Map.Entry<GroomServerStatus, GroomProtocol> entry : groomServers.entrySet()) {
                GroomServerStatus s = entry.getKey();
                groomsMap.put(s.getGroomHostName() + ":" + Constants.DEFAULT_GROOM_INFO_SERVER, s);
            }
        }

        int tasksPerGroom = conf.getInt(Constants.MAX_TASKS_PER_GROOM, 3);
        this.totalTaskCapacity = conf.getInt(Constants.MAX_TASKS, tasksPerGroom * numGroomServers);

        if (detailed) {
            return new ClusterStatus(groomsMap, totalTasks, totalTaskCapacity, state.get());
        } else {
            return new ClusterStatus(numGroomServers, totalTasks, totalTaskCapacity, state.get());
        }
    }

    @Override
    public GroomProtocol findGroomServer(GroomServerStatus status) {
        return groomServers.get(status);
    }

    @Override
    public Collection<GroomProtocol> findGroomServers() {
        return groomServers.values();
    }

    @Override
    public Collection<GroomServerStatus> groomServerStatusKeySet() {
        return groomServers.keySet();
    }

    @Override
    public void addJobInProgressListener(JobInProgressListener listener) {
        jobInProgressListeners.add(listener);
    }

    @Override
    public void removeJobInProgressListener(JobInProgressListener listener) {
        jobInProgressListeners.remove(listener);
    }

    @Override
    public void addGroomStatusListener(GroomStatusListener listener) {
        groomStatusListeners.add(listener);

    }

    @Override
    public void removeGroomStatusListener(GroomStatusListener listener) {
        groomStatusListeners.remove(listener);
    }

    @Override
    public void moveToBlackList(String host) {
        LOG.info("[moveToBlackList()]Host to be moved to black list: " + host);
        for (GroomServerStatus groomStatus : groomServerStatusKeySet()) {
            LOG.info("[moveToBlackList()]GroomServerStatus's host name:" + groomStatus.getGroomHostName() + " host:"
                    + host);
            if (groomStatus.getGroomHostName().equals(host)) {
                boolean result = groomServers.remove(groomStatus, findGroomServer(groomStatus));
                if (!result) {
                    LOG.error("Fail to remove " + host + " out of groom server cache!");
                }
                blackList.add(groomStatus);
                LOG.info("[moveToBlackList()] " + host + " is successfully moved to black list.");
            }
        }
    }

    @Override
    public void removeOutOfBlackList(String host) {
        for (GroomServerStatus groomStatus : blackList) {
            if (groomStatus.getGroomHostName().equals(host)) {
                boolean result = blackList.remove(groomStatus);
                if (result)
                    LOG.info("Successfully remove " + host + " out of black list.");
                else
                    LOG.error("Fail to remove " + host + " out of black list.");
            }
        }
    }

    @Override
    public Collection<GroomServerStatus> alive() {
        return groomServerStatusKeySet();
    }

    @Override
    public Collection<GroomServerStatus> blackList() {
        return blackList;
    }

    @Override
    public GroomServerStatus findInBlackList(String host) {
        for (GroomServerStatus status : blackList) {
            if (host.equals(status.getGroomHostName())) {
                return status;
            }
        }
        return null;
    }

    public String getBSPMasterName() {
        return host + ":" + port;
    }

    public long getStartTime() {
        return startTime;
    }

    public String getBSPMasterIdentifier() {
        return masterIdentifier;
    }

    public int getHttpPort() {
        return infoPort;
    }

    /**
     * Adds a job to the bsp master. Make sure that the checks are inplace before
     * adding a job. This is the core job submission logic
     * 
     * @param jobId The id for the job submitted which needs to be added
     */
    private synchronized JobStatus addJob(BSPJobID jobId, JobInProgress job) {
        synchronized (jobs) {
            jobs.put(job.getProfile().getJobID(), job);
            for (JobInProgressListener listener : jobInProgressListeners) {
                try {
                    listener.jobAdded(job);
                } catch (IOException ioe) {
                    LOG.error("Fail to alter Scheduler a job is added.", ioe);
                }
            }
        }
        return job.getStatus();
    }

    /**
     * Recovers task in job. To be called when a particular task in a job has
     * failed and there is a need to schedule it on a machine.
     */
    private synchronized void recoverTask(JobInProgress job) {
        ++totalSubmissions;
        for (JobInProgressListener listener : jobInProgressListeners) {
            try {
                listener.recoverTaskInJob(job);
            } catch (IOException ioe) {
                LOG.error("Fail to alter Scheduler a job is added.", ioe);
            }
        }
    }

    @Override
    public JobStatus[] jobsToComplete() throws IOException {
        return getJobStatus(jobs.values(), true);
    }

    @Override
    public JobStatus[] getAllJobs() throws IOException {
        LOG.debug("returns all jobs: " + jobs.size());
        return getJobStatus(jobs.values(), false);
    }

    private synchronized static JobStatus[] getJobStatus(Collection<JobInProgress> jips, boolean toComplete) {
        if (jips == null) {
            return new JobStatus[] {};
        }
        List<JobStatus> jobStatusList = new ArrayList<JobStatus>();
        for (JobInProgress jip : jips) {
            JobStatus status = jip.getStatus();

            status.setStartTime(jip.getStartTime());
            status.setNumOfTasks(jip.getNumOfTasks());

            // Sets the user name
            status.setUsername(jip.getProfile().getUser());
            status.setName(jip.getJobName());

            if (toComplete) {
                if (status.getRunState() == JobStatus.RUNNING || status.getRunState() == JobStatus.PREP) {
                    jobStatusList.add(status);
                }
            } else {
                jobStatusList.add(status);
            }
        }

        return jobStatusList.toArray(new JobStatus[jobStatusList.size()]);
    }

    @Override
    public synchronized String getFilesystemName() throws IOException {
        if (fs == null) {
            throw new IllegalStateException("FileSystem object not available yet");
        }
        return fs.getUri().toString();
    }

    /**
     * Return system directory to which BSP store control files.
     */
    @Override
    public String getSystemDir() {
        Path sysDir = new Path(conf.get("bsp.system.dir", "/tmp/hadoop/bsp/system"));
        return fs.makeQualified(sysDir).toString();
    }

    @Override
    public JobProfile getJobProfile(BSPJobID jobid) throws IOException {
        synchronized (this) {
            JobInProgress job = jobs.get(jobid);
            if (job != null) {
                return job.getProfile();
            }
        }
        return null;
    }

    @Override
    public JobStatus getJobStatus(BSPJobID jobid) throws IOException {
        synchronized (this) {
            JobInProgress job = jobs.get(jobid);
            if (job != null) {
                return job.getStatus();
            }
        }
        return null;
    }

    @Override
    public void killJob(BSPJobID jobid) throws IOException {
        JobInProgress job = jobs.get(jobid);

        if (null == job) {
            LOG.info("killJob(): JobId " + jobid.toString() + " is not a valid job");
            return;
        }

        killJob(job);
    }

    private synchronized static void killJob(JobInProgress job) {
        LOG.info("Killing job " + job.getJobID());
        job.kill();
    }

    @Override
    public boolean killTask(TaskAttemptID taskId, boolean shouldFail) throws IOException {
        return false;
    }

    public static BSPMaster constructMaster(Class<? extends BSPMaster> masterClass, final Configuration conf) {
        try {
            Constructor<? extends BSPMaster> c = masterClass.getConstructor(Configuration.class);
            return c.newInstance(conf);
        } catch (Exception e) {
            throw new RuntimeException("Failed construction of " + "Master: " + masterClass.toString()
                    + ((e.getCause() != null) ? e.getCause().getMessage() : ""), e);
        }
    }

    /**
     * Shuts down the BSP Process and does the necessary clean up.
     */
    public void shutdown() {
        try {
            this.syncClient.close();
        } catch (IOException e) {
            LOG.error("Error closing the sync client", e);
        }
        if (null != this.supervisor.get()) {
            this.supervisor.get().stop();
        }
        this.masterServer.stop();
    }

    public BSPMaster.State currentState() {
        return this.state.get();
    }

    @Override
    public void process(WatchedEvent event) {
    }

    @Override
    public Supervisor supervisor() {
        return this.supervisor.get();
    }

    TaskCompletionEvent[] EMPTY_EVENTS = new TaskCompletionEvent[0];

    @Override
    public TaskCompletionEvent[] getTaskCompletionEvents(BSPJobID jobid, int fromEventId, int maxEvents) {
        synchronized (this) {
            JobInProgress job = this.jobs.get(jobid);
            if (null != job) {
                if (job.areTasksInited()) {
                    return job.getTaskCompletionEvents(fromEventId, maxEvents);
                } else {
                    return EMPTY_EVENTS;
                }
            }
        }
        return null;
    }
}