Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.chinamobile.bcbsp.workermanager; import com.chinamobile.bcbsp.action.Directive; import com.chinamobile.bcbsp.action.KillStaffAction; import com.chinamobile.bcbsp.action.LaunchStaffAction; import com.chinamobile.bcbsp.action.WorkerManagerAction; import com.chinamobile.bcbsp.BSPConfiguration; import com.chinamobile.bcbsp.bspcontroller.Counters; import com.chinamobile.bcbsp.bspstaff.BSPStaffRunner; import com.chinamobile.bcbsp.bspstaff.BSPStaff.WorkerAgentForStaffInterface; import com.chinamobile.bcbsp.bspstaff.Staff; import com.chinamobile.bcbsp.Constants; import com.chinamobile.bcbsp.fault.storage.Fault; import com.chinamobile.bcbsp.fault.storage.Fault.Level; import com.chinamobile.bcbsp.http.HttpServer; import com.chinamobile.bcbsp.rpc.BSPRPCProtocolVersion; import com.chinamobile.bcbsp.rpc.ControllerProtocol; import com.chinamobile.bcbsp.rpc.WorkerManagerProtocol; import com.chinamobile.bcbsp.sync.SuperStepReportContainer; import com.chinamobile.bcbsp.thirdPartyInterface.HDFS.BSPFileSystem; import com.chinamobile.bcbsp.thirdPartyInterface.HDFS.impl.BSPFileSystemImpl; import com.chinamobile.bcbsp.thirdPartyInterface.HDFS.impl.BSPHdfsImpl; import com.chinamobile.bcbsp.thirdPartyInterface.HDFS.impl.BSPLocalFileSystemImpl; import com.chinamobile.bcbsp.thirdPartyInterface.Zookeeper.BSPZookeeper; import com.chinamobile.bcbsp.thirdPartyInterface.Zookeeper.impl.BSPZookeeperImpl; import com.chinamobile.bcbsp.util.BSPJobID; import com.chinamobile.bcbsp.util.BSPJob; import com.chinamobile.bcbsp.util.ClassLoaderUtil; import com.chinamobile.bcbsp.util.StaffAttemptID; import com.chinamobile.bcbsp.util.StaffStatus; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.lang.reflect.Constructor; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.ServerSocket; import java.util.ArrayList; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.LinkedBlockingQueue; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FSError; import org.apache.hadoop.fs.Path; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.Server; import org.apache.hadoop.metrics.MetricsContext; import org.apache.hadoop.metrics.MetricsUtil; import org.apache.hadoop.net.DNS; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.util.DiskChecker; import org.apache.hadoop.util.DiskChecker.DiskErrorException; import org.apache.hadoop.util.RunJar; import org.apache.hadoop.util.StringUtils; import org.apache.log4j.LogManager; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.WatchedEvent; import org.apache.zookeeper.Watcher; /** * A WorkerManager is a process that manages staffs assigned by the * BSPController. Each WorkerManager contacts the BSPController, and it takes * assigned staffs and reports its status by means of periodical heart beats * with BSPController. Each WorkerManager is designed to run with HDFS or other * distributed storages. Basically, a WorkerManager and a data node should be * run on one physical node. */ public class WorkerManager implements Runnable, WorkerManagerProtocol, WorkerAgentProtocol, Watcher { /** Define Log variable for outputting messages */ private static final Log LOG = LogFactory.getLog(WorkerManager.class); /** State HEART_BEAT_INTERVAL */ private static volatile int HEART_BEAT_INTERVAL; /** Define the length of cacheQueue */ private static int CACHE_QUEUE_LENGTH = 20; /** State BSP configuration */ private Configuration conf; /** State constants */ static enum State { /** The state of WorkerManager */ NORMAL, COMPUTE, SYNC, BARRIER, STALE, INTERRUPTED, DENIED }; /** State BSPZookeeper */ private BSPZookeeper bspzk = null; /** WorkerManager has not been initialized */ private volatile boolean initialized = false; /** WorkerManager is running */ private volatile boolean running = true; /** WorkerManager is shuttingDown */ private volatile boolean shuttingDown = false; /** WorkerManager is initialized */ private boolean justInited = true; /** The name of workerManager */ private String workerManagerName; /** IP address of bsp controller */ private InetSocketAddress bspControllerAddr; /** IP address of bsp controller */ private InetSocketAddress standbyControllerAddr; /** State BSPFileSystem */ private BSPFileSystem bspsystemFS = null; /** State the number of failed jobs*/ private int failures; /** State the max count of staff*/ private int maxStaffsCount = 0; /** State the count of running staff*/ private Integer currentStaffsCount = 0; /** State the count of finished staff*/ private int finishedStaffsCount = 0; /** State fault list*/ private List<Fault> workerFaultList = null; /** The list of StaffStatus*/ private List<StaffStatus> reportStaffStatusList = null; /** Map for storing running staffs' StaffInProgress*/ private Map<StaffAttemptID, StaffInProgress> runningStaffs = null; /** Map for storing finished staffs' StaffInProgress*/ private Map<StaffAttemptID, StaffInProgress> finishedStaffs = null; /** Map for storing running jobs*/ private Map<BSPJobID, RunningJob> runningJobs = null; /** Map for storing finished jobs*/ private Map<BSPJobID, RunningJob> finishedJobs = null; /** Map for storing running JobToWorkerAgent*/ private Map<BSPJobID, WorkerAgentForJob> runningJobtoWorkerAgent = null; /** Define LaunchStaffManager object */ private LaunchStaffManager lsManager = new LaunchStaffManager(); /** Define rpc server */ private String rpcServer; /** Define worker server */ private Server workerServer; /** State ControllerProtocol */ private ControllerProtocol controllerClient; /** State ControllerProtocol */ private ControllerProtocol standbyControllerClient; /** IP address of staff */ private InetSocketAddress staffReportAddress; /** A server of Staff report server*/ private Server staffReportServer = null; /** The list of failed jobs */ private ArrayList<BSPJobID> failedJobList = new ArrayList<BSPJobID>(); /** Http server */ private HttpServer winfoServer; /** Http port */ private int winfoPort; /** Define variable workerMangerStatus */ private WorkerManagerStatus workerMangerStatus; /** Map for storing staffId and its StaffInProgress */ private Map<StaffAttemptID, StaffInProgress> reprotStaffsMap = null; /** The finish time of staff */ private long finishTime; /**For current free port counter. It will travel around 60001~65535 */ private int currentFreePort = 60000; /** * Constructor. * @param conf Configuration */ public WorkerManager(Configuration conf) throws IOException { LOG.info("worker start"); this.conf = conf; String mode = conf.get(Constants.BC_BSP_CONTROLLER_ADDRESS); if (!mode.equals("local")) { choseActiveControllerAddress(); } } /** * Initialize workerManager. */ @SuppressWarnings("static-access") public synchronized void initialize() throws IOException { if (this.conf.get(Constants.BC_BSP_WORKERMANAGER_RPC_HOST) != null) { this.workerManagerName = conf.get(Constants.BC_BSP_WORKERMANAGER_RPC_HOST); } if (this.workerManagerName == null) { this.workerManagerName = DNS.getDefaultHost(conf.get("bsp.dns.interface", "default"), conf.get("bsp.dns.nameserver", "default")); } // check local disk checkLocalDirs(conf.getStrings(Constants.BC_BSP_LOCAL_DIRECTORY)); deleteLocalFiles("workerManager"); this.workerFaultList = new ArrayList<Fault>(); this.reportStaffStatusList = new ArrayList<StaffStatus>(); this.runningStaffs = new ConcurrentHashMap<StaffAttemptID, StaffInProgress>(); this.finishedStaffs = new ConcurrentHashMap<StaffAttemptID, StaffInProgress>(); this.runningJobs = new ConcurrentHashMap<BSPJobID, RunningJob>(); this.finishedJobs = new ConcurrentHashMap<BSPJobID, RunningJob>(); this.runningJobtoWorkerAgent = new ConcurrentHashMap<BSPJobID, WorkerAgentForJob>(); this.reprotStaffsMap = new ConcurrentHashMap<StaffAttemptID, StaffInProgress>(); this.conf.set(Constants.BC_BSP_WORKERAGENT_HOST, this.workerManagerName); this.conf.set(Constants.BC_BSP_WORKERMANAGER_RPC_HOST, this.workerManagerName); this.maxStaffsCount = conf.getInt(Constants.BC_BSP_WORKERMANAGER_MAXSTAFFS, 1); WorkerManager.HEART_BEAT_INTERVAL = conf.getInt(Constants.HEART_BEAT_INTERVAL, 1000); LOG.info("The max number of staffs is : " + this.maxStaffsCount); int rpcPort = -1; String rpcAddr = null; if (!this.initialized) { rpcAddr = conf.get(Constants.BC_BSP_WORKERMANAGER_RPC_HOST, Constants.DEFAULT_BC_BSP_WORKERMANAGER_RPC_HOST); rpcPort = conf.getInt(Constants.BC_BSP_WORKERMANAGER_RPC_PORT, 5000); if (-1 == rpcPort || null == rpcAddr) { throw new IllegalArgumentException("Error rpc address " + rpcAddr + " port" + rpcPort); } this.workerServer = RPC.getServer(this, rpcAddr, rpcPort, conf); this.workerServer.start(); this.rpcServer = rpcAddr + ":" + rpcPort; LOG.info("Worker rpc server --> " + rpcServer); } String address = conf.get(Constants.BC_BSP_WORKERMANAGER_REPORT_ADDRESS); InetSocketAddress socAddr = NetUtils.createSocketAddr(address); String bindAddress = socAddr.getHostName(); int tmpPort = socAddr.getPort(); // RPC initialization this.staffReportServer = RPC.getServer(this, bindAddress, tmpPort, 10, false, this.conf); this.staffReportServer.start(); // http server InetAddress addr = InetAddress.getLocalHost(); String ipSlave = addr.getHostAddress().toString(); winfoPort = conf.getInt("bcbsp.http.winfoserver.port", 40027); winfoServer = new HttpServer("bcbsp", ipSlave, winfoPort, true, conf); winfoServer.setAttribute("WorkerManager", this); LOG.info("prot: 40027"); LOG.info("ljn test : controllerClient before start "); winfoServer.start(); LOG.info("server has started"); LOG.info("ljn test : controllerClient before register "); // get the assigned address this.staffReportAddress = staffReportServer.getListenerAddress(); LOG.info("WorkerManager up at: " + this.staffReportAddress); DistributedCache.purgeCache(this.conf); LOG.info("ljn test : DistributedCache "); LOG.info("ljn test : bspControllerAddr " + bspControllerAddr); LOG.info("ljn test : BSPRPCProtocolVersion.versionID " + BSPRPCProtocolVersion.versionID); // establish the communication link to bsp master try { this.controllerClient = (ControllerProtocol) RPC.waitForProxy(ControllerProtocol.class, BSPRPCProtocolVersion.versionID, bspControllerAddr, conf); } catch (Exception e) { throw new RuntimeException(e); } LOG.info("ljn test : controllerClient controllerClient start "); // establish the communication link to standby bsp master if ("ha".equals(conf.get(Constants.BC_BSP_HA_FLAG, ""))) { this.standbyControllerClient = (ControllerProtocol) RPC.waitForProxy(ControllerProtocol.class, BSPRPCProtocolVersion.versionID, this.standbyControllerAddr, conf); } LOG.info("bspControllerAddr = " + bspControllerAddr + " standbyControllerAddr = " + standbyControllerAddr); LOG.info("ljn test : controllerClient before register "); // enroll in bsp controller if (-1 == rpcPort || null == rpcAddr) { throw new IllegalArgumentException("Error rpc address " + rpcAddr + " port" + rpcPort); } LOG.info("ljn test : controllerClient before lsManager.start "); this.lsManager.start(); LOG.info("ljn test : controllerClient before lsManager.start over"); workerMangerStatus = new WorkerManagerStatus(workerManagerName, cloneAndResetRunningStaffStatuses(), maxStaffsCount, currentStaffsCount, finishedStaffsCount, failures, this.rpcServer, workerFaultList); this.workerMangerStatus.setHost(bindAddress); this.workerMangerStatus.setHttpPort(this.staffReportAddress.toString()); this.workerMangerStatus.setLocalIp(ipSlave); LOG.info("ljn test : controllerClient before register "); if (!this.controllerClient.register(workerMangerStatus)) { LOG.error("There is a problem in establishing communication" + " link with BSPController"); throw new IOException("There is a problem in establishing" + " communication link with BSPController."); } else { LOG.info("have registed to bsp master"); } if ("ha".equals(conf.get(Constants.BC_BSP_HA_FLAG, ""))) { if (!this.standbyControllerClient.register(workerMangerStatus)) { LOG.error("There is a problem in establishing communication" + " link with BSPController"); throw new IOException( "There is a problem in establishing" + " communication link with BSPController."); } else { LOG.info("have registed to standby bsp master"); } } this.running = true; this.initialized = true; LOG.info("ljn test : controllerClient after register "); } /** * Get the IP address of staff report. * @return the port at which the staff tracker bound to */ public synchronized InetSocketAddress getStaffTrackerReportAddress() { return staffReportAddress; } /** * Launch staff manager. */ private class LaunchStaffManager extends Thread { /** block queue */ private final BlockingQueue<Directive> buffer = new LinkedBlockingQueue<Directive>(); /** * Put directive into queue. * @param directive {@link Directive} */ public void put(Directive directive) { try { buffer.put(directive); } catch (InterruptedException ie) { LOG.error("Unable to put directive into queue.", ie); Thread.currentThread().interrupt(); } } @Override public void run() { while (true) { try { Directive directive = buffer.take(); // update tasks status ArrayList<WorkerManagerAction> actionList = directive.getActionList(); LOG.info("Got Response from BSPController with " + ((actionList != null) ? actionList.size() : 0) + " actions"); // perform actions boolean recovery = directive.isRecovery(); boolean changeWorkerState = directive.isChangeWorkerState(); int failCounter = directive.getFailCounter(); if (actionList != null) { for (WorkerManagerAction action : actionList) { try { if (action instanceof LaunchStaffAction) { if (recovery) { String localPath = conf.get(Constants.BC_BSP_LOCAL_DIRECTORY) + "/workerManager"; LOG.info("if(recovery == true)" + " " + localPath); if (new BSPLocalFileSystemImpl(conf).exists( new BSPHdfsImpl().newpath(localPath, ((LaunchStaffAction) action) .getStaff().getStaffAttemptId().toString()))) { new BSPLocalFileSystemImpl(conf).delete(new BSPHdfsImpl() .newpath(localPath, ((LaunchStaffAction) action).getStaff() .getStaffAttemptId().toString()), true); } } LaunchStaffAction temp = (LaunchStaffAction) action; LOG.info("Now will start staff " + temp.getStaff().getStaffID()); LOG.info("debug:" + temp.getStaff().getJobFile()); startNewStaff((LaunchStaffAction) action, directive, recovery, changeWorkerState, failCounter); // return true; } else { KillStaffAction killAction = (KillStaffAction) action; if (runningStaffs.containsKey(killAction.getStaffID())) { StaffInProgress sip = runningStaffs.get(killAction.getStaffID()); sip.staffStatus.setRunState(StaffStatus.State.KILLED); sip.killAndCleanup(true); } else { LOG.warn(killAction.getStaffID() + " is not in the runningStaffs " + "and the kill action is invalid."); } // return false; } } catch (IOException e) { LOG.error("Exception has been catched in WorkerManager--dispatch !", e); StaffInProgress sip = null; sip = runningStaffs .get(((LaunchStaffAction) action).getStaff().getStaffAttemptId()); sip.getStatus().setStage(0); // convenient for // the call in // controller sip.setStaffStatus(Constants.SATAFF_STATUS.FAULT, new Fault(Fault.Type.DISK, Level.WARNING, sip.getStatus().getWorkerManager(), "IOException happened", sip.getStatus().getJobId().toString(), sip.getStatus().getStaffId().toString())); } } } } catch (Exception e) { //LOG.error(" [WorkerManager run]", e); throw new RuntimeException("WorkerManager run() exception", e); } } } } @Override public boolean dispatch(BSPJobID jobId, Directive directive, boolean recovery, boolean changeWorkerState, int failCounter) { directive.setRecovery(recovery); directive.setChangeWorkerState(changeWorkerState); directive.setFailCounter(failCounter); this.lsManager.put(directive); return true; } /** * Check local disk. * @param localDirs the string array of local disk */ private static void checkLocalDirs(String[] localDirs) throws DiskErrorException { boolean writable = false; if (localDirs != null) { for (int i = 0; i < localDirs.length; i++) { try { DiskChecker.checkDir(new File(localDirs[i])); LOG.info("Local System is Normal : " + localDirs[i]); writable = true; } catch (DiskErrorException e) { //LOG.error("BSP Processor local", e); throw new RuntimeException("WorkerManager checkLocalDirs" + " exception", e); } } } if (!writable) { throw new DiskErrorException("all local directories are not writable"); } } public String[] getLocalDirs() { return conf.getStrings(Constants.BC_BSP_LOCAL_DIRECTORY); } /** * Delete local file. */ public void deleteLocalFiles() throws IOException { String[] localDirs = getLocalDirs(); for (int i = 0; i < localDirs.length; i++) { File f = new File(localDirs[i]); deleteLocalDir(f); } } /** * Delete local directory. * @param dir File */ public void deleteLocalDir(File dir) { if (dir == null || !dir.exists() || !dir.isDirectory()) { return; } for (File file : dir.listFiles()) { if (file.isFile()) { // delete the file file.delete(); } else if (file.isDirectory()) { // recursive delete the subdir deleteLocalDir(file); } } // delete the root directory dir.delete(); } /** * Delete local file. * @param subdir local file */ public void deleteLocalFiles(String subdir) throws IOException { try { String[] localDirs = getLocalDirs(); for (int i = 0; i < localDirs.length; i++) { new BSPLocalFileSystemImpl(this.conf).delete(new BSPHdfsImpl().newpath(localDirs[i], subdir), true); } } catch (NullPointerException e) { //LOG.error("[deleteLocalFiles]", e); throw new RuntimeException("WorkerManager deleteLocalFiles" + " exception", e); } } /** * Delete local file. */ public void cleanupStorage() throws IOException { deleteLocalFiles(); } /** * Clean up threads. */ private void startCleanupThreads() throws IOException { } /** * Update staff statistics. * @param jobId BSPJobID */ public void updateStaffStatistics(BSPJobID jobId) throws Exception { synchronized (currentStaffsCount) { currentStaffsCount--; } finishedStaffsCount++; if (finishedStaffs.size() > CACHE_QUEUE_LENGTH) { finishedStaffs.clear(); } synchronized (runningJobs) { int counter = runningJobs.get(jobId).getStaffCounter(); if (counter > 0) { runningJobs.get(jobId).setStaffCounter(counter - 1); } if (runningJobs.get(jobId).getStaffCounter() == 0) { if (finishedJobs.size() > CACHE_QUEUE_LENGTH) { finishedJobs.clear(); } finishedJobs.put(jobId, runningJobs.remove(jobId)); runningJobtoWorkerAgent.get(jobId).close(); runningJobtoWorkerAgent.remove(jobId); } } } /** * Set up WorkerManagerStatus,report to controller client. * @return the state of WorkerManager */ public State offerService() throws Exception { while (running && !shuttingDown) { try { this.reportStaffStatusList.clear(); Iterator<Entry<StaffAttemptID, StaffInProgress>> runningStaffsIt = runningStaffs.entrySet() .iterator(); Entry<StaffAttemptID, StaffInProgress> entry; while (runningStaffsIt.hasNext()) { entry = runningStaffsIt.next(); switch (entry.getValue().getStatus().getRunState()) { case COMMIT_PENDING: case UNASSIGNED: break; case RUNNING: this.reportStaffStatusList.add(entry.getValue().getStatus()); if (this.reprotStaffsMap.containsKey(entry.getKey())) { this.reprotStaffsMap.remove(entry.getKey()); } this.reprotStaffsMap.put(entry.getKey(), entry.getValue()); break; case SUCCEEDED: updateStaffStatistics(entry.getValue().getStatus().getJobId()); runningStaffsIt.remove(); finishedStaffs.put(entry.getKey(), entry.getValue()); if (this.reprotStaffsMap.containsKey(entry.getKey())) { this.reprotStaffsMap.remove(entry.getKey()); } this.reprotStaffsMap.put(entry.getKey(), entry.getValue()); LOG.info(entry.getKey() + " has succeed and been removed from the runningStaffs"); break; case FAULT: if (entry.getValue().runner.isAlive()) { entry.getValue().getStatus().setPhase(StaffStatus.Phase.CLEANUP); entry.getValue().runner.kill(); } this.reportStaffStatusList.add(entry.getValue().getStatus()); updateStaffStatistics(entry.getValue().getStatus().getJobId()); runningStaffsIt.remove(); finishedStaffs.put(entry.getKey(), entry.getValue()); if (this.reprotStaffsMap.containsKey(entry.getKey())) { this.reprotStaffsMap.remove(entry.getKey()); } this.reprotStaffsMap.put(entry.getKey(), entry.getValue()); LOG.error(entry.getKey() + " is fault and has been removed from the runningStaffs"); break; case STAFF_RECOVERY: break; case WORKER_RECOVERY: break; case FAILED: break; case KILLED: updateStaffStatistics(entry.getValue().getStatus().getJobId()); runningStaffsIt.remove(); finishedStaffs.put(entry.getKey(), entry.getValue()); if (this.reprotStaffsMap.containsKey(entry.getKey())) { this.reprotStaffsMap.remove(entry.getKey()); } this.reprotStaffsMap.put(entry.getKey(), entry.getValue()); LOG.warn(entry.getKey() + " has been killed manually and removed from the runningStaffs"); break; case FAILED_UNCLEAN: break; case KILLED_UNCLEAN: // TODO : This staff should be report and request // the cleanup task in the future. updateStaffStatistics(entry.getValue().getStatus().getJobId()); runningStaffsIt.remove(); finishedStaffs.put(entry.getKey(), entry.getValue()); if (this.reprotStaffsMap.containsKey(entry.getKey())) { this.reprotStaffsMap.remove(entry.getKey()); } this.reprotStaffsMap.put(entry.getKey(), entry.getValue()); LOG.warn(entry.getKey() + " has been killed manually and removed from the runningStaffs"); break; default: LOG.error("Unknown StaffStatus.State: " + entry.getValue().getStatus().getRunState()); } } this.reportStaffStatusList.clear(); Iterator<Entry<StaffAttemptID, StaffInProgress>> iter2 = reprotStaffsMap.entrySet().iterator(); while (iter2.hasNext()) { Entry<StaffAttemptID, StaffInProgress> entry2 = iter2.next(); // LOG.info(entry2.getKey()+"," + entry2.getValue().getRunState()); this.reportStaffStatusList.add(entry2.getValue().getStatus()); } reprotStaffsMap.clear(); this.workerMangerStatus.setStaffReports(this.reportStaffStatusList); this.workerMangerStatus.setMaxStaffsCount(maxStaffsCount); this.workerMangerStatus.setRunningStaffsCount(currentStaffsCount); this.workerMangerStatus.setFinishedStaffsCount(finishedStaffsCount); this.workerMangerStatus.setFailedStaffsCount(failures); this.workerMangerStatus.setWorkerFaultList(workerFaultList); try { boolean ret = this.controllerClient.report(new Directive(this.workerMangerStatus)); synchronized (this) { workerFaultList.clear(); } // list.add() need synchronize if (!ret) { LOG.error("fail to update"); } } catch (Exception ioe) { LOG.error("Fail to communicate with BSPController for reporting." + " in offerservice , Now will fresh controllerClient", ioe); this.ensureFreshControllerClient(); } Thread.sleep(HEART_BEAT_INTERVAL); } catch (InterruptedException ie) { //LOG.error("[offerService]", ie); throw new RuntimeException("WorkerManager offerService" + " InterruptedException", ie); } } return State.NORMAL; } /** * Set up StaffInProgess object, * and call localizeJob(StaffInProgress,Directive) to local. * @param action {@link LaunchStaffAction} * @param directive {@link Directive} * @param recovery the flag of recovery * @param changeWorkerState the flag of changeWorkerState * @param failCounter the counter of failed staff */ private void startNewStaff(LaunchStaffAction action, Directive directive, boolean recovery, boolean changeWorkerState, int failCounter) { Staff s = action.getStaff(); LOG.info("debug: in startNewStaff jobFile is " + s.getJobFile()); BSPJob jobConf = null; try { jobConf = new BSPJob(s.getJobID(), s.getJobFile()); jobConf.setInt("staff.fault.superstep", directive.getFaultSSStep()); } catch (IOException e1) { LOG.error("Exception has been catched in WorkerManager--startNewStaff-jobConf", e1); StaffInProgress sip = runningStaffs.get(action.getStaff().getStaffAttemptId()); sip.getStatus().setStage(0); // convenient for the call in // controller sip.setStaffStatus(Constants.SATAFF_STATUS.FAULT, new Fault(Fault.Type.DISK, Level.WARNING, sip.getStatus().getWorkerManager(), "IOException happened", sip.getStatus().getJobId().toString(), sip.getStatus().getStaffId().toString())); } StaffInProgress sip = new StaffInProgress(s, jobConf, this.workerManagerName); sip.setFailCounter(failCounter); sip.setMigrateSS(directive.getMigrateSSStep()); if (recovery) { sip.getStatus().setRecovery(true); } if (changeWorkerState) { sip.setChangeWorkerState(true); } LOG.info("debug: before localizeJob(sip, directive); job type is " + jobConf.getJobType()); try { localizeJob(sip, directive); } catch (IOException e) { LOG.error("Exception has been catched in WorkerManager" + "--startNewStaff-localizeJob", e); sip = runningStaffs.get(action.getStaff().getStaffAttemptId()); // convenient for the call in sip.getStatus().setStage(0); // controller sip.setStaffStatus(Constants.SATAFF_STATUS.FAULT, new Fault(Fault.Type.DISK, Level.WARNING, sip.getStatus().getWorkerManager(), "IOException happened", sip.getStatus().getJobId().toString(), sip.getStatus().getStaffId().toString())); } } /** * Localization of job. * @param sip StaffInProgress * @param directive {@link Directive} */ private void localizeJob(StaffInProgress sip, Directive directive) throws IOException { Staff staff = sip.getStaff(); conf.addResource(staff.getJobFile()); BSPJob defaultJobConf = new BSPJob((BSPConfiguration) conf); Path localJobFile = defaultJobConf.getLocalPath( Constants.BC_BSP_LOCAL_SUBDIR_WORKERMANAGER + "/" + staff.getStaffID() + "/" + "job.xml"); Path localJarFile = null; // systemFS.copyToLocalFile(new Path(staff.getJobFile()), localJobFile); bspsystemFS.copyToLocalFile(new BSPHdfsImpl().newPath(staff.getJobFile()), localJobFile); BSPConfiguration confBsp = new BSPConfiguration(); confBsp.addResource(localJobFile); LOG.info("debug: conf.get(Constants.USER_BC_BSP_JOB_TYPE) " + confBsp.get(Constants.USER_BC_BSP_JOB_TYPE)); BSPJob jobConf = new BSPJob(confBsp, staff.getJobID().toString()); LOG.info("debug: conf.get(Constants.USER_BC_BSP_JOB_TYPE) " + confBsp.get(Constants.USER_BC_BSP_JOB_TYPE)); LOG.info("debug: job type is " + jobConf.getJobType()); if (Constants.USER_BC_BSP_JOB_TYPE_C.equals(jobConf.getJobType())) { LOG.info("debug: in LocalizeJob job.exe"); localJarFile = defaultJobConf.getLocalPath( Constants.BC_BSP_LOCAL_SUBDIR_WORKERMANAGER + "/" + staff.getStaffID() + "/" + "jobC"); } else { LOG.info("debug: in in LocalizeJob job.jar"); localJarFile = defaultJobConf.getLocalPath( Constants.BC_BSP_LOCAL_SUBDIR_WORKERMANAGER + "/" + staff.getStaffID() + "/" + "job.jar"); } Path jarFile = null; LOG.info("debug: job type is" + jobConf.getJobType()); if (Constants.USER_BC_BSP_JOB_TYPE_C.equals(jobConf.getJobType())) { LOG.info("debug: in LocalizeJob bofore jobConf.getJobExe =" + jobConf.getJobExe()); if (jobConf.getJobExe() != null) { jarFile = new Path(jobConf.getJobExe()); } LOG.info("jarFile is" + jarFile); jobConf.setJobExe(localJarFile.toString()); } else { if (jobConf.getJar() != null) { jarFile = new Path(jobConf.getJar()); } jobConf.setJar(localJarFile.toString()); } if (jarFile != null) { LOG.info("jarFile != null"); bspsystemFS.copyToLocalFile(jarFile, localJarFile); File workDir = new File(new File(localJobFile.toString()).getParent(), "work"); if (!workDir.mkdirs()) { if (!workDir.isDirectory()) { throw new IOException("Mkdirs failed to create " + workDir.toString()); } } if (!Constants.USER_BC_BSP_JOB_TYPE_C.equals(jobConf.getJobType())) { RunJar.unJar(new File(localJarFile.toString()), workDir); /** Add the user program jar to the system's classpath. */ ClassLoaderUtil.addClassPath(localJarFile.toString()); } } RunningJob rjob = addStaffToJob(staff.getJobID(), localJobFile, sip, directive, jobConf); LOG.info("debug:after addStaffToJob(staff.getJobID(), " + "localJobFile, sip, directive, jobConf); "); rjob.localized = true; sip.setFaultSSStep(directive.getFaultSSStep()); LOG.info("debug:before launchStaffForJob(sip, jobConf);"); launchStaffForJob(sip, jobConf); } /** * Launch staff for job. * @param sip StaffInProgress * @param jobConf BSPJob */ private void launchStaffForJob(StaffInProgress sip, BSPJob jobConf) { try { sip.setJobConf(jobConf); sip.launchStaff(); } catch (IOException ioe) { LOG.error("Exception has been catched in WorkerManager" + "--launchStaffForJob", ioe); sip.staffStatus.setRunState(StaffStatus.State.FAILED); sip.getStatus().setStage(0); // convenient for the call in // controller sip.setStaffStatus(Constants.SATAFF_STATUS.FAULT, new Fault(Fault.Type.SYSTEMSERVICE, Fault.Level.INDETERMINATE, sip.getStatus().getWorkerManager(), ioe.toString(), sip.getStatus().getJobId().toString(), sip.getStatus().getStaffId().toString())); } } /** * Add staff to job. * @param jobId BSPJobID * @param localJobFile the path of local job file * @param sip StaffInProgress * @param directive Directive * @param job BSPJob * @return running job */ private RunningJob addStaffToJob(BSPJobID jobId, Path localJobFile, StaffInProgress sip, Directive directive, BSPJob job) { synchronized (runningJobs) { RunningJob rJob = null; if (!runningJobs.containsKey(jobId)) { rJob = new RunningJob(jobId, localJobFile); rJob.localized = false; rJob.staffs = new HashSet<StaffInProgress>(); rJob.jobFile = localJobFile; runningJobs.put(jobId, rJob); // Create a new WorkerAgentForJob for a new job try { WorkerAgentForJob bspPeerForJob = new WorkerAgentForJob(conf, jobId, job, this); runningJobtoWorkerAgent.put(jobId, bspPeerForJob); } catch (IOException e) { LOG.error("Failed to create a WorkerAgentForJob for a new job" + jobId.toString()); } } else { rJob = runningJobs.get(jobId); } rJob.staffs.add(sip); int counter = rJob.getStaffCounter(); rJob.setStaffCounter(counter + 1); return rJob; } } /** * The data structure for initializing a job. */ static class RunningJob { /** State BSPJobID */ private BSPJobID jobId; /** State Path variable */ private Path jobFile; /** State the set of StaffInProgress */ private Set<StaffInProgress> staffs; /** Define the counter of staff */ private int staffCounter = 0; /** The flag of localized */ private boolean localized; /** The flag of keepJobFiles*/ private boolean keepJobFiles; /** * constructor. * @param jobId BSPJobID * @param jobFile Path */ RunningJob(BSPJobID jobId, Path jobFile) { this.jobId = jobId; localized = false; staffs = new HashSet<StaffInProgress>(); this.jobFile = jobFile; keepJobFiles = false; } RunningJob() { } Path getJobFile() { return jobFile; } BSPJobID getJobId() { return jobId; } public void setStaffCounter(int counter) { staffCounter = counter; } public int getStaffCounter() { return staffCounter; } } /** * Clone and reset running staff's StaffStatus. * @return the list of StaffStatus */ private synchronized List<StaffStatus> cloneAndResetRunningStaffStatuses() { List<StaffStatus> result = new ArrayList<StaffStatus>(runningStaffs.size()); for (StaffInProgress sip : runningStaffs.values()) { StaffStatus status = sip.getStatus(); result.add((StaffStatus) status.clone()); } return result; } /** * Init file system. */ public void initFileSystem() throws Exception { if (justInited) { String dir = controllerClient.getSystemDir(); if (dir == null) { LOG.error("Fail to get system directory."); throw new IOException("Fail to get system directory."); } bspsystemFS = new BSPFileSystemImpl(dir, conf); } justInited = false; } /** * run() method of WorkerManager. */ @Override public void run() { try { initialize(); initFileSystem(); startCleanupThreads(); boolean denied = false; while (running && !shuttingDown && !denied) { boolean staleState = false; try { while (running && !staleState && !shuttingDown && !denied) { try { State osState = offerService(); if (osState == State.STALE) { staleState = true; } else if (osState == State.DENIED) { denied = true; } } catch (Exception e) { if (!shuttingDown) { LOG.warn("Lost connection to BSP Controller [" + bspControllerAddr + "]. Retrying...", e); try { Thread.sleep(5000); } catch (InterruptedException ie) { //LOG.error("[run]", ie); throw new RuntimeException("WorkerManager run" + " InterruptedException", ie); } } } } } catch (Exception e) { //LOG.error("[run]", e); throw new RuntimeException("WorkerManager run" + " Exception", e); } if (shuttingDown) { return; } LOG.warn("Reinitializing local state"); initialize(); initFileSystem(); } } catch (Exception ioe) { LOG.error("Got fatal exception in WorkerManager: " + StringUtils.stringifyException(ioe)); LOG.error("WorkerManager will quit abnormally!"); close(); return; } } /** * Shut down the WorkerManager. */ public synchronized void shutdown() throws IOException { LOG.info("Prepare to shutdown the WorkerManager"); shuttingDown = true; close(); } /** * Stop all Staff Process and WorkerAgentForJob. * Stop all RPC Server. * Clean up temporary files on the local disk. */ @Override @SuppressWarnings("deprecation") public synchronized void close() { this.running = false; this.initialized = false; try { for (StaffInProgress sip : runningStaffs.values()) { if (sip.runner.isAlive()) { sip.killAndCleanup(true); LOG.info(sip.getStatus().getStaffId() + " has been killed by system"); } } LOG.info("Succeed to stop all Staff Process"); for (Map.Entry<BSPJobID, WorkerAgentForJob> e : runningJobtoWorkerAgent.entrySet()) { e.getValue().close(); } LOG.info("Succeed to stop all WorkerAgentForJob"); this.workerServer.stop(); this.lsManager.stop(); RPC.stopProxy(controllerClient); if (staffReportServer != null) { staffReportServer.stop(); staffReportServer = null; } LOG.info("Succeed to stop all RPC Server"); cleanupStorage(); LOG.info("Succeed to cleanup temporary files on the local disk"); } catch (Exception e) { //LOG.error("Failed to execute the close()", e); throw new RuntimeException("WorkerManager run" + " Failed to execute the close()", e); } } /** * Start the WorkerManager * * @param hrs WorkerManager * @return the thread of startWorkerManager */ public static Thread startWorkerManager(final WorkerManager hrs) { return startWorkerManager(hrs, "regionserver" + hrs.workerManagerName); } /** * Start the WorkerManager * * @param hrs WorkerManager * @param name the name of thread * @return the thread of startWorkerManager */ public static Thread startWorkerManager(final WorkerManager hrs, final String name) { Thread t = new Thread(hrs); t.setName(name); t.start(); return t; } /** * StaffInProgress maintains all the info for a Staff that lives at this * WorkerManager. It maintains the Staff object, its StaffStatus, and the * BSPStaffRunner. */ class StaffInProgress { /** State staff variable */ private Staff staff; /** State WorkerAgentForStaffInterface variable */ private WorkerAgentForStaffInterface staffAgent; /** BSPJob variable*/ private BSPJob jobConf; /** BSPJob variable*/ private BSPJob localJobConf; /** Define BSPStaffRunner variable*/ private BSPStaffRunner runner; /** Flag of staff is killed */ private volatile boolean wasKilled = false; /** StaffStatus variable */ private StaffStatus staffStatus; /** Define error is String type */ private String error = "no"; /** Fault synchronization super step */ private int faultSSStep = 0; /** The flag of changeWorkerState */ private boolean changeWorkerState = false; /** The counter of failed staff */ private int failCounter = 0; /** Migrate Super Step */ private int migrateSuperStep = 0; /** * Constructor. * @param staff Staff * @param jobConf BSPJob * @param workerManagerName the name of WorkerManager */ public StaffInProgress(Staff staff, BSPJob jobConf, String workerManagerName) { this.staff = staff; this.jobConf = jobConf; this.localJobConf = null; this.staffStatus = new StaffStatus(staff.getJobID(), staff.getStaffID(), 0, StaffStatus.State.UNASSIGNED, "running", workerManagerName, StaffStatus.Phase.STARTING); } public StaffInProgress() { } /** * Set StaffStatus. * @param stateStatus the state of staff * @param fault Fault */ public void setStaffStatus(int stateStatus, Fault fault) { switch (stateStatus) { case Constants.SATAFF_STATUS.RUNNING: this.staffStatus.setRunState(StaffStatus.State.RUNNING); break; case Constants.SATAFF_STATUS.SUCCEED: this.staffStatus.setRunState(StaffStatus.State.SUCCEEDED); finishTime = System.currentTimeMillis(); this.staffStatus.setFinishTime(finishTime); break; case Constants.SATAFF_STATUS.FAULT: this.staffStatus.setRunState(StaffStatus.State.FAULT); this.staffStatus.setFault(fault); break; default: LOG.error("Unknown StaffStatus.State: <Constants.SATAFF_STATUS>" + stateStatus); } } public boolean getChangeWorkerState() { return changeWorkerState; } public void setChangeWorkerState(boolean changeWorkerState) { this.changeWorkerState = changeWorkerState; } public String getError() { return this.error; } public int getFaultSSStep() { return faultSSStep; } public void setFaultSSStep(int faultSSStep) { this.faultSSStep = faultSSStep; } public void setFailCounter(int failCounter) { this.failCounter = failCounter; } public int getFailCounter() { return this.failCounter; } /** * Localize staff. * @param task Staff */ private void localizeStaff(Staff task) throws IOException { Path localJobFile = this.jobConf.getLocalPath( Constants.BC_BSP_LOCAL_SUBDIR_WORKERMANAGER + "/" + task.getStaffID() + "/job.xml"); // changed by chen Path localJarFile = null; if (Constants.USER_BC_BSP_JOB_TYPE_C.equals(this.getJobType())) { LOG.info("*************************************"); localJarFile = this.jobConf.getLocalPath( Constants.BC_BSP_LOCAL_SUBDIR_WORKERMANAGER + "/" + task.getStaffID() + "/jobC"); } else { LOG.info("debug: in localizeStaff job.jar"); localJarFile = this.jobConf.getLocalPath( Constants.BC_BSP_LOCAL_SUBDIR_WORKERMANAGER + "/" + task.getStaffID() + "/job.jar"); } String jobFile = task.getJobFile(); // systemFS.copyToLocalFile(new Path(jobFile), localJobFile); bspsystemFS.copyToLocalFile(new BSPHdfsImpl().newPath(jobFile), localJobFile); task.setJobFile(localJobFile.toString()); localJobConf = new BSPJob(task.getJobID(), localJobFile.toString()); localJobConf.set("bsp.task.id", task.getStaffID().toString()); String jarFile = null; LOG.info("debug: job type is " + this.getJobType()); if (Constants.USER_BC_BSP_JOB_TYPE_C.equals(this.getJobType())) { jarFile = localJobConf.getJobExe(); if (jarFile != null) { // systemFS.copyToLocalFile(new Path(jarFile), localJarFile); bspsystemFS.copyToLocalFile(new BSPHdfsImpl().newPath(jarFile), localJarFile); LOG.info("debug: jobExe=" + localJarFile.toString()); String localdir = Constants.BC_BSP_LOCAL_SUBDIR_WORKERMANAGER; Path localpath = this.jobConf .getLocalPath(Constants.BC_BSP_LOCAL_SUBDIR_WORKERMANAGER + "/JobC"); File t = new File(localpath.toString()); if (!t.exists()) { // systemFS.copyToLocalFile(new Path(jarFile), localpath); // alter by gtt bspsystemFS.copyToLocalFile(new BSPHdfsImpl().newPath(jarFile), localpath); } LOG.info("localdir is :" + localdir); localJobConf.setJobExe(localJarFile.toString()); } } else { jarFile = localJobConf.getJar(); if (jarFile != null) { // systemFS.copyToLocalFile(new Path(jarFile), localJarFile); bspsystemFS.copyToLocalFile(new BSPHdfsImpl().newPath(jarFile), localJarFile); localJobConf.setJar(localJarFile.toString()); } } this.staffStatus.setMaxSuperStep(Long.parseLong(String.valueOf(localJobConf.getNumSuperStep()))); LOG.info("debug: localJarFile.toString() is " + localJarFile.toString()); this.staff.setJobExeLocalPath(localJarFile.toString()); LOG.debug("localizeStaff : " + localJobConf.getJar()); LOG.debug("localizeStaff : " + localJobFile.toString()); task.setConf(localJobConf); } public synchronized void setJobConf(BSPJob jobConf) { this.jobConf = jobConf; } public synchronized BSPJob getJobConf() { return localJobConf; } /** * Launch staff. * @throws IOException */ public void launchStaff() throws IOException { LOG.info("debug:before localizeStaff(staff);"); localizeStaff(staff); LOG.info("debug:after localizeStaff(staff);"); staffStatus.setRunState(StaffStatus.State.RUNNING); BSPJobID jobID = localJobConf.getJobID(); runningJobtoWorkerAgent.get(jobID).addStaffCounter(staff.getStaffAttemptId()); runningJobtoWorkerAgent.get(jobID).setJobConf(jobConf); runningStaffs.put(staff.getStaffAttemptId(), this); LOG.info("in launchStaff() jobC path is " + this.getStaff().getJobExeLocalPath()); synchronized (currentStaffsCount) { currentStaffsCount++; } LOG.info("debug:staff attemtId" + this.staff.getStaffAttemptId()); this.runner = staff.createRunner(WorkerManager.this); this.runner.setFaultSSStep(this.faultSSStep); LOG.info("debug: + before runner start"); this.runner.start(); } /** * This task has run on too long, and should be killed. * @param wasFailure flag of failure */ public synchronized void killAndCleanup(boolean wasFailure) throws IOException { onKillStaff(); runner.kill(); } /** * Kill staff. */ private void onKillStaff() { if (this.staffAgent != null) { this.staffAgent.onKillStaff(); } } public Staff getStaff() { return staff; } public synchronized StaffStatus getStatus() { return staffStatus; } public StaffStatus.State getRunState() { return staffStatus.getRunState(); } /** * Judge that staff was killed. * @return flag of Staff that was killed */ public boolean wasKilled() { return wasKilled; } @Override public boolean equals(Object obj) { return (obj instanceof StaffInProgress) && staff.getStaffID().equals(((StaffInProgress) obj).getStaff().getStaffID()); } @Override public int hashCode() { return staff.getStaffID().hashCode(); } public void setStaffAgent(WorkerAgentForStaffInterface staffAgent) { this.staffAgent = staffAgent; } public int getMigrateSS() { return this.migrateSuperStep; } public void setMigrateSS(int superstep) { this.migrateSuperStep = superstep; } public String getJobType() { return this.jobConf.get(Constants.USER_BC_BSP_JOB_TYPE, ""); } /** For JUnit test. */ public void setStaffStatus(StaffStatus stfs) { this.staffStatus = stfs; } } public boolean isRunning() { return running; } /** * Construct WorkerManger. * @param workerManagerClass extends WorkerManager * @param conf Configuration * @return WorkerManager object */ public static WorkerManager constructWorkerManager(Class<? extends WorkerManager> workerManagerClass, final Configuration conf) { try { Constructor<? extends WorkerManager> c = workerManagerClass.getConstructor(Configuration.class); return c.newInstance(conf); } catch (Exception e) { throw new RuntimeException( "Failed construction of " + "WorkerManager: " + workerManagerClass.toString(), e); } } @Override public long getProtocolVersion(String protocol, long clientVersion) throws IOException { if (protocol.equals(WorkerManagerProtocol.class.getName())) { return BSPRPCProtocolVersion.versionID; } else if (protocol.equals(WorkerAgentProtocol.class.getName())) { return BSPRPCProtocolVersion.versionID; } else { throw new IOException("Unknown protocol to WorkerManager: " + protocol); } } /** * The main() for child processes. */ public static class Child { /** * WorkerManager's main method for * disposing and preserving WorkerManger. * @param args command parameters */ public static void main(String[] args) { BSPConfiguration defaultConf = new BSPConfiguration(); // report address String host = args[0]; int port = Integer.parseInt(args[1]); InetSocketAddress address = new InetSocketAddress(host, port); StaffAttemptID staffid = StaffAttemptID.forName(args[2]); int faultSSStep = Integer.parseInt(args[3]); String hostName = args[4]; String jobType = args[5]; LOG.info(staffid + ": Child Starts"); LOG.info("=*=*=*=*=*=*=*=*=*=*=*" + "=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*"); WorkerAgentProtocol umbilical = null; Staff staff = null; BSPJob job = null; try { umbilical = (WorkerAgentProtocol) RPC.getProxy(WorkerAgentProtocol.class, BSPRPCProtocolVersion.versionID, address, defaultConf); staff = umbilical.getStaff(staffid); LOG.info("debug:job.xml path = " + staff.getJobFile()); defaultConf.addResource(new Path(staff.getJobFile())); job = new BSPJob(staff.getJobID(), staff.getJobFile()); LOG.info("debug:job.exe path = " + staff.getJobExeLocalPath()); LOG.info("debug:job.jar path = " + job.getJar()); // use job-specified working directory new BSPFileSystemImpl(job.getConf()).setWorkingDirectory(new BSPHdfsImpl().getWorkingDirectory()); boolean recovery = umbilical.getStaffRecoveryState(staffid); boolean changeWorkerState = umbilical.getStaffChangeWorkerState(staffid); int failCounter = umbilical.getFailCounter(staffid); job.setInt("staff.fault.superstep", faultSSStep); int migrateStep = umbilical.getMigrateSuperStep(staffid); if (Constants.USER_BC_BSP_JOB_TYPE_C.equals(jobType)) { staff.runC(job, staff, umbilical, recovery, changeWorkerState, failCounter, hostName); } else if (job.getComputeState() == 0) { staff.run(job, staff, umbilical, recovery, changeWorkerState, migrateStep, failCounter, hostName); // run the task } else if (job.getComputeState() == 1) { staff.runPartition(job, staff, umbilical, recovery, changeWorkerState, migrateStep, failCounter, hostName); } else if (job.getComputeState() == 2) { //add for SGA-Graph staff.runSGAGraph(job, staff, umbilical, recovery, changeWorkerState, migrateStep, failCounter, hostName); } LOG.info("staff " + staffid + "run complete!"); } catch (ClassNotFoundException cnfE) { LOG.error("Exception has been catched in WorkerManager--Error running child", cnfE); // Report back any failures, for diagnostic purposes ByteArrayOutputStream baos = new ByteArrayOutputStream(); cnfE.printStackTrace(new PrintStream(baos)); umbilical.setStaffStatus(staffid, Constants.SATAFF_STATUS.FAULT, new Fault(Fault.Type.SYSTEMSERVICE, Fault.Level.CRITICAL, umbilical.getWorkerManagerName(job.getJobID(), staffid), cnfE.toString(), job.toString(), staffid.toString()), 0); } catch (FSError e) { LOG.error("Exception has been catched in WorkerManager--FSError from child", e); umbilical.setStaffStatus(staffid, Constants.SATAFF_STATUS.FAULT, new Fault(Fault.Type.SYSTEMSERVICE, Fault.Level.CRITICAL, umbilical.getWorkerManagerName(job.getJobID(), staffid), e.toString(), job.toString(), staffid.toString()), 0); } catch (Throwable throwable) { LOG.error("Exception has been catched in WorkerManager--Error running child", throwable); // Report back any failures, for diagnostic purposes ByteArrayOutputStream baos = new ByteArrayOutputStream(); throwable.printStackTrace(new PrintStream(baos)); umbilical.setStaffStatus(staffid, Constants.SATAFF_STATUS.FAULT, new Fault(Fault.Type.SYSTEMSERVICE, Fault.Level.CRITICAL, umbilical.getWorkerManagerName(job.getJobID(), staffid), throwable.toString(), job.toString(), staffid.toString()), 0); } finally { RPC.stopProxy(umbilical); MetricsContext metricsContext = MetricsUtil.getContext("mapred"); metricsContext.close(); // Shutting down log4j of the child-vm... // This assumes that on return from Staff.run() // there is no more logging done. LogManager.shutdown(); } } } @Override public Staff getStaff(StaffAttemptID staffid) throws IOException { StaffInProgress sip = runningStaffs.get(staffid); if (sip != null) { return sip.getStaff(); } else { LOG.warn(staffid + " is not in the runningStaffs"); return null; } } @Override public boolean getStaffRecoveryState(StaffAttemptID staffId) { return runningStaffs.get(staffId).getStatus().isRecovery(); } @Override public boolean getStaffChangeWorkerState(StaffAttemptID staffId) { return runningStaffs.get(staffId).getChangeWorkerState(); } @Override public int getFailCounter(StaffAttemptID staffId) { return this.runningStaffs.get(staffId).getFailCounter(); } @Override public boolean ping(StaffAttemptID staffId) throws IOException { return false; } @Override public void done(StaffAttemptID staffId, boolean shouldBePromoted) throws IOException { } @Override public void fsError(StaffAttemptID staffId, String message) throws IOException { } @Override public String getWorkerManagerName(BSPJobID jobId, StaffAttemptID staffId) { return runningJobtoWorkerAgent.get(jobId).getWorkerManagerName(jobId, staffId); } @Override public boolean localBarrier(BSPJobID jobId, StaffAttemptID staffId, int superStepCounter, SuperStepReportContainer ssrc) { LOG.info("ljn test : workermanager localBarrier is in step " + superStepCounter); if (this.runningStaffs.containsKey(staffId)) { this.runningStaffs.get(staffId).getStatus().setProgress(superStepCounter + 1); } return runningJobtoWorkerAgent.get(jobId).localBarrier(jobId, staffId, superStepCounter, ssrc); } @Override public void addCounters(BSPJobID jobId, Counters pCounters) { this.runningJobtoWorkerAgent.get(jobId).addCounters(pCounters); } @Override public int getNumberWorkers(BSPJobID jobId, StaffAttemptID staffId) { return runningJobtoWorkerAgent.get(jobId).getNumberWorkers(jobId, staffId); } @Override public void setNumberWorkers(BSPJobID jobId, StaffAttemptID staffId, int num) { runningJobtoWorkerAgent.get(jobId).setNumberWorkers(jobId, staffId, num); } @Override public void addStaffReportCounter(BSPJobID jobId) { runningJobtoWorkerAgent.get(jobId).addStaffReportCounter(); } public String getWorkerManagerName() { return this.workerManagerName; } @Override public BSPJobID getBSPJobID() { return null; } @Override public void setStaffStatus(StaffAttemptID staffId, int staffStatus, Fault fault, int stage) { this.runningStaffs.get(staffId).setStaffStatus(staffStatus, fault); this.runningStaffs.get(staffId).getStatus().setStage(stage); } /** * Get StaffStatus. * @param staffId StaffAttemptID * @return StaffStatus */ public StaffStatus getStaffStatus(StaffAttemptID staffId) { return this.runningStaffs.get(staffId).getStatus(); } /** * This method is used to set mapping table that shows the partition to the * worker. According to Job ID get WorkerAgentForJob and call its method to * set this mapping table. * @param jobId BSPJobID * @param partitionId id of partition * @param hostName the name of host */ @Override public void setWorkerNametoPartitions(BSPJobID jobId, int partitionId, String hostName) { this.runningJobtoWorkerAgent.get(jobId).setWorkerNametoPartitions(jobId, partitionId, hostName); } /** * Get the hostName of the workerManager. * @return hostName */ public String getHostName() { return this.conf.get(Constants.BC_BSP_WORKERAGENT_HOST, Constants.DEFAULT_BC_BSP_WORKERAGENT_HOST); } @Override public void clearFailedJobList() { this.failedJobList.clear(); } @Override public void addFailedJob(BSPJobID jobId) { this.failedJobList.add(jobId); } @Override public int getFailedJobCounter() { return this.failedJobList.size(); } @Override public synchronized int getFreePort() { ServerSocket s; this.currentFreePort = this.currentFreePort + 1; int count = 0; for (; this.currentFreePort <= 65536; this.currentFreePort++) { count++; if (count > 5535) { LOG.info("[WorkerManager: getFreePort()] attempts " + "to get a free port over 5535 times!"); return 60000; } if (this.currentFreePort > 65535) { this.currentFreePort = 60001; } try { LOG.info("debug:this.currentFreePort is " + this.currentFreePort); s = new ServerSocket(this.currentFreePort); s.close(); return this.currentFreePort; } catch (IOException e) { LOG.info("debug:this.currentFreePort is " + this.currentFreePort); LOG.error("[WokerManager] caught", e); } } return 60000; } @Override public void setStaffAgentAddress(StaffAttemptID staffID, String addr) { if (this.runningStaffs.containsKey(staffID)) { StaffInProgress sip = this.runningStaffs.get(staffID); String[] addrs = addr.split(":"); InetSocketAddress address = new InetSocketAddress(addrs[0], Integer.parseInt(addrs[1])); WorkerAgentForStaffInterface staffAgent = null; try { staffAgent = (WorkerAgentForStaffInterface) RPC.getProxy(WorkerAgentForStaffInterface.class, WorkerAgentForStaffInterface.versionID, address, this.conf); } catch (IOException e) { LOG.error("[WorkerManager] caught: ", e); } sip.setStaffAgent(staffAgent); } } @Override public void process(WatchedEvent event) { LOG.info("now in process"); LOG.info("event type is " + event.getType()); try { if (event.getType().toString().equals("NodeDeleted")) { LOG.info("in NodeDeleted"); if (bspzk != null) { if (bspzk.equaltoStat(Constants.BSPCONTROLLER_STANDBY_LEADER, true)) { String standControllerAddr = getData(Constants.BSPCONTROLLER_LEADER); InetSocketAddress newStandbyAddr = NetUtils.createSocketAddr(standControllerAddr); if (!this.standbyControllerAddr.equals(newStandbyAddr)) { this.bspControllerAddr = this.standbyControllerAddr; this.standbyControllerAddr = newStandbyAddr; this.controllerClient = (ControllerProtocol) RPC.getProxy(ControllerProtocol.class, BSPRPCProtocolVersion.versionID, this.bspControllerAddr, conf); this.standbyControllerClient = (ControllerProtocol) RPC.getProxy( ControllerProtocol.class, BSPRPCProtocolVersion.versionID, this.standbyControllerAddr, conf); } LOG.info("now the active is " + this.bspControllerAddr.toString() + "and the standby is " + this.standbyControllerAddr.toString()); } } } else if (event.getType().toString().equals("NodeDataChanged")) { // watch the standby bspzk.exists(Constants.BSPCONTROLLER_STANDBY_LEADER, true); // establish the communication link to standby bsp master this.standbyControllerClient = (ControllerProtocol) RPC.getProxy(ControllerProtocol.class, BSPRPCProtocolVersion.versionID, this.standbyControllerAddr, conf); LOG.info("bspControllerAddr = " + bspControllerAddr + " standbyControllerAddr = " + standbyControllerAddr); if (!this.standbyControllerClient.register(workerMangerStatus)) { LOG.error("There is a problem in establishing communication" + " link with BSPController"); throw new IOException( "There is a problem in establishing" + " communication link with BSPController."); } else { LOG.info("have registed to standby bsp master"); } } } catch (Exception e) { LOG.error("problem happened when register to standby controller " + e.toString()); } } /** * Get data from the path of ZooKeeper. * @param path the path of ZooKeeper * @return data from the path of ZooKeeper * @throws KeeperException * @throws InterruptedException */ public String getData(String path) throws KeeperException, InterruptedException { if (bspzk != null) { byte[] b = bspzk.getData(path, false, null); return new String(b); } return null; } /** * Get the address of BspController whose role is Active */ public void choseActiveControllerAddress() { String zkAddress = conf.get(Constants.ZOOKEEPER_QUORUM) + ":" + conf.getInt(Constants.ZOOKEPER_CLIENT_PORT, Constants.DEFAULT_ZOOKEPER_CLIENT_PORT); try { this.bspzk = new BSPZookeeperImpl(zkAddress, Constants.SESSION_TIME_OUT, this); if (bspzk != null) { if (bspzk.equaltoStat(Constants.BSPCONTROLLER_LEADER, true)) { String controllerAddr = getData(Constants.BSPCONTROLLER_LEADER); LOG.info("active controller Address is " + controllerAddr); this.bspControllerAddr = NetUtils.createSocketAddr(controllerAddr); } else { LOG.error("could not get the active BspController's " + "address,please restart the System"); } // s = zk.exists(Constants.BSPCONTROLLER_STANDBY_LEADER, true); if (bspzk.equaltoStat(Constants.BSPCONTROLLER_STANDBY_LEADER, true)) { String standControllerAddr = getData(Constants.BSPCONTROLLER_STANDBY_LEADER); LOG.info("standby controller Address is " + standControllerAddr); this.standbyControllerAddr = NetUtils.createSocketAddr(standControllerAddr); } else { LOG.info("could not get the standby BspController's address," + "please restart the standby System"); } } } catch (Exception e) { LOG.error("could not get the active BspController's address," + "please restart the System:" + e.getMessage()); } } /** * Ensure fresh jobSubmitClient. */ public void ensureFreshControllerClient() { this.controllerClient = this.standbyControllerClient; InetSocketAddress temp = this.bspControllerAddr; this.bspControllerAddr = this.standbyControllerAddr; this.standbyControllerAddr = temp; LOG.info("now the active is " + this.bspControllerAddr.toString() + "and the standby is " + this.standbyControllerAddr.toString()); try { LOG.info(" in workerManager, Now will try to connect " + this.standbyControllerAddr.toString()); } catch (Exception e) { LOG.warn("lost connection to " + this.bspControllerAddr.toString()); } } @Override public int getMigrateSuperStep(StaffAttemptID staffId) { return runningStaffs.get(staffId).getMigrateSS(); } @Override public boolean updateWorkerJobState(StaffAttemptID staffId) { return this.runningJobtoWorkerAgent.get(staffId.getJobID()).updateStaffsReporter(staffId); } @Override public void clearStaffRC(BSPJobID jobId) { runningJobtoWorkerAgent.get(jobId).clearStaffRC(jobId); } public Configuration getConf() { return conf; } public Integer getCurrentStaffsCount() { return currentStaffsCount; } public void setCurrentStaffsCount(Integer currentStaffsCount) { this.currentStaffsCount = currentStaffsCount; } public int getFinishedStaffsCount() { return finishedStaffsCount; } public void setFinishedStaffsCount(int finishedStaffsCount) { this.finishedStaffsCount = finishedStaffsCount; } public Map<StaffAttemptID, StaffInProgress> getFinishedStaffs() { return finishedStaffs; } public void setFinishedStaffs(Map<StaffAttemptID, StaffInProgress> finishedStaffs) { this.finishedStaffs = finishedStaffs; } public Map<BSPJobID, RunningJob> getRunningJobs() { return runningJobs; } public void setRunningJobs(Map<BSPJobID, RunningJob> runningJobs) { this.runningJobs = runningJobs; } public Map<BSPJobID, WorkerAgentForJob> getRunningJobtoWorkerAgent() { return runningJobtoWorkerAgent; } public void setRunningJobtoWorkerAgent(Map<BSPJobID, WorkerAgentForJob> runningJobtoWorkerAgent) { this.runningJobtoWorkerAgent = runningJobtoWorkerAgent; } public Map<StaffAttemptID, StaffInProgress> getRunningStaffs() { return runningStaffs; } public void setRunningStaffs(Map<StaffAttemptID, StaffInProgress> runningStaffs) { this.runningStaffs = runningStaffs; } public ArrayList<BSPJobID> getFailedJobList() { return failedJobList; } public void setFailedJobList(ArrayList<BSPJobID> failedJobList) { this.failedJobList = failedJobList; } public ControllerProtocol getControllerClient() { return controllerClient; } public void setControllerClient(ControllerProtocol controllerClient) { this.controllerClient = controllerClient; } public boolean isJustInited() { return justInited; } public void setJustInited(boolean justInited) { this.justInited = justInited; } public InetSocketAddress getBspControllerAddr() { return bspControllerAddr; } public void setBspControllerAddr(InetSocketAddress bspControllerAddr) { this.bspControllerAddr = bspControllerAddr; } public ControllerProtocol getStandbyControllerClient() { return standbyControllerClient; } public void setStandbyControllerClient(ControllerProtocol standbyControllerClient) { this.standbyControllerClient = standbyControllerClient; } public InetSocketAddress getStandbyControllerAddr() { return standbyControllerAddr; } public void setStandbyControllerAddr(InetSocketAddress standbyControllerAddr) { this.standbyControllerAddr = standbyControllerAddr; } public BSPZookeeper getBspzk() { return bspzk; } public void setBspzk(BSPZookeeper bspzk) { this.bspzk = bspzk; } }