Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.chinamobile.bcbsp.sync; import java.util.ArrayList; import java.util.List; import java.util.concurrent.CountDownLatch; import org.apache.commons.logging.LogFactory; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.WatchedEvent; import org.apache.zookeeper.Watcher; import org.apache.zookeeper.ZooDefs.Ids; import org.apache.commons.logging.Log; import com.chinamobile.bcbsp.BSPConfiguration; import com.chinamobile.bcbsp.Constants; import com.chinamobile.bcbsp.Constants.BspCounters; import com.chinamobile.bcbsp.bspcontroller.Counters; import com.chinamobile.bcbsp.bspcontroller.JobInProgressControlInterface; import com.chinamobile.bcbsp.thirdPartyInterface.Zookeeper.BSPZookeeper; import com.chinamobile.bcbsp.thirdPartyInterface.Zookeeper.impl.BSPCreateModeImpl; import com.chinamobile.bcbsp.thirdPartyInterface.Zookeeper.impl.BSPZookeeperImpl; import com.chinamobile.bcbsp.thirdPartyInterface.Zookeeper.impl.BSPkeeperStateImpl; import com.chinamobile.bcbsp.util.BSPJobID; /** * GeneralSSController GeneralSSController for completing the general SuperStep * synchronization control. This class is connected to JobInProgress. * * @author * @version */ public class GeneralSSController implements Watcher, GeneralSSControllerInterface { /** The log of the class. */ private static final Log LOG = LogFactory.getLog(GeneralSSController.class); /** The bsp configuration. */ private BSPConfiguration conf; /** The job inprogress controller. */ private JobInProgressControlInterface jip; /** The id of the bsp job. */ private BSPJobID jobId; /** The superstep counter. */ // private int superStepCounter = 0; private int superStepCounter = -2; /** The fault superstepcounter. */ private int faultSuperStepCounter = 0; /** The base checke number. */ private int checkNumBase; /** The zookeeper of the bsp. */ private BSPZookeeper bspzk = null; /** The zookeeper address. */ private final String zookeeperAddr; /** The root of the zookeeper. */ private final String bspZKRoot; /** The volatile variable of the thread. */ private volatile Integer mutex = 0; /** The stage flag. */ private int stageFlag = 1; /** The object of he zookeeper run. */ private ZooKeeperRun zkRun = new ZooKeeperRun(); /** The object of the counters. */ private Counters counters = new Counters(); /** The connected Latch. */ private CountDownLatch connectedLatch = null; /** * The operation of the zookeeper. * * @author */ public class ZooKeeperRun extends Thread { /** * @param ssc * SuperStepCommand * @throws Exception * exception. */ public void startNextSuperStep(SuperStepCommand ssc) throws Exception { int nextSuperStep = ssc.getNextSuperStepNum(); jip.reportLOG(jobId.toString() + " the next superstepnum is : " + nextSuperStep); if (bspzk.equaltostat(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + nextSuperStep, false)) { bspzk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + nextSuperStep, new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } else { jip.reportLOG("The node hash exists" + bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + nextSuperStep); List<String> tmpList = new ArrayList<String>(); tmpList = bspzk.getChildren( bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + nextSuperStep, false); for (String e : tmpList) { bspzk.delete( bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + nextSuperStep + "/" + e, bspzk.exists(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + nextSuperStep + "/" + e, false).getAversion()); } } if (bspzk.equaltostat(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + nextSuperStep, false)) { bspzk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + nextSuperStep, new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } else { List<String> tmpList = new ArrayList<String>(); tmpList = bspzk.getChildren( bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + nextSuperStep, false); for (String e : tmpList) { bspzk.delete( bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + nextSuperStep + "/" + e, bspzk.exists(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + nextSuperStep + "/" + e, false).getAversion()); } } if (bspzk.equaltostat(bspZKRoot + "/" + jobId.toString().substring(17) + "-migrate", false)) { bspzk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-migrate", new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } else { jip.reportLOG("The node hash exists" + bspZKRoot + "/" + jobId.toString().substring(17) + "-migrate" + "/"); List<String> tmpList = new ArrayList<String>(); // Stat tmpStat = null; tmpList = bspzk.getChildren(bspZKRoot + "/" + jobId.toString().substring(17) + "-migrate", false); for (String e : tmpList) { bspzk.delete(bspZKRoot + "/" + jobId.toString().substring(17) + "-migrate" + "/" + e, bspzk .exists(bspZKRoot + "/" + jobId.toString().substring(17) + "-migrate" + "/" + e, false) .getAversion()); } } if (bspzk.equaltostat( bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + nextSuperStep, false)) { bspzk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + nextSuperStep, new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } else { List<String> tmpList = new ArrayList<String>(); tmpList = bspzk.getChildren( bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + nextSuperStep, false); for (String e : tmpList) { bspzk.delete( bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + nextSuperStep + "/" + e, bspzk.exists(bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + nextSuperStep + "/" + e, false).getAversion()); } } if (bspzk.equaltostat(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter + "/" + Constants.COMMAND_NAME, false)) { LOG.info("no such node ,now will create it"); bspzk.create( bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter + "/" + Constants.COMMAND_NAME, ssc.toString().getBytes(), Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } jip.reportLOG(jobId.toString() + " command of next is " + ssc.toString()); jip.reportLOG( jobId.toString() + " [Write Command Path] " + bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter + "/" + Constants.COMMAND_NAME); jip.reportLOG(jobId.toString() + " leave the barrier of " + superStepCounter); if (nextSuperStep % Constants.K == 0) { if (bspzk.equaltostat(bspZKRoot + "/" + jobId.toString().substring(17) + "-m" + "/" + String.valueOf(nextSuperStep / Constants.K), false)) { bspzk.create( bspZKRoot + "/" + jobId.toString().substring(17) + "-m" + "/" + String.valueOf(nextSuperStep / Constants.K), new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } else { List<String> tmpList = new ArrayList<String>(); tmpList = bspzk.getChildren(bspZKRoot + "/" + jobId.toString().substring(17) + "-m" + "/" + String.valueOf(nextSuperStep / Constants.K), false); for (String e : tmpList) { bspzk.delete( bspZKRoot + "/" + jobId.toString().substring(17) + "-m" + "/" + String.valueOf(nextSuperStep / Constants.K) + "/" + e, bspzk.exists(bspZKRoot + "/" + jobId.toString().substring(17) + "-m" + "/" + String.valueOf(nextSuperStep / Constants.K) + "/" + e, false) .getAversion()); } } } } /** * @param command * SuperStepCommand * @throws Exception * exception. */ public void stopNextSuperStep(String command) throws Exception { if (bspzk.equaltostat(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter + "/" + Constants.COMMAND_NAME, false)) { bspzk.create( bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter + "/" + Constants.COMMAND_NAME, command.getBytes(), Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } jip.reportLOG(jobId.toString() + " command of next is " + command); jip.reportLOG(jobId.toString() + " prepare to quit"); } /** * @param ableCheckPoint * while can Check Point */ public void cleanReadHistory(int ableCheckPoint) { List<String> tmpList = new ArrayList<String>(); try { tmpList = bspzk.getChildren( bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + ableCheckPoint, false); for (String e : tmpList) { bspzk.delete( bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + ableCheckPoint + "/" + e, bspzk.exists(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + ableCheckPoint + "/" + e, false).getAversion()); jip.reportLOG("The node hash exists" + bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + ableCheckPoint + "/" + e); } } catch (Exception exc) { jip.reportLOG(jobId.toString() + " [cleanReadHistory]" + exc.getMessage()); } } /** * This is a thread and execute the logic control. */ @Override public void run() { boolean jobEndFlag = true; try { if (bspzk.equaltostat( bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + superStepCounter, false)) { // LOG.info("ljn test : zk create " + bspZKRoot + "/" // + jobId.toString().substring(17) + "-ss" + "/" + superStepCounter); bspzk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + superStepCounter, new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); // ljn SGA-Graph // bspzk.create(bspZKRoot + "/" + jobId.toString().substring(17) + // "-ss" // + "/" + -2, new byte[0], BSPIds.OPEN_ACL_UNSAFE, // new BSPCreateModeImpl().getPERSISTENT()); } if (bspzk.equaltostat( bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter, false)) { bspzk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter, new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); // ljn SGA-Graph // bspzk.create(bspZKRoot + "/" + jobId.toString().substring(17) + // "-sc" // + "/" + -2, new byte[0], BSPIds.OPEN_ACL_UNSAFE, // new BSPCreateModeImpl().getPERSISTENT()); } //ljn sga-garaph if (bspzk.equaltostat( bspZKRoot + "/" + jobId.toString().substring(17) + "-m" + "/" + String.valueOf(0), false)) { bspzk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-m" + "/" + String.valueOf(0), new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } if (bspzk.equaltostat( bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + superStepCounter, false)) { bspzk.create( bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + superStepCounter, new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); // bspzk.create(bspZKRoot + "/" + jobId.toString().substring(17) // + "-counters" + "/" + -2, new byte[0], // BSPIds.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } // ljn SGA-Graph // LOG.info("ljn test : bspzk.create " + bspZKRoot + "/" // + jobId.toString().substring(17) + "-e" + "/" + 0); if (bspzk.equaltostat(bspZKRoot + "/" + jobId.toString().substring(17) + "-e" + "/" + 0, false)) { bspzk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-e" + "/" + 0, new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); // bspzk.create(bspZKRoot + "/" + jobId.toString().substring(17) // + "-counters" + "/" + -2, new byte[0], // BSPIds.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } } catch (Exception e) { throw new RuntimeException(e); } while (jobEndFlag) { try { long start = System.currentTimeMillis(); setStageFlag(Constants.SUPERSTEP_STAGE.FIRST_STAGE); generalSuperStepBarrier(checkNumBase * 2); // LOG.info("ljn test : SUPERSTEP_STAGE.SECOND_STAGE before. "); setStageFlag(Constants.SUPERSTEP_STAGE.SECOND_STAGE); // LOG.info("ljn test : SUPERSTEP_STAGE.SECOND_STAGE after and checkNumBase is " // + checkNumBase); SuperStepCommand ssc = getSuperStepCommand(checkNumBase); GeneralSSController.this.counters.findCounter(BspCounters.TIME_IN_SYNC_MS) .increment(System.currentTimeMillis() - start); updateJobCounters(checkNumBase); switch (ssc.getCommandType()) { case Constants.COMMAND_TYPE.START: startNextSuperStep(ssc); setCheckNumBase(); superStepCounter = ssc.getNextSuperStepNum(); jip.setSuperStepCounter(superStepCounter); // LOG.info("ljn test : START checkNumBase is " + checkNumBase); break; case Constants.COMMAND_TYPE.START_AND_CHECKPOINT: startNextSuperStep(ssc); generalSuperStepBarrier(checkNumBase * 3); jip.setAbleCheckPoint(superStepCounter); LOG.info("ableCheckPoint: " + superStepCounter); superStepCounter = ssc.getNextSuperStepNum(); jip.setSuperStepCounter(superStepCounter); // LOG.info("ljn test : START_AND_CHECKPOINT checkNumBase is " // + checkNumBase); break; case Constants.COMMAND_TYPE.START_AND_RECOVERY: cleanReadHistory(ssc.getAbleCheckPoint()); startNextSuperStep(ssc); setCheckNumBase(); // LOG.info("ljn test : START_AND_RECOVERY checkNumBase is " // + checkNumBase); superStepCounter = ssc.getAbleCheckPoint(); generalSuperStepBarrier(checkNumBase * 1); superStepCounter = ssc.getNextSuperStepNum(); jip.setSuperStepCounter(superStepCounter); break; case Constants.COMMAND_TYPE.STOP: stopNextSuperStep(ssc.toString()); jobEndFlag = quitBarrier(); jip.clearStaffsForJob(); break; default: jip.reportLOG(jobId.toString() + " Unkonwn command of " + ssc.getCommandType()); } } catch (Exception e) { jip.reportLOG(jobId.toString() + "error: " + e.toString()); throw new RuntimeException(e); } } } } /** * Generate the GeneralSSController to control the synchronization between * SuperSteps * * @param jobId * The id of the bspjob. */ public GeneralSSController(BSPJobID jobId) { this.jobId = jobId; this.conf = new BSPConfiguration(); this.zookeeperAddr = conf.get(Constants.ZOOKEEPER_QUORUM) + ":" + conf.getInt(Constants.ZOOKEPER_CLIENT_PORT, Constants.DEFAULT_ZOOKEPER_CLIENT_PORT); this.bspZKRoot = Constants.BSPJOB_ZOOKEEPER_DIR_ROOT; setup(); } /** For JUnit test. */ public GeneralSSController(BSPJobID jobId, BSPConfiguration conf) { this.jobId = jobId; this.conf = conf; this.zookeeperAddr = conf.get(Constants.ZOOKEEPER_QUORUM) + ":" + conf.getInt(Constants.ZOOKEPER_CLIENT_PORT, Constants.DEFAULT_ZOOKEPER_CLIENT_PORT); ; this.bspZKRoot = Constants.BSPJOB_ZOOKEEPER_DIR_ROOT; setup(); } @Override public boolean isCommandBarrier() { try { if (bspzk.equaltostat(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter + "/" + Constants.COMMAND_NAME, false)) { return false; } else { return true; } } catch (Exception e) { jip.reportLOG("[isCommandBarrier] " + e.getMessage()); // return false; throw new RuntimeException(e); } } @Override public void setJobInProgressControlInterface(JobInProgressControlInterface aJip) { this.jip = aJip; // this.superStepCounter = aJip.getSuperStepCounter(); // ljn SGA-Graph } @Override public void setCheckNumBase() { this.checkNumBase = jip.getCheckNum(); } /** * @return stageflag. */ public int getStageFlag() { return stageFlag; } /** * @param stageFlag * The stageFlag. */ public void setStageFlag(int stageFlag) { this.stageFlag = stageFlag; } @Override public void process(WatchedEvent event) { synchronized (mutex) { mutex.notify(); } if (event.getState() == new BSPkeeperStateImpl().getSyncConnected()) { this.connectedLatch.countDown(); } } /** * get zookeeper. * * @return BSPZookeeper */ private BSPZookeeper getZooKeeper() { try { if (this.bspzk == null) { this.connectedLatch = new CountDownLatch(1); this.bspzk = new BSPZookeeperImpl(this.zookeeperAddr, Constants.SESSION_TIME_OUT, this); this.zkWaitConnected(bspzk); return bspzk; } else { return this.bspzk; } } catch (Exception e) { LOG.error("[getZooKeeper]", e); throw new RuntimeException(e); } } /** * @param bspzk * BSPzookeeper. */ public void zkWaitConnected(BSPZookeeper bspzk) { if (bspzk.equaltoState()) { try { this.connectedLatch.await(); } catch (InterruptedException e) { throw new IllegalStateException(e); } } } @Override public void setup() { try { this.bspzk = this.getZooKeeper(); if (bspzk != null) { if (bspzk.equaltostat(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-s", false)) { bspzk.create(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-s", new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } if (bspzk.equaltostat(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-d", false)) { bspzk.create(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-d", new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } if (bspzk.equaltostat(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-ss", false)) { bspzk.create(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-ss", new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } if (bspzk.equaltostat(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-sc", false)) { bspzk.create(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-sc", new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } if (bspzk.equaltostat(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-counters", false)) { bspzk.create(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-counters", new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } if (bspzk.equaltostat(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-migrate", false)) { bspzk.create(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-migrate", new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } // ljn SGA-Graph if (bspzk.equaltostat(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-e", false)) { bspzk.create(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-e", new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } if (bspzk.equaltostat(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-m", false)) { bspzk.create(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-m", new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); } } } catch (Exception e) { jip.reportLOG(jobId.toString() + " [setup]" + e.getMessage()); throw new RuntimeException(e); } } @Override public void cleanup() { List<String> list = new ArrayList<String>(); List<String> tmpList = new ArrayList<String>(); try { try { list.clear(); list = bspzk.getChildren(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-s", false); for (String e : list) { bspzk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-s" + "/" + e, bspzk .exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-s" + "/" + e, false) .getVersion()); } } catch (Exception e) { LOG.info("The exception is " + e.getStackTrace()); } finally { bspzk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-s", bspzk .exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-s", false).getVersion()); } jip.reportLOG(jobId.toString() + "delete the -s"); try { list.clear(); list = bspzk.getChildren(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-d", false); for (String e : list) { bspzk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-d" + "/" + e, bspzk .exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-d" + "/" + e, false) .getVersion()); } } catch (Exception e) { LOG.info("The exception is " + e.getStackTrace()); } finally { bspzk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-d", bspzk .exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-d", false).getVersion()); } jip.reportLOG(jobId.toString() + "delete the -d"); try { list.clear(); list = bspzk.getChildren(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-migrate", false); for (String e : list) { bspzk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-migrate" + "/" + e, bspzk.exists( this.bspZKRoot + "/" + jobId.toString().substring(17) + "-migrate" + "/" + e, false).getVersion()); } } catch (Exception e) { LOG.info("The exception is " + e.getStackTrace()); } finally { bspzk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-migrate", bspzk.exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-migrate", false) .getVersion()); } jip.reportLOG(jobId.toString() + "delete the -migrate"); list.clear(); list = bspzk.getChildren(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-ss", false); for (String e : list) { try { tmpList.clear(); tmpList = bspzk.getChildren( this.bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + e, false); for (String ee : tmpList) { bspzk.delete( this.bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + e + "/" + ee, bspzk.exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + e + "/" + ee, false).getAversion()); } } catch (Exception exc) { exc.printStackTrace(); } finally { bspzk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + e, bspzk .exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + e, false) .getVersion()); } } bspzk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-ss", bspzk .exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-ss", false).getVersion()); jip.reportLOG(jobId.toString() + "delete the -ss"); list.clear(); list = bspzk.getChildren(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-sc", false); for (String e : list) { try { tmpList.clear(); tmpList = bspzk.getChildren( this.bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + e, false); for (String ee : tmpList) { bspzk.delete( this.bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + e + "/" + ee, bspzk.exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + e + "/" + ee, false).getAversion()); } } catch (Exception exc) { LOG.info("The exception is " + exc.getStackTrace()); } finally { bspzk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + e, bspzk .exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + e, false) .getVersion()); } } bspzk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-sc", bspzk .exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-sc", false).getVersion()); jip.reportLOG(jobId.toString() + "delete the -sc"); list.clear(); list = bspzk.getChildren(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-counters", false); for (String e : list) { try { tmpList.clear(); tmpList = bspzk.getChildren( this.bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + e, false); for (String ee : tmpList) { bspzk.delete( this.bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + e + "/" + ee, bspzk.exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + e + "/" + ee, false).getAversion()); } } catch (Exception exc) { LOG.info("The exception is " + exc.getStackTrace()); } finally { bspzk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + e, bspzk.exists( this.bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + e, false).getVersion()); } } bspzk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-counters", bspzk.exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-counters", false) .getVersion()); jip.reportLOG(jobId.toString() + "delete the -counters"); } catch (Exception e) { throw new RuntimeException(e); } } @Override public void start() { this.zkRun.start(); } @Override @SuppressWarnings("deprecation") public void stop() { this.zkRun.stop(); } @Override public boolean generalSuperStepBarrier(int checkNum) { List<String> list = new ArrayList<String>(); try { jip.reportLOG(jobId.toString() + " enter the barrier of " + superStepCounter); LOG.info("ljn test : start the generalSuperStepBarrier check number is " + checkNum); while (true) { synchronized (mutex) { list.clear(); list = bspzk.getChildren( bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + superStepCounter, true); LOG.info("ljn test : generalSuperStepBarrier while loop list -ss is : " + bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + superStepCounter); if (list.size() < checkNum) { mutex.wait(); } else { break; } } } LOG.info("ljn test : over the generalSuperStepBarrier check number is " + checkNum); return true; } catch (Exception e) { jip.reportLOG(jobId.toString() + "error: " + e.toString()); throw new RuntimeException(e); } } @Override public SuperStepCommand getSuperStepCommand(int checkNum) { List<String> list = new ArrayList<String>(); try { while (true) { synchronized (mutex) { list.clear(); list = bspzk.getChildren( bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter, true); if (list.size() < checkNum) { jip.reportLOG("[getSuperStepCommand]: " + list.size() + " instead of " + checkNum); mutex.wait(); } else { jip.reportLOG("[getSuperStepCommand]: " + list.size()); break; } } } SuperStepReportContainer[] ssrcs = new SuperStepReportContainer[checkNumBase]; int counter = 0; for (String e : list) { LOG.info(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter + "/" + e + " get Superstep command checkNumBase=" + checkNumBase + " list.size=" + list.size()); if (!e.equals(Constants.COMMAND_NAME)) { byte[] b = bspzk.getData( bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter + "/" + e, false, bspzk.exists(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter + "/" + e, false)); ssrcs[counter] = new SuperStepReportContainer(new String(b)); LOG.info("ljn test : checkNumBase is " + checkNumBase + " not command." + " counter is " + counter); counter++; } } SuperStepCommand ssc = jip.generateCommand(ssrcs); return ssc; } catch (Exception e) { LOG.error("!!!!!!!!!!!!!!!!!!!!!!"); LOG.error(e); throw new RuntimeException(e); } } @Override public boolean quitBarrier() { List<String> list = new ArrayList<String>(); try { while (true) { synchronized (mutex) { list.clear(); list = bspzk.getChildren( bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + superStepCounter, true); if (list.size() > 0) { mutex.wait(); } else { break; } } } } catch (Exception e) { throw new RuntimeException(e); } finally { jip.completedJob(); return false; } } @Override public void recoveryBarrier(List<String> WMNames) { LOG.info("recoveryBarrier: this.superStepCounter " + superStepCounter); faultSuperStepCounter = superStepCounter; int base = WMNames.size(); switch (this.stageFlag) { case Constants.SUPERSTEP_STAGE.FIRST_STAGE: try { jip.reportLOG("recoveried: " + this.jobId.toString() + " enter the firstStageSuperStepBarrier of " + Integer.toString(superStepCounter)); for (int i = 0; i < base; i++) { bspzk.create( bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + Integer.toString(superStepCounter) + "/" + WMNames.get(i) + "-recovery" + 0, new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); bspzk.create( bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + Integer.toString(superStepCounter) + "/" + WMNames.get(i) + "-recovery" + 1, new byte[0], Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); LOG.info("first--recoveryBarrier: " + "recovery" + i); } jip.reportLOG("recoveried: " + this.jobId.toString() + " enter the secondStageSuperStepBarrier(first) of " + Integer.toString(superStepCounter)); for (int i = 0; i < base; i++) { bspzk.create( bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + Integer.toString(superStepCounter) + "/" + WMNames.get(i) + "-recovery" + i, "RECOVERY".getBytes(), Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); bspzk.create( bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + Integer.toString(superStepCounter) + "/" + WMNames.get(i), this.counters.makeEscapedCompactString().getBytes(), Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); LOG.info("second-(first)--recoveryBarrier: " + "recovery" + i); } } catch (KeeperException e) { e.printStackTrace(); throw new RuntimeException(e); } catch (InterruptedException e) { e.printStackTrace(); throw new RuntimeException(e); } break; case Constants.SUPERSTEP_STAGE.SECOND_STAGE: try { jip.reportLOG( "recoveried " + this.jobId.toString() + " enter the secondStageSuperStepBarrier(second) of" + " superStepCounter: " + Integer.toString(superStepCounter)); for (int i = 0; i < base; i++) { bspzk.create( bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + Integer.toString(superStepCounter) + "/" + WMNames.get(i) + "-recovery" + i, "RECOVERY".getBytes(), Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); bspzk.create( bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + Integer.toString(superStepCounter) + "/" + WMNames.get(i), this.counters.makeEscapedCompactString().getBytes(), Ids.OPEN_ACL_UNSAFE, new BSPCreateModeImpl().getPERSISTENT()); LOG.info("second--recoveryBarrier: " + "recovery" + i); } } catch (KeeperException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } break; default: jip.reportLOG(jobId.toString() + " Unkonwn command of "); } } /** * add by chen HA recovery need to Know the running job's SuperStep. */ @Override public void setCurrentSuperStep() { List<String> list = new ArrayList<String>(); try { list = bspzk.getChildren(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss", false); int tempStepCounter = list.size() - 1; for (int i = 0; i < tempStepCounter; i++) { superStepCounter = i; this.updateJobCounters(checkNumBase); } superStepCounter = list.size() - 1; jip.setSuperStepCounter(superStepCounter); LOG.info("HA the superStepCounter=" + superStepCounter); } catch (KeeperException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } } /** * Get counters from ZooKeeper and update jip Counters. */ public void updateJobCounters(int checkNum) { List<String> list = new ArrayList<String>(); try { while (true) { synchronized (mutex) { list.clear(); list = bspzk.getChildren( bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + superStepCounter, true); if (list.size() < checkNum) { jip.reportLOG("[getCounters]: " + list.size() + " instead of " + checkNum); mutex.wait(); } else { jip.reportLOG("[getCounters]: " + list.size()); break; } } } for (String e : list) { LOG.info(bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + superStepCounter + "/" + e); byte[] b = bspzk.getData( bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + superStepCounter + "/" + e, false, bspzk.exists(bspZKRoot + "/" + jobId.toString().substring(17) + "-counters" + "/" + superStepCounter + "/" + e, false)); this.counters.incrAllCounters(Counters.fromEscapedCompactString(new String(b))); } jip.setCounters(this.counters); } catch (Exception e) { throw new RuntimeException(e); } } }