Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.chinamobile.bcbsp.bspstaff; import com.chinamobile.bcbsp.ActiveMQBroker; import com.chinamobile.bcbsp.BSPConfiguration; import com.chinamobile.bcbsp.Constants; import com.chinamobile.bcbsp.Constants.BspCounters; import com.chinamobile.bcbsp.api.AggregateValue; import com.chinamobile.bcbsp.api.Aggregator; import com.chinamobile.bcbsp.api.BSP; //import com.chinamobile.bcbsp.api.Edge; import com.chinamobile.bcbsp.api.Partitioner; import com.chinamobile.bcbsp.api.RecordParse; import com.chinamobile.bcbsp.api.Vertex; import com.chinamobile.bcbsp.bspcontroller.Counters; import com.chinamobile.bcbsp.comm.BSPMessage; //import com.chinamobile.bcbsp.comm.BSPMessage; //import com.chinamobile.bcbsp.comm.Communicator; import com.chinamobile.bcbsp.comm.CommunicatorInterface; import com.chinamobile.bcbsp.comm.CommunicatorNew; import com.chinamobile.bcbsp.comm.GraphStaffHandler; import com.chinamobile.bcbsp.comm.IMessage; import com.chinamobile.bcbsp.comm.MessageManagerInterface; //import com.chinamobile.bcbsp.comm.RPCCommunicator; import com.chinamobile.bcbsp.comm.io.util.MemoryAllocator; //import com.chinamobile.bcbsp.examples.bytearray.pagerank.PageRankMessage; import com.chinamobile.bcbsp.fault.storage.AggValueCheckpoint; import com.chinamobile.bcbsp.fault.storage.Checkpoint; import com.chinamobile.bcbsp.fault.storage.Fault; import com.chinamobile.bcbsp.fault.storage.Fault.Level; import com.chinamobile.bcbsp.fault.storage.Fault.Type; import com.chinamobile.bcbsp.graph.GraphDataFactory; import com.chinamobile.bcbsp.graph.GraphDataInterface; import com.chinamobile.bcbsp.io.InputFormat; import com.chinamobile.bcbsp.io.OutputFormat; import com.chinamobile.bcbsp.io.RecordReader; import com.chinamobile.bcbsp.io.RecordWriter; import com.chinamobile.bcbsp.ml.BSPPeer; import com.chinamobile.bcbsp.partition.HashPartitioner; import com.chinamobile.bcbsp.partition.HashWithBalancerWritePartition; import com.chinamobile.bcbsp.partition.HashWritePartition; import com.chinamobile.bcbsp.partition.NotDivideWritePartition; import com.chinamobile.bcbsp.partition.RangeWritePartition; import com.chinamobile.bcbsp.partition.RecordParseDefault; import com.chinamobile.bcbsp.partition.WritePartition; //import com.chinamobile.bcbsp.pipes.Application; import com.chinamobile.bcbsp.router.routeparameter; import com.chinamobile.bcbsp.subgraph.SubGraphManager; import com.chinamobile.bcbsp.subgraph.util.InDegreeOwnerBSP; import com.chinamobile.bcbsp.subgraph.util.InstabilityVertexIndex; import com.chinamobile.bcbsp.subgraph.util.MigrateVertexCommand; import com.chinamobile.bcbsp.sync.StaffSSController; import com.chinamobile.bcbsp.sync.StaffSSControllerInterface; import com.chinamobile.bcbsp.sync.SuperStepCommand; import com.chinamobile.bcbsp.sync.SuperStepReportContainer; import com.chinamobile.bcbsp.thirdPartyInterface.HDFS.BSPFileSystem; import com.chinamobile.bcbsp.thirdPartyInterface.HDFS.impl.BSPFileSystemImpl; import com.chinamobile.bcbsp.thirdPartyInterface.HDFS.impl.BSPHdfsImpl; import com.chinamobile.bcbsp.util.BSPJob; import com.chinamobile.bcbsp.util.BSPJobID; import com.chinamobile.bcbsp.util.StaffAttemptID; import com.chinamobile.bcbsp.workermanager.WorkerAgentProtocol; import com.chinamobile.bcbsp.workermanager.WorkerManager; //import com.chinamobile.bcbsp.pipes.Application; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.DataInput; import java.io.DataInputStream; import java.io.DataOutput; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStream; import java.net.InetSocketAddress; import java.net.URI; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; //import org.apache.hadoop.fs.FileSystem; //import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.serializer.Deserializer; import org.apache.hadoop.io.serializer.SerializationFactory; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RPC.Server; import org.apache.hadoop.ipc.VersionedProtocol; import org.apache.hadoop.util.ReflectionUtils; /** * BSPStaff A BSPStaff is an entity that executes the local computation of a * BSPJob. A BSPJob usually consists of many BSPStaffs which are distributed * among the workers. * * @author * @version */ public class BSPStaff extends Staff implements GraphStaffHandler { /** * The worker agent for staff. */ private WorkerAgentForStaffInterface staffAgent; /** * The BSP Job configuration. */ private BSPJob bspJob; /** * ActiveMQBroker Provides message middleware ActiveMQ messaging service. */ private ActiveMQBroker activeMQBroker; /** * The avtiveMQ port. */ private int activeMQPort; /** * The communication tool for BSP. It manages the outgoing and incoming queues * of each staff. */ private CommunicatorInterface communicator; /* Zhicheng Liu added */ /** * The partition RPC port. */ private int partitionRPCPort; /** * The split information. */ private BytesWritable rawSplit = new BytesWritable(); /** * The split class. */ private String rawSplitClass; /** * The graph data. */ private SubGraphManager graphData; /** * <partitionID--hostName:port1-port2>. */ private HashMap<Integer, String> partitionToWorkerManagerHostWithPorts = new HashMap<Integer, String>(); /** * The hash bucket to partition. */ private HashMap<Integer, Integer> hashBucketToPartition = null; /** * The range router. */ private HashMap<Integer, Integer> RangeRouter = null; /** * The router parameter. */ private routeparameter routerparameter = new routeparameter(); /** * Get the range router. * * @return RangeRouter */ public HashMap<Integer, Integer> getRangeRouter() { return RangeRouter; } /** * Set the range router. * * @param rangeRouter * the range router */ public void setRangeRouter(HashMap<Integer, Integer> rangeRouter) { RangeRouter = rangeRouter; } // variable for barrier /** * The staff super step controller. */ private StaffSSControllerInterface sssc; /** * Total staff number. */ private int staffNum = 0; /** * Total worker manager number. */ private int workerMangerNum = 0; /** * The local barrier number. */ private int localBarrierNum = 0; // variable for local computation /** * The max super step number. */ private int maxSuperStepNum = 0; /** * The current super step counter. */ private int currentSuperStepCounter = 0; /** * The active counter. */ private long activeCounter = 0; /** * The local compute flag. true:start the local compute. */ private boolean flag = true; /** * The super Step command. */ private SuperStepCommand ssc; // For Partition /** * The partitioner. */ private Partitioner<Text> partitioner; /** * */ private int numCopy = 100; /** * The number of verteices from other staff that could not be parsed. */ private int lost = 0; /** * Loadbalance vertex size */ private int vertexSize = 0; // For Aggregation /** Map for user registered aggregate values. */ private HashMap<String, Class<? extends AggregateValue<?, ?>>> nameToAggregateValue = new HashMap<String, Class<? extends AggregateValue<?, ?>>>(); /** Map for user registered aggregatros. */ private HashMap<String, Class<? extends Aggregator<?>>> nameToAggregator = new HashMap<String, Class<? extends Aggregator<?>>>(); /** Map to cache of the aggregate values aggregated for each vertex. */ @SuppressWarnings("unchecked") private HashMap<String, AggregateValue> aggregateValues = new HashMap<String, AggregateValue>(); /** Map to instance of the aggregate values for the current vertex. */ @SuppressWarnings("unchecked") private HashMap<String, AggregateValue> aggregateValuesCurrent = new HashMap<String, AggregateValue>(); /** Map to cache of the aggregate values calculated last super step. */ @SuppressWarnings("unchecked") private HashMap<String, AggregateValue> aggregateResults = new HashMap<String, AggregateValue>(); /** * The record parse to parse graph data from HDFS. */ private RecordParse recordParse = null; /** * The log in log4j,to write logs. */ private static final Log LOG = LogFactory.getLog(BSPStaff.class); /** * The recovery times. */ private int recoveryTimes = 0; /** The rpc server */ private Server server; /** The c++ application in java */ // private Application application; // add by chen /** * The counters in system. */ private Counters counters; /* Zhicheng Liu added */ /** * The staff start time. */ private long staffStartTime = 0; /** * The staff end time. */ private long staffEndTime = 0; /** * The staff run time. */ private long staffRunTime = 0; // ms /** * Read and write check point time. */ private long rwCheckPointT = 0; // ms /** * Load data time. */ private long loadDataT = 0; // ms /** * The migrate Staff cost. */ private long migrateCost = 0; // ms /** The byte number of graphData. */ private long graphBytes = 0; /** The per message length. */ private int messagePerLength = 0; /** The byte number of incoming messages. */ private long messageBytes = 0; /** The migrate flag,true:migrate the Staff. */ private boolean migrateFlag = false; /** The Staff is a slow staff. */ private boolean hasMigrateStaff = false; /** The migrate mode,true:open migrate mode. */ private boolean openMigrateMode = false; /* Feng added */ /** Judge need aggValues checkpoint. */ private boolean aggCpFlag = false; // Baoxing Yang added /***/ private int fs = 0; /** Feng added for migrate staff string messages */ private ConcurrentLinkedQueue<String> migrateMessagesString = new ConcurrentLinkedQueue<String>(); /** Feng added for migrated staff flag */ private boolean migratedStaffFlag = false; private BSP bsp; private Map<String, LinkedList<IMessage>> icomMess = null; // SGA graph HashMap<Integer, Vertex> migrateVertexMap = new HashMap<Integer, Vertex>(); private MigrateVertexCommand migrateVertexCommand; private String migrateDirBase; private String migratePartitionDir; private boolean evaluateflag; private float golbalIntravelRatio; private HashMap<Integer, Long> computeTimeMap = new HashMap<Integer, Long>(); /** The default constructor. */ public BSPStaff() { } /** * The constructor of BSPStaff. * * @param jobId * The current BSP Job id. * @param jobFile * The BSP Job file. * @param staffId * The current BSP Staff id. * @param partition * The partition owns by the current staff * @param splitClass * The split class * @param split */ public BSPStaff(BSPJobID jobId, String jobFile, StaffAttemptID staffId, int partition, String splitClass, BytesWritable split) { this.setJobId(jobId); this.setJobFile(jobFile); this.setSid(staffId); this.setPartition(partition); this.rawSplitClass = splitClass; this.rawSplit = split; } /** * Get staff number. * * @return staff number */ public int getStaffNum() { return staffNum; } /** * Get the * * @return */ public int getNumCopy() { return numCopy; } /** * @param numCopy */ public void setNumCopy(int numCopy) { this.numCopy = numCopy; } /** * Get the Hash bucket to partition. * * @return Hash bucket to partition */ public HashMap<Integer, Integer> getHashBucketToPartition() { return this.hashBucketToPartition; } /** * Set the Hash bucket to partition. * * @param hashBucketToPartition * the Hash bucket to partition. */ public void setHashBucketToPartition(HashMap<Integer, Integer> hashBucketToPartition) { this.hashBucketToPartition = hashBucketToPartition; } /** * Get graph data. * * @return graph data */ public GraphDataInterface getGraphData() { return graphData; } /** * Set graph data. * * @param graph * graph data */ public void setGraphData(SubGraphManager graph) { this.graphData = graph; } @Override public BSPStaffRunner createRunner(WorkerManager workerManager) { return new BSPStaffRunner(this, workerManager, this.bspJob); } /** just for test */ public void loadData(BSPJob job) throws ClassNotFoundException, IOException, InterruptedException { int i = 0; this.partitioner = (Partitioner<Text>) ReflectionUtils.newInstance( job.getConf().getClass(Constants.USER_BC_BSP_JOB_PARTITIONER_CLASS, HashPartitioner.class), job.getConf()); if (i == 1) { throw new ClassNotFoundException(); } else if (i == 2) { throw new IOException(); } else if (i == 3) { throw new InterruptedException(); } } /** * loadData: load data for the staff. * * @param job * BSP job configuration * @param workerAgent * Protocol that staff child process uses to contact its parent process * @return boolean * @throws ClassNotFoundException * @throws IOException * e * @throws InterruptedException * e */ @SuppressWarnings("unchecked") public boolean loadData(BSPJob job, WorkerAgentProtocol workerAgent, WorkerAgentForStaffInterface aStaffAgent) throws ClassNotFoundException, IOException, InterruptedException { // rebuild the input split RecordReader input = null; org.apache.hadoop.mapreduce.InputSplit split = null; if (rawSplitClass.equals("no")) { input = null; } else { DataInputBuffer splitBuffer = new DataInputBuffer(); splitBuffer.reset(rawSplit.getBytes(), 0, rawSplit.getLength()); SerializationFactory factory = new SerializationFactory(job.getConf()); Deserializer<? extends org.apache.hadoop.mapreduce.InputSplit> deserializer = (Deserializer<? extends org.apache.hadoop.mapreduce.InputSplit>) factory .getDeserializer(job.getConf().getClassByName(rawSplitClass)); deserializer.open(splitBuffer); split = deserializer.deserialize(null); // rebuild the InputFormat class according to the user configuration InputFormat inputformat = (InputFormat) ReflectionUtils.newInstance( job.getConf().getClass(Constants.USER_BC_BSP_JOB_INPUT_FORMAT_CLASS, InputFormat.class), job.getConf()); inputformat.initialize(job.getConf()); input = inputformat.createRecordReader(split, job); input.initialize(split, job.getConf()); } SuperStepReportContainer ssrc = new SuperStepReportContainer(); ssrc.setPartitionId(this.getPartition()); this.numCopy = (int) (1 / (job.getConf().getFloat(Constants.USER_BC_BSP_JOB_BALANCE_FACTOR, Constants.USER_BC_BSP_JOB_BALANCE_FACTOR_DEFAULT))); ssrc.setNumCopy(numCopy); ssrc.setCheckNum(this.staffNum); StaffSSControllerInterface lsssc = new StaffSSController(this.getJobId(), this.getSid(), workerAgent); long start = System.currentTimeMillis(); LOG.info("in BCBSP with PartitionType is: Hash" + " start time:" + start); if (this.staffNum == 1 || job.getConf().getBoolean(Constants.USER_BC_BSP_JOB_ISDIVIDE, false)) { this.partitioner = (Partitioner<Text>) ReflectionUtils.newInstance( job.getConf().getClass(Constants.USER_BC_BSP_JOB_PARTITIONER_CLASS, HashPartitioner.class), job.getConf()); this.partitioner.setNumPartition(this.staffNum); this.partitioner.intialize(job, split); WritePartition writePartition = new NotDivideWritePartition(); /* * RecordParse recordParse = (RecordParse) ReflectionUtils .newInstance( * job.getConf() .getClass( Constants.USER_BC_BSP_JOB_RECORDPARSE_CLASS, * RecordParseDefault.class), job .getConf()); recordParse.init(job); * //add by chen for null bug this.recordParse = recordParse; * //this.recordParse.init(job); */ writePartition.setRecordParse(this.recordParse); writePartition.setStaff(this); writePartition.write(input); ssrc.setDirFlag(new String[] { "1" }); ssrc.setCheckNum(this.staffNum); lsssc.loadDataBarrier(ssrc, Constants.PARTITION_TYPE.HASH); LOG.info("The number of verteices from other staff" + " that cound not be parsed:" + this.lost); LOG.info("in BCBSP with PartitionType is:HASH" + " the number of HeadNode in this partition is:" + graphData.sizeForAll()); graphData.finishAdd(); ssrc.setCheckNum(this.staffNum * 2); ssrc.setDirFlag(new String[] { "2" }); lsssc.loadDataBarrier(ssrc, Constants.PARTITION_TYPE.HASH); } else { this.partitioner = (Partitioner<Text>) ReflectionUtils.newInstance( job.getConf().getClass(Constants.USER_BC_BSP_JOB_PARTITIONER_CLASS, HashPartitioner.class), job.getConf()); WritePartition writePartition = (WritePartition) ReflectionUtils.newInstance(job.getConf().getClass( Constants.USER_BC_BSP_JOB_WRITEPARTITION_CLASS, HashWritePartition.class), job.getConf()); int multiple = 1; if (writePartition instanceof HashWithBalancerWritePartition) { this.partitioner.setNumPartition(this.staffNum * numCopy); multiple = 2; } else { this.partitioner.setNumPartition(this.staffNum); multiple = 1; if (writePartition instanceof RangeWritePartition) { multiple = 2; } } this.partitioner.intialize(job, split); /* * RecordParse recordParse = (RecordParse) ReflectionUtils .newInstance( * job.getConf() .getClass( Constants.USER_BC_BSP_JOB_RECORDPARSE_CLASS, * RecordParseDefault.class), job .getConf()); recordParse.init(job); // * this.recordParse = (RecordParse) ReflectionUtils.newInstance( // * job.getConf().getClass( // Constants.USER_BC_BSP_JOB_RECORDPARSE_CLASS, * // RecordParseDefault.class), job.getConf()); // * this.recordParse.init(job); this.recordParse = recordParse; */ writePartition.setPartitioner(partitioner); writePartition.setRecordParse(this.recordParse); writePartition.setStaff(this); writePartition.setWorkerAgent(aStaffAgent); writePartition.setSsrc(ssrc); writePartition.setSssc(lsssc); writePartition.setTotalCatchSize(job.getConf().getInt(Constants.USER_BC_BSP_JOB_TOTALCACHE_SIZE, Constants.USER_BC_BSP_JOB_TOTALCACHE_SIZE_DEFAULT)); int threadNum = job.getConf().getInt(Constants.USER_BC_BSP_JOB_SENDTHREADNUMBER, Constants.USER_BC_BSP_JOB_SENDTHREADNUMBER_DEFAULT); if (threadNum > this.staffNum) { threadNum = this.staffNum - 1; } writePartition.setSendThreadNum(threadNum); writePartition.write(input); ssrc.setDirFlag(new String[] { "1" }); ssrc.setCheckNum(this.staffNum * multiple); lsssc.loadDataBarrier(ssrc, Constants.PARTITION_TYPE.HASH); LOG.info("The number of verteices from other staff that" + " cound not be parsed:" + this.lost); LOG.info("in BCBSP with PartitionType is:HASH" + " the number of HeadNode in this partition is:" + graphData.sizeForAll()); graphData.finishAdd(); ssrc.setCheckNum(this.staffNum * (multiple + 1)); ssrc.setDirFlag(new String[] { "2" }); lsssc.loadDataBarrier(ssrc, Constants.PARTITION_TYPE.HASH); } long end = System.currentTimeMillis(); LOG.info("in BCBSP with PartitionType is:HASH" + " end time:" + end); LOG.info( "in BCBSP with PartitionType is:HASH" + " using time:" + (float) (end - start) / 1000 + " seconds"); return true; } /** * loadData: load data for the staff c++. * * @param job * BSP job configuration. * @param workerAgent * Protocol that staff child process uses to contact its parent process * @param application * communication with c++ process. * @return boolean * @throws ClassNotFoundException * e * @throws IOException * e * @throws InterruptedException * e */ /** * saveResult: save the local computation result on the HDFS(SequenceFile) * Changed IN 20140313 For Reconstruction.. * * @param job * BSP Job configuration * @param staff * the current BSP Staff * @return boolean */ @SuppressWarnings("unchecked") public boolean saveResult(BSPJob job, Staff staff, WorkerAgentProtocol workerAgent) { try { OutputFormat outputformat = (OutputFormat) ReflectionUtils.newInstance( job.getConf().getClass(Constants.USER_BC_BSP_JOB_OUTPUT_FORMAT_CLASS, OutputFormat.class), job.getConf()); outputformat.initialize(job.getConf()); RecordWriter output = outputformat.getRecordWriter(job, this.getSid()); // Note Changed 20140313 this.graphData.saveAllVertices(this, output); output.close(job); graphData.clean(); } catch (Exception e) { LOG.error("Exception has been catched in BSPStaff--saveResult !", e); BSPConfiguration conf = new BSPConfiguration(); if (this.recoveryTimes < conf.getInt(Constants.BC_BSP_JOB_RECOVERY_ATTEMPT_MAX, 0)) { recovery(job, staff, workerAgent); } else { workerAgent.setStaffStatus(this.getSid(), Constants.SATAFF_STATUS.FAULT, new Fault(Fault.Type.DISK, Fault.Level.INDETERMINATE, workerAgent.getWorkerManagerName(job.getJobID(), this.getSid()), e.toString(), job.toString(), this.getSid().toString()), 2); LOG.info("=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*" + "=*=*=*=*=*=*=*=*=*"); LOG.error("Other Exception has happened and been catched, " + "the exception will be reported to WorkerManager", e); } } return true; } /** * Judge if this Staff should recovery. * * @return true:the Staff should recovery. */ @Override public boolean recovery(BSPJob job, Staff staff, WorkerAgentProtocol workerAgent) { this.recoveryTimes++; boolean success = saveResult(job, staff, workerAgent); return success == true; } // Just for testing /** * Test the first route. */ public void displayFirstRoute() { for (Entry<Integer, String> e : this.getPartitionToWorkerManagerNameAndPort().entrySet()) { LOG.info("partitionToWorkerManagerName : " + e.getKey() + " " + e.getValue()); } } // Just for testing /** * Test the second route. */ public void displaySecondRoute() { for (Entry<Integer, Integer> e : this.hashBucketToPartition.entrySet()) { LOG.info("partitionToRange : " + e.getKey() + " " + e.getValue()); } } /** * Get the local barrier number. * * @param hostName * the current compute node name. * @return local barrier nmuber. */ public final int getLocalBarrierNumber(final String hostName) { int localBarrierNumber = 0; // if (this.getPartitionToWorkerManagerNameAndPort() == null) { // LOG.info("ljn test : Run getLocalBarrierNumber getPartitionToWorkerManagerNameAndPort is null"); // } else { // LOG.info("ljn test : Run getLocalBarrierNumber getPartitionToWorkerManagerNameAndPort is not null"); // } for (Entry<Integer, String> entry : this.getPartitionToWorkerManagerNameAndPort().entrySet()) { String workerManagerName = entry.getValue().split(":")[0]; if (workerManagerName.equals(hostName)) { localBarrierNumber++; } } return localBarrierNumber; } /** * Delete the old check point. * * @param oldCheckpoint * the old check point * @param job * BSP job configuration * @return true:successfuly delete. */ private boolean deleteOldCheckpoint(int oldCheckpoint, BSPJob job) { LOG.info("deleteOldCheckpoint--oldCheckpoint: " + oldCheckpoint); try { Configuration conf = new Configuration(); BSPConfiguration bspConf = new BSPConfiguration(); String uri = bspConf.get(Constants.BC_BSP_HDFS_NAME) + job.getConf().get(Constants.BC_BSP_CHECKPOINT_WRITEPATH) + "/" + job.getJobID().toString() + "/" + oldCheckpoint + "/"; // FileSystem fs = FileSystem.get(URI.create(uri), conf); BSPFileSystem bspfs = new BSPFileSystemImpl(URI.create(uri), conf); // if (fs.exists(new Path(uri))) { // fs.delete(new Path(uri), true); // } if (bspfs.exists(new BSPHdfsImpl().newPath(uri))) { bspfs.delete(new BSPHdfsImpl().newPath(uri), true); } } catch (IOException e) { LOG.error("Exception has happened and been catched!", e); return false; } return true; } /** * Run c++ local compute process for BSP job. * * @param job * BSP job configuration * @param workerAgent * Protocol that staff child process uses to contact its parent process * @param recovery * true:the Staff is a recovery Staff * @param changeWorkerState * To change worker state * @param failCounter * fail times * @param hostName * the current compute node name. */ @Override public void runC(BSPJob job, Staff staff, WorkerAgentProtocol workerAgent, boolean recovery, boolean changeWorkerState, int failCounter, String hostName) { // } @SuppressWarnings({ "unchecked", "unused" }) /** * run the local computation. * @param job BSP job configuration * @param staff the current Staff * @param workerAgent Protocol that staff child process uses * to contact its parent process * @param recovery true:the Staff is a recovery Staff * @param changeWorkerState To change worker state * @param failCounter fail times * @param hostName the current compute node name. * Review comment: * (1) The codes inside this method are too messy. * Review time: 2011-11-30 * Reviewer: Hongxu Zhang. * Fix log: * (1) To make the codes neat and well-organized, I use more empty * lines and annotations * to organize the codes. * Fix time: 2011-12-1 * Programmer: Hu Zheng. */ /* * Review suggestion: allow user to determine whether to use load balance * Zhicheng Liu 2013/10/9 */ // public void run(BSPJob job, Staff staff, WorkerAgentProtocol workerAgent, @Override public void runPartition(BSPJob job, Staff staff, WorkerAgentProtocol workerAgent, boolean recovery, boolean changeWorkerState, int migrateSuperStep, int failCounter, String hostName) { WritePartition writePartition = (WritePartition) ReflectionUtils.newInstance( job.getConf().getClass(Constants.USER_BC_BSP_JOB_WRITEPARTITION_CLASS, HashWritePartition.class), job.getConf()); // record the number of failures of this staff LOG.info("BSPStaff---run()--changeWorkerState: " + changeWorkerState + "[HostName] " + hostName); staff.setFailCounter(failCounter); // Note Memory Deploy 20140312 MemoryAllocator ma = new MemoryAllocator(job); ma.PrintMemoryInfo(LOG); ma.setupBeforeLoadGraph(LOG); initializeBefore(job, workerAgent); // instalize the staff. SuperStepReportContainer ssrc = newInstanceSSRC(); this.sssc = new StaffSSController(this.getJobId(), this.getSid(), workerAgent); Checkpoint cp = new Checkpoint(job); AggValueCheckpoint aggcp = new AggValueCheckpoint(job); /* Zhicheng Liu added */ this.openMigrateMode = this.bspJob.getConf().get("bcbsp.loadbalance", "false").equals("true") ? true : false; /* Feng added */ this.aggCpFlag = this.bspJob.getConf().get("bcbsp.aggValuesCheckpoint", "false").equals("true") ? true : false; if (this.getGraphDataFactory() == null) { this.setGraphDataFactory(new GraphDataFactory(job.getConf())); } // prepare for staff try { this.counters = new Counters(); // for staff migtare if (openMigrateMode && migrateSuperStep != 0) { boolean staffMigrateFlag = true;// flag to judge a staff migrate // to set the init vertex path LOG.info("Migrate new staff " + this.getSid()); this.currentSuperStepCounter = migrateSuperStep - 1; prepareForMigrate(workerAgent, hostName); prepareMigrateGraphdata(job, staff, cp); intializePartitionForRecovery(job, writePartition); } else if (!recovery) { ssrc.setCheckNum(this.staffNum); int runpartitionRPCPort = workerAgent.getFreePort(); /* Zhicheng Liu added */ this.partitionRPCPort = runpartitionRPCPort; ssrc.setPort1(runpartitionRPCPort); this.activeMQPort = workerAgent.getFreePort(); ssrc.setPort2(this.activeMQPort); LOG.info("[BSPStaff] Get the port for partitioning RPC is : " + runpartitionRPCPort + "!"); LOG.info("[BSPSr Ataff] Get the port foctiveMQ Broker is : " + this.activeMQPort + "!"); initializeAfter(job, workerAgent); recordWorkerManagerNameAndPort(ssrc); this.staffAgent = new WorkerAgentForStaff(job.getConf()); workerAgent.setStaffAgentAddress(this.getSid(), this.staffAgent.address()); initStaffNum(hostName); initWorkerNum(workerAgent); /** Clock */ long start = System.currentTimeMillis(); try { loadData(job, workerAgent, this.staffAgent); } catch (Exception e) { throw new RuntimeException("Load data Exception in BSP staff runPartition", e); } long end = System.currentTimeMillis(); LOG.info("[==>Clock<==] <BSP Partition compute load Data> used " + (end - start) / 1000f + " seconds"); if (this.openMigrateMode) { this.loadDataT = (end - start) * 2; updateMigrateStatistics(); } } else { // for recovery LOG.info("The recoveried staff begins to read checkpoint"); LOG.info("The fault SuperStepCounter is : " + job.getInt("staff.fault.superstep", 0)); prepareRecoverySchedule(job, workerAgent, hostName); prepareRecoveryGraphdata(job, staff, cp); resetRecoverySuperStepReportContainer(ssrc); freshRecoveryPort(ssrc, workerAgent); this.currentSuperStepCounter = ssc.getNextSuperStepNum(); LOG.info("ljn test : The recoveried staff begins to read checkpoint ** current superstep is " + this.currentSuperStepCounter); intializePartitionForRecovery(job, writePartition); } } catch (ClassNotFoundException cnfE) { faultProcessLocalCompute(job, staff, workerAgent, cnfE, 0); return; } catch (IOException ioE) { faultProcessLocalCompute(job, staff, workerAgent, ioE, 0); return; // } catch (InterruptedException iE) { // faultProcessLocalCompute(job, staff, workerAgent, iE, 0); // return; } this.bsp = (BSP) ReflectionUtils.newInstance( job.getConf().getClass(Constants.USER_BC_BSP_JOB_WORK_CLASS, BSP.class), job.getConf()); prpareAggregate(job, migrateSuperStep); /* Zhicheng Liu added */ if (openMigrateMode && migrateSuperStep != 0) { decapsulateAggForMigrate(); } String commOption = job.getCommucationOption(); try { // configuration before local computation bsp.setup(staff); this.icomMess = null; if (openMigrateMode && migrateSuperStep != 0) { // read message from // hdfs for recovery processStaffMigrate(job, staff, cp, ssrc, workerAgent); // this.currentSuperStepCounter++; } prepareRoute(writePartition); this.issueCommunicator(commOption, hostName, 0, job, icomMess); BSPStaffContext context = new BSPStaffContext(job, this.currentSuperStepCounter); context.setCommHandler(communicator); // Begin local computation for the partition compute. while (this.flag) { /* Zhicheng Liu added waiting migrate staff for */ if (this.openMigrateMode && this.hasMigrateStaff && migrateSuperStep == 0) { LOG.info("Having staff needed to migrate, so update the" + " globle routing"); this.hasMigrateStaff = false; resetPortsMigrate(ssrc, workerAgent, hostName); rebuildCommunicator(job, hostName); } staffStartTime = System.currentTimeMillis(); this.activeCounter = 0; if (openMigrateMode && migrateSuperStep != 0) { recovery = false; LOG.info("Test migrateSuperStep counter! " + migrateSuperStep); migrateSuperStep = 0; this.migratedStaffFlag = true; LOG.info("this.currentSuperStepCounter is " + this.currentSuperStepCounter); } else if (!recovery) { this.graphData.setRecovryFlag(recovery); } else {// for recovery recovery = false; } prepareForOneSuperstep(staff, context, bsp, job); long start = System.currentTimeMillis(); // graph data processing and BSPPeer compute. if (this.openMigrateMode == true && this.migrateMessagesString != null) { this.migrateMessagesString.clear(); } this.graphData.setMigratedStaffFlag(migratedStaffFlag); this.graphData.processingByBucket(this, bsp, job, currentSuperStepCounter, context); long end = System.currentTimeMillis(); reportTimeOneStep(start, end); start = System.currentTimeMillis(); this.communicator.noMoreMessagesForSending(); while (true) { if (this.communicator.isSendingOver()) { break; } } start = System.currentTimeMillis(); staffEndTime = start; staffRunTime = staffEndTime - staffStartTime; processSendSSRC(ssrc); end = System.currentTimeMillis(); LOG.info("[==>Clock<==] <Sending over sync> used " + (end - start) / 1000f + " seconds"); start = end; this.communicator.noMoreMessagesForReceiving(); while (true) { if (this.communicator.isReceivingOver()) { break; } } if (this.openMigrateMode) { updateMggMigrateStatistics(); } processReceiveSSRC(ssrc); updateCounter(); this.communicator.exchangeIncomeQueues(); reportMessage(); if ((this.currentSuperStepCounter + 1) >= this.maxSuperStepNum) { this.communicator.clearOutgoingQueues(); this.communicator.clearIncomedQueues(); this.activeCounter = 0; } else { this.activeCounter = this.graphData.getActiveCounter(); } encapsulateAgg(ssrc); setSecondBarrier(ssrc, context); sssc.setCounters(this.counters); this.counters.clearCounters(); if (this.openMigrateMode) { updateMigrateCost(ssrc); } this.ssc = sssc.secondStageSuperStepBarrier(this.currentSuperStepCounter, ssrc); LOG.info("[==>Clock<==] <StaffSSController's rebuild session> used " + StaffSSController.rebuildTime / 1000f + " seconds"); StaffSSController.rebuildTime = 0; if (this.openMigrateMode) { confirmMigrateStaff(); } if (ssc.getCommandType() == Constants.COMMAND_TYPE.START_AND_RECOVERY) { LOG.info("ljn test : command in run partition is START_AND_RECOVERY "); prepareForRecovery(workerAgent, hostName); } decapsulateAgg(aggcp, job, staff); // command tye to switch. switch (ssc.getCommandType()) { case Constants.COMMAND_TYPE.START: LOG.info("Get the CommandType is : START"); if (openMigrateMode && migrateFlag) { writeMigrateData(cp, job, staff); this.flag = false; break; } this.currentSuperStepCounter = ssc.getNextSuperStepNum(); this.flag = true; break; case Constants.COMMAND_TYPE.START_AND_CHECKPOINT: LOG.info("Get the CommandTye is : START_AND_CHECKPOINT"); if (openMigrateMode && migrateFlag) { writeMigrateData(cp, job, staff); } processCheckpointCommand(cp, job, staff, ssrc); if (openMigrateMode && migrateFlag) { this.flag = false; } else { this.currentSuperStepCounter = ssc.getNextSuperStepNum(); this.flag = true; } break; case Constants.COMMAND_TYPE.START_AND_RECOVERY: LOG.info("Get the CommandTye is : START_AND_RECOVERY"); this.currentSuperStepCounter = ssc.getAbleCheckPoint(); processRecoveryCommand(cp, job, staff, ssrc, hostName); displayFirstRoute(); recovery = true; this.flag = true; break; case Constants.COMMAND_TYPE.STOP: LOG.info("Get the CommandTye is : STOP"); LOG.info("Staff will save the computation result and then quit!"); this.currentSuperStepCounter = ssc.getNextSuperStepNum(); this.flag = false; break; default: LOG.error("ERROR! " + ssc.getCommandType() + " is not a valid CommandType, so the staff will save the " + "computation result and quit!"); flag = false; } workerAgent.setStaffStatus(this.getSid(), Constants.SATAFF_STATUS.RUNNING, null, 1); } this.communicator.complete(); } catch (IOException ioe) { // try over faultProcessLocalCompute(job, staff, workerAgent, ioe, 1); LOG.error("Staff will quit abnormally"); return; } catch (Exception e) { faultProcessLocalCompute(job, staff, workerAgent, e, 1); LOG.error("Staff will quit abnormally"); return; } if (!this.migrateFlag) { try { finishStaff(job, staff, workerAgent, ssrc); LOG.info("Staff is completed successfully"); } catch (Exception e) { reportErrorStaff(workerAgent, job, staff, e); } } else { finishMigrateStaff(bsp, staff, workerAgent, recovery); } } private void reportErrorStaff(WorkerAgentProtocol workerAgent, BSPJob job, Staff staff, Exception e) { faultProcessLocalCompute(job, staff, workerAgent, e, 2); } private void finishStaff(BSPJob job, Staff staff, WorkerAgentProtocol workerAgent, SuperStepReportContainer ssrc) { saveResult(job, staff, workerAgent); ssrc.setLocalBarrierNum(this.localBarrierNum); ssrc.setStageFlag(Constants.SUPERSTEP_STAGE.SAVE_RESULT_STAGE); ssrc.setDirFlag(new String[] { "1", "2", "write", "read" }); sssc.saveResultStageSuperStepBarrier(this.currentSuperStepCounter, ssrc); // cleanup after local computation bsp.cleanup(staff); stopActiveMQBroker(); try { done(workerAgent); } catch (IOException e) { throw new RuntimeException(e); } workerAgent.setStaffStatus(this.getSid(), Constants.SATAFF_STATUS.SUCCEED, null, 1); LOG.info("The max SuperStep num is " + this.maxSuperStepNum); } private void processRecoveryCommand(Checkpoint cp, BSPJob job, Staff staff, SuperStepReportContainer ssrc, String hostName) { // // clean first // int version = job.getGraphDataVersion(); // this.graphData = this.getGraphDataFactory().createGraphData(version, // this); // this.graphData.clean(); // // this.graphData = cp.readCheckPoint( // // new BSPHdfsImpl().newPath(ssc.getInitWritePath()), job, // // staff); // // ljn modefied 20170701 // this.graphData = cp.readCheckPoint( // new BSPHdfsImpl().newPath(ssc.getInitReadPath()), job, staff); // ssrc.setPartitionId(this.getPartition()); // ssrc.setLocalBarrierNum(this.localBarrierNum); // ssrc.setStageFlag(Constants.SUPERSTEP_STAGE.READ_CHECKPOINT_STAGE); // ssrc.setDirFlag(new String[] {"read"}); // ssrc.setCheckNum(this.workerMangerNum * 1); // // ssrc.setPort2(this.activeMQPort); // LOG.info("[BSPStaff] Get the port for ActiveMQ Broker is : " // + this.activeMQPort + "!"); // this.setPartitionToWorkerManagerNameAndPort(sssc // .checkPointStageSuperStepBarrier(this.currentSuperStepCounter, ssrc)); // displayFirstRoute(); // /* // * Feng added for new version staff recovery reinitialize communicator for // * send messages // */ // this.communicator = new CommunicatorNew(this.getJobId(), job, // this.getPartition(), partitioner); // this.communicator.initialize(this.routerparameter, // this.getPartitionToWorkerManagerNameAndPort(), this.graphData); // this.communicator.start(hostName, this); // this.communicator.setPartitionToWorkerManagerNamePort(this // .getPartitionToWorkerManagerNameAndPort()); // // this.communicator.setPartitionToWorkerManagerNamePort(this // // .getPartitionToWorkerManagerNameAndPort()); // // this.currentSuperStepCounter = ssc.getNextSuperStepNum(); // // this.communicator.clearOutgoingQueues(); // this.communicator.clearIncomedQueues(); } /** * Initialize the staff state. */ public void initializeBefore(BSPJob job, WorkerAgentProtocol workerAgent) { this.bspJob = job; this.maxSuperStepNum = job.getNumSuperStep(); this.staffNum = job.getNumBspStaff(); } public void initializeAfter(BSPJob job, WorkerAgentProtocol workerAgent) { // ***note for putHeadNode Error @author Liu Jinpeng 2013/06/26 { this.recordParse = (RecordParse) ReflectionUtils.newInstance( job.getConf().getClass(Constants.USER_BC_BSP_JOB_RECORDPARSE_CLASS, RecordParseDefault.class), job.getConf()); this.recordParse.init(job); int version = job.getGraphDataVersion(); if (version != 5) { LOG.error("Nor for SGA_Graph."); } this.graphData = (SubGraphManager) this.getGraphDataFactory().createGraphData(version, this); } // this.counters = new Counters(); } /** * Return a new SuperStepReportContainer for staff. * * @return ssrc */ public SuperStepReportContainer newInstanceSSRC() { SuperStepReportContainer ssrc = new SuperStepReportContainer(); ssrc.setPartitionId(this.getPartition()); return ssrc; } public void recordWorkerManagerNameAndPort(SuperStepReportContainer ssrc) { this.partitionToWorkerManagerHostWithPorts = sssc.scheduleBarrier(ssrc); LOG.info("ljn test migrate : record the partitionToWorkerManagerHostWithPorts is " + partitionToWorkerManagerHostWithPorts); ; // record the map from partitions to workermanagers for (Integer e : this.partitionToWorkerManagerHostWithPorts.keySet()) { String[] nameAndPorts = this.partitionToWorkerManagerHostWithPorts.get(e).split(":"); String[] ports = nameAndPorts[1].split("-"); this.getPartitionToWorkerManagerNameAndPort().put(e, nameAndPorts[0] + ":" + ports[1]); } } // /** // * Set the state of ssrc in no recorvy staff. // * @param ssrc // */ // public void setSSRCState(SuperStepReportContainer ssrc) { // ssrc.setCheckNum(this.staffNum); // ssrc.setPort1(partitionRPCPort); // ssrc.setPort2(this.activeMQPort); // } /** * For partition and for WorkerManager to invoke rpc method of Staff. * * @param job * @param workerAgent */ public void initCommunicatePort(BSPJob job, WorkerAgentProtocol workerAgent) { /* Zhicheng Liu added */ this.partitionRPCPort = workerAgent.getFreePort(); this.activeMQPort = workerAgent.getFreePort(); LOG.info("[BSPStaff] Get the port for partitioning RPC is : " + partitionRPCPort + "!"); } // initialize the number of local staffs and the number of // workers of the same job /** * Initialize the number of local staffs. * * @param hostName */ public void initStaffNum(String hostName) { this.localBarrierNum = getLocalBarrierNumber(hostName); } /** * Initialize the number of workers of the same job. * * @param hostName */ public void initWorkerNum(WorkerAgentProtocol workerAgent) { this.workerMangerNum = workerAgent.getNumberWorkers(this.getJobId(), this.getSid()); displayFirstRoute(); } /** * Prepare the aggregator and aggretevalue before compute. * * @param job */ public void prpareAggregate(BSPJob job, int migrateSuperStep) { /** Clock */ // long start = System.currentTimeMillis(); // load aggregators and aggregate values. loadAggregators(job); // long end = System.currentTimeMillis(); // LOG.info("[==>Clock<==] <loadAggregators> used " + (end - start) / 1000f // + " seconds"); /* Zhicheng Liu added */ /* * if (openMigrateMode && migrateSuperStep != 0) { // String[] aggValues = * this.ssc.getAggValues(); // if (aggValues != null) { // * decapsulateAggregateValues(aggValues); // } // } */ if (openMigrateMode && migrateSuperStep != 0) { String[] aggValues = this.ssc.getAggValues(); if (aggValues != null) { decapsulateAggregateValues(aggValues); } } } /** * Initialize the route of bsp staff for partition cumpute. * * @param writePartition */ public void prepareRoute(WritePartition writePartition) { this.routerparameter.setPartitioner(partitioner); if (writePartition instanceof HashWithBalancerWritePartition) { this.routerparameter.setHashBucketToPartition(this.getHashBucketToPartition()); } else { if (writePartition instanceof RangeWritePartition) { this.routerparameter.setRangeRouter(this.getRangeRouter()); } } } /** * Prepare the local compute for this superstep. * * @param staff * @param context * @param bsp * @param job */ public void prepareForOneSuperstep(Staff staff, BSPStaffContext context, BSP bsp, BSPJob job) { this.communicator.setStaffId(staff.getStaffID().toString()); this.communicator.begin(this.currentSuperStepCounter); context.refreshSuperStep(this.currentSuperStepCounter); SuperStepContext ssContext = new SuperStepContext(job, currentSuperStepCounter); publishAggregateValues(ssContext); bsp.initBeforeSuperStep(ssContext); initBeforeSuperStepForAggregateValues(ssContext); } /** * Report the time information for one superstep. */ public void reportTimeOneStep(long start, long end) { /** Clocks */ LOG.info("[BSPStaff] Vertex computing is over for the super step <" + this.currentSuperStepCounter + ">"); LOG.info("[==>Clock<==] [Local computing for partition ] used " + (end - start) / 1000f + " seconds" + " in superstep " + currentSuperStepCounter); LOG.info("[==>Clock<==] ...(Load Graph Data Time) used " + loadGraphTime / 1000f + " seconds" + " in superstep " + currentSuperStepCounter); this.loadGraphTime = 0; LOG.info("[==>Clock<==] ...(Aggregate Time) used " + aggregateTime / 1000f + " seconds" + " in superstep " + currentSuperStepCounter); this.aggregateTime = 0; LOG.info("[==>Clock<==] ...(Compute Time) used " + computeTime / 1000f + " seconds" + " in superstep " + currentSuperStepCounter); LOG.info("[==>Clock<==] ...(Collect Messages Time) used " + collectMsgsTime / 1000f + " seconds"); this.collectMsgsTime = 0; } /** * After send update ssrc. * * @param ssrc */ public void processSendSSRC(SuperStepReportContainer ssrc) { ssrc.setLocalBarrierNum(this.localBarrierNum); ssrc.setStageFlag(Constants.SUPERSTEP_STAGE.FIRST_STAGE); ssrc.setDirFlag(new String[] { "1" }); ssrc.setCheckNum(this.workerMangerNum); sssc.firstStageSuperStepBarrier(this.currentSuperStepCounter, ssrc); } /** * After receive update ssrc. * * @param ssrc */ public void processReceiveSSRC(SuperStepReportContainer ssrc) { ssrc.setLocalBarrierNum(this.localBarrierNum); ssrc.setStageFlag(Constants.SUPERSTEP_STAGE.FIRST_STAGE); ssrc.setDirFlag(new String[] { "2" }); ssrc.setCheckNum(this.workerMangerNum * 2); sssc.firstStageSuperStepBarrier(this.currentSuperStepCounter, ssrc); } /** * Update the counter after one superstep. */ public void updateCounter() { this.counters.findCounter(BspCounters.MESSAGES_NUM_SENT) .increment(this.communicator.getCombineOutgoMessageCounter()); this.counters.findCounter(BspCounters.MESSAGES_NUM_RECEIVED) .increment(this.communicator.getReceivedMessageCounter()); this.counters.findCounter(BspCounters.MESSAGE_BYTES_SENT) .increment(this.communicator.getCombineOutgoMessageBytesCounter()); this.counters.findCounter(BspCounters.MESSAGE_BYTES_RECEIVED) .increment(this.communicator.getReceivedMessageBytesCounter()); } public void reportMessage() { this.messageBytes = communicator.getIncomedQueuesSize() * this.messagePerLength; LOG.info("[BSPStaff] Communicator has received " + this.communicator.getIncomedQueuesSize() + " messages totally for the super step <" + this.currentSuperStepCounter + ">"); } /** * Encapsulate the aggregate values into String[]. */ public void encapsulateAgg(SuperStepReportContainer ssrc) { String[] aggValues = encapsulateAggregateValues(); ssrc.setAggValues(aggValues); } /** * Set the state of ssrc for senond barrier. * * @param ssrc */ // public void setSecondBarrier(SuperStepReportContainer ssrc, // BSPStaffContext context) { // ssrc.setAggValues(ssrc.getAggValues()); // ssrc.setLocalBarrierNum(this.localBarrierNum); // ssrc.setStageFlag(Constants.SUPERSTEP_STAGE.SECOND_STAGE); // // to here // LOG.info("[WorkerManagerNum]" + this.workerMangerNum); // ssrc.setCheckNum(this.workerMangerNum + 1); // if (context.getCurrentSuperStepCounter() > 0 && !(context.getActiveFLag())) // { // ssrc.setJudgeFlag(0); // } else { // ssrc.setJudgeFlag(this.activeCounter // + this.communicator.getIncomedQueuesSize() // + this.communicator.getOutgoingQueuesSize()); // } // } // ljn SGA_Graph public void setSecondBarrier(SuperStepReportContainer ssrc, BSPStaffContext context) { ssrc.setAggValues(ssrc.getAggValues()); ssrc.setLocalBarrierNum(this.localBarrierNum); ssrc.setStageFlag(Constants.SUPERSTEP_STAGE.SECOND_STAGE); // to here LOG.info("[WorkerManagerNum]" + this.workerMangerNum); ssrc.setCheckNum(this.workerMangerNum + 1); // if (context.getCurrentSuperStepCounter() > 0 && // !(context.getActiveFLag())) { // ssrc.setJudgeFlag(0); // } else { if (context.getCurrentSuperStepCounter() > 0) { ssrc.setJudgeFlag(this.activeCounter + this.communicator.getIncomedQueuesSize() + this.communicator.getOutgoingQueuesSize()); } else { ssrc.setJudgeFlag(1); } } /** * Prepare the network, checknumber and superstep for recovery. * * @param workerAgent * @param hostName */ private void prepareForRecovery(WorkerAgentProtocol workerAgent, String hostName) { LOG.info("[Command]--[routeTableSize]" + ssc.getPartitionToWorkerManagerNameAndPort().size()); this.setPartitionToWorkerManagerNameAndPort(ssc.getPartitionToWorkerManagerNameAndPort()); ArrayList<String> tmp = new ArrayList<String>(); for (String str : this.getPartitionToWorkerManagerNameAndPort().values()) { if (!tmp.contains(str)) { tmp.add(str); } } this.localBarrierNum = getLocalBarrierNumber(hostName); workerAgent.setNumberWorkers(this.getJobId(), this.getSid(), tmp.size()); tmp.clear(); this.workerMangerNum = workerAgent.getNumberWorkers(this.getJobId(), this.getSid()); displayFirstRoute(); } /** * Decapsulate the aggregate values from String[]. * * @param aggcp * @param job * @param staff */ public void decapsulateAgg(AggValueCheckpoint aggcp, BSPJob job, Staff staff) { String[] aggValues = this.ssc.getAggValues(); if (aggValues != null) { decapsulateAggregateValues(aggValues); /* for checkpoint the aggvalues */ if (aggCpFlag) { String aggValuesCp = aggValues.toString(); BSPConfiguration bspAggConf = new BSPConfiguration(); String agguri = bspAggConf.get(Constants.BC_BSP_HDFS_NAME) + this.getJobId() + "/" + this.getSid() + "/aggValueCheckpoint/aggCheckpoint.cp"; boolean aggsuccess; try { aggsuccess = aggcp.writeAggCheckPoint(aggValuesCp, new BSPHdfsImpl().newPath(agguri), job, staff); if (aggsuccess) { LOG.info("AggValues have been writen into aggCheckpoint!"); } else { LOG.info("Fail to write aggValues into aggCheckpoint!"); } } catch (IOException e) { new RuntimeException(e); } // Feng added end-else } // end-if } // end-if } private void processCheckpointCommand(Checkpoint cp, BSPJob job, Staff staff, SuperStepReportContainer ssrc) { boolean success; try { success = cp.writeCheckPoint(this.graphData, new BSPHdfsImpl().newPath(ssc.getInitWritePath()), job, staff); if (success) { deleteOldCheckpoint(ssc.getOldCheckPoint(), job); } ssrc.setLocalBarrierNum(this.localBarrierNum); ssrc.setStageFlag(Constants.SUPERSTEP_STAGE.WRITE_CHECKPOINT_SATGE); ssrc.setDirFlag(new String[] { "write" }); ssrc.setCheckNum(this.workerMangerNum * 3); sssc.checkPointStageSuperStepBarrier(this.currentSuperStepCounter, ssrc); } catch (Exception e) { throw new RuntimeException(e); } } /** * Prepare for recovery , reset the parameters. * * @param job * @param workerAgent * @param hostName */ private void prepareRecoverySchedule(BSPJob job, WorkerAgentProtocol workerAgent, String hostName) { this.ssc = sssc.secondStageSuperStepBarrierForRecovery(job.getInt("staff.fault.superstep", 0)); this.setPartitionToWorkerManagerNameAndPort(ssc.getPartitionToWorkerManagerNameAndPort()); this.localBarrierNum = getLocalBarrierNumber(hostName); ArrayList<String> tmp = new ArrayList<String>(); for (String str : this.getPartitionToWorkerManagerNameAndPort().values()) { if (!tmp.contains(str)) { tmp.add(str); } } workerAgent.setNumberWorkers(this.getJobId(), this.getSid(), tmp.size()); tmp.clear(); this.workerMangerNum = workerAgent.getNumberWorkers(this.getJobId(), this.getSid()); this.currentSuperStepCounter = ssc.getAbleCheckPoint(); } /** * Prepare the graphdata from checkpoint for recovery. * * @param job * @param staff */ private void prepareRecoveryGraphdata(BSPJob job, Staff staff, Checkpoint cp) { // int version = job.getGraphDataVersion(); // this.graphData = this.getGraphDataFactory().createGraphData(version, // this); // this.graphData.clean(); // // /* Zhicheng Liu added */ // long tmpTS = System.currentTimeMillis(); // // this.graphData = cp.readCheckPoint( // new BSPHdfsImpl().newPath(ssc.getInitReadPath()), job, staff); // // /* Zhicheng Liu added */ // long tmpTE = System.currentTimeMillis(); // this.rwCheckPointT = (tmpTE - tmpTS) * 2; } /** * Reset the ssrc for recovery staff. * * @param ssrc */ private void resetRecoverySuperStepReportContainer(SuperStepReportContainer ssrc) { ssrc.setLocalBarrierNum(this.localBarrierNum); ssrc.setStageFlag(Constants.SUPERSTEP_STAGE.READ_CHECKPOINT_STAGE); ssrc.setDirFlag(new String[] { "read" }); ssrc.setCheckNum(this.workerMangerNum * 1); } /** * Get the new port of ActiveMQ. * * @param ssrc * @param workerAgent */ private void freshRecoveryPort(SuperStepReportContainer ssrc, WorkerAgentProtocol workerAgent) { this.activeMQPort = workerAgent.getFreePort(); ssrc.setPort2(this.activeMQPort); LOG.info("[BSPStaff] ReGet the port for ActiveMQ Broker is : " + this.activeMQPort + "!"); this.setPartitionToWorkerManagerNameAndPort( sssc.checkPointStageSuperStepBarrier(this.currentSuperStepCounter, ssrc)); displayFirstRoute(); } /** * Rebuid the partition and read data from checkpoint for intializing. * * @param job * @param writePartition * @throws ClassNotFoundException * @throws IOException */ private void intializePartitionForRecovery(BSPJob job, WritePartition writePartition) throws ClassNotFoundException, IOException { this.currentSuperStepCounter = ssc.getNextSuperStepNum(); LOG.info("Now, this super step count is " + this.currentSuperStepCounter); this.partitioner = (Partitioner<Text>) ReflectionUtils.newInstance( job.getConf().getClass(Constants.USER_BC_BSP_JOB_PARTITIONER_CLASS, HashPartitioner.class), job.getConf()); if (writePartition instanceof HashWithBalancerWritePartition) { this.partitioner.setNumPartition(this.staffNum * numCopy); } else { this.partitioner.setNumPartition(this.staffNum); } org.apache.hadoop.mapreduce.InputSplit split = null; if (rawSplitClass.equals("no")) { } else { DataInputBuffer splitBuffer = new DataInputBuffer(); splitBuffer.reset(rawSplit.getBytes(), 0, rawSplit.getLength()); SerializationFactory factory = new SerializationFactory(job.getConf()); Deserializer<? extends org.apache.hadoop.mapreduce.InputSplit> deserializer = (Deserializer<? extends org.apache.hadoop.mapreduce.InputSplit>) factory .getDeserializer(job.getConf().getClassByName(rawSplitClass)); deserializer.open(splitBuffer); split = deserializer.deserialize(null); } this.partitioner.intialize(job, split); displayFirstRoute(); } private void faultProcessLocalCompute(BSPJob job, Staff staff, WorkerAgentProtocol workerAgent, Exception e, int faultNumber) { LOG.error("Exception has been catched in BSPStaff--run--before" + " local computing !", e); Type type; Level level; if (e instanceof ClassNotFoundException) { type = Fault.Type.SYSTEMSERVICE; level = Fault.Level.CRITICAL; } else if (e instanceof IOException) { type = Fault.Type.DISK; level = Fault.Level.INDETERMINATE; } else if (e instanceof InterruptedException) { type = Fault.Type.SYSTEMSERVICE; level = Fault.Level.CRITICAL; } else { type = Fault.Type.SYSTEMSERVICE; level = Fault.Level.INDETERMINATE; } workerAgent.setStaffStatus(staff.getStaffAttemptId(), Constants.SATAFF_STATUS.FAULT, new Fault(type, level, workerAgent.getWorkerManagerName(job.getJobID(), staff.getStaffAttemptId()), e.toString(), job.getJobID().toString(), staff.getStaffAttemptId().toString()), faultNumber); LOG.info("=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=" + "*=*=*=*=*=*=*=*=*"); LOG.error("Exception has happened and been catched, " + "the exception will be reported to WorkerManager", e); } /** * Prepare the network and checknumber for Migrate. * * @param workerAgent * @param hostName */ private void prepareForMigrate(WorkerAgentProtocol workerAgent, String hostName) { this.ssc = sssc.secondStageSuperStepBarrierForRecovery(this.currentSuperStepCounter); this.setPartitionToWorkerManagerNameAndPort(ssc.getPartitionToWorkerManagerNameAndPort()); this.localBarrierNum = getLocalBarrierNumber(hostName); ArrayList<String> tmp = new ArrayList<String>(); for (String str : this.getPartitionToWorkerManagerNameAndPort().values()) { if (!tmp.contains(str)) { tmp.add(str); } } // Update the worker information workerAgent.setNumberWorkers(this.getJobId(), this.getSid(), tmp.size()); tmp.clear(); this.workerMangerNum = workerAgent.getNumberWorkers(this.getJobId(), this.getSid()); } /** * Prepare the graphdata from checkpoint for migrate. * * @param job * @param staff * @throws IOException */ private void prepareMigrateGraphdata(BSPJob job, Staff staff, Checkpoint cp) throws IOException { // int version = job.getGraphDataVersion(); // this.graphData = this.getGraphDataFactory().createGraphData(version, // this); // this.graphData.clean(); // // this.graphData.setMigratedStaffFlag(true); // BSPConfiguration bspConf = new BSPConfiguration(); // String uri = bspConf.get(Constants.BC_BSP_HDFS_NAME) + this.getJobId() // + "/" + this.getSid() + "/migrate/graph.cp"; // long tmpTS = System.currentTimeMillis(); // this.graphData = cp.readCheckPoint(new BSPHdfsImpl().newPath(uri), job, // staff); // long tmpTE = System.currentTimeMillis(); // this.rwCheckPointT = (tmpTE - tmpTS) * 2; // if (graphData.getVertexSize() == 0) { // vertexSize = 10; // } // this.graphBytes = graphData.sizeForAll() * vertexSize; // LOG.info("readGraph from checkpoint: this.graphBytes is " + // this.graphBytes); // Configuration conf = new Configuration(); // BSPFileSystem bspfs = new BSPFileSystemImpl(URI.create(uri), conf); // if (bspfs.exists(new BSPHdfsImpl().newPath(uri))) { // bspfs.delete(new BSPHdfsImpl().newPath(uri), true); // LOG.info("Has deleted the checkpoint of graphData on hdfs"); // } } /** * Update the the graph Statistics for Migrate. */ private void updateMigrateStatistics() { if (graphData.getVertexSize() == 0) { vertexSize = 8; } this.graphBytes = graphData.sizeForAll() * vertexSize; } /** * Update the the message Statistics for Migrate. */ private void updateMggMigrateStatistics() { if (communicator.getIncomingQueuesSize() == 0) { // ljn test motify // this.messagePerLength = 0; this.messagePerLength = 10; // LOG.info("incoming message is null, so this.messagePerLength = 0"); LOG.info("incoming message is null, so this.messagePerLength = 10"); } else { while (true) { BSPMessage msg = (BSPMessage) communicator.checkAMessage(); if (msg != null) { this.messagePerLength = msg.intoString().getBytes().length; break; } } // while } } /** * Decapsulate aggregate value for migrate. */ private void decapsulateAggForMigrate() { String[] aggValues = this.ssc.getAggValues(); if (aggValues != null) { decapsulateAggregateValues(aggValues); } } /** * Process the Migrate work for Staff. * * @param job * @param staff * @param cp * @param ssrc * @param workerAgent * @throws IOException */ private void processStaffMigrate(BSPJob job, Staff staff, Checkpoint cp, SuperStepReportContainer ssrc, WorkerAgentProtocol workerAgent) throws IOException { LOG.info("read message checkpoint from hdfs for migrate"); BSPConfiguration bspConf = new BSPConfiguration(); String uri = bspConf.get(Constants.BC_BSP_HDFS_NAME) + this.getJobId() + "/" + this.getSid() + "/migrate/message.cp"; icomMess = cp.readMessages(new BSPHdfsImpl().newPath(uri), job, staff); LOG.info("Migrate messages size! " + icomMess.size()); // Delete from hdfs Configuration conf = new Configuration(); // FileSystem fs = FileSystem.get(URI.create(uri), conf); BSPFileSystem bspfs = new BSPFileSystemImpl(URI.create(uri), conf); // if (fs.exists(new Path(uri))) { // LOG.info("Has delete message checkpoint from hdfs"); // fs.delete(new Path(uri), true); // } if (bspfs.exists(new BSPHdfsImpl().newPath(uri))) { LOG.info("Has delete message checkpoint from hdfs"); bspfs.delete(new BSPHdfsImpl().newPath(uri), true); } ssrc.setCheckNum(this.staffNum); int runpartitionRPCPort = workerAgent.getFreePort(); this.partitionRPCPort = runpartitionRPCPort; ssrc.setPort1(runpartitionRPCPort); this.activeMQPort = workerAgent.getFreePort(); ssrc.setPort2(this.activeMQPort); LOG.info("[BSPStaff] Get the port for partitioning RPC is : " + runpartitionRPCPort + "!"); LOG.info("[BSPStaff] Get the port for ActiveMQ Broker is : " + this.activeMQPort + "!"); this.partitionToWorkerManagerHostWithPorts = sssc.scheduleBarrierForMigrate(ssrc); // Get the globle information of route // record the map from partitions to workermanagers for (Integer e : this.partitionToWorkerManagerHostWithPorts.keySet()) { String[] nameAndPorts = this.partitionToWorkerManagerHostWithPorts.get(e).split(":"); String[] ports = nameAndPorts[1].split("-"); this.getPartitionToWorkerManagerNameAndPort().put(e, nameAndPorts[0] + ":" + ports[1]); } // Added later this.staffAgent = new WorkerAgentForStaff(job.getConf()); workerAgent.setStaffAgentAddress(this.getSid(), this.staffAgent.address()); } /** * Reset the localcheck number and communicator(RPC,ActiveMQ) for Migrating. * * @param ssrc * @param workerAgent */ private void resetPortsMigrate(SuperStepReportContainer ssrc, WorkerAgentProtocol workerAgent, String hostName) { ssrc.setCheckNum(this.staffNum); ssrc.setPort1(this.partitionRPCPort); ssrc.setPort2(this.activeMQPort); LOG.info("[BSPStaff migrate] Get the port for partitioning RPC is : " + this.partitionRPCPort + "!"); LOG.info("[BSPStaff migrate] Get the port for ActiveMQ Broker is : " + this.activeMQPort + "!"); this.partitionToWorkerManagerHostWithPorts = sssc.scheduleBarrierForMigrate(ssrc); // record the map from partitions to workermanagers for (Integer e : this.partitionToWorkerManagerHostWithPorts.keySet()) { String[] nameAndPorts = this.partitionToWorkerManagerHostWithPorts.get(e).split(":"); String[] ports = nameAndPorts[1].split("-"); this.getPartitionToWorkerManagerNameAndPort().put(e, nameAndPorts[0] + ":" + ports[1]); } ArrayList<String> tmp = new ArrayList<String>(); for (String str : this.getPartitionToWorkerManagerNameAndPort().values()) { String workerName = str.split(":")[0]; if (!tmp.contains(workerName)) { tmp.add(workerName); } } workerAgent.setNumberWorkers(this.getJobId(), this.getSid(), tmp.size()); tmp.clear(); this.workerMangerNum = workerAgent.getNumberWorkers(this.getJobId(), this.getSid()); LOG.info("get globle partitiontoWorkerNanagerNameAndPort is " + this.getPartitionToWorkerManagerNameAndPort()); this.localBarrierNum = getLocalBarrierNumber(hostName); } /** * Rebuild the communicator for migrate staff. * * @param job * @param hostName */ private void rebuildCommunicator(BSPJob job, String hostName) { /* * Feng added for new version loadbalance reinitialize communicator for send * messages */ this.communicator = new CommunicatorNew(this.getJobId(), job, this.getPartition(), partitioner); this.communicator.initialize(this.routerparameter, this.getPartitionToWorkerManagerNameAndPort(), this.graphData); this.communicator.start(hostName, this); this.communicator.setPartitionToWorkerManagerNamePort(this.getPartitionToWorkerManagerNameAndPort()); } /** * Compute the migrate cost and report migrate information to bspcontroller. * * @param ssrc */ private void updateMigrateCost(SuperStepReportContainer ssrc) { if (this.graphBytes == 0) { this.graphBytes = 1; } this.migrateCost = (this.rwCheckPointT != 0 ? this.rwCheckPointT : this.loadDataT) * (this.graphBytes + this.messageBytes) / this.graphBytes; ssrc.setStaffRunTime(this.staffRunTime); ssrc.setStaffID(this.getSid().getStaffID().getId()); ssrc.setCurrentSuperStep(this.currentSuperStepCounter); ssrc.setMigrateCost(migrateCost); LOG.info("start second barrier"); LOG.info("staffRunTime is " + this.staffRunTime); LOG.info("staffID is " + this.getSid()); LOG.info("currentSuperStepCounter is " + this.currentSuperStepCounter); LOG.info("migrateCost is " + this.migrateCost); } /** * Judge whether the staff is slow. */ private void confirmMigrateStaff() { String migrateStaffIDs = ssc.getMigrateStaffIDs(); if (!migrateStaffIDs.equals("")) { LOG.info("Get the superstep command, and shows that the" + " migate staff id is " + migrateStaffIDs); this.hasMigrateStaff = true; String[] ids = migrateStaffIDs.split(":"); for (int i = 0; i < ids.length; i++) { if (Integer.parseInt(ids[i]) == this.getSid().getStaffID().getId()) { this.migrateFlag = true; LOG.info("This staff should migrate!"); break; } } } } /** * Write the graph data and message into Checkpoint for migrate staff. * * @param cp * @param job * @param staff * @throws IOException * @throws InterruptedException */ private void writeMigrateData(Checkpoint cp, BSPJob job, Staff staff) throws IOException, InterruptedException { // Wrtie graph checkpoint BSPConfiguration bspConf = new BSPConfiguration(); String uri = bspConf.get(Constants.BC_BSP_HDFS_NAME) + this.getJobId() + "/" + this.getSid() + "/migrate/graph.cp"; boolean success = cp.writeCheckPoint(this.graphData, new BSPHdfsImpl().newPath(uri), job, staff); if (success) { LOG.info("Has Write graphData checkPoint success for migrate staff!"); } else { LOG.info("Can not write graphData checkPiont to hdfs"); } // Write incoming message checkpoint uri = bspConf.get(Constants.BC_BSP_HDFS_NAME) + this.getJobId() + "/" + this.getSid() + "/migrate/message.cp"; LOG.info("writeMessages size! " + this.migrateMessagesString.size()); success = cp.writeMessages(this.communicator, graphData, new BSPHdfsImpl().newPath(uri), job, this, staff, this.migrateMessagesString); if (success) { LOG.info("Has Write incoming messages into hdfs for migrate staff"); } else { LOG.info("Can not write messages checkpoint to hdfs"); } } /** * Finish the staff of one step for migrate staff. * * @param bsp * @param staff * @param workerAgent */ private void finishMigrateStaff(BSP bsp, Staff staff, WorkerAgentProtocol workerAgent, boolean recovery) { LOG.info("This staff stop running and migrate!"); bsp.cleanup(staff); stopActiveMQBroker(); try { done(workerAgent); LOG.info("setStaffStatus before! recovery " + recovery); workerAgent.setStaffStatus(this.getSid(), Constants.SATAFF_STATUS.SUCCEED, null, 1); boolean succ = workerAgent.updateWorkerJobState(this.getSid()); LOG.info("setStaffStatus after! recovery " + recovery); if (succ) { LOG.info("Update the infomation of staffs successfully!"); } else { LOG.info("Can not update the infomation of staffs successfully!"); } } catch (IOException e) { e.printStackTrace(); } } /** * Get BSP job configuration. */ @Override public BSPJob getConf() { return this.bspJob; } /** * Set BSP job configuration. */ @Override public void setConf(BSPJob bspJob) { this.bspJob = bspJob; } /** Write and read split info to WorkerManager. */ @Override public void write(DataOutput out) throws IOException { super.write(out); Text.writeString(out, rawSplitClass); rawSplit.write(out); } @Override public void readFields(DataInput in) throws IOException { super.readFields(in); rawSplitClass = Text.readString(in); rawSplit.readFields(in); } /** * Get partition to worker manager name and port. */ /* * public HashMap<Integer, String> getPartitionToWorkerManagerNameAndPort() { * try{ return this.getPartitionToWorkerManagerNameAndPort(); }catch(Exception * e){ LOG.info("getPartitionToworkerManager error is:"+e); return null; } } */ @SuppressWarnings("unchecked") private void loadAggregators(BSPJob job) { int aggregateNum = job.getAggregateNum(); String[] aggregateNames = job.getAggregateNames(); for (int i = 0; i < aggregateNum; i++) { String name = aggregateNames[i]; this.nameToAggregator.put(name, job.getAggregatorClass(name)); this.nameToAggregateValue.put(name, job.getAggregateValueClass(name)); } try { // Instanciate each aggregate values. for (Entry<String, Class<? extends AggregateValue<?, ?>>> entry : this.nameToAggregateValue .entrySet()) { String aggName = entry.getKey(); AggregateValue aggValue; aggValue = entry.getValue().newInstance(); this.aggregateValuesCurrent.put(aggName, aggValue); } } catch (InstantiationException e) { LOG.error("[BSPStaff:loadAggregators]", e); } catch (IllegalAccessException e) { LOG.error("[BSPStaff:loadAggregators]", e); } } @SuppressWarnings("unchecked") private void initBeforeSuperStepForAggregateValues(SuperStepContext ssContext) { for (Entry<String, AggregateValue> entry : this.aggregateValuesCurrent.entrySet()) { AggregateValue aggValue = entry.getValue(); aggValue.initBeforeSuperStep(ssContext); } } @SuppressWarnings("unchecked") private void aggregate(ConcurrentLinkedQueue<IMessage> messages, BSPJob job, Vertex vertex, int superStepCount) { try { for (Entry<String, Class<? extends AggregateValue<?, ?>>> entry : this.nameToAggregateValue .entrySet()) { String aggName = entry.getKey(); // Init the aggregate value for this head node. AggregateValue aggValue1 = this.aggregateValuesCurrent.get(aggName); AggregationContext aggContext = new AggregationContext(job, vertex, superStepCount); publishAggregateValues(aggContext); aggValue1.initValue(messages.iterator(), aggContext); // Get the current aggregate value. AggregateValue aggValue0; aggValue0 = this.aggregateValues.get(aggName); // Get the aggregator for this kind of aggregate value. Aggregator<AggregateValue> aggregator; aggregator = (Aggregator<AggregateValue>) this.nameToAggregator.get(aggName).newInstance(); // Aggregate if (aggValue0 == null) { // the first time aggregate. aggValue0 = (AggregateValue) aggValue1.clone(); this.aggregateValues.put(aggName, aggValue0); } else { ArrayList<AggregateValue> tmpValues = new ArrayList<AggregateValue>(); tmpValues.add(aggValue0); tmpValues.add(aggValue1); AggregateValue aggValue = aggregator.aggregate(tmpValues); this.aggregateValues.put(aggName, aggValue); } } } catch (InstantiationException e) { LOG.error("[BSPStaff:aggregate]", e); } catch (IllegalAccessException e) { LOG.error("[BSPStaff:aggregate]", e); } } /** * To encapsulate the aggregation values to the String[]. The aggValues should * be in form as follows: [ AggregateName \t AggregateValue.toString() ] * * @return String[] */ @SuppressWarnings("unchecked") private String[] encapsulateAggregateValues() { int aggSize = this.aggregateValues.size(); String[] aggValues = new String[aggSize]; int ia = 0; for (Entry<String, AggregateValue> entry : this.aggregateValues.entrySet()) { aggValues[ia] = entry.getKey() + Constants.KV_SPLIT_FLAG + entry.getValue().toString(); ia++; } // The cache for this super step should be cleared for next super step. this.aggregateValues.clear(); return aggValues; } /** * To decapsulate the aggregation values from the String[]. The aggValues * should be in form as follows: [ AggregateName \t AggregateValue.toString() * ] * * @param aggValues * String[] */ @SuppressWarnings("unchecked") private void decapsulateAggregateValues(String[] aggValues) { for (int i = 0; i < aggValues.length; i++) { String[] aggValueRecord = aggValues[i].split(Constants.KV_SPLIT_FLAG); String aggName = aggValueRecord[0]; String aggValueString = aggValueRecord[1]; AggregateValue aggValue = null; try { aggValue = this.nameToAggregateValue.get(aggName).newInstance(); aggValue.initValue(aggValueString); // init the aggValue from // its string form. } catch (InstantiationException e1) { LOG.error("ERROR", e1); } catch (IllegalAccessException e1) { LOG.error("ERROR", e1); } // end-try if (aggValue != null) { this.aggregateResults.put(aggName, aggValue); } // end-if } // end-for } /** * To publish the aggregate values into the bsp's cache for user's accession * for the next super step. * * @param context * BSPStaffContext */ @SuppressWarnings("unchecked") private void publishAggregateValues(BSPStaffContext context) { for (Entry<String, AggregateValue> entry : this.aggregateResults.entrySet()) { context.addAggregateValues(entry.getKey(), entry.getValue()); } } /** * To publish the aggregate values into the super step context for the * bsp.initBeforeSuperStep for the next super step. * * @param context * SuperStepContext */ @SuppressWarnings("unchecked") private void publishAggregateValues(SuperStepContext context) { for (Entry<String, AggregateValue> entry : this.aggregateResults.entrySet()) { context.addAggregateValues(entry.getKey(), entry.getValue()); } } /** * To publish the aggregate values into the aggregation context for the * aggregation value's init of each vertex. * * @param context * AggregationContext */ @SuppressWarnings("unchecked") private void publishAggregateValues(AggregationContext context) { for (Entry<String, AggregateValue> entry : this.aggregateResults.entrySet()) { context.addAggregateValues(entry.getKey(), entry.getValue()); } } /** * WorkerAgentForStaffInterface.java. */ public interface WorkerAgentForStaffInterface extends VersionedProtocol { public static final long versionID = 0L; /** * This method is used to worker which this worker's partition id equals * belongPartition. * * @param jobId * the current BSP job id * @param staffId * the current Staff id * @param belongPartition * the current partition * @return worker manager */ WorkerAgentForStaffInterface getWorker(BSPJobID jobId, StaffAttemptID staffId, int belongPartition); /** * This method is used to put the HeadNode to WorkerAgentForJob's map. * * @param jobId * the current BSP job id * @param staffId * the current Staff id * @param belongPartition * the partitionID which the HeadNode belongs to */ void putHeadNode(BSPJobID jobId, StaffAttemptID staffId, int belongPartition, BytesWritable data, String type); /** * This method is used to put the HeadNode to WorkerAgentForJob's map. * * @param jobId * the current BSP job id * @param staffId * the current Staff id * @param belongPartition * the partitionID which the HeadNode belongs to */ void putHeadNode(BSPJobID jobId, StaffAttemptID staffId, int belongPartition, BytesWritable data); /** * Get the address of this WorkerAgentForStaff. * * @return address */ String address(); /** * This method will be invoked before the staff be killed, to notice the * staff to do some cleaning operations. */ void onKillStaff(); } /** * WorkerAgentForStaff.java. * * @author root */ public class WorkerAgentForStaff implements WorkerAgentForStaffInterface { /** <partitionID, hostName:port1-port2> */ private HashMap<Integer, String> partitionToWorkerManagerHostWithPorts = new HashMap<Integer, String>(); /** * The workers. */ private final Map<InetSocketAddress, WorkerAgentForStaffInterface> workers = new ConcurrentHashMap<InetSocketAddress, WorkerAgentForStaffInterface>(); /** * This class implements an IP Socket Address (IP address + port number). */ private InetSocketAddress workAddress; /** * .hadoop.ipc.RPC.Server. */ private Server server = null; /** * BSP job configruation. */ private Configuration conf; /** * Constructor of WorkerAgentForStaff. * * @param conf * BSP job configuration */ public WorkerAgentForStaff(Configuration conf) { this.partitionToWorkerManagerHostWithPorts = BSPStaff.this.partitionToWorkerManagerHostWithPorts; // LOG.info("ljn test : partitionToWorkerManagerHostWithPorts " // + partitionToWorkerManagerHostWithPorts); this.conf = conf; String[] hostandports = this.partitionToWorkerManagerHostWithPorts.get(BSPStaff.this.getPartition()) .split(":"); // LOG.info(this.partitionToWorkerManagerHostWithPorts.get(BSPStaff.this // .getPartition())); String[] ports = hostandports[1].split("-"); workAddress = new InetSocketAddress(hostandports[0], Integer.parseInt(ports[0])); reinitialize(); } /** * Reinitialize the Staff. */ private void reinitialize() { try { LOG.info("reinitialize() the WorkerAgentForStaff: " + getJobId().toString()); server = RPC.getServer(this, workAddress.getHostName(), workAddress.getPort(), conf); server.start(); LOG.info("WorkerAgentForStaff address:" + workAddress.getHostName() + " port:" + workAddress.getPort()); } catch (IOException e) { LOG.error("[reinitialize]", e); } } /** * Get WorkerAgentConnection * * @param addr * IP address + port number * @return The workerAgentConnection */ protected WorkerAgentForStaffInterface getWorkerAgentConnection(InetSocketAddress addr) { WorkerAgentForStaffInterface worker; synchronized (this.workers) { worker = workers.get(addr); if (worker == null) { try { worker = (WorkerAgentForStaffInterface) RPC.getProxy(WorkerAgentForStaffInterface.class, WorkerAgentForStaffInterface.versionID, addr, this.conf); } catch (IOException e) { LOG.error("[getWorkerAgentConnection]", e); } this.workers.put(addr, worker); } } return worker; } /** * Get Address. * * @param peerName * @return IP address + port number */ private InetSocketAddress getAddress(String peerName) { String[] workerAddrParts = peerName.split(":"); return new InetSocketAddress(workerAddrParts[0], Integer.parseInt(workerAddrParts[1])); } /** * This method is used to get worker. * * @param jobId * @param staffId * @param belongPartition * @return */ @Override public WorkerAgentForStaffInterface getWorker(BSPJobID jobId, StaffAttemptID staffId, int belongPartition) { String dstworkerName = null; dstworkerName = this.partitionToWorkerManagerHostWithPorts.get(belongPartition); // hostName:port1-port2 String[] hostAndPorts = dstworkerName.split(":"); String[] ports = hostAndPorts[1].split("-"); dstworkerName = hostAndPorts[0] + ":" + ports[0]; WorkerAgentForStaffInterface work = workers.get(getAddress(dstworkerName)); if (work == null) { work = getWorkerAgentConnection(getAddress(dstworkerName)); } return work; } /** * This method is used to put the HeadNode to WorkerAgentForJob's map. * * @param jobId * BSP job id * @param staffId * BSP Staff id * @param belongPartition * the partitionID which the HeadNode belongs to */ @Override // hash? @SuppressWarnings("unchecked") public void putHeadNode(BSPJobID jobId, StaffAttemptID staffId, int belongPartition, BytesWritable data) { DataInputStream in = new DataInputStream( new BufferedInputStream(new ByteArrayInputStream(data.getBytes()))); try { while (true) { Text key = new Text(); key.readFields(in); Text value = new Text(); value.readFields(in); if (key.getLength() > 0 && value.getLength() > 0) { if (BSPStaff.this.recordParse == null) { LOG.error("Test Null: BSPStaff.this.recordParse is NULL"); } Vertex vertex = BSPStaff.this.recordParse.recordParse(key.toString(), value.toString()); if (vertex == null) { BSPStaff.this.lost++; continue; } BSPStaff.this.graphData.addForAll(vertex); } else { break; } } } catch (IOException e) { LOG.error("ERROR", e); } } /** * This method is used to put the HeadNode to WorkerAgentForJob's map. * * @param jobId * @param staffId * @param belongPartition * the partitionID which the HeadNode belongs to */ @Override // hash? @SuppressWarnings("unchecked") public void putHeadNode(BSPJobID jobId, StaffAttemptID staffId, int belongPartition, BytesWritable data, String type) { DataInputStream in = new DataInputStream( new BufferedInputStream(new ByteArrayInputStream(data.getBytes()))); try { while (true) { Text key = new Text(); key.readFields(in); Text value = new Text(); value.readFields(in); if (key.getLength() > 0 && value.getLength() > 0) { // application.getDownlink().sendKeyValue(key.toString(), // value.toString()); // // if (vertex == null) { // BSPStaff.this.lost++; // continue; // } // BSPStaff.this.graphData.addForAll(vertex); } else { break; } } } catch (IOException e) { LOG.error("ERROR", e); } } @Override public long getProtocolVersion(String arg0, long arg1) throws IOException { return WorkerAgentForStaffInterface.versionID; } @Override public String address() { String hostName = this.workAddress.getHostName(); int port = this.workAddress.getPort(); return new String(hostName + ":" + port); } @Override public void onKillStaff() { BSPStaff.this.stopActiveMQBroker(); } } /** * Start the ActiveMQ. * * @param hostName * the compute nome name */ public void startActiveMQBroker(String hostName) { // brokerName = "hostName-partitionID" this.activeMQBroker = new ActiveMQBroker(hostName + "-" + this.getPartition()); try { this.activeMQBroker.startBroker(this.activeMQPort); LOG.info("[BSPStaff] starts ActiveMQ Broker successfully!"); } catch (Exception e) { LOG.error("[BSPStaff] caught: ", e); } } /** * Stop the ActiveMQ. */ public void stopActiveMQBroker() { if (this.activeMQBroker != null) { try { this.activeMQBroker.stopBroker(); LOG.info("[BSPStaff] stops ActiveMQ Broker successfully!"); } catch (Exception e) { LOG.error("[BSPStaff] caught: ", e); } } } /** * Start the rpc server. * * @param hostName * the local compute node name */ public void startRPCServer(String hostName) { try { server = RPC.getServer(this.communicator, hostName, this.activeMQPort, new Configuration()); server.start(); LOG.info("[BSPStaff] starts RPC Communication Server successfully!"); } catch (UnknownHostException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } /** * Stop RPC server. */ public void stopRPCSever() { if (this.server == null) { LOG.info("[BSPStaff] stops RPC Communication Server successfully!"); return; } this.server.stop(); } /** * Get the recovery times. */ @Override public int getRecoveryTimes() { return recoveryTimes; } long loadGraphTime = 0; long aggregateTime = 0; long computeTime = 0; long collectMsgsTime = 0; // ============ Note Add 20140310 For Adjusting Local Computation ======== @Override public void vertexProcessing(Vertex v, BSP bsp, BSPJob job, int superStepCounter, BSPStaffContext context, boolean activeFlag) throws IOException { // /**Feng added for migrate staff messages*/ // ConcurrentLinkedQueue<IMessage> migrateMessages; String migrateMessages = null; StringBuffer sb = new StringBuffer(); /** Clock */ // long tmpStart = System.currentTimeMillis(); if (v == null) { LOG.error("Fail to get the HeadNode of index[" + "] " + "and the system will skip the record"); return; } // loadGraphTime = loadGraphTime + (System.currentTimeMillis() - tmpStart); // Get the incomed message queue for this vertex. ConcurrentLinkedQueue<IMessage> messages = this.communicator .getMessageQueue(String.valueOf(v.getVertexID())); if (this.migratedStaffFlag == true) { // LOG.info("Feng test use the migrated messages! "+v.getVertexID()); messages = this.communicator// Feng test use the migrated messages .getMigrateMessageQueue(String.valueOf(v.getVertexID())); } // LOG.info("Feng test Messagesize! "+messages.size()); // if (messages.size() > 0) // LOG.info("<TEST1> V: " +v.getVertexID() +" MSG ID " + // messages.poll().getMessageId()); // Aggregate the new values for each vertex. Clock the time // cost. // tmpStart = System.currentTimeMillis(); aggregate(messages, job, v, this.currentSuperStepCounter); // aggregateTime = aggregateTime + (System.currentTimeMillis() - tmpStart); // Note Ever Fault. // Note Ever Edit /\2014-01-23 context.refreshVertex(v); if (superStepCounter > 0) { if (!activeFlag && (messages.size() == 0)) { return; } // for activemap update else { upadateActiveBitMap(v); } } if (this.openMigrateMode == true && messages.size() > 0) { // this.migrateMessages.clear(); // migrateMessages.addAll(messages); // LOG.info("Feng test messageSize "+messages.size()); // for(IMessage mig:messages){ // LOG.info("Feng test message "+mig.intoString()); // } Iterator<IMessage> iterator = messages.iterator(); sb.append(v.getVertexID().toString() + Constants.MESSAGE_SPLIT); while (iterator.hasNext()) { IMessage msg = iterator.next(); String msgID = msg.getMessageId().toString(); String msgValue = msg.getContent().toString(); if (msgID != null) { sb.append(msgID + Constants.SPLIT_FLAG + msgValue + Constants.SPACE_SPLIT_FLAG); } migrateMessages = sb.toString(); this.migrateMessagesString.add(migrateMessages); } // LOG.info("Feng test record migrate messages!"); } Iterator<IMessage> messagesIter = messages.iterator(); // Iterator<IMessage> messagesIterTest = messages.iterator(); // if(messagesIterTest.hasNext()){ // IMessage i = messagesIterTest.next(); // LOG.info("Feng test staff messages" +i.getMessageId() // +"messagesSize! "+i.getContent()); // } // LOG.info("Feng test! "+ ((PageRankMessage) // (messagesIterTest.next())).getContent()); // Call the compute function for local computation. /* * Publish the total result aggregate values into the bsp's cache for the * user's function's accession in the next super step. */ publishAggregateValues(context); /** Clock */ // tmpStart = System.currentTimeMillis(); try { // LOG.info("bsp.compute vertex message!"); bsp.compute(messagesIter, context); } catch (Exception e) { throw new RuntimeException("catch exception", e); } // computeTime = computeTime + (System.currentTimeMillis() - tmpStart); /** Clock */ messages.clear(); /** Clock */ // tmpStart = System.currentTimeMillis(); // collectMsgsTime = collectMsgsTime + (System.currentTimeMillis() - // tmpStart); /** Clock */ } public void upadateActiveBitMap(Vertex v) { for (InstabilityVertexIndex vIndex : this.graphData.getVertexStore().getMigrateTreeSet()) { if (vIndex.getVertexIdString().equals(v.getVertexID().toString())) { vIndex.getMigrateFactor().updateActiveMap(currentSuperStepCounter); } } } @Override public void preBucket(int bucket, int superstep) { // LOG.info("++++++++++++++++++++++++++"+superstep); if (superstep == 0) { return; } this.communicator.preBucketMessages(bucket, superstep - 1); } @Override public void saveResultOfVertex(Vertex v, RecordWriter output) throws IOException, InterruptedException { output.write(new Text(v.intoString())); } // ==============Add End====== private void issueCommunicator(String commOption, String hostName, int migrateSuperStep, BSPJob job, Map<String, LinkedList<IMessage>> icomMess) { // Start an ActiveMQ Broker, create a communicator, initialize it, // and start it // Note TAG The Option Content LOG.info("[Comm Server Str Is ] ---->>>> " + commOption); // RPC communicator ?RPC Server if (commOption.equals(Constants.RPC_VERSION)) { // LOG.info("Start Communicator[RPC]"); // this.communicator = new RPCCommunicator(this.getJobId(), job, // this.getPartition(), partitioner); // // /* Zhicheng Liu added */ // if (openMigrateMode && migrateSuperStep != 0) { // communicator.recoveryForMigrate(icomMess); // LOG.info("Migrate staff: incomed messages size is " // + communicator.getIncomedQueuesSize()); // } // } else if (commOption.equals(Constants.RPC_BYTEARRAY_VERSION)) { LOG.info("Start Communicator[ByteArray]"); this.communicator = new CommunicatorNew(this.getJobId(), job, this.getPartition(), partitioner); /* Feng added for new version loadbalance */ if (openMigrateMode && migrateSuperStep != 0) { LOG.info("Recovery for migrate!"); communicator.recoveryForMigrate(icomMess); } } // Note Do Sth Setting Up. this.communicator.initialize(this.routerparameter, this.getPartitionToWorkerManagerNameAndPort(), this.graphData); this.communicator.start(hostName, this); } // ============ Note Add 20140310 For Adjusting Local Computation ======== @Override public void peerProcessing(BSPPeer peer, BSP bsp, BSPJob job, int superStepCounter, BSPStaffContext context, boolean activeFlag) { // /**Feng added for migrate staff messages*/ // ConcurrentLinkedQueue<IMessage> migrateMessages; String migrateMessages = null; StringBuffer sb = new StringBuffer(); /** Clock */ long tmpStart = System.currentTimeMillis(); if (peer == null) { throw new RuntimeException("No key-value to compute for staff " + this.getSid()); } loadGraphTime = loadGraphTime + (System.currentTimeMillis() - tmpStart); // Get the incomed message queue for this peer(partition). try { ConcurrentLinkedQueue<IMessage> messages = this.communicator .getMessageQueue(String.valueOf(Constants.DEFAULT_PEER_DST_MESSSAGE_ID)); // Aggregate the new values for each vertex. Clock the time // cost. publishAggregateValues(context); LOG.info("ljn test : peerProcessing messages size is " + messages.size()); tmpStart = System.currentTimeMillis(); aggregate(messages, job, peer, this.currentSuperStepCounter); aggregateTime = aggregateTime + (System.currentTimeMillis() - tmpStart); context.updatePeer(peer); if (superStepCounter > 0) { if (!activeFlag && (messages.size() == 0)) { return; } } Iterator<IMessage> messagesIter = messages.iterator(); // Call the compute function for local computation. /* * Publish the total result aggregate values into the bsp's cache for the * user's function's accession in the next super step. */ // publishAggregateValues(context); /** Clock */ tmpStart = System.currentTimeMillis(); try { // LOG.info("bsp.compute vertex message!"); bsp.compute(messagesIter, context); } catch (Exception e) { throw new RuntimeException("catch exception", e); } computeTime = computeTime + (System.currentTimeMillis() - tmpStart); /** Clock */ messages.clear(); peer.resetPair(); /** Clock */ tmpStart = System.currentTimeMillis(); collectMsgsTime = collectMsgsTime + (System.currentTimeMillis() - tmpStart); } catch (Exception e) { throw new RuntimeException("catch exception on peer compute", e); } /** Clock */ } /** * Aggregate for peer compute. * * @param messages * @param job * @param peer * @param currentSuperStepCounter */ private void aggregate(ConcurrentLinkedQueue<IMessage> messages, BSPJob job, BSPPeer peer, int currentSuperStepCounter) { try { for (Entry<String, Class<? extends AggregateValue<?, ?>>> entry : this.nameToAggregateValue .entrySet()) { String aggName = entry.getKey(); // Init the aggregate value for this head node. AggregateValue aggValue1 = this.aggregateValuesCurrent.get(aggName); AggregationContext aggContext = new AggregationContext(job, peer, currentSuperStepCounter); publishAggregateValues(aggContext); aggValue1.initValue(messages.iterator(), aggContext); // Get the current aggregate value. AggregateValue aggValue0; aggValue0 = this.aggregateValues.get(aggName); // Get the aggregator for this kind of aggregate value. Aggregator<AggregateValue> aggregator; aggregator = (Aggregator<AggregateValue>) this.nameToAggregator.get(aggName).newInstance(); // Aggregate if (aggValue0 == null) { // the first time aggregate. aggValue0 = (AggregateValue) aggValue1.clone(); this.aggregateValues.put(aggName, aggValue0); } else { ArrayList<AggregateValue> tmpValues = new ArrayList<AggregateValue>(); tmpValues.add(aggValue0); tmpValues.add(aggValue1); AggregateValue aggValue = aggregator.aggregate(tmpValues); this.aggregateValues.put(aggName, aggValue); } } } catch (InstantiationException e) { LOG.error("[BSPStaff:aggregate]", e); } catch (IllegalAccessException e) { LOG.error("[BSPStaff:aggregate]", e); } } /** For JUnit test. */ public int getActiveMQPort() { return activeMQPort; } public void setActiveMQPort(int activeMQPort) { this.activeMQPort = activeMQPort; } public CommunicatorInterface getCommunicator() { return communicator; } public void setCommunicator(CommunicatorInterface communicator) { this.communicator = communicator; } /* * =================================For * SGA-Graph============================================= * @liujianan */ @Override public void runSGAGraph(BSPJob job, Staff staff, WorkerAgentProtocol workerAgent, boolean recovery, boolean changeWorkerState, int migrateSuperStep, int failCounter, String hostName) { this.migrateDirBase = "/tmp/sgagraph/migrate" + this.getJobId().toString().substring(17); WritePartition writePartition = (WritePartition) ReflectionUtils.newInstance( job.getConf().getClass(Constants.USER_BC_BSP_JOB_WRITEPARTITION_CLASS, HashWritePartition.class), job.getConf()); // record the number of failures of this staff LOG.info("BSPStaff---run()--SGAGraph changeWorkerState: " + changeWorkerState + "[HostName] " + hostName); staff.setFailCounter(failCounter); // Note Memory Deploy 20140312 MemoryAllocator ma = new MemoryAllocator(job); ma.PrintMemoryInfo(LOG); ma.setupBeforeLoadGraph(LOG); // instalize the number initializeBefore(job, workerAgent); // instalize the staff. SuperStepReportContainer ssrc = newInstanceSSRC(); this.sssc = new StaffSSController(this.getJobId(), this.getSid(), workerAgent); Checkpoint cp = new Checkpoint(job); AggValueCheckpoint aggcp = new AggValueCheckpoint(job); /* Zhicheng Liu added */ this.openMigrateMode = this.bspJob.getConf().get("bcbsp.loadbalance", "false").equals("true") ? true : false; /* Feng added */ this.aggCpFlag = this.bspJob.getConf().get("bcbsp.aggValuesCheckpoint", "false").equals("true") ? true : false; if (this.getGraphDataFactory() == null) { this.setGraphDataFactory(new GraphDataFactory(job.getConf())); } this.counters = new Counters(); // for staff migtare if (openMigrateMode && migrateSuperStep != 0) { } else if (!recovery) { ssrc.setCheckNum(this.staffNum); int runpartitionRPCPort = workerAgent.getFreePort(); /* Zhicheng Liu added */ // for communicator port this.partitionRPCPort = runpartitionRPCPort; ssrc.setPort1(runpartitionRPCPort); this.activeMQPort = workerAgent.getFreePort(); ssrc.setPort2(this.activeMQPort); LOG.info("[BSPStaff] Get the port for partitioning RPC is : " + runpartitionRPCPort + "!"); initializeAfter(job, workerAgent); recordWorkerManagerNameAndPort(ssrc); this.staffAgent = new WorkerAgentForStaff(job.getConf()); workerAgent.setStaffAgentAddress(this.getSid(), this.staffAgent.address()); initStaffNum(hostName); initWorkerNum(workerAgent); /** Clock */ // long start = System.currentTimeMillis(); try { // loadDataSGAGraph(job, workerAgent, this.staffAgent); loadData(job, workerAgent, this.staffAgent); } catch (Exception e) { throw new RuntimeException("Load data Exception in BSP staff runPartition", e); } // long end = System.currentTimeMillis(); // LOG.info("[==>Clock<==] <BSP Partition compute load Data> used " // + (end - start) / 1000f + " seconds"); if (this.openMigrateMode) { // this.loadDataT = (end - start) * 2; updateMigrateStatistics(); } } else { throw new RuntimeException("Has fault !!", new Exception()); } this.bsp = (BSP) ReflectionUtils.newInstance( job.getConf().getClass(Constants.USER_BC_BSP_JOB_WORK_CLASS, BSP.class), job.getConf()); prpareAggregate(job, migrateSuperStep); /* Zhicheng Liu added */ if (openMigrateMode && migrateSuperStep != 0) { } String commOption = job.getCommucationOption(); try { // configuration before local computation bsp.setup(staff); this.icomMess = null; if (openMigrateMode && migrateSuperStep != 0) { // read message from } prepareRoute(writePartition); this.issueCommunicator(commOption, hostName, 0, job, icomMess); BSPStaffContext context = new BSPStaffContext(job, this.currentSuperStepCounter); context.setCommHandler(communicator); /* * Input Graph evaluation SGA-Graph @ljn */ long startinDegreeStatistics = System.currentTimeMillis(); inDegreeStatistics(context, staff, job, ssrc); long endinDegreeStatistics = System.currentTimeMillis(); LOG.info( "ljn report : inDegreeStatistics time is " + (endinDegreeStatistics - startinDegreeStatistics)); // updateGraphOwnerMap(); // Begin local computation for the partition compute. while (this.flag) { /* Zhicheng Liu added waiting migrate staff for */ if (this.openMigrateMode && this.hasMigrateStaff && migrateSuperStep == 0) { } // for Vertex migrate before process compute. if (this.currentSuperStepCounter > 0 && !(this.evaluateflag)) { this.migrateVertexCommand = makeMigrateVertexCommand(this.graphData); // LOG.info("ljn test : migrateVertexCommand flag is " + // migrateVertexCommand.getMigrateFlag()); if ((migrateVertexCommand != null) && migrateVertexCommand.getMigrateFlag() && (this.currentSuperStepCounter % Constants.K == 0)) { LOG.info( "ljn report : max migrateVertex num is " + migrateVertexCommand.getMaxMigrateNum()); long startMigrate = System.currentTimeMillis(); processVerticeMigrate(migrateVertexCommand, sssc, ssrc); long endMigrate = System.currentTimeMillis(); LOG.info("ljn report : Migrate Vertex time is " + (endMigrate - startMigrate)); } } // Test migrate of SGA-graph // if (currentSuperStepCounter == 1) { // LOG.info("ljn test : enter migrate "); // processVerticeMigrate(migrateVertexCommand, sssc, ssrc); // } // staffStartTime = System.currentTimeMillis(); // no use this.activeCounter = 0; if (openMigrateMode && migrateSuperStep != 0) { } else if (!recovery) { this.graphData.setRecovryFlag(recovery); } else {// for recovery recovery = false; } prepareForOneSuperstep(staff, context, bsp, job); long start = System.currentTimeMillis(); // graph data processing and BSPPeer compute. if (this.openMigrateMode == true && this.migrateMessagesString != null) { this.migrateMessagesString.clear(); } this.graphData.setMigratedStaffFlag(migratedStaffFlag); // Graph-Centric compute flag if (this.currentSuperStepCounter % Constants.K == 0) { LOG.info("ljn test : this.currentSuperStepCounter%Constants.K is " + this.currentSuperStepCounter % Constants.K + " and evaluateflag is " + evaluateflag); context.startSubgraphCompute(this.evaluateflag); } this.graphData.processingByBucket(this, bsp, job, currentSuperStepCounter, context); long end = System.currentTimeMillis(); reportTimeOneStep(start, end); start = System.currentTimeMillis(); this.communicator.noMoreMessagesForSending(); while (true) { if (this.communicator.isSendingOver()) { break; } } // start = System.currentTimeMillis(); staffEndTime = start; staffRunTime = staffEndTime - staffStartTime; this.computeTimeMap.put(this.currentSuperStepCounter, staffRunTime); LOG.info("ljn report : RUNNING TIME of Step " + this.currentSuperStepCounter + " is " + staffRunTime); processSendSSRC(ssrc); // end = System.currentTimeMillis(); // LOG.info("[==>Clock<==] <Sending over sync> used " + (end - start) // / 1000f + " seconds"); start = end; this.communicator.noMoreMessagesForReceiving(); while (true) { if (this.communicator.isReceivingOver()) { break; } } // first stage superstep barrier processReceiveSSRC(ssrc); updateCounter(); this.communicator.exchangeIncomeQueues(); reportMessage(); if ((this.currentSuperStepCounter + 1) >= this.maxSuperStepNum) { this.communicator.clearOutgoingQueues(); this.communicator.clearIncomedQueues(); this.activeCounter = 0; } else { this.activeCounter = this.graphData.getActiveCounter(); LOG.info("ljn test : activeCounter is " + activeCounter + " currentSuperStepCounter is " + currentSuperStepCounter); } encapsulateAgg(ssrc); setSecondBarrier(ssrc, context); sssc.setCounters(this.counters); this.counters.clearCounters(); if (this.openMigrateMode) { updateMigrateCost(ssrc); } // second stage superstep barrier // if(this.currentSuperStepCounter%Constants.K == 0){ // this.evaluateflag = // this.sssc.graphEvaluateBarrier(this.currentSuperStepCounter/Constants.K, // this.staffNum, this.graphData.getGlobalFactor()); // }else{ // this.evaluateflag = false; // } this.ssc = sssc.secondStageSuperStepBarrier(this.currentSuperStepCounter, ssrc); LOG.info("[==>Clock<==] <StaffSSController's rebuild session> used " + StaffSSController.rebuildTime / 1000f + " seconds"); StaffSSController.rebuildTime = 0; if (this.openMigrateMode) { confirmMigrateStaff(); } if (ssc.getCommandType() == Constants.COMMAND_TYPE.START_AND_RECOVERY) { LOG.info("ljn test : command in run partition is START_AND_RECOVERY "); prepareForRecovery(workerAgent, hostName); } decapsulateAgg(aggcp, job, staff); // command tye to switch. switch (ssc.getCommandType()) { case Constants.COMMAND_TYPE.START: LOG.info("Get the CommandType is : START"); if (openMigrateMode && migrateFlag) { writeMigrateData(cp, job, staff); this.flag = false; break; } this.currentSuperStepCounter = ssc.getNextSuperStepNum(); this.flag = true; break; case Constants.COMMAND_TYPE.START_AND_CHECKPOINT: LOG.info("Get the CommandTye is : START_AND_CHECKPOINT"); // if (openMigrateMode && migrateFlag) { // writeMigrateData(cp, job, staff); // } processCheckpointCommand(cp, job, staff, ssrc); if (openMigrateMode && migrateFlag) { this.flag = false; } else { this.currentSuperStepCounter = ssc.getNextSuperStepNum(); this.flag = true; } break; case Constants.COMMAND_TYPE.START_AND_RECOVERY: LOG.info("Get the CommandTye is : START_AND_RECOVERY"); this.currentSuperStepCounter = ssc.getAbleCheckPoint(); processRecoveryCommand(cp, job, staff, ssrc, hostName); displayFirstRoute(); recovery = true; this.flag = true; break; case Constants.COMMAND_TYPE.STOP: LOG.info("Get the CommandTye is : STOP"); LOG.info("Staff will save the computation result and then quit!"); this.currentSuperStepCounter = ssc.getNextSuperStepNum(); this.flag = false; break; default: LOG.error("ERROR! " + ssc.getCommandType() + " is not a valid CommandType, so the staff will save the " + "computation result and quit!"); flag = false; } workerAgent.setStaffStatus(this.getSid(), Constants.SATAFF_STATUS.RUNNING, null, 1); } this.communicator.complete(); } catch (IOException ioe) { // try over faultProcessLocalCompute(job, staff, workerAgent, ioe, 1); LOG.error("Staff will quit abnormally"); return; } catch (Exception e) { faultProcessLocalCompute(job, staff, workerAgent, e, 1); LOG.error("Staff will quit abnormally"); return; } long allComputeTime = 0L; for (int key : this.computeTimeMap.keySet()) { allComputeTime = allComputeTime + this.computeTimeMap.get(key); } LOG.info("ljn report : ************ Compute time is " + allComputeTime + "*************"); if (!this.migrateFlag) { try { finishStaff(job, staff, workerAgent, ssrc); LOG.info("Staff is completed successfully"); } catch (Exception e) { reportErrorStaff(workerAgent, job, staff, e); } } else { finishMigrateStaff(bsp, staff, workerAgent, recovery); } } // /** // * update the graphdata ,add migrate vertice into the partition. // */ // private void refreshGraphdata() { // ArrayList<Vertex> mirrorVertice = this.communicator.getMirrorVertexQueue(); // for (Vertex vertex : mirrorVertice) { // this.graphData.addForAll(vertex); // } // } private MigrateVertexCommand makeMigrateVertexCommand(SubGraphManager staffGraphData) { MigrateVertexCommand migrateVertexCommand = new MigrateVertexCommand(); migrateVertexCommand.setMigrateFlag(!evaluateflag); int maxMigrateNum = staffGraphData.sizeForAll() - staffGraphData.getInTravelNum(); if (maxMigrateNum > staffGraphData.sizeForAll() * 0.1) { maxMigrateNum = (int) (staffGraphData.sizeForAll() * 0.1); } if (maxMigrateNum > (1 - this.golbalIntravelRatio) * staffGraphData.sizeForAll()) { maxMigrateNum = (int) ((1 - this.golbalIntravelRatio) * staffGraphData.sizeForAll()); } // migrateVertexCommand.setMaxMigrateNum(maxMigrateNum); migrateVertexCommand.setMaxMigrateNum(800); return migrateVertexCommand; } @Override public void subGraphProcessing(SubGraphManager bspSubGraphManager, BSP bsp, BSPJob job, int superStepCounter, BSPStaffContext context) { long tmpStart = System.currentTimeMillis(); if (bspSubGraphManager == null) { throw new RuntimeException("The staff " + this.getSid() + " has no subgraph."); } this.computeSubGraph(bspSubGraphManager, job, context, bsp); // computeTime = computeTime + (System.currentTimeMillis() - tmpStart); // tmpStart = System.currentTimeMillis(); // collectMsgsTime = collectMsgsTime + (System.currentTimeMillis() - // tmpStart); } private void computeSubGraph(SubGraphManager bspSubGraphManager, BSPJob job, BSPStaffContext context, BSP bsp) { try { MessageManagerInterface messageQueues = communicator.getPartitionMessages(); // AbstractSequentialCompute seqSubgraph = new // SequentialSubGraphCompute(); // bspSubGraphManager.setSeqSubgraph(seqSubgraph); context.updateSubgraph(bspSubGraphManager); bsp.computeSubGraph(context, messageQueues); for (InstabilityVertexIndex vIndex : this.graphData.getVertexStore().getMigrateTreeSet()) { if (messageQueues.getMessageQueue(vIndex.getVertexIdString()) == null || messageQueues.getMessageQueue(vIndex.getVertexIdString()).size() == 0) { continue; } else { vIndex.getMigrateFactor().updateActiveMap(currentSuperStepCounter); } } } catch (Exception e) { throw new RuntimeException("catch exception on subgraph compute", e); } } // /** // * Synchronization the mirror vertex between two partition. // */ // private void syncMirrorVertice() { // HashMap<Integer, Vertex> mirrorVertexs = new HashMap<Integer, Vertex>(); // for (String vidString : this.graphData.getVertexStore() // .getMirrorVertexMap().keySet()) { // Vertex aVertex = this.graphData.getVertexStore().getMirrorVertexMap() // .get(vidString).getVertex(); // int tarPartition = this.partitioner.getPartitionID(new Text(aVertex // .getVertexID().toString())); // mirrorVertexs.put(tarPartition, aVertex); // } // this.communicator.sendMigrateVertex(vertex);Vertex(mirrorVertexs); // } /** * loadData: load data for the staff in SGA-Graph. * * @param job * BSP job configuration * @param workerAgent * Protocol that staff child process uses to contact its parent process * @return boolean * @throws ClassNotFoundException * @throws IOException * e * @throws InterruptedException * e */ @SuppressWarnings("unchecked") public boolean loadDataSGAGraph(BSPJob job, WorkerAgentProtocol workerAgent, WorkerAgentForStaffInterface aStaffAgent) throws ClassNotFoundException, IOException, InterruptedException { // rebuild the input split RecordReader input = null; org.apache.hadoop.mapreduce.InputSplit split = null; if (rawSplitClass.equals("no")) { input = null; } else { DataInputBuffer splitBuffer = new DataInputBuffer(); splitBuffer.reset(rawSplit.getBytes(), 0, rawSplit.getLength()); SerializationFactory factory = new SerializationFactory(job.getConf()); Deserializer<? extends org.apache.hadoop.mapreduce.InputSplit> deserializer = (Deserializer<? extends org.apache.hadoop.mapreduce.InputSplit>) factory .getDeserializer(job.getConf().getClassByName(rawSplitClass)); deserializer.open(splitBuffer); split = deserializer.deserialize(null); // rebuild the InputFormat class according to the user configuration InputFormat inputformat = (InputFormat) ReflectionUtils.newInstance( job.getConf().getClass(Constants.USER_BC_BSP_JOB_INPUT_FORMAT_CLASS, InputFormat.class), job.getConf()); inputformat.initialize(job.getConf()); input = inputformat.createRecordReader(split, job); input.initialize(split, job.getConf()); } SuperStepReportContainer ssrc = new SuperStepReportContainer(); ssrc.setPartitionId(this.getPartition()); this.numCopy = (int) (1 / (job.getConf().getFloat(Constants.USER_BC_BSP_JOB_BALANCE_FACTOR, Constants.USER_BC_BSP_JOB_BALANCE_FACTOR_DEFAULT))); ssrc.setNumCopy(numCopy); ssrc.setCheckNum(this.staffNum); StaffSSControllerInterface lsssc = new StaffSSController(this.getJobId(), this.getSid(), workerAgent); long start = System.currentTimeMillis(); LOG.info("in BCBSP with PartitionType is: Hash" + " start time:" + start); if (this.staffNum == 1 || job.getConf().getBoolean(Constants.USER_BC_BSP_JOB_ISDIVIDE, false)) { this.partitioner = (Partitioner<Text>) ReflectionUtils.newInstance( job.getConf().getClass(Constants.USER_BC_BSP_JOB_PARTITIONER_CLASS, HashPartitioner.class), job.getConf()); this.partitioner.setNumPartition(this.staffNum); this.partitioner.intialize(job, split); WritePartition writePartition = new NotDivideWritePartition(); /* * RecordParse recordParse = (RecordParse) ReflectionUtils .newInstance( * job.getConf() .getClass( Constants.USER_BC_BSP_JOB_RECORDPARSE_CLASS, * RecordParseDefault.class), job .getConf()); recordParse.init(job); * //add by chen for null bug this.recordParse = recordParse; * //this.recordParse.init(job); */ writePartition.setRecordParse(this.recordParse); writePartition.setStaff(this); writePartition.write(input); ssrc.setDirFlag(new String[] { "1" }); ssrc.setCheckNum(this.staffNum); lsssc.loadDataBarrier(ssrc, Constants.PARTITION_TYPE.HASH); LOG.info("The number of verteices from other staff" + " that cound not be parsed:" + this.lost); LOG.info("in BCBSP with PartitionType is:HASH" + " the number of HeadNode in this partition is:" + graphData.sizeForAll()); graphData.finishAdd(); ssrc.setCheckNum(this.staffNum * 2); ssrc.setDirFlag(new String[] { "2" }); lsssc.loadDataBarrier(ssrc, Constants.PARTITION_TYPE.HASH); } else { this.partitioner = (Partitioner<Text>) ReflectionUtils.newInstance( job.getConf().getClass(Constants.USER_BC_BSP_JOB_PARTITIONER_CLASS, HashPartitioner.class), job.getConf()); WritePartition writePartition = (WritePartition) ReflectionUtils.newInstance(job.getConf().getClass( Constants.USER_BC_BSP_JOB_WRITEPARTITION_CLASS, HashWritePartition.class), job.getConf()); int multiple = 1; if (writePartition instanceof HashWithBalancerWritePartition) { this.partitioner.setNumPartition(this.staffNum * numCopy); multiple = 2; } else { this.partitioner.setNumPartition(this.staffNum); multiple = 1; if (writePartition instanceof RangeWritePartition) { multiple = 2; } } this.partitioner.intialize(job, split); /* * RecordParse recordParse = (RecordParse) ReflectionUtils .newInstance( * job.getConf() .getClass( Constants.USER_BC_BSP_JOB_RECORDPARSE_CLASS, * RecordParseDefault.class), job .getConf()); recordParse.init(job); // * this.recordParse = (RecordParse) ReflectionUtils.newInstance( // * job.getConf().getClass( // Constants.USER_BC_BSP_JOB_RECORDPARSE_CLASS, * // RecordParseDefault.class), job.getConf()); // * this.recordParse.init(job); this.recordParse = recordParse; */ writePartition.setPartitioner(partitioner); writePartition.setRecordParse(this.recordParse); writePartition.setStaff(this); writePartition.setWorkerAgent(aStaffAgent); writePartition.setSsrc(ssrc); writePartition.setSssc(lsssc); writePartition.setTotalCatchSize(job.getConf().getInt(Constants.USER_BC_BSP_JOB_TOTALCACHE_SIZE, Constants.USER_BC_BSP_JOB_TOTALCACHE_SIZE_DEFAULT)); int threadNum = job.getConf().getInt(Constants.USER_BC_BSP_JOB_SENDTHREADNUMBER, Constants.USER_BC_BSP_JOB_SENDTHREADNUMBER_DEFAULT); if (threadNum > this.staffNum) { threadNum = this.staffNum - 1; } writePartition.setSendThreadNum(threadNum); writePartition.write(input); ssrc.setDirFlag(new String[] { "1" }); ssrc.setCheckNum(this.staffNum * multiple); lsssc.loadDataBarrier(ssrc, Constants.PARTITION_TYPE.HASH); LOG.info("The number of verteices from other staff that" + " cound not be parsed:" + this.lost); LOG.info("in BCBSP with PartitionType is:HASH" + " the number of HeadNode in this partition is:" + graphData.sizeForAll()); graphData.finishAdd(); ssrc.setCheckNum(this.staffNum * (multiple + 1)); ssrc.setDirFlag(new String[] { "2" }); lsssc.loadDataBarrier(ssrc, Constants.PARTITION_TYPE.HASH); // for input graph evaluate. // this.evaluateflag = lsssc.graphEvaluateBarrier(0, this.staffNum, // this.graphData.getGlobalFactor()); } long end = System.currentTimeMillis(); LOG.info("in BCBSP with PartitionType is:HASH" + " end time:" + end); LOG.info( "in BCBSP with PartitionType is:HASH" + " using time:" + (float) (end - start) / 1000 + " seconds"); return true; } @Override public Partitioner<Text> getPartitionRule() { return this.partitioner; } private void inDegreeStatistics(BSPStaffContext context, Staff staff, BSPJob job, SuperStepReportContainer ssrc) throws IOException, InstantiationException, IllegalAccessException { this.currentSuperStepCounter = -2; LOG.info("ljn test : inDegreeStatistics currentSuperStepCounter is " + currentSuperStepCounter); // InDegreeOwnerBSP inDegreeOwnerBSP = new InDegreeOwnerBSP(); InDegreeOwnerBSP inDegreeOwnerBSP = InDegreeOwnerBSP.class.newInstance(); inDegreeOwnerBSP.setMigrateMap(this.graphData.getVertexStore().getMigrateTreeSet()); while (currentSuperStepCounter <= -1) { LOG.info("ljn test : inDegreeStatistics int while currentSuperStepCounter is " + currentSuperStepCounter); prepareForOneSuperstep(staff, context, inDegreeOwnerBSP, job); this.graphData.processingByBucket(this, inDegreeOwnerBSP, job, currentSuperStepCounter, context); this.communicator.noMoreMessagesForSending(); while (true) { if (this.communicator.isSendingOver()) { break; } } processSendSSRC(ssrc); this.communicator.noMoreMessagesForReceiving(); while (true) { if (this.communicator.isReceivingOver()) { break; } } processReceiveSSRC(ssrc); updateCounter(); this.communicator.exchangeIncomeQueues(); reportMessage(); setSecondBarrier(ssrc, context); sssc.setCounters(this.counters); this.counters.clearCounters(); this.ssc = sssc.secondStageSuperStepBarrier(this.currentSuperStepCounter, ssrc); this.currentSuperStepCounter = ssc.getNextSuperStepNum(); } LOG.info("ljn test: add times is " + this.graphData.getVertexStore().getMigrateTreeSet().getI()); this.graphData.setInTravelNum(inDegreeOwnerBSP.getInTravelNum()); LOG.info("ljn report : MigrateTreeset size is " + this.graphData.getVertexStore().getMigrateTreeSet().size()); LOG.info("ljn report : IntravelNum is " + inDegreeOwnerBSP.getInTravelNum()); this.golbalIntravelRatio = sssc.graphEvaluateBarrier(0, this.staffNum, this.graphData); LOG.info("ljn report : golbalIntravelRatio is " + golbalIntravelRatio); // if (golbalIntravelRatio > 0.8) { // this.evaluateflag = true; // } else { // evaluateflag = false; // } evaluateflag = true; } private void inDegreeStatistics() { } @Override public void run(BSPJob job, Staff task, WorkerAgentProtocol umbilical, boolean recovery, boolean changeWorkerState, int migrateSuperStep, int failCounter, String hostName) throws IOException, ClassNotFoundException, InterruptedException { // TODO Auto-generated method stub } /** * Process the vertex migrate. * * @param migrateVertexCommand * @param sssc * @throws IOException */ private void processVerticeMigrate(MigrateVertexCommand migrateVertexCommand, StaffSSControllerInterface sssc, SuperStepReportContainer ssrc) throws IOException { int maxMigrateNum = migrateVertexCommand.getMaxMigrateNum(); // int maxMigrateNum = 50; LOG.info("ljn test before migrate migratetreeset size is " + this.graphData.getVertexStore().getMigrateTreeSet().size()); float minMigrateFactor = migrateVertexCommand.getMinMigrateFactor(); HashMap<String, Integer> migrateMap = this.graphData.chooseMigrateVertex(maxMigrateNum, minMigrateFactor, migrateVertexCommand.getMinActiveBitMap()); LOG.info("ljn report : from this partition migrate vertex number is " + migrateMap.size()); int nextMigrate = (int) ((this.graphData.getGlobalVertexNum() * ((1 - golbalIntravelRatio - 0.2))) / this.bspJob.getNumBspStaff()); LOG.info("ljn test : migrate is size is " + migrateMap.size() + " and nextMigrate is " + nextMigrate); if (migrateMap.size() > nextMigrate || this.currentSuperStepCounter > 2) { this.evaluateflag = true; // return; } // LOG.info("ljn test : processVerticeMigrate migrateMap is " + migrateMap); // HashMap<String, Integer> migrateMap = new HashMap<String, Integer>(); // migrateMap.put("1", 1); migrateVertex(migrateMap, sssc, ssrc); updatePartitionRoute(migrateMap, sssc, ssrc); // ArrayList<Vertex> migrateList = new ArrayList<Vertex>(); // migrateList.addAll(this.communicator.getMigrateVertexQueue()); // LOG.info("ljn report migrate vertex receive number not 0 is " // + migrateList.size()); // for(int i=0; i<migrateList.size(); i++ ){ // Vertex vertex = migrateList.get(i); // LOG.info("ljn test : add vertex ID " + vertex.getVertexID().toString() // + " Edge is " + vertex.getAllEdges()); // this.graphData.addForAll(vertex); // } LOG.info("ljn report migrate vertex receive number not 0 is " + this.communicator.getMigrateVertexQueue().size()); for (Vertex vertex : this.communicator.getMigrateVertexQueue()) { this.graphData.addForAll(vertex); // LOG.info("ljn test : add vertex ID " + vertex.getVertexID().toString() // + " Edge is " + vertex.getAllEdges()); } updateGraphOwnerMap(); // cleanMigrateData(migrateMap); // Reset the active bit map. ljn this.graphData.getVertexStore().resetMigrate(); this.communicator.getMigrateVertexQueue().clear(); } private void updateGraphOwnerMap() { // ljn add the active factor of initialThreshold. // this.graphData.getVertexStore().updateOwnermap(); } private void updatePartitionRoute(HashMap<String, Integer> migrateMap, StaffSSControllerInterface sssc, SuperStepReportContainer ssrc) throws IOException { writeMigratePartition(migrateMap, sssc); sssc.migrateBarrier(ssrc, this.currentSuperStepCounter, this.getStaffNum() * 2, false, this.getPartition()); readMigratePartition(sssc, this.currentSuperStepCounter); } private void readMigratePartition(StaffSSControllerInterface sssc, int currentSuperStepCounter) throws IOException { BufferedReader br = null; Path migratePartitionPath = new Path(migratePartitionDir); FileSystem fsFileSystem = FileSystem.get(this.getConf().getConf()); FileStatus[] fs = fsFileSystem.listStatus(migratePartitionPath); Path[] listPath = FileUtil.stat2Paths(fs); for (Path p : listPath) { FSDataInputStream fsInput = fsFileSystem.open(p); br = new BufferedReader(new InputStreamReader(fsInput)); String line = null; while (null != (line = br.readLine())) { String[] strs = line.split(":"); this.partitioner.updateMigratePartition(new Text(strs[0]), Integer.parseInt(strs[1])); } } } private void writeMigratePartition(HashMap<String, Integer> migrateMap, StaffSSControllerInterface sssc) throws IOException { String migratePartitionString = this.migrateDirBase + "/" + String.valueOf(this.currentSuperStepCounter / Constants.K) + "/" + String.valueOf(this.getPartition()); LOG.info("ljn test : writeMigratePartition " + migratePartitionString); this.migratePartitionDir = this.migrateDirBase + "/" + String.valueOf(this.currentSuperStepCounter / Constants.K); Path migratePartitionPath = new Path(migratePartitionString); FileSystem fsFileSystem = FileSystem.get(this.getConf().getConf()); FSDataOutputStream fsOutput = fsFileSystem.create(migratePartitionPath); for (String vid : migrateMap.keySet()) { // LOG.info("ljn test : write " + vid + " :" + migrateMap.get(vid)); String str = vid + ":" + migrateMap.get(vid) + "\n"; fsOutput.write(str.getBytes("UTF-8")); // Text a = new Text(vid + ":" + migrateMap.get(vid)); // a.write(fsOutput); } fsFileSystem.close(); } private void cleanMigrateData(HashMap<String, Integer> migrateMap) { cleanMigrateVertex(migrateMap); } /** * Migrate the vertex data. * * @param migrateMap * @param sssc * @throws IOException */ private void migrateVertex(HashMap<String, Integer> migrateMap, StaffSSControllerInterface sssc, SuperStepReportContainer ssrc) throws IOException { migrateVertexData(migrateMap, sssc); migrateMessageData(migrateMap, sssc); sssc.migrateBarrier(ssrc, this.currentSuperStepCounter, this.getStaffNum(), true, this.getPartition()); // LOG.info("ljn report migrate vertex receive number is " // + this.communicator.getMigrateVertexQueue().size()); // for (Vertex vertex : this.communicator.getMigrateVertexQueue()) { // this.graphData.addForAll(vertex); // LOG.info("ljn test : add vertex ID " + vertex.getVertexID().toString() // + " Edge is " + vertex.getAllEdges()); // } LOG.info("ljn test after migrate migratetreeset size is " + this.graphData.getVertexStore().getMigrateTreeSet().size()); } // private void syncMigrate(StaffSSControllerInterface sssc, // SuperStepReportContainer ssrc) { // sssc.migrateBarrier(ssrc, this.currentSuperStepCounter, // this.getStaffNum()); // } /** * Start migrate vertex thread and sync. * * @param migrateMap * @param sssc */ private void migrateVertexData(HashMap<String, Integer> migrateMap, StaffSSControllerInterface sssc) { for (String vId : migrateMap.keySet()) { // for test if (this.graphData.getVertexStore().getNormalVertexMap().get(vId) != null) { Vertex vertex = this.graphData.getVertexStore().getNormalVertexMap().get(vId).getVertex(); int targerPartition = migrateMap.get(vId); this.communicator.sendMigrateVertex(vertex, targerPartition); } else { // LOG.warn("Vertex " + vId + " is not exits in this partition "); } } this.communicator.sendAllMigrateVertice(this.currentSuperStepCounter - 1); } private void cleanMigrateVertex(HashMap<String, Integer> migrateMap) { for (String vId : migrateMap.keySet()) { this.graphData.getVertexStore().getNormalVertexMap().remove(vId); } } /** * Start message data thread and sync. * * @param migrateMap * @param sssc * @throws IOException */ private void migrateMessageData(HashMap<String, Integer> migrateMap, StaffSSControllerInterface sssc) throws IOException { for (String vId : migrateMap.keySet()) { ConcurrentLinkedQueue<IMessage> messages = this.communicator.getMessageQueue(vId); for (IMessage message : messages) { this.communicator.sendMigrateMessage(message, migrateMap.get(vId), this.currentSuperStepCounter - 1); } } } public boolean isEvaluateflag() { return evaluateflag; } public void setEvaluateflag(boolean evaluateflag) { this.evaluateflag = evaluateflag; } }