Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.giraph.bsp; import org.apache.giraph.conf.GiraphConstants; import org.apache.giraph.conf.ImmutableClassesGiraphConfiguration; import org.apache.giraph.graph.GraphTaskManager; import org.apache.giraph.graph.InputSplitEvents; import org.apache.giraph.graph.InputSplitPaths; import org.apache.giraph.partition.GraphPartitionerFactory; import org.apache.giraph.worker.WorkerInfo; import org.apache.giraph.zk.BspEvent; import org.apache.giraph.zk.PredicateLock; import org.apache.giraph.zk.ZooKeeperExt; import org.apache.giraph.zk.ZooKeeperManager; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.Mapper; import org.apache.log4j.Logger; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.WatchedEvent; import org.apache.zookeeper.Watcher; import org.apache.zookeeper.Watcher.Event.EventType; import org.apache.zookeeper.Watcher.Event.KeeperState; import org.apache.zookeeper.ZooDefs.Ids; import org.json.JSONException; import org.json.JSONObject; import java.io.IOException; import java.net.UnknownHostException; import java.security.InvalidParameterException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import static org.apache.giraph.conf.GiraphConstants.CHECKPOINT_DIRECTORY; /** * Zookeeper-based implementation of {@link CentralizedService}. * * @param <I> Vertex id * @param <V> Vertex data * @param <E> Edge data * @param <M> Message data */ @SuppressWarnings("rawtypes") public abstract class BspService<I extends WritableComparable, V extends Writable, E extends Writable, M extends Writable> implements Watcher, CentralizedService<I, V, E, M> { /** Unset superstep */ public static final long UNSET_SUPERSTEP = Long.MIN_VALUE; /** Input superstep (superstep when loading the vertices happens) */ public static final long INPUT_SUPERSTEP = -1; /** Unset application attempt */ public static final long UNSET_APPLICATION_ATTEMPT = Long.MIN_VALUE; /** Base ZooKeeper directory */ public static final String BASE_DIR = "/_hadoopBsp"; /** Master job state znode above base dir */ public static final String MASTER_JOB_STATE_NODE = "/_masterJobState"; /** Vertex input split directory about base dir */ public static final String VERTEX_INPUT_SPLIT_DIR = "/_vertexInputSplitDir"; /** Vertex input split done directory about base dir */ public static final String VERTEX_INPUT_SPLIT_DONE_DIR = "/_vertexInputSplitDoneDir"; /** Denotes a reserved vertex input split */ public static final String VERTEX_INPUT_SPLIT_RESERVED_NODE = "/_vertexInputSplitReserved"; /** Denotes a finished vertex input split */ public static final String VERTEX_INPUT_SPLIT_FINISHED_NODE = "/_vertexInputSplitFinished"; /** Denotes that all the vertex input splits are are ready for consumption */ public static final String VERTEX_INPUT_SPLITS_ALL_READY_NODE = "/_vertexInputSplitsAllReady"; /** Denotes that all the vertex input splits are done. */ public static final String VERTEX_INPUT_SPLITS_ALL_DONE_NODE = "/_vertexInputSplitsAllDone"; /** Edge input split directory about base dir */ public static final String EDGE_INPUT_SPLIT_DIR = "/_edgeInputSplitDir"; /** Edge input split done directory about base dir */ public static final String EDGE_INPUT_SPLIT_DONE_DIR = "/_edgeInputSplitDoneDir"; /** Denotes a reserved edge input split */ public static final String EDGE_INPUT_SPLIT_RESERVED_NODE = "/_edgeInputSplitReserved"; /** Denotes a finished edge input split */ public static final String EDGE_INPUT_SPLIT_FINISHED_NODE = "/_edgeInputSplitFinished"; /** Denotes that all the edge input splits are are ready for consumption */ public static final String EDGE_INPUT_SPLITS_ALL_READY_NODE = "/_edgeInputSplitsAllReady"; /** Denotes that all the edge input splits are done. */ public static final String EDGE_INPUT_SPLITS_ALL_DONE_NODE = "/_edgeInputSplitsAllDone"; /** Directory of attempts of this application */ public static final String APPLICATION_ATTEMPTS_DIR = "/_applicationAttemptsDir"; /** Where the master election happens */ public static final String MASTER_ELECTION_DIR = "/_masterElectionDir"; /** Superstep scope */ public static final String SUPERSTEP_DIR = "/_superstepDir"; /** Where the merged aggregators are located */ public static final String MERGED_AGGREGATOR_DIR = "/_mergedAggregatorDir"; /** Healthy workers register here. */ public static final String WORKER_HEALTHY_DIR = "/_workerHealthyDir"; /** Unhealthy workers register here. */ public static final String WORKER_UNHEALTHY_DIR = "/_workerUnhealthyDir"; /** Workers which wrote checkpoint notify here */ public static final String WORKER_WROTE_CHECKPOINT_DIR = "/_workerWroteCheckpointDir"; /** Finished workers notify here */ public static final String WORKER_FINISHED_DIR = "/_workerFinishedDir"; /** Where the master and worker addresses and partition assignments are set */ public static final String ADDRESSES_AND_PARTITIONS_DIR = "/_addressesAndPartitions"; /** Helps coordinate the partition exchnages */ public static final String PARTITION_EXCHANGE_DIR = "/_partitionExchangeDir"; /** Denotes that the superstep is done */ public static final String SUPERSTEP_FINISHED_NODE = "/_superstepFinished"; /** Denotes which workers have been cleaned up */ public static final String CLEANED_UP_DIR = "/_cleanedUpDir"; /** JSON partition stats key */ public static final String JSONOBJ_PARTITION_STATS_KEY = "_partitionStatsKey"; /** JSON message count key */ public static final String JSONOBJ_NUM_MESSAGES_KEY = "_numMsgsKey"; /** JSON metrics key */ public static final String JSONOBJ_METRICS_KEY = "_metricsKey"; /** JSON state key */ public static final String JSONOBJ_STATE_KEY = "_stateKey"; /** JSON application attempt key */ public static final String JSONOBJ_APPLICATION_ATTEMPT_KEY = "_applicationAttemptKey"; /** JSON superstep key */ public static final String JSONOBJ_SUPERSTEP_KEY = "_superstepKey"; /** Suffix denotes a worker */ public static final String WORKER_SUFFIX = "_worker"; /** Suffix denotes a master */ public static final String MASTER_SUFFIX = "_master"; /** If at the end of a checkpoint file, indicates metadata */ public static final String CHECKPOINT_METADATA_POSTFIX = ".metadata"; /** * If at the end of a checkpoint file, indicates vertices, edges, * messages, etc. */ public static final String CHECKPOINT_VERTICES_POSTFIX = ".vertices"; /** * If at the end of a checkpoint file, indicates metadata and data is valid * for the same filenames without .valid */ public static final String CHECKPOINT_VALID_POSTFIX = ".valid"; /** * If at the end of a checkpoint file, indicates the stitched checkpoint * file prefixes. A checkpoint is not valid if this file does not exist. */ public static final String CHECKPOINT_FINALIZED_POSTFIX = ".finalized"; /** Class logger */ private static final Logger LOG = Logger.getLogger(BspService.class); /** Path to the job's root */ protected final String basePath; /** Path to the job state determined by the master (informative only) */ protected final String masterJobStatePath; /** ZooKeeper paths for vertex input splits. */ protected final InputSplitPaths vertexInputSplitsPaths; /** ZooKeeper paths for edge input splits. */ protected final InputSplitPaths edgeInputSplitsPaths; /** Vertex input split events. */ protected final InputSplitEvents vertexInputSplitsEvents; /** Edge input split events. */ protected final InputSplitEvents edgeInputSplitsEvents; /** Path to the application attempts) */ protected final String applicationAttemptsPath; /** Path to the cleaned up notifications */ protected final String cleanedUpPath; /** Path to the checkpoint's root (including job id) */ protected final String checkpointBasePath; /** Path to the master election path */ protected final String masterElectionPath; /** Private ZooKeeper instance that implements the service */ private final ZooKeeperExt zk; /** Has the Connection occurred? */ private final BspEvent connectedEvent; /** Has worker registration changed (either healthy or unhealthy) */ private final BspEvent workerHealthRegistrationChanged; /** Are the addresses and partition assignments to workers ready? */ private final BspEvent addressesAndPartitionsReadyChanged; /** Application attempt changed */ private final BspEvent applicationAttemptChanged; /** Superstep finished synchronization */ private final BspEvent superstepFinished; /** Master election changed for any waited on attempt */ private final BspEvent masterElectionChildrenChanged; /** Cleaned up directory children changed*/ private final BspEvent cleanedUpChildrenChanged; /** Registered list of BspEvents */ private final List<BspEvent> registeredBspEvents = new ArrayList<BspEvent>(); /** Immutable configuration of the job*/ private final ImmutableClassesGiraphConfiguration<I, V, E, M> conf; /** Job context (mainly for progress) */ private final Mapper<?, ?, ?, ?>.Context context; /** Cached superstep (from ZooKeeper) */ private long cachedSuperstep = UNSET_SUPERSTEP; /** Restarted from a checkpoint (manual or automatic) */ private long restartedSuperstep = UNSET_SUPERSTEP; /** Cached application attempt (from ZooKeeper) */ private long cachedApplicationAttempt = UNSET_APPLICATION_ATTEMPT; /** Job id, to ensure uniqueness */ private final String jobId; /** Task partition, to ensure uniqueness */ private final int taskPartition; /** My hostname */ private final String hostname; /** Combination of hostname '_' partition (unique id) */ private final String hostnamePartitionId; /** Graph partitioner */ private final GraphPartitionerFactory<I, V, E, M> graphPartitionerFactory; /** Mapper that will do the graph computation */ private final GraphTaskManager<I, V, E, M> graphTaskManager; /** File system */ private final FileSystem fs; /** Checkpoint frequency */ private final int checkpointFrequency; /** * Constructor. * * @param serverPortList ZooKeeper server port list * @param sessionMsecTimeout ZooKeeper session timeount in milliseconds * @param context Mapper context * @param graphTaskManager GraphTaskManager for this compute node */ public BspService(String serverPortList, int sessionMsecTimeout, Mapper<?, ?, ?, ?>.Context context, GraphTaskManager<I, V, E, M> graphTaskManager) { this.vertexInputSplitsEvents = new InputSplitEvents(context); this.edgeInputSplitsEvents = new InputSplitEvents(context); this.connectedEvent = new PredicateLock(context); this.workerHealthRegistrationChanged = new PredicateLock(context); this.addressesAndPartitionsReadyChanged = new PredicateLock(context); this.applicationAttemptChanged = new PredicateLock(context); this.superstepFinished = new PredicateLock(context); this.masterElectionChildrenChanged = new PredicateLock(context); this.cleanedUpChildrenChanged = new PredicateLock(context); registerBspEvent(connectedEvent); registerBspEvent(workerHealthRegistrationChanged); registerBspEvent(vertexInputSplitsEvents.getAllReadyChanged()); registerBspEvent(vertexInputSplitsEvents.getStateChanged()); registerBspEvent(edgeInputSplitsEvents.getAllReadyChanged()); registerBspEvent(edgeInputSplitsEvents.getStateChanged()); registerBspEvent(addressesAndPartitionsReadyChanged); registerBspEvent(applicationAttemptChanged); registerBspEvent(superstepFinished); registerBspEvent(masterElectionChildrenChanged); registerBspEvent(cleanedUpChildrenChanged); this.context = context; this.graphTaskManager = graphTaskManager; this.conf = new ImmutableClassesGiraphConfiguration<I, V, E, M>(context.getConfiguration()); this.jobId = conf.get("mapred.job.id", "Unknown Job"); this.taskPartition = conf.getTaskPartition(); this.restartedSuperstep = conf.getLong(GiraphConstants.RESTART_SUPERSTEP, UNSET_SUPERSTEP); this.cachedSuperstep = restartedSuperstep; if ((restartedSuperstep != UNSET_SUPERSTEP) && (restartedSuperstep < 0)) { throw new IllegalArgumentException("BspService: Invalid superstep to restart - " + restartedSuperstep); } try { this.hostname = conf.getLocalHostname(); } catch (UnknownHostException e) { throw new RuntimeException(e); } this.hostnamePartitionId = hostname + "_" + getTaskPartition(); this.graphPartitionerFactory = conf.createGraphPartitioner(); this.checkpointFrequency = conf.getCheckpointFrequency(); basePath = ZooKeeperManager.getBasePath(conf) + BASE_DIR + "/" + jobId; masterJobStatePath = basePath + MASTER_JOB_STATE_NODE; vertexInputSplitsPaths = new InputSplitPaths(basePath, VERTEX_INPUT_SPLIT_DIR, VERTEX_INPUT_SPLIT_DONE_DIR, VERTEX_INPUT_SPLITS_ALL_READY_NODE, VERTEX_INPUT_SPLITS_ALL_DONE_NODE); edgeInputSplitsPaths = new InputSplitPaths(basePath, EDGE_INPUT_SPLIT_DIR, EDGE_INPUT_SPLIT_DONE_DIR, EDGE_INPUT_SPLITS_ALL_READY_NODE, EDGE_INPUT_SPLITS_ALL_DONE_NODE); applicationAttemptsPath = basePath + APPLICATION_ATTEMPTS_DIR; cleanedUpPath = basePath + CLEANED_UP_DIR; checkpointBasePath = CHECKPOINT_DIRECTORY.getWithDefault(getConfiguration(), CHECKPOINT_DIRECTORY.getDefaultValue() + "/" + getJobId()); masterElectionPath = basePath + MASTER_ELECTION_DIR; if (LOG.isInfoEnabled()) { LOG.info("BspService: Connecting to ZooKeeper with job " + jobId + ", " + getTaskPartition() + " on " + serverPortList); } try { this.zk = new ZooKeeperExt(serverPortList, sessionMsecTimeout, conf.getZookeeperOpsMaxAttempts(), conf.getZookeeperOpsRetryWaitMsecs(), this, context); connectedEvent.waitForever(); this.fs = FileSystem.get(getConfiguration()); } catch (IOException e) { throw new RuntimeException(e); } } /** * Get the superstep from a ZooKeeper path * * @param path Path to parse for the superstep * @return Superstep from the path. */ public static long getSuperstepFromPath(String path) { int foundSuperstepStart = path.indexOf(SUPERSTEP_DIR); if (foundSuperstepStart == -1) { throw new IllegalArgumentException( "getSuperstepFromPath: Cannot find " + SUPERSTEP_DIR + "from " + path); } foundSuperstepStart += SUPERSTEP_DIR.length() + 1; int endIndex = foundSuperstepStart + path.substring(foundSuperstepStart).indexOf("/"); if (endIndex == -1) { throw new IllegalArgumentException("getSuperstepFromPath: Cannot find end of superstep from " + path); } if (LOG.isTraceEnabled()) { LOG.trace("getSuperstepFromPath: Got path=" + path + ", start=" + foundSuperstepStart + ", end=" + endIndex); } return Long.parseLong(path.substring(foundSuperstepStart, endIndex)); } /** * Get the hostname and id from a "healthy" worker path * * @param path Path to check * @return Hostname and id from path */ public static String getHealthyHostnameIdFromPath(String path) { int foundWorkerHealthyStart = path.indexOf(WORKER_HEALTHY_DIR); if (foundWorkerHealthyStart == -1) { throw new IllegalArgumentException( "getHealthyHostnameidFromPath: Couldn't find " + WORKER_HEALTHY_DIR + " from " + path); } foundWorkerHealthyStart += WORKER_HEALTHY_DIR.length(); return path.substring(foundWorkerHealthyStart); } /** * Generate the base superstep directory path for a given application * attempt * * @param attempt application attempt number * @return directory path based on the an attempt */ public final String getSuperstepPath(long attempt) { return applicationAttemptsPath + "/" + attempt + SUPERSTEP_DIR; } /** * Get the input split events for edge input. * * @return InputSplitEvents for edge input. */ public InputSplitEvents getEdgeInputSplitsEvents() { return edgeInputSplitsEvents; } /** * Get the input split events for vertex input. * * @return InputSplitEvents for vertex input. */ public InputSplitEvents getVertexInputSplitsEvents() { return vertexInputSplitsEvents; } /** * Generate the worker information "healthy" directory path for a * superstep * * @param attempt application attempt number * @param superstep superstep to use * @return directory path based on the a superstep */ public final String getWorkerInfoHealthyPath(long attempt, long superstep) { return applicationAttemptsPath + "/" + attempt + SUPERSTEP_DIR + "/" + superstep + WORKER_HEALTHY_DIR; } /** * Generate the worker information "unhealthy" directory path for a * superstep * * @param attempt application attempt number * @param superstep superstep to use * @return directory path based on the a superstep */ public final String getWorkerInfoUnhealthyPath(long attempt, long superstep) { return applicationAttemptsPath + "/" + attempt + SUPERSTEP_DIR + "/" + superstep + WORKER_UNHEALTHY_DIR; } /** * Generate the worker "wrote checkpoint" directory path for a * superstep * * @param attempt application attempt number * @param superstep superstep to use * @return directory path based on the a superstep */ public final String getWorkerWroteCheckpointPath(long attempt, long superstep) { return applicationAttemptsPath + "/" + attempt + SUPERSTEP_DIR + "/" + superstep + WORKER_WROTE_CHECKPOINT_DIR; } /** * Generate the worker "finished" directory path for a * superstep * * @param attempt application attempt number * @param superstep superstep to use * @return directory path based on the a superstep */ public final String getWorkerFinishedPath(long attempt, long superstep) { return applicationAttemptsPath + "/" + attempt + SUPERSTEP_DIR + "/" + superstep + WORKER_FINISHED_DIR; } /** * Generate the "addresses and partitions" directory path for a superstep * * @param attempt application attempt number * @param superstep superstep to use * @return directory path based on the a superstep */ public final String getAddressesAndPartitionsPath(long attempt, long superstep) { return applicationAttemptsPath + "/" + attempt + SUPERSTEP_DIR + "/" + superstep + ADDRESSES_AND_PARTITIONS_DIR; } /** * Generate the "partition exchange" directory path for a superstep * * @param attempt application attempt number * @param superstep superstep to use * @return directory path based on the a superstep */ public final String getPartitionExchangePath(long attempt, long superstep) { return applicationAttemptsPath + "/" + attempt + SUPERSTEP_DIR + "/" + superstep + PARTITION_EXCHANGE_DIR; } /** * Based on the superstep, worker info, and attempt, get the appropriate * worker path for the exchange. * * @param attempt Application attempt * @param superstep Superstep * @param workerInfo Worker info of the exchange. * @return Path of the desired worker */ public final String getPartitionExchangeWorkerPath(long attempt, long superstep, WorkerInfo workerInfo) { return getPartitionExchangePath(attempt, superstep) + "/" + workerInfo.getHostnameId(); } /** * Generate the "superstep finished" directory path for a superstep * * @param attempt application attempt number * @param superstep superstep to use * @return directory path based on the a superstep */ public final String getSuperstepFinishedPath(long attempt, long superstep) { return applicationAttemptsPath + "/" + attempt + SUPERSTEP_DIR + "/" + superstep + SUPERSTEP_FINISHED_NODE; } /** * Generate the base superstep directory path for a given application * attempt * * @param superstep Superstep to use * @return Directory path based on the a superstep */ public final String getCheckpointBasePath(long superstep) { return checkpointBasePath + "/" + superstep; } /** * Get the checkpoint from a finalized checkpoint path * * @param finalizedPath Path of the finalized checkpoint * @return Superstep referring to a checkpoint of the finalized path */ public static long getCheckpoint(Path finalizedPath) { if (!finalizedPath.getName().endsWith(CHECKPOINT_FINALIZED_POSTFIX)) { throw new InvalidParameterException( "getCheckpoint: " + finalizedPath + "Doesn't end in " + CHECKPOINT_FINALIZED_POSTFIX); } String checkpointString = finalizedPath.getName().replace(CHECKPOINT_FINALIZED_POSTFIX, ""); return Long.parseLong(checkpointString); } /** * Get the ZooKeeperExt instance. * * @return ZooKeeperExt instance. */ public final ZooKeeperExt getZkExt() { return zk; } @Override public final long getRestartedSuperstep() { return restartedSuperstep; } /** * Set the restarted superstep * * @param superstep Set the manually restarted superstep */ public final void setRestartedSuperstep(long superstep) { if (superstep < INPUT_SUPERSTEP) { throw new IllegalArgumentException("setRestartedSuperstep: Bad argument " + superstep); } restartedSuperstep = superstep; } /** * Should checkpoint on this superstep? If checkpointing, always * checkpoint the first user superstep. If restarting, the first * checkpoint is after the frequency has been met. * * @param superstep Decide if checkpointing no this superstep * @return True if this superstep should be checkpointed, false otherwise */ public final boolean checkpointFrequencyMet(long superstep) { if (checkpointFrequency == 0) { return false; } long firstCheckpoint = INPUT_SUPERSTEP + 1; if (getRestartedSuperstep() != UNSET_SUPERSTEP) { firstCheckpoint = getRestartedSuperstep() + checkpointFrequency; } if (superstep < firstCheckpoint) { return false; } return ((superstep - firstCheckpoint) % checkpointFrequency) == 0; } /** * Get the file system * * @return file system */ public final FileSystem getFs() { return fs; } public final ImmutableClassesGiraphConfiguration<I, V, E, M> getConfiguration() { return conf; } public final Mapper<?, ?, ?, ?>.Context getContext() { return context; } public final String getHostname() { return hostname; } public final String getHostnamePartitionId() { return hostnamePartitionId; } public final int getTaskPartition() { return taskPartition; } public final GraphTaskManager<I, V, E, M> getGraphTaskManager() { return graphTaskManager; } public final BspEvent getWorkerHealthRegistrationChangedEvent() { return workerHealthRegistrationChanged; } public final BspEvent getAddressesAndPartitionsReadyChangedEvent() { return addressesAndPartitionsReadyChanged; } public final BspEvent getApplicationAttemptChangedEvent() { return applicationAttemptChanged; } public final BspEvent getSuperstepFinishedEvent() { return superstepFinished; } public final BspEvent getMasterElectionChildrenChangedEvent() { return masterElectionChildrenChanged; } public final BspEvent getCleanedUpChildrenChangedEvent() { return cleanedUpChildrenChanged; } /** * Get the master commanded job state as a JSONObject. Also sets the * watches to see if the master commanded job state changes. * * @return Last job state or null if none * @throws InterruptedException * @throws KeeperException */ public final JSONObject getJobState() { try { getZkExt().createExt(masterJobStatePath, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, true); } catch (KeeperException.NodeExistsException e) { LOG.info("getJobState: Job state already exists (" + masterJobStatePath + ")"); } catch (KeeperException e) { throw new IllegalStateException("Failed to create job state path " + "due to KeeperException", e); } catch (InterruptedException e) { throw new IllegalStateException("Failed to create job state path " + "due to InterruptedException", e); } String jobState = null; try { List<String> childList = getZkExt().getChildrenExt(masterJobStatePath, true, true, true); if (childList.isEmpty()) { return null; } jobState = new String(getZkExt().getData(childList.get(childList.size() - 1), true, null)); } catch (KeeperException.NoNodeException e) { LOG.info("getJobState: Job state path is empty! - " + masterJobStatePath); } catch (KeeperException e) { throw new IllegalStateException("Failed to get job state path " + "children due to KeeperException", e); } catch (InterruptedException e) { throw new IllegalStateException( "Failed to get job state path " + "children due to InterruptedException", e); } try { return new JSONObject(jobState); } catch (JSONException e) { throw new RuntimeException("getJobState: Failed to parse job state " + jobState); } } /** * Get the job id * * @return job id */ public final String getJobId() { return jobId; } /** * Get the latest application attempt and cache it. * * @return the latest application attempt */ public final long getApplicationAttempt() { if (cachedApplicationAttempt != UNSET_APPLICATION_ATTEMPT) { return cachedApplicationAttempt; } try { getZkExt().createExt(applicationAttemptsPath, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, true); } catch (KeeperException.NodeExistsException e) { LOG.info("getApplicationAttempt: Node " + applicationAttemptsPath + " already exists!"); } catch (KeeperException e) { throw new IllegalStateException("Couldn't create application " + "attempts path due to KeeperException", e); } catch (InterruptedException e) { throw new IllegalStateException( "Couldn't create application " + "attempts path due to InterruptedException", e); } try { List<String> attemptList = getZkExt().getChildrenExt(applicationAttemptsPath, true, false, false); if (attemptList.isEmpty()) { cachedApplicationAttempt = 0; } else { cachedApplicationAttempt = Long.parseLong(Collections.max(attemptList)); } } catch (KeeperException e) { throw new IllegalStateException("Couldn't get application " + "attempts to KeeperException", e); } catch (InterruptedException e) { throw new IllegalStateException("Couldn't get application " + "attempts to InterruptedException", e); } return cachedApplicationAttempt; } /** * Get the latest superstep and cache it. * * @return the latest superstep * @throws InterruptedException * @throws KeeperException */ public final long getSuperstep() { if (cachedSuperstep != UNSET_SUPERSTEP) { return cachedSuperstep; } String superstepPath = getSuperstepPath(getApplicationAttempt()); try { getZkExt().createExt(superstepPath, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, true); } catch (KeeperException.NodeExistsException e) { if (LOG.isInfoEnabled()) { LOG.info("getApplicationAttempt: Node " + applicationAttemptsPath + " already exists!"); } } catch (KeeperException e) { throw new IllegalStateException("getSuperstep: KeeperException", e); } catch (InterruptedException e) { throw new IllegalStateException("getSuperstep: InterruptedException", e); } List<String> superstepList; try { superstepList = getZkExt().getChildrenExt(superstepPath, true, false, false); } catch (KeeperException e) { throw new IllegalStateException("getSuperstep: KeeperException", e); } catch (InterruptedException e) { throw new IllegalStateException("getSuperstep: InterruptedException", e); } if (superstepList.isEmpty()) { cachedSuperstep = INPUT_SUPERSTEP; } else { cachedSuperstep = Long.parseLong(Collections.max(superstepList)); } return cachedSuperstep; } /** * Increment the cached superstep. Shouldn't be the initial value anymore. */ public final void incrCachedSuperstep() { if (cachedSuperstep == UNSET_SUPERSTEP) { throw new IllegalStateException("incrSuperstep: Invalid unset cached superstep " + UNSET_SUPERSTEP); } ++cachedSuperstep; } /** * Set the cached superstep (should only be used for loading checkpoints * or recovering from failure). * * @param superstep will be used as the next superstep iteration */ public final void setCachedSuperstep(long superstep) { cachedSuperstep = superstep; } /** * Set the cached application attempt (should only be used for restart from * failure by the master) * * @param applicationAttempt Will denote the new application attempt */ public final void setApplicationAttempt(long applicationAttempt) { cachedApplicationAttempt = applicationAttempt; String superstepPath = getSuperstepPath(cachedApplicationAttempt); try { getZkExt().createExt(superstepPath, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, true); } catch (KeeperException.NodeExistsException e) { throw new IllegalArgumentException("setApplicationAttempt: Attempt already exists! - " + superstepPath, e); } catch (KeeperException e) { throw new RuntimeException("setApplicationAttempt: KeeperException - " + superstepPath, e); } catch (InterruptedException e) { throw new RuntimeException("setApplicationAttempt: InterruptedException - " + superstepPath, e); } } /** * Register a BspEvent. Ensure that it will be signaled * by catastrophic failure so that threads waiting on an event signal * will be unblocked. * * @param event Event to be registered. */ public void registerBspEvent(BspEvent event) { registeredBspEvents.add(event); } /** * Subclasses can use this to instantiate their respective partitioners * * @return Instantiated graph partitioner factory */ protected GraphPartitionerFactory<I, V, E, M> getGraphPartitionerFactory() { return graphPartitionerFactory; } /** * Derived classes that want additional ZooKeeper events to take action * should override this. * * @param event Event that occurred * @return true if the event was processed here, false otherwise */ protected boolean processEvent(WatchedEvent event) { return false; } @Override public final void process(WatchedEvent event) { // 1. Process all shared events // 2. Process specific derived class events if (LOG.isDebugEnabled()) { LOG.debug("process: Got a new event, path = " + event.getPath() + ", type = " + event.getType() + ", state = " + event.getState()); } if ((event.getPath() == null) && (event.getType() == EventType.None)) { if (event.getState() == KeeperState.Disconnected) { // Watches may not be triggered for some time, so signal all BspEvents for (BspEvent bspEvent : registeredBspEvents) { bspEvent.signal(); } LOG.warn("process: Disconnected from ZooKeeper (will automatically " + "try to recover) " + event); } else if (event.getState() == KeeperState.SyncConnected) { if (LOG.isInfoEnabled()) { LOG.info("process: Asynchronous connection complete."); } connectedEvent.signal(); } else { LOG.warn("process: Got unknown null path event " + event); } return; } boolean eventProcessed = false; if (event.getPath().startsWith(masterJobStatePath)) { // This will cause all becomeMaster() MasterThreads to notice the // change in job state and quit trying to become the master. masterElectionChildrenChanged.signal(); eventProcessed = true; } else if ((event.getPath().contains(WORKER_HEALTHY_DIR) || event.getPath().contains(WORKER_UNHEALTHY_DIR)) && (event.getType() == EventType.NodeChildrenChanged)) { if (LOG.isDebugEnabled()) { LOG.debug("process: workerHealthRegistrationChanged " + "(worker health reported - healthy/unhealthy )"); } workerHealthRegistrationChanged.signal(); eventProcessed = true; } else if (event.getPath().equals(vertexInputSplitsPaths.getAllReadyPath()) && (event.getType() == EventType.NodeCreated)) { if (LOG.isInfoEnabled()) { LOG.info("process: inputSplitsReadyChanged " + "(input splits ready)"); } vertexInputSplitsEvents.getAllReadyChanged().signal(); eventProcessed = true; } else if (event.getPath().endsWith(VERTEX_INPUT_SPLIT_RESERVED_NODE) && (event.getType() == EventType.NodeCreated)) { if (LOG.isDebugEnabled()) { LOG.debug("process: vertexInputSplitsStateChanged " + "(made a reservation)"); } vertexInputSplitsEvents.getStateChanged().signal(); eventProcessed = true; } else if (event.getPath().endsWith(VERTEX_INPUT_SPLIT_RESERVED_NODE) && (event.getType() == EventType.NodeDeleted)) { if (LOG.isInfoEnabled()) { LOG.info("process: vertexInputSplitsStateChanged " + "(lost a reservation)"); } vertexInputSplitsEvents.getStateChanged().signal(); eventProcessed = true; } else if (event.getPath().endsWith(VERTEX_INPUT_SPLIT_FINISHED_NODE) && (event.getType() == EventType.NodeCreated)) { if (LOG.isDebugEnabled()) { LOG.debug("process: vertexInputSplitsStateChanged " + "(finished inputsplit)"); } vertexInputSplitsEvents.getStateChanged().signal(); eventProcessed = true; } else if (event.getPath().endsWith(VERTEX_INPUT_SPLIT_DONE_DIR) && (event.getType() == EventType.NodeChildrenChanged)) { if (LOG.isDebugEnabled()) { LOG.debug("process: vertexInputSplitsDoneStateChanged " + "(worker finished sending)"); } vertexInputSplitsEvents.getDoneStateChanged().signal(); eventProcessed = true; } else if (event.getPath().equals(vertexInputSplitsPaths.getAllDonePath()) && (event.getType() == EventType.NodeCreated)) { if (LOG.isInfoEnabled()) { LOG.info("process: vertexInputSplitsAllDoneChanged " + "(all vertices sent from input splits)"); } vertexInputSplitsEvents.getAllDoneChanged().signal(); eventProcessed = true; } else if (event.getPath().equals(edgeInputSplitsPaths.getAllReadyPath()) && (event.getType() == EventType.NodeCreated)) { if (LOG.isInfoEnabled()) { LOG.info("process: edgeInputSplitsReadyChanged " + "(input splits ready)"); } edgeInputSplitsEvents.getAllReadyChanged().signal(); eventProcessed = true; } else if (event.getPath().endsWith(EDGE_INPUT_SPLIT_RESERVED_NODE) && (event.getType() == EventType.NodeCreated)) { if (LOG.isDebugEnabled()) { LOG.debug("process: edgeInputSplitsStateChanged " + "(made a reservation)"); } edgeInputSplitsEvents.getStateChanged().signal(); eventProcessed = true; } else if (event.getPath().endsWith(EDGE_INPUT_SPLIT_RESERVED_NODE) && (event.getType() == EventType.NodeDeleted)) { if (LOG.isInfoEnabled()) { LOG.info("process: edgeInputSplitsStateChanged " + "(lost a reservation)"); } edgeInputSplitsEvents.getStateChanged().signal(); eventProcessed = true; } else if (event.getPath().endsWith(EDGE_INPUT_SPLIT_FINISHED_NODE) && (event.getType() == EventType.NodeCreated)) { if (LOG.isDebugEnabled()) { LOG.debug("process: edgeInputSplitsStateChanged " + "(finished inputsplit)"); } edgeInputSplitsEvents.getStateChanged().signal(); eventProcessed = true; } else if (event.getPath().endsWith(EDGE_INPUT_SPLIT_DONE_DIR) && (event.getType() == EventType.NodeChildrenChanged)) { if (LOG.isDebugEnabled()) { LOG.debug("process: edgeInputSplitsDoneStateChanged " + "(worker finished sending)"); } edgeInputSplitsEvents.getDoneStateChanged().signal(); eventProcessed = true; } else if (event.getPath().equals(edgeInputSplitsPaths.getAllDonePath()) && (event.getType() == EventType.NodeCreated)) { if (LOG.isInfoEnabled()) { LOG.info("process: edgeInputSplitsAllDoneChanged " + "(all edges sent from input splits)"); } edgeInputSplitsEvents.getAllDoneChanged().signal(); eventProcessed = true; } else if (event.getPath().contains(ADDRESSES_AND_PARTITIONS_DIR) && event.getType() == EventType.NodeCreated) { if (LOG.isInfoEnabled()) { LOG.info("process: partitionAssignmentsReadyChanged " + "(partitions are assigned)"); } addressesAndPartitionsReadyChanged.signal(); eventProcessed = true; } else if (event.getPath().contains(SUPERSTEP_FINISHED_NODE) && event.getType() == EventType.NodeCreated) { if (LOG.isInfoEnabled()) { LOG.info("process: superstepFinished signaled"); } superstepFinished.signal(); eventProcessed = true; } else if (event.getPath().endsWith(applicationAttemptsPath) && event.getType() == EventType.NodeChildrenChanged) { if (LOG.isInfoEnabled()) { LOG.info("process: applicationAttemptChanged signaled"); } applicationAttemptChanged.signal(); eventProcessed = true; } else if (event.getPath().contains(MASTER_ELECTION_DIR) && event.getType() == EventType.NodeChildrenChanged) { if (LOG.isInfoEnabled()) { LOG.info("process: masterElectionChildrenChanged signaled"); } masterElectionChildrenChanged.signal(); eventProcessed = true; } else if (event.getPath().equals(cleanedUpPath) && event.getType() == EventType.NodeChildrenChanged) { if (LOG.isInfoEnabled()) { LOG.info("process: cleanedUpChildrenChanged signaled"); } cleanedUpChildrenChanged.signal(); eventProcessed = true; } if (!(processEvent(event)) && (!eventProcessed)) { LOG.warn("process: Unknown and unprocessed event (path=" + event.getPath() + ", type=" + event.getType() + ", state=" + event.getState() + ")"); } } }