org.apache.giraph.graph.BspServiceMaster.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.giraph.graph.BspServiceMaster.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.giraph.graph;

import net.iharder.Base64;
import org.apache.giraph.bsp.ApplicationState;
import org.apache.giraph.bsp.BspInputFormat;
import org.apache.giraph.bsp.CentralizedServiceMaster;
import org.apache.giraph.bsp.SuperstepState;
import org.apache.giraph.graph.GraphMapper.MapFunctions;
import org.apache.giraph.zk.BspEvent;
import org.apache.giraph.zk.PredicateLock;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher.Event.EventType;
import org.apache.zookeeper.ZooDefs.Ids;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import org.apache.giraph.graph.partition.MasterGraphPartitioner;
import org.apache.giraph.graph.partition.PartitionOwner;
import org.apache.giraph.graph.partition.PartitionStats;
import org.apache.giraph.graph.partition.PartitionUtils;
import org.apache.giraph.utils.WritableUtils;

/**
 * ZooKeeper-based implementation of {@link CentralizedService}.
 *
 * @param <I> Vertex id
 * @param <V> Vertex data
 * @param <E> Edge data
 * @param <M> Message data
 */
@SuppressWarnings("rawtypes")
public class BspServiceMaster<I extends WritableComparable, V extends Writable, E extends Writable, M extends Writable>
        extends BspService<I, V, E, M> implements CentralizedServiceMaster<I, V, E, M> {
    /** Counter group name for the Giraph statistics */
    public static final String GIRAPH_STATS_COUNTER_GROUP_NAME = "Giraph Stats";
    /** Class logger */
    private static final Logger LOG = Logger.getLogger(BspServiceMaster.class);
    /** Superstep counter */
    private Counter superstepCounter = null;
    /** Vertex counter */
    private Counter vertexCounter = null;
    /** Finished vertex counter */
    private Counter finishedVertexCounter = null;
    /** Edge counter */
    private Counter edgeCounter = null;
    /** Sent messages counter */
    private Counter sentMessagesCounter = null;
    /** Workers on this superstep */
    private Counter currentWorkersCounter = null;
    /** Current master task partition */
    private Counter currentMasterTaskPartitionCounter = null;
    /** Last checkpointed superstep */
    private Counter lastCheckpointedSuperstepCounter = null;
    /** Am I the master? */
    private boolean isMaster = false;
    /** Max number of workers */
    private final int maxWorkers;
    /** Min number of workers */
    private final int minWorkers;
    /** Min % responded workers */
    private final float minPercentResponded;
    /** Poll period in msecs */
    private final int msecsPollPeriod;
    /** Max number of poll attempts */
    private final int maxPollAttempts;
    /** Min number of long tails before printing */
    private final int partitionLongTailMinPrint;
    /** Last finalized checkpoint */
    private long lastCheckpointedSuperstep = -1;
    /** State of the superstep changed */
    private final BspEvent superstepStateChanged = new PredicateLock();
    /** Master graph partitioner */
    private final MasterGraphPartitioner<I, V, E, M> masterGraphPartitioner;
    /** All the partition stats from the last superstep */
    private final List<PartitionStats> allPartitionStatsList = new ArrayList<PartitionStats>();
    /** Aggregator writer */
    private AggregatorWriter aggregatorWriter;
    /** Master class */
    private MasterCompute masterCompute;

    /**
     * Constructor for setting up the master.
     *
     * @param serverPortList ZooKeeper server port list
     * @param sessionMsecTimeout Msecs to timeout connecting to ZooKeeper
     * @param context Mapper context
     * @param graphMapper Graph mapper
     */
    public BspServiceMaster(String serverPortList, int sessionMsecTimeout, Mapper<?, ?, ?, ?>.Context context,
            GraphMapper<I, V, E, M> graphMapper) {
        super(serverPortList, sessionMsecTimeout, context, graphMapper);
        registerBspEvent(superstepStateChanged);

        maxWorkers = getConfiguration().getInt(GiraphJob.MAX_WORKERS, -1);
        minWorkers = getConfiguration().getInt(GiraphJob.MIN_WORKERS, -1);
        minPercentResponded = getConfiguration().getFloat(GiraphJob.MIN_PERCENT_RESPONDED, 100.0f);
        msecsPollPeriod = getConfiguration().getInt(GiraphJob.POLL_MSECS, GiraphJob.POLL_MSECS_DEFAULT);
        maxPollAttempts = getConfiguration().getInt(GiraphJob.POLL_ATTEMPTS, GiraphJob.POLL_ATTEMPTS_DEFAULT);
        partitionLongTailMinPrint = getConfiguration().getInt(GiraphJob.PARTITION_LONG_TAIL_MIN_PRINT,
                GiraphJob.PARTITION_LONG_TAIL_MIN_PRINT_DEFAULT);
        masterGraphPartitioner = getGraphPartitionerFactory().createMasterGraphPartitioner();
    }

    @Override
    public void setJobState(ApplicationState state, long applicationAttempt, long desiredSuperstep) {
        JSONObject jobState = new JSONObject();
        try {
            jobState.put(JSONOBJ_STATE_KEY, state.toString());
            jobState.put(JSONOBJ_APPLICATION_ATTEMPT_KEY, applicationAttempt);
            jobState.put(JSONOBJ_SUPERSTEP_KEY, desiredSuperstep);
        } catch (JSONException e) {
            throw new RuntimeException("setJobState: Coudn't put " + state.toString());
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("setJobState: " + jobState.toString() + " on superstep " + getSuperstep());
        }
        try {
            getZkExt().createExt(masterJobStatePath + "/jobState", jobState.toString().getBytes(),
                    Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT_SEQUENTIAL, true);
        } catch (KeeperException.NodeExistsException e) {
            throw new IllegalStateException(
                    "setJobState: Imposible that " + masterJobStatePath + " already exists!", e);
        } catch (KeeperException e) {
            throw new IllegalStateException("setJobState: Unknown KeeperException for " + masterJobStatePath, e);
        } catch (InterruptedException e) {
            throw new IllegalStateException("setJobState: Unknown InterruptedException for " + masterJobStatePath,
                    e);
        }

        if (state == ApplicationState.FAILED) {
            failJob();
        }
    }

    /**
     * Master uses this to calculate the {@link VertexInputFormat}
     * input splits and write it to ZooKeeper.
     *
     * @param numWorkers Number of available workers
     * @return List of input splits
     * @throws InstantiationException
     * @throws IllegalAccessException
     * @throws IOException
     * @throws InterruptedException
     */
    private List<InputSplit> generateInputSplits(int numWorkers) {
        VertexInputFormat<I, V, E, M> vertexInputFormat = BspUtils
                .<I, V, E, M>createVertexInputFormat(getConfiguration());
        List<InputSplit> splits;
        try {
            splits = vertexInputFormat.getSplits(getContext(), numWorkers);
            float samplePercent = getConfiguration().getFloat(GiraphJob.INPUT_SPLIT_SAMPLE_PERCENT,
                    GiraphJob.INPUT_SPLIT_SAMPLE_PERCENT_DEFAULT);
            if (samplePercent != GiraphJob.INPUT_SPLIT_SAMPLE_PERCENT_DEFAULT) {
                int lastIndex = (int) (samplePercent * splits.size() / 100f);
                List<InputSplit> sampleSplits = splits.subList(0, lastIndex);
                LOG.warn("generateInputSplits: Using sampling - Processing " + "only " + sampleSplits.size()
                        + " instead of " + splits.size() + " expected splits.");
                return sampleSplits;
            } else {
                if (LOG.isInfoEnabled()) {
                    LOG.info("generateInputSplits: Got " + splits.size() + " input splits for " + numWorkers
                            + " workers");
                }
                return splits;
            }
        } catch (IOException e) {
            throw new IllegalStateException("generateInputSplits: Got IOException", e);
        } catch (InterruptedException e) {
            throw new IllegalStateException("generateInputSplits: Got InterruptedException", e);
        }
    }

    /**
     * When there is no salvaging this job, fail it.
     *
     * @throws IOException
     */
    private void failJob() {
        LOG.fatal("failJob: Killing job " + getJobId());
        try {
            @SuppressWarnings("deprecation")
            org.apache.hadoop.mapred.JobClient jobClient = new org.apache.hadoop.mapred.JobClient(
                    (org.apache.hadoop.mapred.JobConf) getConfiguration());
            @SuppressWarnings("deprecation")
            org.apache.hadoop.mapred.JobID jobId = org.apache.hadoop.mapred.JobID.forName(getJobId());
            RunningJob job = jobClient.getJob(jobId);
            job.killJob();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Parse the {@link WorkerInfo} objects from a ZooKeeper path
     * (and children).
     *
     * @param workerInfosPath Path where all the workers are children
     * @param watch Watch or not?
     * @return List of workers in that path
     */
    private List<WorkerInfo> getWorkerInfosFromPath(String workerInfosPath, boolean watch) {
        List<WorkerInfo> workerInfoList = new ArrayList<WorkerInfo>();
        List<String> workerInfoPathList;
        try {
            workerInfoPathList = getZkExt().getChildrenExt(workerInfosPath, watch, false, true);
        } catch (KeeperException e) {
            throw new IllegalStateException("getWorkers: Got KeeperException", e);
        } catch (InterruptedException e) {
            throw new IllegalStateException("getWorkers: Got InterruptedStateException", e);
        }
        for (String workerInfoPath : workerInfoPathList) {
            WorkerInfo workerInfo = new WorkerInfo();
            WritableUtils.readFieldsFromZnode(getZkExt(), workerInfoPath, true, null, workerInfo);
            workerInfoList.add(workerInfo);
        }
        return workerInfoList;
    }

    /**
     * Get the healthy and unhealthy {@link WorkerInfo} objects for
     * a superstep
     *
     * @param superstep superstep to check
     * @param healthyWorkerInfoList filled in with current data
     * @param unhealthyWorkerInfoList filled in with current data
     */
    private void getAllWorkerInfos(long superstep, List<WorkerInfo> healthyWorkerInfoList,
            List<WorkerInfo> unhealthyWorkerInfoList) {
        String healthyWorkerInfoPath = getWorkerInfoHealthyPath(getApplicationAttempt(), superstep);
        String unhealthyWorkerInfoPath = getWorkerInfoUnhealthyPath(getApplicationAttempt(), superstep);

        try {
            getZkExt().createOnceExt(healthyWorkerInfoPath, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, true);
        } catch (KeeperException e) {
            throw new IllegalStateException("getWorkers: KeeperException", e);
        } catch (InterruptedException e) {
            throw new IllegalStateException("getWorkers: IllegalStateException", e);
        }

        try {
            getZkExt().createOnceExt(unhealthyWorkerInfoPath, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT,
                    true);
        } catch (KeeperException e) {
            throw new IllegalStateException("getWorkers: KeeperException", e);
        } catch (InterruptedException e) {
            throw new IllegalStateException("getWorkers: IllegalStateException", e);
        }

        List<WorkerInfo> currentHealthyWorkerInfoList = getWorkerInfosFromPath(healthyWorkerInfoPath, true);
        List<WorkerInfo> currentUnhealthyWorkerInfoList = getWorkerInfosFromPath(unhealthyWorkerInfoPath, false);

        healthyWorkerInfoList.clear();
        if (currentHealthyWorkerInfoList != null) {
            for (WorkerInfo healthyWorkerInfo : currentHealthyWorkerInfoList) {
                healthyWorkerInfoList.add(healthyWorkerInfo);
            }
        }

        unhealthyWorkerInfoList.clear();
        if (currentUnhealthyWorkerInfoList != null) {
            for (WorkerInfo unhealthyWorkerInfo : currentUnhealthyWorkerInfoList) {
                unhealthyWorkerInfoList.add(unhealthyWorkerInfo);
            }
        }
    }

    /**
     * Check all the {@link WorkerInfo} objects to ensure that a minimum
     * number of good workers exists out of the total that have reported.
     *
     * @return List of of healthy workers such that the minimum has been
     *         met, otherwise null
     */
    private List<WorkerInfo> checkWorkers() {
        boolean failJob = true;
        int pollAttempt = 0;
        List<WorkerInfo> healthyWorkerInfoList = new ArrayList<WorkerInfo>();
        List<WorkerInfo> unhealthyWorkerInfoList = new ArrayList<WorkerInfo>();
        int totalResponses = -1;
        while (pollAttempt < maxPollAttempts) {
            getAllWorkerInfos(getSuperstep(), healthyWorkerInfoList, unhealthyWorkerInfoList);
            totalResponses = healthyWorkerInfoList.size() + unhealthyWorkerInfoList.size();
            if ((totalResponses * 100.0f / maxWorkers) >= minPercentResponded) {
                failJob = false;
                break;
            }
            getContext().setStatus(
                    getGraphMapper().getMapFunctions() + " " + "checkWorkers: Only found " + totalResponses
                            + " responses of " + maxWorkers + " needed to start superstep " + getSuperstep());
            if (getWorkerHealthRegistrationChangedEvent().waitMsecs(msecsPollPeriod)) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("checkWorkers: Got event that health "
                            + "registration changed, not using poll attempt");
                }
                getWorkerHealthRegistrationChangedEvent().reset();
                continue;
            }
            if (LOG.isInfoEnabled()) {
                LOG.info("checkWorkers: Only found " + totalResponses + " responses of " + maxWorkers
                        + " needed to start superstep " + getSuperstep() + ".  Sleeping for " + msecsPollPeriod
                        + " msecs and used " + pollAttempt + " of " + maxPollAttempts + " attempts.");
                // Find the missing workers if there are only a few
                if ((maxWorkers - totalResponses) <= partitionLongTailMinPrint) {
                    Set<Integer> partitionSet = new TreeSet<Integer>();
                    for (WorkerInfo workerInfo : healthyWorkerInfoList) {
                        partitionSet.add(workerInfo.getPartitionId());
                    }
                    for (WorkerInfo workerInfo : unhealthyWorkerInfoList) {
                        partitionSet.add(workerInfo.getPartitionId());
                    }
                    for (int i = 1; i <= maxWorkers; ++i) {
                        if (partitionSet.contains(Integer.valueOf(i))) {
                            continue;
                        } else if (i == getTaskPartition()) {
                            continue;
                        } else {
                            LOG.info("checkWorkers: No response from " + "partition " + i + " (could be master)");
                        }
                    }
                }
            }
            ++pollAttempt;
        }
        if (failJob) {
            LOG.error("checkWorkers: Did not receive enough processes in " + "time (only " + totalResponses + " of "
                    + minWorkers + " required).  This occurs if you do not "
                    + "have enough map tasks available simultaneously on "
                    + "your Hadoop instance to fulfill the number of " + "requested workers.");
            return null;
        }

        if (healthyWorkerInfoList.size() < minWorkers) {
            LOG.error("checkWorkers: Only " + healthyWorkerInfoList.size() + " available when " + minWorkers
                    + " are required.");
            return null;
        }

        getContext().setStatus(getGraphMapper().getMapFunctions() + " " + "checkWorkers: Done - Found "
                + totalResponses + " responses of " + maxWorkers + " needed to start superstep " + getSuperstep());

        return healthyWorkerInfoList;
    }

    @Override
    public int createInputSplits() {
        // Only the 'master' should be doing this.  Wait until the number of
        // processes that have reported health exceeds the minimum percentage.
        // If the minimum percentage is not met, fail the job.  Otherwise
        // generate the input splits
        try {
            if (getZkExt().exists(inputSplitsPath, false) != null) {
                LOG.info(inputSplitsPath + " already exists, no need to create");
                return Integer.parseInt(new String(getZkExt().getData(inputSplitsPath, false, null)));
            }
        } catch (KeeperException.NoNodeException e) {
            if (LOG.isInfoEnabled()) {
                LOG.info("createInputSplits: Need to create the " + "input splits at " + inputSplitsPath);
            }
        } catch (KeeperException e) {
            throw new IllegalStateException("createInputSplits: KeeperException", e);
        } catch (InterruptedException e) {
            throw new IllegalStateException("createInputSplits: InterrtupedException", e);
        }

        // When creating znodes, in case the master has already run, resume
        // where it left off.
        List<WorkerInfo> healthyWorkerInfoList = checkWorkers();
        if (healthyWorkerInfoList == null) {
            setJobState(ApplicationState.FAILED, -1, -1);
            return -1;
        }

        // Note that the input splits may only be a sample if
        // INPUT_SPLIT_SAMPLE_PERCENT is set to something other than 100
        List<InputSplit> splitList = generateInputSplits(healthyWorkerInfoList.size());
        if (healthyWorkerInfoList.size() > splitList.size()) {
            LOG.warn("createInputSplits: Number of inputSplits=" + splitList.size() + " < "
                    + healthyWorkerInfoList.size() + "=number of healthy processes, "
                    + "some workers will be not used");
        }
        String inputSplitPath = null;
        for (int i = 0; i < splitList.size(); ++i) {
            try {
                ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
                DataOutput outputStream = new DataOutputStream(byteArrayOutputStream);
                InputSplit inputSplit = splitList.get(i);
                Text.writeString(outputStream, inputSplit.getClass().getName());
                ((Writable) inputSplit).write(outputStream);
                inputSplitPath = inputSplitsPath + "/" + i;
                getZkExt().createExt(inputSplitPath, byteArrayOutputStream.toByteArray(), Ids.OPEN_ACL_UNSAFE,
                        CreateMode.PERSISTENT, true);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("createInputSplits: Created input split " + "with index " + i + " serialized as "
                            + byteArrayOutputStream.toString());
                }
            } catch (KeeperException.NodeExistsException e) {
                if (LOG.isInfoEnabled()) {
                    LOG.info("createInputSplits: Node " + inputSplitPath + " already exists.");
                }
            } catch (KeeperException e) {
                throw new IllegalStateException("createInputSplits: KeeperException", e);
            } catch (InterruptedException e) {
                throw new IllegalStateException("createInputSplits: IllegalStateException", e);
            } catch (IOException e) {
                throw new IllegalStateException("createInputSplits: IOException", e);
            }
        }

        // Let workers know they can start trying to load the input splits
        try {
            getZkExt().create(inputSplitsAllReadyPath, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
        } catch (KeeperException.NodeExistsException e) {
            LOG.info("createInputSplits: Node " + inputSplitsAllReadyPath + " already exists.");
        } catch (KeeperException e) {
            throw new IllegalStateException("createInputSplits: KeeperException", e);
        } catch (InterruptedException e) {
            throw new IllegalStateException("createInputSplits: IllegalStateException", e);
        }

        return splitList.size();
    }

    /**
     * Read the finalized checkpoint file and associated metadata files for the
     * checkpoint.  Modifies the {@link PartitionOwner} objects to get the
     * checkpoint prefixes.  It is an optimization to prevent all workers from
     * searching all the files.  Also read in the aggregator data from the
     * finalized checkpoint file and setting it.
     *
     * @param superstep Checkpoint set to examine.
     * @param partitionOwners Partition owners to modify with checkpoint
     *        prefixes
     * @throws IOException
     * @throws InterruptedException
     * @throws KeeperException
     */
    private void prepareCheckpointRestart(long superstep, Collection<PartitionOwner> partitionOwners)
            throws IOException, KeeperException, InterruptedException {
        FileSystem fs = getFs();
        List<Path> validMetadataPathList = new ArrayList<Path>();
        String finalizedCheckpointPath = getCheckpointBasePath(superstep) + CHECKPOINT_FINALIZED_POSTFIX;
        DataInputStream finalizedStream = fs.open(new Path(finalizedCheckpointPath));
        int prefixFileCount = finalizedStream.readInt();
        for (int i = 0; i < prefixFileCount; ++i) {
            String metadataFilePath = finalizedStream.readUTF() + CHECKPOINT_METADATA_POSTFIX;
            validMetadataPathList.add(new Path(metadataFilePath));
        }

        // Set the merged aggregator data if it exists.
        int aggregatorDataSize = finalizedStream.readInt();
        if (aggregatorDataSize > 0) {
            byte[] aggregatorZkData = new byte[aggregatorDataSize];
            int actualDataRead = finalizedStream.read(aggregatorZkData, 0, aggregatorDataSize);
            if (actualDataRead != aggregatorDataSize) {
                throw new RuntimeException("prepareCheckpointRestart: Only read " + actualDataRead + " of "
                        + aggregatorDataSize + " aggregator bytes from " + finalizedCheckpointPath);
            }
            String mergedAggregatorPath = getMergedAggregatorPath(getApplicationAttempt(), superstep - 1);
            if (LOG.isInfoEnabled()) {
                LOG.info("prepareCheckpointRestart: Reloading merged " + "aggregator " + "data '"
                        + Arrays.toString(aggregatorZkData) + "' to previous checkpoint in path "
                        + mergedAggregatorPath);
            }
            if (getZkExt().exists(mergedAggregatorPath, false) == null) {
                getZkExt().createExt(mergedAggregatorPath, aggregatorZkData, Ids.OPEN_ACL_UNSAFE,
                        CreateMode.PERSISTENT, true);
            } else {
                getZkExt().setData(mergedAggregatorPath, aggregatorZkData, -1);
            }
        }
        masterCompute.readFields(finalizedStream);
        finalizedStream.close();

        Map<Integer, PartitionOwner> idOwnerMap = new HashMap<Integer, PartitionOwner>();
        for (PartitionOwner partitionOwner : partitionOwners) {
            if (idOwnerMap.put(partitionOwner.getPartitionId(), partitionOwner) != null) {
                throw new IllegalStateException("prepareCheckpointRestart: Duplicate partition " + partitionOwner);
            }
        }
        // Reading the metadata files.  Simply assign each partition owner
        // the correct file prefix based on the partition id.
        for (Path metadataPath : validMetadataPathList) {
            String checkpointFilePrefix = metadataPath.toString();
            checkpointFilePrefix = checkpointFilePrefix.substring(0,
                    checkpointFilePrefix.length() - CHECKPOINT_METADATA_POSTFIX.length());
            DataInputStream metadataStream = fs.open(metadataPath);
            long partitions = metadataStream.readInt();
            for (long i = 0; i < partitions; ++i) {
                long dataPos = metadataStream.readLong();
                int partitionId = metadataStream.readInt();
                PartitionOwner partitionOwner = idOwnerMap.get(partitionId);
                if (LOG.isInfoEnabled()) {
                    LOG.info("prepareSuperstepRestart: File " + metadataPath + " with position " + dataPos
                            + ", partition id = " + partitionId + " assigned to " + partitionOwner);
                }
                partitionOwner.setCheckpointFilesPrefix(checkpointFilePrefix);
            }
            metadataStream.close();
        }
    }

    @Override
    public void setup() {
        // Might have to manually load a checkpoint.
        // In that case, the input splits are not set, they will be faked by
        // the checkpoint files.  Each checkpoint file will be an input split
        // and the input split
        superstepCounter = getContext().getCounter(GIRAPH_STATS_COUNTER_GROUP_NAME, "Superstep");
        vertexCounter = getContext().getCounter(GIRAPH_STATS_COUNTER_GROUP_NAME, "Aggregate vertices");
        finishedVertexCounter = getContext().getCounter(GIRAPH_STATS_COUNTER_GROUP_NAME,
                "Aggregate finished vertices");
        edgeCounter = getContext().getCounter(GIRAPH_STATS_COUNTER_GROUP_NAME, "Aggregate edges");
        sentMessagesCounter = getContext().getCounter(GIRAPH_STATS_COUNTER_GROUP_NAME, "Sent messages");
        currentWorkersCounter = getContext().getCounter(GIRAPH_STATS_COUNTER_GROUP_NAME, "Current workers");
        currentMasterTaskPartitionCounter = getContext().getCounter(GIRAPH_STATS_COUNTER_GROUP_NAME,
                "Current master task partition");
        lastCheckpointedSuperstepCounter = getContext().getCounter(GIRAPH_STATS_COUNTER_GROUP_NAME,
                "Last checkpointed superstep");
        if (getRestartedSuperstep() != UNSET_SUPERSTEP) {
            superstepCounter.increment(getRestartedSuperstep());
        }
    }

    @Override
    public boolean becomeMaster() {
        // Create my bid to become the master, then try to become the worker
        // or return false.
        String myBid = null;
        try {
            myBid = getZkExt().createExt(masterElectionPath + "/" + getHostnamePartitionId(), null,
                    Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL_SEQUENTIAL, true);
        } catch (KeeperException e) {
            throw new IllegalStateException("becomeMaster: KeeperException", e);
        } catch (InterruptedException e) {
            throw new IllegalStateException("becomeMaster: IllegalStateException", e);
        }
        while (true) {
            JSONObject jobState = getJobState();
            try {
                if ((jobState != null) && ApplicationState
                        .valueOf(jobState.getString(JSONOBJ_STATE_KEY)) == ApplicationState.FINISHED) {
                    LOG.info("becomeMaster: Job is finished, " + "give up trying to be the master!");
                    isMaster = false;
                    return isMaster;
                }
            } catch (JSONException e) {
                throw new IllegalStateException("becomeMaster: Couldn't get state from " + jobState, e);
            }
            try {
                List<String> masterChildArr = getZkExt().getChildrenExt(masterElectionPath, true, true, true);
                if (LOG.isInfoEnabled()) {
                    LOG.info("becomeMaster: First child is '" + masterChildArr.get(0) + "' and my bid is '" + myBid
                            + "'");
                }
                if (masterChildArr.get(0).equals(myBid)) {
                    currentMasterTaskPartitionCounter
                            .increment(getTaskPartition() - currentMasterTaskPartitionCounter.getValue());
                    masterCompute = BspUtils.createMasterCompute(getConfiguration());
                    aggregatorWriter = BspUtils.createAggregatorWriter(getConfiguration());
                    try {
                        aggregatorWriter.initialize(getContext(), getApplicationAttempt());
                    } catch (IOException e) {
                        throw new IllegalStateException("becomeMaster: " + "Couldn't initialize aggregatorWriter",
                                e);
                    }
                    LOG.info("becomeMaster: I am now the master!");
                    isMaster = true;
                    return isMaster;
                }
                LOG.info("becomeMaster: Waiting to become the master...");
                getMasterElectionChildrenChangedEvent().waitForever();
                getMasterElectionChildrenChangedEvent().reset();
            } catch (KeeperException e) {
                throw new IllegalStateException("becomeMaster: KeeperException", e);
            } catch (InterruptedException e) {
                throw new IllegalStateException("becomeMaster: IllegalStateException", e);
            }
        }
    }

    /**
     * Collect and aggregate the worker statistics for a particular superstep.
     *
     * @param superstep Superstep to aggregate on
     * @return Global statistics aggregated on all worker statistics
     */
    private GlobalStats aggregateWorkerStats(long superstep) {
        Class<? extends Writable> partitionStatsClass = masterGraphPartitioner.createPartitionStats().getClass();
        GlobalStats globalStats = new GlobalStats();
        // Get the stats from the all the worker selected nodes
        String workerFinishedPath = getWorkerFinishedPath(getApplicationAttempt(), superstep);
        List<String> workerFinishedPathList = null;
        try {
            workerFinishedPathList = getZkExt().getChildrenExt(workerFinishedPath, false, false, true);
        } catch (KeeperException e) {
            throw new IllegalStateException("aggregateWorkerStats: KeeperException", e);
        } catch (InterruptedException e) {
            throw new IllegalStateException("aggregateWorkerStats: InterruptedException", e);
        }

        allPartitionStatsList.clear();
        for (String finishedPath : workerFinishedPathList) {
            JSONObject workerFinishedInfoObj = null;
            try {
                byte[] zkData = getZkExt().getData(finishedPath, false, null);
                workerFinishedInfoObj = new JSONObject(new String(zkData));
                List<? extends Writable> writableList = WritableUtils.readListFieldsFromByteArray(
                        Base64.decode(workerFinishedInfoObj.getString(JSONOBJ_PARTITION_STATS_KEY)),
                        partitionStatsClass, getConfiguration());
                for (Writable writable : writableList) {
                    globalStats.addPartitionStats((PartitionStats) writable);
                    globalStats.addMessageCount(workerFinishedInfoObj.getLong(JSONOBJ_NUM_MESSAGES_KEY));
                    allPartitionStatsList.add((PartitionStats) writable);
                }
            } catch (JSONException e) {
                throw new IllegalStateException("aggregateWorkerStats: JSONException", e);
            } catch (KeeperException e) {
                throw new IllegalStateException("aggregateWorkerStats: KeeperException", e);
            } catch (InterruptedException e) {
                throw new IllegalStateException("aggregateWorkerStats: InterruptedException", e);
            } catch (IOException e) {
                throw new IllegalStateException("aggregateWorkerStats: IOException", e);
            }
        }

        if (LOG.isInfoEnabled()) {
            LOG.info(
                    "aggregateWorkerStats: Aggregation found " + globalStats + " on superstep = " + getSuperstep());
        }
        return globalStats;
    }

    /**
     * Get the aggregator values for a particular superstep and aggregate them.
     *
     * @param superstep superstep to check
     */
    private void collectAndProcessAggregatorValues(long superstep) {
        String workerFinishedPath = getWorkerFinishedPath(getApplicationAttempt(), superstep);
        List<String> hostnameIdPathList = null;
        try {
            hostnameIdPathList = getZkExt().getChildrenExt(workerFinishedPath, false, false, true);
        } catch (KeeperException e) {
            throw new IllegalStateException("collectAndProcessAggregatorValues: KeeperException", e);
        } catch (InterruptedException e) {
            throw new IllegalStateException("collectAndProcessAggregatorValues: InterruptedException", e);
        }

        for (String hostnameIdPath : hostnameIdPathList) {
            JSONObject workerFinishedInfoObj = null;
            JSONArray aggregatorArray = null;
            try {
                byte[] zkData = getZkExt().getData(hostnameIdPath, false, null);
                workerFinishedInfoObj = new JSONObject(new String(zkData));
            } catch (KeeperException e) {
                throw new IllegalStateException("collectAndProcessAggregatorValues: KeeperException", e);
            } catch (InterruptedException e) {
                throw new IllegalStateException("collectAndProcessAggregatorValues: InterruptedException", e);
            } catch (JSONException e) {
                throw new IllegalStateException("collectAndProcessAggregatorValues: JSONException", e);
            }
            try {
                aggregatorArray = workerFinishedInfoObj.getJSONArray(JSONOBJ_AGGREGATOR_VALUE_ARRAY_KEY);
            } catch (JSONException e) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("collectAndProcessAggregatorValues: " + "No aggregators" + " for " + hostnameIdPath);
                }
                continue;
            }
            for (int i = 0; i < aggregatorArray.length(); ++i) {
                try {
                    if (LOG.isInfoEnabled()) {
                        LOG.info("collectAndProcessAggregatorValues: " + "Getting aggregators from "
                                + aggregatorArray.getJSONObject(i));
                    }
                    String aggregatorName = aggregatorArray.getJSONObject(i).getString(AGGREGATOR_NAME_KEY);
                    String aggregatorClassName = aggregatorArray.getJSONObject(i)
                            .getString(AGGREGATOR_CLASS_NAME_KEY);
                    @SuppressWarnings("unchecked")
                    Aggregator<Writable> aggregator = (Aggregator<Writable>) getAggregator(aggregatorName);
                    boolean firstTime = false;
                    if (aggregator == null) {
                        @SuppressWarnings("unchecked")
                        Class<? extends Aggregator<Writable>> aggregatorClass = (Class<? extends Aggregator<Writable>>) Class
                                .forName(aggregatorClassName);
                        aggregator = registerAggregator(aggregatorName, aggregatorClass);
                        firstTime = true;
                    }
                    Writable aggregatorValue = aggregator.createAggregatedValue();
                    InputStream input = new ByteArrayInputStream(
                            Base64.decode(aggregatorArray.getJSONObject(i).getString(AGGREGATOR_VALUE_KEY)));
                    aggregatorValue.readFields(new DataInputStream(input));
                    if (LOG.isDebugEnabled()) {
                        LOG.debug(
                                "collectAndProcessAggregatorValues: " + "aggregator value size=" + input.available()
                                        + " for aggregator=" + aggregatorName + " value=" + aggregatorValue);
                    }
                    if (firstTime) {
                        aggregator.setAggregatedValue(aggregatorValue);
                    } else {
                        aggregator.aggregate(aggregatorValue);
                    }
                } catch (IOException e) {
                    throw new IllegalStateException("collectAndProcessAggregatorValues: "
                            + "IOException when reading aggregator data " + aggregatorArray, e);
                } catch (JSONException e) {
                    throw new IllegalStateException("collectAndProcessAggregatorValues: "
                            + "JSONException when reading aggregator data " + aggregatorArray, e);
                } catch (ClassNotFoundException e) {
                    throw new IllegalStateException("collectAndProcessAggregatorValues: "
                            + "ClassNotFoundException when reading aggregator data " + aggregatorArray, e);
                } catch (InstantiationException e) {
                    throw new IllegalStateException("collectAndProcessAggregatorValues: "
                            + "InstantiationException when reading aggregator data " + aggregatorArray, e);
                } catch (IllegalAccessException e) {
                    throw new IllegalStateException("collectAndProcessAggregatorValues: "
                            + "IOException when reading aggregator data " + aggregatorArray, e);
                }
            }
        }
    }

    /**
     * Save the supplied aggregator values.
     *
     * @param superstep superstep for which to save values
     */
    private void saveAggregatorValues(long superstep) {
        Map<String, Aggregator<Writable>> aggregatorMap = getAggregatorMap();
        if (aggregatorMap.size() > 0) {
            String mergedAggregatorPath = getMergedAggregatorPath(getApplicationAttempt(), superstep);
            byte[] zkData = null;
            JSONArray aggregatorArray = new JSONArray();
            for (Map.Entry<String, Aggregator<Writable>> entry : aggregatorMap.entrySet()) {
                try {
                    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
                    DataOutput output = new DataOutputStream(outputStream);
                    entry.getValue().getAggregatedValue().write(output);

                    JSONObject aggregatorObj = new JSONObject();
                    aggregatorObj.put(AGGREGATOR_NAME_KEY, entry.getKey());
                    aggregatorObj.put(AGGREGATOR_VALUE_KEY, Base64.encodeBytes(outputStream.toByteArray()));
                    aggregatorArray.put(aggregatorObj);
                    if (LOG.isInfoEnabled()) {
                        LOG.info("saveAggregatorValues: " + "Trying to add aggregatorObj " + aggregatorObj + "("
                                + entry.getValue().getAggregatedValue() + ") to merged aggregator path "
                                + mergedAggregatorPath);
                    }
                } catch (IOException e) {
                    throw new IllegalStateException("saveAggregatorValues: " + "IllegalStateException", e);
                } catch (JSONException e) {
                    throw new IllegalStateException("saveAggregatorValues: JSONException", e);
                }
            }
            try {
                zkData = aggregatorArray.toString().getBytes();
                getZkExt().createExt(mergedAggregatorPath, zkData, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT,
                        true);
            } catch (KeeperException.NodeExistsException e) {
                LOG.warn("saveAggregatorValues: " + mergedAggregatorPath + " already exists!");
            } catch (KeeperException e) {
                throw new IllegalStateException("saveAggregatorValues: KeeperException", e);
            } catch (InterruptedException e) {
                throw new IllegalStateException("saveAggregatorValues: IllegalStateException", e);
            }
            if (LOG.isInfoEnabled()) {
                LOG.info("saveAggregatorValues: Finished " + "loading " + mergedAggregatorPath
                        + " with aggregator values " + aggregatorArray);
            }
        }
    }

    /**
     * Finalize the checkpoint file prefixes by taking the chosen workers and
     * writing them to a finalized file.  Also write out the master
     * aggregated aggregator array from the previous superstep.
     *
     * @param superstep superstep to finalize
     * @param chosenWorkerInfoList list of chosen workers that will be finalized
     * @throws IOException
     * @throws InterruptedException
     * @throws KeeperException
     */
    private void finalizeCheckpoint(long superstep, List<WorkerInfo> chosenWorkerInfoList)
            throws IOException, KeeperException, InterruptedException {
        Path finalizedCheckpointPath = new Path(getCheckpointBasePath(superstep) + CHECKPOINT_FINALIZED_POSTFIX);
        try {
            getFs().delete(finalizedCheckpointPath, false);
        } catch (IOException e) {
            LOG.warn("finalizedValidCheckpointPrefixes: Removed old file " + finalizedCheckpointPath);
        }

        // Format:
        // <number of files>
        // <used file prefix 0><used file prefix 1>...
        // <aggregator data length><aggregators as a serialized JSON byte array>
        FSDataOutputStream finalizedOutputStream = getFs().create(finalizedCheckpointPath);
        finalizedOutputStream.writeInt(chosenWorkerInfoList.size());
        for (WorkerInfo chosenWorkerInfo : chosenWorkerInfoList) {
            String chosenWorkerInfoPrefix = getCheckpointBasePath(superstep) + "."
                    + chosenWorkerInfo.getHostnameId();
            finalizedOutputStream.writeUTF(chosenWorkerInfoPrefix);
        }
        String mergedAggregatorPath = getMergedAggregatorPath(getApplicationAttempt(), superstep - 1);
        if (getZkExt().exists(mergedAggregatorPath, false) != null) {
            byte[] aggregatorZkData = getZkExt().getData(mergedAggregatorPath, false, null);
            finalizedOutputStream.writeInt(aggregatorZkData.length);
            finalizedOutputStream.write(aggregatorZkData);
        } else {
            finalizedOutputStream.writeInt(0);
        }
        masterCompute.write(finalizedOutputStream);
        finalizedOutputStream.close();
        lastCheckpointedSuperstep = superstep;
        lastCheckpointedSuperstepCounter.increment(superstep - lastCheckpointedSuperstepCounter.getValue());
    }

    /**
     * Assign the partitions for this superstep.  If there are changes,
     * the workers will know how to do the exchange.  If this was a restarted
     * superstep, then make sure to provide information on where to find the
     * checkpoint file.
     *
     * @param allPartitionStatsList All partition stats
     * @param chosenWorkerInfoList All the chosen worker infos
     * @param masterGraphPartitioner Master graph partitioner
     */
    private void assignPartitionOwners(List<PartitionStats> allPartitionStatsList,
            List<WorkerInfo> chosenWorkerInfoList, MasterGraphPartitioner<I, V, E, M> masterGraphPartitioner) {
        Collection<PartitionOwner> partitionOwners;
        if (getSuperstep() == INPUT_SUPERSTEP || getSuperstep() == getRestartedSuperstep()) {
            partitionOwners = masterGraphPartitioner.createInitialPartitionOwners(chosenWorkerInfoList, maxWorkers);
            if (partitionOwners.isEmpty()) {
                throw new IllegalStateException("assignAndExchangePartitions: No partition owners set");
            }
        } else {
            partitionOwners = masterGraphPartitioner.generateChangedPartitionOwners(allPartitionStatsList,
                    chosenWorkerInfoList, maxWorkers, getSuperstep());

            PartitionUtils.analyzePartitionStats(partitionOwners, allPartitionStatsList);
        }

        // If restarted, prepare the checkpoint restart
        if (getRestartedSuperstep() == getSuperstep()) {
            try {
                prepareCheckpointRestart(getSuperstep(), partitionOwners);
            } catch (IOException e) {
                throw new IllegalStateException("assignPartitionOwners: IOException on preparing", e);
            } catch (KeeperException e) {
                throw new IllegalStateException("assignPartitionOwners: KeeperException on preparing", e);
            } catch (InterruptedException e) {
                throw new IllegalStateException("assignPartitionOwners: InteruptedException on preparing", e);
            }
        }

        // There will be some exchange of partitions
        if (!partitionOwners.isEmpty()) {
            String vertexExchangePath = getPartitionExchangePath(getApplicationAttempt(), getSuperstep());
            try {
                getZkExt().createOnceExt(vertexExchangePath, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT,
                        true);
            } catch (KeeperException e) {
                throw new IllegalStateException(
                        "assignPartitionOwners: KeeperException creating " + vertexExchangePath);
            } catch (InterruptedException e) {
                throw new IllegalStateException(
                        "assignPartitionOwners: InterruptedException creating " + vertexExchangePath);
            }
        }

        // Workers are waiting for these assignments
        String partitionAssignmentsPath = getPartitionAssignmentsPath(getApplicationAttempt(), getSuperstep());
        WritableUtils.writeListToZnode(getZkExt(), partitionAssignmentsPath, -1,
                new ArrayList<Writable>(partitionOwners));
    }

    /**
     * Check whether the workers chosen for this superstep are still alive
     *
     * @param chosenWorkerInfoHealthPath Path to the healthy workers in ZooKeeper
     * @param chosenWorkerInfoList List of the healthy workers
     * @return true if they are all alive, false otherwise.
     * @throws InterruptedException
     * @throws KeeperException
     */
    private boolean superstepChosenWorkerAlive(String chosenWorkerInfoHealthPath,
            List<WorkerInfo> chosenWorkerInfoList) throws KeeperException, InterruptedException {
        List<WorkerInfo> chosenWorkerInfoHealthyList = getWorkerInfosFromPath(chosenWorkerInfoHealthPath, false);
        Set<WorkerInfo> chosenWorkerInfoHealthySet = new HashSet<WorkerInfo>(chosenWorkerInfoHealthyList);
        boolean allChosenWorkersHealthy = true;
        for (WorkerInfo chosenWorkerInfo : chosenWorkerInfoList) {
            if (!chosenWorkerInfoHealthySet.contains(chosenWorkerInfo)) {
                allChosenWorkersHealthy = false;
                LOG.error("superstepChosenWorkerAlive: Missing chosen " + "worker " + chosenWorkerInfo
                        + " on superstep " + getSuperstep());
            }
        }
        return allChosenWorkersHealthy;
    }

    @Override
    public void restartFromCheckpoint(long checkpoint) {
        // Process:
        // 1. Remove all old input split data
        // 2. Increase the application attempt and set to the correct checkpoint
        // 3. Send command to all workers to restart their tasks
        try {
            getZkExt().deleteExt(inputSplitsPath, -1, true);
        } catch (InterruptedException e) {
            throw new RuntimeException("restartFromCheckpoint: InterruptedException", e);
        } catch (KeeperException e) {
            throw new RuntimeException("restartFromCheckpoint: KeeperException", e);
        }
        setApplicationAttempt(getApplicationAttempt() + 1);
        setCachedSuperstep(checkpoint);
        setRestartedSuperstep(checkpoint);
        setJobState(ApplicationState.START_SUPERSTEP, getApplicationAttempt(), checkpoint);
    }

    /**
     * Only get the finalized checkpoint files
     */
    public static class FinalizedCheckpointPathFilter implements PathFilter {
        @Override
        public boolean accept(Path path) {
            return path.getName().endsWith(BspService.CHECKPOINT_FINALIZED_POSTFIX);
        }
    }

    @Override
    public long getLastGoodCheckpoint() throws IOException {
        // Find the last good checkpoint if none have been written to the
        // knowledge of this master
        if (lastCheckpointedSuperstep == -1) {
            FileStatus[] fileStatusArray = getFs().listStatus(new Path(checkpointBasePath),
                    new FinalizedCheckpointPathFilter());
            if (fileStatusArray == null) {
                return -1;
            }
            Arrays.sort(fileStatusArray);
            lastCheckpointedSuperstep = getCheckpoint(fileStatusArray[fileStatusArray.length - 1].getPath());
            if (LOG.isInfoEnabled()) {
                LOG.info("getLastGoodCheckpoint: Found last good checkpoint " + lastCheckpointedSuperstep + " from "
                        + fileStatusArray[fileStatusArray.length - 1].getPath().toString());
            }
        }
        return lastCheckpointedSuperstep;
    }

    /**
     * Wait for a set of workers to signal that they are done with the
     * barrier.
     *
     * @param finishedWorkerPath Path to where the workers will register their
     *        hostname and id
     * @param workerInfoList List of the workers to wait for
     * @param event Event to wait on for a chance to be done.
     * @return True if barrier was successful, false if there was a worker
     *         failure
     */
    private boolean barrierOnWorkerList(String finishedWorkerPath, List<WorkerInfo> workerInfoList,
            BspEvent event) {
        try {
            getZkExt().createOnceExt(finishedWorkerPath, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, true);
        } catch (KeeperException e) {
            throw new IllegalStateException(
                    "barrierOnWorkerList: KeeperException - Couldn't create " + finishedWorkerPath, e);
        } catch (InterruptedException e) {
            throw new IllegalStateException(
                    "barrierOnWorkerList: InterruptedException - Couldn't create " + finishedWorkerPath, e);
        }
        List<String> hostnameIdList = new ArrayList<String>(workerInfoList.size());
        for (WorkerInfo workerInfo : workerInfoList) {
            hostnameIdList.add(workerInfo.getHostnameId());
        }
        String workerInfoHealthyPath = getWorkerInfoHealthyPath(getApplicationAttempt(), getSuperstep());
        List<String> finishedHostnameIdList;
        long nextInfoMillis = System.currentTimeMillis();
        while (true) {
            try {
                finishedHostnameIdList = getZkExt().getChildrenExt(finishedWorkerPath, true, false, false);
            } catch (KeeperException e) {
                throw new IllegalStateException("barrierOnWorkerList: KeeperException - Couldn't get "
                        + "children of " + finishedWorkerPath, e);
            } catch (InterruptedException e) {
                throw new IllegalStateException("barrierOnWorkerList: IllegalException - Couldn't get "
                        + "children of " + finishedWorkerPath, e);
            }
            if (LOG.isDebugEnabled()) {
                LOG.debug("barrierOnWorkerList: Got finished worker list = " + finishedHostnameIdList + ", size = "
                        + finishedHostnameIdList.size() + ", worker list = " + workerInfoList + ", size = "
                        + workerInfoList.size() + " from " + finishedWorkerPath);
            }

            if (LOG.isInfoEnabled() && (System.currentTimeMillis() > nextInfoMillis)) {
                nextInfoMillis = System.currentTimeMillis() + 30000;
                LOG.info("barrierOnWorkerList: " + finishedHostnameIdList.size() + " out of "
                        + workerInfoList.size() + " workers finished on superstep " + getSuperstep() + " on path "
                        + finishedWorkerPath);
            }
            getContext().setStatus(getGraphMapper().getMapFunctions() + " - " + finishedHostnameIdList.size()
                    + " finished out of " + workerInfoList.size() + " on superstep " + getSuperstep());
            if (finishedHostnameIdList.containsAll(hostnameIdList)) {
                break;
            }

            // Wait for a signal or no more than 60 seconds to progress
            // or else will continue.
            event.waitMsecs(60 * 1000);
            event.reset();
            getContext().progress();

            // Did a worker die?
            try {
                if ((getSuperstep() > 0) && !superstepChosenWorkerAlive(workerInfoHealthyPath, workerInfoList)) {
                    return false;
                }
            } catch (KeeperException e) {
                throw new IllegalStateException(
                        "barrierOnWorkerList: KeeperException - " + "Couldn't get " + workerInfoHealthyPath, e);
            } catch (InterruptedException e) {
                throw new IllegalStateException(
                        "barrierOnWorkerList: InterruptedException - " + "Couldn't get " + workerInfoHealthyPath,
                        e);
            }
        }

        return true;
    }

    @Override
    public SuperstepState coordinateSuperstep() throws KeeperException, InterruptedException {
        // 1. Get chosen workers and set up watches on them.
        // 2. Assign partitions to the workers
        //    (possibly reloading from a superstep)
        // 3. Wait for all workers to complete
        // 4. Collect and process aggregators
        // 5. Create superstep finished node
        // 6. If the checkpoint frequency is met, finalize the checkpoint

        List<WorkerInfo> chosenWorkerInfoList = checkWorkers();
        if (chosenWorkerInfoList == null) {
            LOG.fatal("coordinateSuperstep: Not enough healthy workers for " + "superstep " + getSuperstep());
            setJobState(ApplicationState.FAILED, -1, -1);
        } else {
            for (WorkerInfo workerInfo : chosenWorkerInfoList) {
                String workerInfoHealthyPath = getWorkerInfoHealthyPath(getApplicationAttempt(), getSuperstep())
                        + "/" + workerInfo.getHostnameId();
                if (getZkExt().exists(workerInfoHealthyPath, true) == null) {
                    LOG.warn("coordinateSuperstep: Chosen worker " + workerInfoHealthyPath
                            + " is no longer valid, failing superstep");
                }
            }
        }

        currentWorkersCounter.increment(chosenWorkerInfoList.size() - currentWorkersCounter.getValue());
        assignPartitionOwners(allPartitionStatsList, chosenWorkerInfoList, masterGraphPartitioner);

        if (getSuperstep() == INPUT_SUPERSTEP) {
            // Coordinate the workers finishing sending their vertices to the
            // correct workers and signal when everything is done.
            if (!barrierOnWorkerList(inputSplitsDonePath, chosenWorkerInfoList,
                    getInputSplitsDoneStateChangedEvent())) {
                throw new IllegalStateException(
                        "coordinateSuperstep: Worker failed during input split " + "(currently not supported)");
            }
            try {
                getZkExt().create(inputSplitsAllDonePath, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
            } catch (KeeperException.NodeExistsException e) {
                LOG.info("coordinateInputSplits: Node " + inputSplitsAllDonePath + " already exists.");
            } catch (KeeperException e) {
                throw new IllegalStateException("coordinateInputSplits: KeeperException", e);
            } catch (InterruptedException e) {
                throw new IllegalStateException("coordinateInputSplits: IllegalStateException", e);
            }
        }

        String finishedWorkerPath = getWorkerFinishedPath(getApplicationAttempt(), getSuperstep());
        if (!barrierOnWorkerList(finishedWorkerPath, chosenWorkerInfoList, getSuperstepStateChangedEvent())) {
            return SuperstepState.WORKER_FAILURE;
        }

        // Collect aggregator values, then run the master.compute() and
        // finally save the aggregator values
        collectAndProcessAggregatorValues(getSuperstep());
        runMasterCompute(getSuperstep());
        saveAggregatorValues(getSuperstep());

        // If the master is halted or all the vertices voted to halt and there
        // are no more messages in the system, stop the computation
        GlobalStats globalStats = aggregateWorkerStats(getSuperstep());
        if (masterCompute.isHalted() || (globalStats.getFinishedVertexCount() == globalStats.getVertexCount()
                && globalStats.getMessageCount() == 0)) {
            globalStats.setHaltComputation(true);
        }

        // Let everyone know the aggregated application state through the
        // superstep finishing znode.
        String superstepFinishedNode = getSuperstepFinishedPath(getApplicationAttempt(), getSuperstep());
        WritableUtils.writeToZnode(getZkExt(), superstepFinishedNode, -1, globalStats);
        vertexCounter.increment(globalStats.getVertexCount() - vertexCounter.getValue());
        finishedVertexCounter.increment(globalStats.getFinishedVertexCount() - finishedVertexCounter.getValue());
        edgeCounter.increment(globalStats.getEdgeCount() - edgeCounter.getValue());
        sentMessagesCounter.increment(globalStats.getMessageCount() - sentMessagesCounter.getValue());

        // Finalize the valid checkpoint file prefixes and possibly
        // the aggregators.
        if (checkpointFrequencyMet(getSuperstep())) {
            try {
                finalizeCheckpoint(getSuperstep(), chosenWorkerInfoList);
            } catch (IOException e) {
                throw new IllegalStateException("coordinateSuperstep: IOException on finalizing checkpoint", e);
            }
        }

        // Clean up the old supersteps (always keep this one)
        long removeableSuperstep = getSuperstep() - 1;
        if (!(getConfiguration().getBoolean(GiraphJob.KEEP_ZOOKEEPER_DATA, GiraphJob.KEEP_ZOOKEEPER_DATA_DEFAULT))
                && (removeableSuperstep >= 0)) {
            String oldSuperstepPath = getSuperstepPath(getApplicationAttempt()) + "/" + removeableSuperstep;
            try {
                if (LOG.isInfoEnabled()) {
                    LOG.info("coordinateSuperstep: Cleaning up old Superstep " + oldSuperstepPath);
                }
                getZkExt().deleteExt(oldSuperstepPath, -1, true);
            } catch (KeeperException.NoNodeException e) {
                LOG.warn("coordinateBarrier: Already cleaned up " + oldSuperstepPath);
            } catch (KeeperException e) {
                throw new IllegalStateException(
                        "coordinateSuperstep: KeeperException on " + "finalizing checkpoint", e);
            }
        }
        incrCachedSuperstep();
        // Counter starts at zero, so no need to increment
        if (getSuperstep() > 0) {
            superstepCounter.increment(1);
        }
        SuperstepState superstepState;
        if (globalStats.getHaltComputation()) {
            superstepState = SuperstepState.ALL_SUPERSTEPS_DONE;
        } else {
            superstepState = SuperstepState.THIS_SUPERSTEP_DONE;
        }
        try {
            aggregatorWriter.writeAggregator(getAggregatorMap(),
                    (superstepState == SuperstepState.ALL_SUPERSTEPS_DONE) ? AggregatorWriter.LAST_SUPERSTEP
                            : getSuperstep());
        } catch (IOException e) {
            throw new IllegalStateException("coordinateSuperstep: IOException while " + "writing aggregators data",
                    e);
        }

        return superstepState;
    }

    /**
     * Run the master.compute() class
     *
     * @param superstep superstep for which to run the master.compute()
     */
    private void runMasterCompute(long superstep) {
        GraphState<I, V, E, M> graphState = getGraphMapper().getGraphState();
        // The master.compute() should run logically before the workers, so
        // increase the superstep counter it uses by one
        graphState.setSuperstep(superstep + 1);
        graphState.setNumVertices(vertexCounter.getValue());
        graphState.setNumEdges(edgeCounter.getValue());
        graphState.setContext(getContext());
        graphState.setGraphMapper(getGraphMapper());
        masterCompute.setGraphState(graphState);
        if (superstep == INPUT_SUPERSTEP) {
            try {
                masterCompute.initialize();
            } catch (InstantiationException e) {
                LOG.fatal("map: MasterCompute.initialize failed in instantiation", e);
                throw new RuntimeException("map: MasterCompute.initialize failed in instantiation", e);
            } catch (IllegalAccessException e) {
                LOG.fatal("map: MasterCompute.initialize failed in access", e);
                throw new RuntimeException("map: MasterCompute.initialize failed in access", e);
            }
        }
        masterCompute.compute();
    }

    /**
     * Need to clean up ZooKeeper nicely.  Make sure all the masters and workers
     * have reported ending their ZooKeeper connections.
     */
    private void cleanUpZooKeeper() {
        try {
            getZkExt().createExt(cleanedUpPath, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, true);
        } catch (KeeperException.NodeExistsException e) {
            if (LOG.isInfoEnabled()) {
                LOG.info("cleanUpZooKeeper: Node " + cleanedUpPath + " already exists, no need to create.");
            }
        } catch (KeeperException e) {
            throw new IllegalStateException("cleanupZooKeeper: Got KeeperException", e);
        } catch (InterruptedException e) {
            throw new IllegalStateException("cleanupZooKeeper: Got IllegalStateException", e);
        }
        // Need to wait for the number of workers and masters to complete
        int maxTasks = BspInputFormat.getMaxTasks(getConfiguration());
        if ((getGraphMapper().getMapFunctions() == MapFunctions.ALL)
                || (getGraphMapper().getMapFunctions() == MapFunctions.ALL_EXCEPT_ZOOKEEPER)) {
            maxTasks *= 2;
        }
        List<String> cleanedUpChildrenList = null;
        while (true) {
            try {
                cleanedUpChildrenList = getZkExt().getChildrenExt(cleanedUpPath, true, false, true);
                if (LOG.isInfoEnabled()) {
                    LOG.info("cleanUpZooKeeper: Got " + cleanedUpChildrenList.size() + " of " + maxTasks
                            + " desired children from " + cleanedUpPath);
                }
                if (cleanedUpChildrenList.size() == maxTasks) {
                    break;
                }
                if (LOG.isInfoEnabled()) {
                    LOG.info("cleanedUpZooKeeper: Waiting for the " + "children of " + cleanedUpPath
                            + " to change since only got " + cleanedUpChildrenList.size() + " nodes.");
                }
            } catch (KeeperException e) {
                // We are in the cleanup phase -- just log the error
                LOG.error("cleanUpZooKeeper: Got KeeperException, " + "but will continue", e);
                return;
            } catch (InterruptedException e) {
                // We are in the cleanup phase -- just log the error
                LOG.error("cleanUpZooKeeper: Got InterruptedException, " + "but will continue", e);
                return;
            }

            getCleanedUpChildrenChangedEvent().waitForever();
            getCleanedUpChildrenChangedEvent().reset();
        }

        // At this point, all processes have acknowledged the cleanup,
        // and the master can do any final cleanup
        try {
            if (!getConfiguration().getBoolean(GiraphJob.KEEP_ZOOKEEPER_DATA,
                    GiraphJob.KEEP_ZOOKEEPER_DATA_DEFAULT)) {
                if (LOG.isInfoEnabled()) {
                    LOG.info("cleanupZooKeeper: Removing the following path " + "and all children - " + basePath);
                }
                getZkExt().deleteExt(basePath, -1, true);
            }
        } catch (KeeperException e) {
            LOG.error("cleanupZooKeeper: Failed to do cleanup of " + basePath + " due to KeeperException", e);
        } catch (InterruptedException e) {
            LOG.error("cleanupZooKeeper: Failed to do cleanup of " + basePath + " due to InterruptedException", e);
        }
    }

    @Override
    public void cleanup() throws IOException {
        // All master processes should denote they are done by adding special
        // znode.  Once the number of znodes equals the number of partitions
        // for workers and masters, the master will clean up the ZooKeeper
        // znodes associated with this job.
        String masterCleanedUpPath = cleanedUpPath + "/" + getTaskPartition() + MASTER_SUFFIX;
        try {
            String finalFinishedPath = getZkExt().createExt(masterCleanedUpPath, null, Ids.OPEN_ACL_UNSAFE,
                    CreateMode.PERSISTENT, true);
            if (LOG.isInfoEnabled()) {
                LOG.info("cleanup: Notifying master its okay to cleanup with " + finalFinishedPath);
            }
        } catch (KeeperException.NodeExistsException e) {
            if (LOG.isInfoEnabled()) {
                LOG.info("cleanup: Couldn't create finished node '" + masterCleanedUpPath);
            }
        } catch (KeeperException e) {
            LOG.error("cleanup: Got KeeperException, continuing", e);
        } catch (InterruptedException e) {
            LOG.error("cleanup: Got InterruptedException, continuing", e);
        }

        if (isMaster) {
            cleanUpZooKeeper();
            // If desired, cleanup the checkpoint directory
            if (getConfiguration().getBoolean(GiraphJob.CLEANUP_CHECKPOINTS_AFTER_SUCCESS,
                    GiraphJob.CLEANUP_CHECKPOINTS_AFTER_SUCCESS_DEFAULT)) {
                boolean success = getFs().delete(new Path(checkpointBasePath), true);
                if (LOG.isInfoEnabled()) {
                    LOG.info("cleanup: Removed HDFS checkpoint directory (" + checkpointBasePath
                            + ") with return = " + success + " since this job succeeded ");
                }
            }
            aggregatorWriter.close();
        }

        try {
            getZkExt().close();
        } catch (InterruptedException e) {
            // cleanup phase -- just log the error
            LOG.error("cleanup: Zookeeper failed to close", e);
        }
    }

    /**
     * Event that the master watches that denotes if a worker has done something
     * that changes the state of a superstep (either a worker completed or died)
     *
     * @return Event that denotes a superstep state change
     */
    public final BspEvent getSuperstepStateChangedEvent() {
        return superstepStateChanged;
    }

    /**
     * Should this worker failure cause the current superstep to fail?
     *
     * @param failedWorkerPath Full path to the failed worker
     */
    private void checkHealthyWorkerFailure(String failedWorkerPath) {
        if (getSuperstepFromPath(failedWorkerPath) < getSuperstep()) {
            return;
        }

        Collection<PartitionOwner> partitionOwners = masterGraphPartitioner.getCurrentPartitionOwners();
        String hostnameId = getHealthyHostnameIdFromPath(failedWorkerPath);
        for (PartitionOwner partitionOwner : partitionOwners) {
            WorkerInfo workerInfo = partitionOwner.getWorkerInfo();
            WorkerInfo previousWorkerInfo = partitionOwner.getPreviousWorkerInfo();
            if (workerInfo.getHostnameId().equals(hostnameId)
                    || ((previousWorkerInfo != null) && previousWorkerInfo.getHostnameId().equals(hostnameId))) {
                LOG.warn("checkHealthyWorkerFailure: " + "at least one healthy worker went down " + "for superstep "
                        + getSuperstep() + " - " + hostnameId + ", will try to restart from "
                        + "checkpointed superstep " + lastCheckpointedSuperstep);
                superstepStateChanged.signal();
            }
        }
    }

    @Override
    public boolean processEvent(WatchedEvent event) {
        boolean foundEvent = false;
        if (event.getPath().contains(WORKER_HEALTHY_DIR) && (event.getType() == EventType.NodeDeleted)) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("processEvent: Healthy worker died (node deleted) " + "in " + event.getPath());
            }
            checkHealthyWorkerFailure(event.getPath());
            superstepStateChanged.signal();
            foundEvent = true;
        } else if (event.getPath().contains(WORKER_FINISHED_DIR)
                && event.getType() == EventType.NodeChildrenChanged) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(
                        "processEvent: Worker finished (node change) " + "event - superstepStateChanged signaled");
            }
            superstepStateChanged.signal();
            foundEvent = true;
        }

        return foundEvent;
    }

    /**
     * Use an aggregator in this superstep. Note that the master uses all
     * aggregators by default, so calling this function is not neccessary.
     *
     * @param name Name of aggregator (should be unique)
     * @return boolean (always true)
     */
    public boolean useAggregator(String name) {
        return true;
    }
}