org.commoncrawl.service.pagerank.master.PageRankMaster.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.service.pagerank.master.PageRankMaster.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.service.pagerank.master;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.text.NumberFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.StringTokenizer;
import java.util.TreeSet;
import java.util.Vector;
import java.util.zip.CRC32;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.commoncrawl.async.Timer;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.db.RecordStore;
import org.commoncrawl.server.CommonCrawlServer;
import org.commoncrawl.service.pagerank.BaseConfig;
import org.commoncrawl.service.pagerank.IterationInfo;
import org.commoncrawl.service.pagerank.PRMasterState;
import org.commoncrawl.service.pagerank.PageRankJobConfig;
import org.commoncrawl.service.pagerank.SlaveStatus;
import org.commoncrawl.service.pagerank.SlaveStatus.State;
import org.commoncrawl.util.CCStringUtils;

/**
 * 
 * @author rana
 *
 */
public class PageRankMaster extends CommonCrawlServer {

    private static final int MAX_ITERATION_DEFAULT = 50;
    private static final int POLL_TIMER_INTERVAL = 1000;
    private static final int DEFAULT_INSTANCES_PER_SLAVE = 1;
    private static final String PRMasterStateKey = "PRMasterState";
    private static final String PRJobConfigKey = "PRJobConfig";
    private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
    static {
        NUMBER_FORMAT.setMinimumIntegerDigits(5);
        NUMBER_FORMAT.setGroupingUsed(false);
    }

    private String _slavesFile;
    private long _slavesFileCRC = -1;
    private FileSystem _fileSystem;
    private String _hdfsWorkingDir = "crawl/pageRank/jobs";
    private long _jobId = -1;
    private boolean _pageRankStarted = false;

    private Timer _pollTimer = null;
    /** record store object used to persist state **/
    private RecordStore _recordStore = new RecordStore();
    private boolean _serverPaused = false;

    private Vector<PageRankRemoteSlave> _slaves = new Vector<PageRankRemoteSlave>();
    private String _slavesList = null;
    private SlaveStatus _slaveStates[] = null;

    private PRMasterState _serverState = null;
    private PageRankJobConfig _jobConfig = null;
    //private Vector<PageRankJobConfig>  _jobQueue  = new Vector<PageRankJobConfig>();
    //private Vector<PageRankJobConfig>  _jobList   = new Vector<PageRankJobConfig>();

    public PageRankMaster() {
        setAsyncWebDispatch(true);
    }

    public PageRankJobConfig getActiveJobConfig() {
        return _serverState.getActiveJobConfig();
    }

    public String getMasterState() {
        return PRMasterState.ServerStatus.toString(_serverState.getServerStatus());
    }

    public PageRankJobConfig getActiveJob() {
        if (_serverState.getServerStatus() != PRMasterState.ServerStatus.IDLE) {
            return _serverState.getActiveJobConfig();
        }
        return null;
    }

    public String getActiveJobName() {
        if (_serverState.getServerStatus() != PRMasterState.ServerStatus.IDLE) {
            return getJobName(_serverState.getActiveJobConfig());
        }
        return "";
    }

    //public Vector<PageRankJobConfig> getJobQueue() { return _jobQueue; }
    public static String getJobName(PageRankJobConfig jobConfig) {
        return "job-" + jobConfig.getJobId();
    }
    /*
    public Vector<PageRankJobConfig> getQueueableJobList() { 
      Vector<PageRankJobConfig> listOut = new Vector<PageRankJobConfig>();
      HashSet<PageRankJobConfig> queuedJobs = new HashSet<PageRankJobConfig>();
      queuedJobs.addAll(_jobQueue);
      for (PageRankJobConfig job : _jobList) { 
        if (!queuedJobs.contains(job)) { 
    listOut.add(job);
        }
      }
      return listOut;
    }
    */

    public void createNewJob(String inputValuePath, String graphPath, int maxIterations, int slaveCount)
            throws IOException {

        PageRankJobConfig jobConfig = new PageRankJobConfig();

        jobConfig.setJobId(System.currentTimeMillis());
        jobConfig.setIterationNumber(0);
        jobConfig.setMaxIterationNumber(maxIterations);
        jobConfig.setSlaveCount(slaveCount);
        jobConfig.setInputValuesPath(inputValuePath);
        jobConfig.setOutlinksDataPath(graphPath);
        // set up job dir ... 
        Path jobPath = new Path(_hdfsWorkingDir, Long.toString(jobConfig.getJobId()));
        // mk the job dir 
        // _fileSystem.mkdirs(jobPath);
        jobConfig.setJobWorkPath(jobPath.toString());
        jobConfig.setAlgorithmId(0);
        jobConfig.setAlpha(.85f);

        //serializeJobConfig(jobConfig);

        //_jobList.add(jobConfig);
    }

    /*
    public void queueJob(String jobIdStr) {
          
      long jobId = Long.parseLong(jobIdStr);
          
      for (PageRankJobConfig job : _jobList) { 
        if (job.getJobId() == jobId) { 
    _jobList.remove(job);
    _jobQueue.add(job);
    break;
        }
      }
    }
    */

    Vector<PageRankRemoteSlave> getSlaves() {
        return _slaves;
    }

    public boolean isServerIdle() {
        return _serverState.getServerStatus() == PRMasterState.ServerStatus.IDLE;
    }

    public boolean isPageRankActive() {
        return _serverState.getServerStatus() != PRMasterState.ServerStatus.IDLE;
    }

    public boolean isPageRankTerminating() {
        return _serverState.getServerStatus() == PRMasterState.ServerStatus.FINISHING;
    }

    public boolean isServerPaused() {
        return _serverPaused;
    }

    public boolean isIterationActive() {
        return _serverState.getServerStatus() >= PRMasterState.ServerStatus.ITERATING_DISTRIBUTING
                && _serverState.getServerStatus() <= PRMasterState.ServerStatus.ITERATING_CALCULATING;
    }

    public int getSlaveIterationPhase() {
        if (_serverState.getServerStatus() == PRMasterState.ServerStatus.ITERATING_CALCULATING) {
            return IterationInfo.Phase.CALCULATE;
        } else {
            return IterationInfo.Phase.DISTRIBUTE;
        }
    }

    public long getCurrentJobNumber() {
        return (isPageRankActive()) ? _serverState.getActiveJobConfig().getJobId() : -1;
    }

    public int getCurrentIterationNumber() {
        return (isPageRankActive()) ? _serverState.getActiveJobConfig().getIterationNumber() : -1;
    }

    public int getMaxIteration() {
        return (isPageRankActive()) ? _serverState.getActiveJobConfig().getMaxIterationNumber() : -1;
    }

    public BaseConfig getBaseConfigForSlave(PageRankRemoteSlave slave) {

        BaseConfig baseConfig = new BaseConfig();

        baseConfig.setBaseWorkingDir(_hdfsWorkingDir);
        // baseConfig.setFileSystem(_fileSystem.getUri().toString());
        baseConfig.setSlaveCount(_slaves.size());
        baseConfig.setSlaveId(slave.getSlaveId());
        baseConfig.setSlavesList(_slavesList);

        return baseConfig;
    }

    public void createJob() {

    }

    //@Override
    protected String getDefaultLogFileName() {
        return "prmaster";
    }

    @Override
    protected String getDefaultDataDir() {
        return CrawlEnvironment.DEFAULT_DATA_DIR;
    }

    @Override
    protected String getDefaultHttpInterface() {
        return CrawlEnvironment.DEFAULT_HTTP_INTERFACE;
    }

    @Override
    protected int getDefaultHttpPort() {
        return CrawlEnvironment.DEFAULT_PAGERANK_MASTER_HTTP_PORT;
    }

    @Override
    protected String getDefaultRPCInterface() {
        return CrawlEnvironment.DEFAULT_RPC_INTERFACE;
    }

    @Override
    protected int getDefaultRPCPort() {
        return CrawlEnvironment.DEFAULT_PAGERANK_MASTER_RPC_PORT;
    }

    @Override
    protected String getWebAppName() {
        return CrawlEnvironment.PAGERANK_MASTER_WEBAPP_NAME;
    }

    @Override
    protected boolean initServer() {

        if (_slavesFile == null || _jobId == -1) {
            LOG.error("Slaves File not specified. Specify Slaves file via --slaves");
            return false;
        } else {
            try {
                // get a pointer to the hdfs file system 
                // _fileSystem = CrawlEnvironment.getDefaultFileSystem();
                // parse slaves file ..
                parseSlavesFile();
                // load database state ... 
                loadState();
                // init slave states array 
                _slaveStates = new SlaveStatus[_slaves.size()];
                // and populate it ... 
                for (int i = 0; i < _slaveStates.length; ++i) {
                    _slaveStates[i] = new SlaveStatus();
                }
                // connect to slaves ...
                connectToSlaves();
                // setup poll timer
                _pollTimer = new Timer(POLL_TIMER_INTERVAL, true, new Timer.Callback() {

                    @Override
                    public void timerFired(Timer timer) {

                        potentaillyUpdateServerStatus();

                    }

                });

                getEventLoop().setTimer(_pollTimer);

                return true;
            } catch (IOException e) {
                LOG.error(CCStringUtils.stringifyException(e));
            }
        }
        return false;
    }

    @Override
    protected boolean parseArguements(String[] argv) {

        for (int i = 0; i < argv.length; ++i) {

            if (argv[i].equalsIgnoreCase("--slaves")) {
                if (i + 1 < argv.length) {
                    _slavesFile = argv[++i];
                }
            } else if (argv[i].equalsIgnoreCase("--jobId")) {
                _jobId = Long.parseLong(argv[++i]);
            }

        }

        return true;
    }

    @Override
    protected void printUsage() {

    }

    @Override
    protected boolean startDaemons() {
        return true;
    }

    @Override
    protected void stopDaemons() {

    }

    void parseSlavesFile() throws IOException {

        StringBuffer slavesListWriter = new StringBuffer();

        LOG.info("Loading Slaves File from:" + _slavesFile);
        InputStream stream = null;
        URL resourceURL = CrawlEnvironment.getHadoopConfig().getResource(_slavesFile);

        if (resourceURL != null) {
            stream = resourceURL.openStream();
        }
        // try as filename 
        else {
            LOG.info("Could not load resource as an URL. Trying as an absolute pathname");
            stream = new FileInputStream(new File(_slavesFile));
        }

        if (stream == null) {
            throw new FileNotFoundException();
        }

        BufferedReader reader = new BufferedReader(new InputStreamReader(new BufferedInputStream(stream)));

        String slaveHostPlusCount = null;

        int slaveCount = 0;

        LOG.info("Loading slaves file");
        while ((slaveHostPlusCount = reader.readLine()) != null) {
            if (!slaveHostPlusCount.startsWith("#")) {
                StringTokenizer tokenizer = new StringTokenizer(slaveHostPlusCount, ":");
                if (tokenizer.countTokens() != 3) {
                    throw new IOException("Invalid Slave Entry:" + slaveHostPlusCount + " in slaves File");
                } else {
                    String slaveName = tokenizer.nextToken();
                    int instanceCount = Integer.parseInt(tokenizer.nextToken());
                    String localInterfacePort = tokenizer.nextToken();

                    for (int i = 0; i < instanceCount; ++i) {
                        PageRankRemoteSlave state = new PageRankRemoteSlave(this, slaveCount++, slaveName, i);
                        LOG.info("Adding slave:" + slaveName + "instance:" + i);
                        _slaves.add(state);
                        slavesListWriter.append(state.getFullyQualifiedName() + ",");
                    }
                }
            }
            // update finalized slaves list 
            _slavesList = slavesListWriter.toString();
        }

        // now close the file and reopen to to compute the crc ... 
        reader.close();
        stream.close();

        CRC32 fileCRC = new CRC32();

        InputStream crcStream = null;

        if (resourceURL != null) {
            crcStream = resourceURL.openStream();
        } else {
            LOG.info("Could not load resource as an URL. Trying as an absolute pathname");
            crcStream = new FileInputStream(new File(_slavesFile));
        }

        byte[] buf = new byte[4096];
        int nRead = 0;
        while ((nRead = crcStream.read(buf, 0, buf.length)) > 0) {
            fileCRC.update(buf, 0, nRead);
        }

        _slavesFileCRC = fileCRC.getValue();
        LOG.info("Slaves File CRC is:" + _slavesFileCRC);

        crcStream.close();

    }

    void connectToSlaves() throws IOException {
        LOG.info("Connecting to Slaves");
        for (PageRankRemoteSlave slave : _slaves) {
            slave.connect();
        }
    }

    // react to a status change in a pagerank slave ..
    void slaveStatusChanged(PageRankRemoteSlave slave) {

        LOG.info("slaveStatusChanged from slave:" + slave.getFullyQualifiedName() + " NewStatus:"
                + slave.getLastKnowStatus() + " ServerState:" + _serverState.getServerStatus());

        try {
            _slaveStates[slave.getSlaveId()].merge(slave.getLastKnowStatus());
        } catch (CloneNotSupportedException e) {
        }

        if (_serverState.getServerStatus() == PRMasterState.ServerStatus.IDLE) {

            if (!_pageRankStarted) {

                int idleCount = 0;
                // check to see if all slaves are ready 
                for (SlaveStatus slaveState : _slaveStates) {
                    // if this slave is finished with current iteration .. 
                    if (slaveState.getState() == SlaveStatus.State.IDLE) {
                        ++idleCount;
                    }
                }
                // if all slaves are in idle state 
                if (idleCount == _slaveStates.length) {
                    // ready to send page rank start command 
                    LOG.info("All Slaves Online and Idle. Sending Start PageRank Cmd");
                    // and send out start page rank command
                    sendSlavesStartPageRankCmd(PRMasterState.ServerStatus.STARTED);
                    _pageRankStarted = true;
                } else {
                    LOG.info(idleCount + " Slave Idle/Online. Waiting on:" + (_slaveStates.length - idleCount));
                }
            }
        } else {
            potentiallyResyncSlaveState(slave.getSlaveId());

            potentaillyUpdateServerStatus();
        }
    }

    private void potentiallyResyncSlaveState(int slaveIdx) {

        SlaveStatus status = _slaveStates[slaveIdx];

        //check to see if the slave even has a potentially valid state ... 
        if (status.isFieldDirty(SlaveStatus.Field_STATE)) {
            // do we want to reset this slave...
            boolean resetSlave = false;
            // ok now try to see if it is out of sync with the master ... 
            switch (status.getState()) {

            // slave is in an initialized but idle state ...
            case SlaveStatus.State.IDLE: {
                // if page rank is active ... send the appropriate command to the slave ... 
                if (isPageRankActive()) {
                    LOG.info("Slave is IDLE while Master has PageRankActive. Sending Start Page Rank to Slave:"
                            + _slaves.get(slaveIdx).getFullyQualifiedName());
                    _slaves.get(slaveIdx).sendStartPageRankCmd(_serverState.getServerStatus());
                }
            }
                break;

            case SlaveStatus.State.STARTED_IDLE:
            case SlaveStatus.State.DONE_CALCULATING: {
                // if page rank is active ... figure out next steps based on host state 
                if (isPageRankActive()) {
                    // if we are iterating ... 
                    if (isIterationActive()) {
                        boolean sendDoIterationCommand = true;
                        if (status.getState() == SlaveStatus.State.DONE_CALCULATING && _serverState
                                .getServerStatus() == PRMasterState.ServerStatus.ITERATING_CALCULATING) {
                            if (status.getCurrentIteration() == _serverState.getActiveJobConfig()
                                    .getIterationNumber()) {
                                sendDoIterationCommand = false;
                                LOG.info("Slave is DONE_CALCULATING and Master is in CALCULATING... IGORING Slave:"
                                        + _slaves.get(slaveIdx).getFullyQualifiedName());
                            }
                        }
                        if (sendDoIterationCommand) {
                            LOG.info("Slave is in " + SlaveStatus.State.toString(status.getState())
                                    + " while Master is in Iteration. Sending StartIteration to Slave:"
                                    + _slaves.get(slaveIdx).getFullyQualifiedName());
                            sendSlaveDoIterationCmd(_slaves.get(slaveIdx));
                        }
                    }
                    // are we terminating ... ? 
                    else if (isPageRankTerminating()) {
                        LOG.info("Slave is in " + SlaveStatus.State.toString(status.getState())
                                + " while Master is in PageRankTerminating. Sending EndPageRank to Slave:"
                                + _slaves.get(slaveIdx).getFullyQualifiedName());
                        // send end page rank command to slave ... 
                        _slaves.get(slaveIdx).sendEndPageRankCmd();
                    }
                } else {
                    // bad slave is out of sync ... reset it ...
                    resetSlave = true;
                }
            }
                break;

            // if slave is in iteration state ... 
            case SlaveStatus.State.DISTRIBUTING:
            case SlaveStatus.State.DONE_DISTRIBUTING:
            case SlaveStatus.State.CALCULATING: {

                if (status.getState() == State.DONE_DISTRIBUTING && _deferredSlaves.size() != 0) {
                    PageRankRemoteSlave nextSlave = _deferredSlaves.first();
                    LOG.info("Sending Do Iteration To Deferred Slave:" + nextSlave.getFullyQualifiedName());
                    _deferredSlaves.remove(nextSlave);
                    sendSlaveDoIterationCmd(nextSlave);
                }
                // validate that master is in sync ... 
                if (!isPageRankActive() && !isIterationActive()) {
                    resetSlave = true;
                }
            }
                break;
            }

            if (resetSlave) {
                LOG.error("Slave:" + _slaves.get(slaveIdx).getFullyQualifiedName() + " out of Sync with Master."
                        + "Master State:" + _serverState + "Slave State:" + status.getState()
                        + " -- Sending RESET");

                // reset slave state 
                _slaveStates[slaveIdx].clear();
                // and send reset cmd to slave...
                _slaves.get(slaveIdx).sendResetCmd();
            }
        }
    }

    private void potentaillyUpdateServerStatus() {
        try {

            // if the server is in an idle state ... 
            if (_serverState.getServerStatus() == PRMasterState.ServerStatus.IDLE) {
                // start the next job potentially 
                //potentiallyStartNextPRJob();
            }

            // paused state handling ...
            /*
            if (_serverState.getServerStatus() == PRMasterState.ServerStatus.PAUSED) { 
                  
              // if server is in a paused state but we are ok to start iterating ... 
              if (!isServerPaused()) { 
                LOG.info("Server Moving from PAUSED to NextIteration State");
                advanceToNextPRIteration();
              }
            }
            */
            else if (_serverState.getServerStatus() == PRMasterState.ServerStatus.STARTED) {
                int completionCount = 0;
                for (SlaveStatus slaveState : _slaveStates) {
                    // if this slave is finished with current iteration .. 
                    if (slaveState.getState() == SlaveStatus.State.STARTED_IDLE
                            && slaveState.getActiveJobId() == getActiveJobConfig().getJobId()) {
                        ++completionCount;
                    }
                }
                if (completionCount == _slaves.size()) {
                    LOG.info("Server in STARTED STATE and All Clients are STARTED_IDLE. Moving to NextIteration");
                    // and update server state ... 
                    _serverState.setServerStatus(PRMasterState.ServerStatus.ITERATING_DISTRIBUTING);
                    // reset txn id 
                    _serverState.setCurrentTxnId(System.currentTimeMillis());
                    // finally serialize state 
                    //serializeServerState();
                    // and send out the next iteration command to the slaves ... 
                    sendSlavesDoIterationCmd();

                }
            }
            // if in iteration state ... 
            else if (_serverState.getServerStatus() == PRMasterState.ServerStatus.ITERATING_DISTRIBUTING
                    || _serverState.getServerStatus() == PRMasterState.ServerStatus.ITERATING_CALCULATING) {
                int completedSlaveCount = 0;
                int desiredTransitionState = (_serverState
                        .getServerStatus() == PRMasterState.ServerStatus.ITERATING_DISTRIBUTING)
                                ? SlaveStatus.State.DONE_DISTRIBUTING
                                : SlaveStatus.State.DONE_CALCULATING;

                for (int slaveIdx = 0; slaveIdx < _slaveStates.length; ++slaveIdx) {

                    SlaveStatus slaveState = _slaveStates[slaveIdx];

                    // if this slave is finished with current iteration .. 
                    if (slaveState.getState() == desiredTransitionState
                            && slaveState.getCurrentIteration() == getCurrentIterationNumber()) {
                        ++completedSlaveCount;
                    }
                }
                // if all slaves are done with the current iteration ...
                if (completedSlaveCount == _slaves.size()) {

                    int completedCheckpointsCount = 0;

                    for (int slaveIdx = 0; slaveIdx < _slaveStates.length; ++slaveIdx) {

                        SlaveStatus slaveState = _slaveStates[slaveIdx];

                        // check to see if we need to send this slave a checkpoint command  
                        if (slaveState.getCurrentCheckpointId() != _serverState.getCurrentTxnId()) {
                            LOG.info("Sending Slave:" + _slaves.get(slaveIdx).getFullyQualifiedName()
                                    + " Checkpoint Command - CurrentCheckpointId:"
                                    + slaveState.getCurrentCheckpointId());
                            slaveState.setCurrentCheckpointId(_serverState.getCurrentTxnId());
                            sendSlaveCheckpointCommand(slaveIdx, _serverState.getCurrentTxnId(),
                                    getSlaveIterationPhase(), getCurrentIterationNumber());
                        } else {
                            // ok the current slave completed active checkpoint command
                            if (slaveState.getCommittedCheckpointId() == _serverState.getCurrentTxnId()) {
                                // increment completion count ...
                                completedCheckpointsCount++;
                                LOG.info("Slave:" + _slaves.get(slaveIdx).getFullyQualifiedName()
                                        + " Has Completed Checkpoint. Completed CheckpointCount:"
                                        + completedCheckpointsCount);
                            }
                        }
                    }

                    // if everyone completed active checkpoint 
                    if (completedCheckpointsCount == _slaves.size()) {
                        // OK. advance to next MASTER state ..
                        LOG.info("All Slaves Report Successfull Checkpoint Status for Txn:"
                                + _serverState.getCurrentTxnId());

                        // if we were in the distribute phase ... 
                        if (_serverState.getServerStatus() == PRMasterState.ServerStatus.ITERATING_DISTRIBUTING) {
                            LOG.info("All Clients Done DISTRIBUTING. Moving to CALCULATING State");
                            // advance master state 
                            _serverState.setServerStatus(PRMasterState.ServerStatus.ITERATING_CALCULATING);
                            // reset txn id 
                            _serverState.setCurrentTxnId(System.currentTimeMillis());
                            // serialize the state 
                            //               /serializeServerState();
                            // notify slaves ... 
                            sendSlavesDoIterationCmd();
                        }
                        // otherwise, if in the calculation phase ... 
                        else {
                            // if we reached the last iteration .. .we need to do a clean shutdown ... 
                            if (getCurrentIterationNumber() == getMaxIteration()) {
                                LOG.info(
                                        "All Clients Done CALCULATING and Iteration Number == Max Iteration.Moving to FINISHING STATE");
                                // set our appropriate state ... 
                                _serverState.setServerStatus(PRMasterState.ServerStatus.FINISHING);
                                // serialize the state 
                                //serializeServerState();
                                // and send shutdown command to slaves ... 
                                sendSlavesEndPageRankCmd();
                            }
                            // otherwise,  
                            else {
                                // if server is not paused ... 
                                if (!isServerPaused()) {
                                    LOG.info(
                                            "All Clients Done CALCULATING and Iteration Number < Max Iteration.Moving to Next Iteration");
                                    advanceToNextPRIteration();
                                } else {
                                    LOG.info(
                                            "All Clients Done CALCULATING and Iteration Number < Max Iteration BUT Server is PAUSED.Moving to PAUSED STATE");
                                    // set pause state 
                                    _serverState.setServerStatus(PRMasterState.ServerStatus.PAUSED);
                                    // serialize state 
                                    //serializeServerState();
                                }
                            }
                        }
                    }
                }
            }
            // shutdown state handling ...
            else if (_serverState.getServerStatus() == PRMasterState.ServerStatus.FINISHING) {
                int idledCount = 0;
                for (SlaveStatus slaveState : _slaveStates) {
                    // if this slave is finished with current iteration .. 
                    if ((slaveState.getState() == SlaveStatus.State.DONE_CALCULATING
                            || slaveState.getState() == SlaveStatus.State.STARTED_IDLE)
                            && slaveState.getCurrentIteration() == getCurrentIterationNumber()) {
                        ++idledCount;
                    }
                }

                // all slaves are done cleaning up ... 
                if (idledCount == _slaves.size()) {
                    LOG.info("SERVER in FINISHING State and ALL Clients FINISHED.Finishing PR Job");
                    // do cleanup ... 
                    //finishPageRankJob(_serverState.getActiveJobConfig(),false);

                    _serverState.getActiveJobConfig().clear();
                    _serverState.setFieldClean(PRMasterState.Field_ACTIVEJOBCONFIG);
                    // and reset state ... 
                    _serverState.setServerStatus(PRMasterState.ServerStatus.IDLE);
                    // and serialize state 
                    //serializeServerState();
                }
            }
        } catch (IOException e) {
            LOG.error("Unexpected IOException: " + CCStringUtils.stringifyException(e));
            //TODO: KILL SERVER HERE ...
        }
    }

    private void advanceToNextPRIteration() throws IOException {
        //advance iteration number and restart distribution ...
        _serverState.getActiveJobConfig()
                .setIterationNumber(_serverState.getActiveJobConfig().getIterationNumber() + 1);
        // and update server state ... 
        _serverState.setServerStatus(PRMasterState.ServerStatus.ITERATING_DISTRIBUTING);
        // update transaction id 
        _serverState.setCurrentTxnId(System.currentTimeMillis());
        // finally serialize state 
        //serializeServerState();
        // and send out the next iteration command to the slaves ... 
        sendSlavesDoIterationCmd();
    }

    /*
    private void serializeServerState() throws IOException { 
      _recordStore.beginTransaction();
      _recordStore.updateRecordByKey(PRMasterStateKey, _serverState);
      if (_serverState.isFieldDirty(PRMasterState.Field_ACTIVEJOBCONFIG)) { 
        // update the disk state of the active job config
        serializeJobConfig(_serverState.getActiveJobConfig());
      }
      _recordStore.commitTransaction();
    }
    */

    private void clearAllCheckpointAndDistributionFiles(FileSystem fs, Path jobDataPath) throws IOException {
        // scan job directory for best value candidate 
        Path checkpointSearchPattern = new Path(jobDataPath, "*-CheckpointComplete-*");
        Path distroSearchPattern = new Path(jobDataPath, "OutlinkPR-*");

        FileStatus checkpointCandidates[] = fs.globStatus(checkpointSearchPattern);
        for (FileStatus candidate : checkpointCandidates) {
            LOG.info("Deleting:" + candidate.getPath());
            fs.delete(candidate.getPath(), false);
        }
        FileStatus distroCandidates[] = fs.globStatus(distroSearchPattern);
        for (FileStatus candidate : distroCandidates) {
            LOG.info("Deleting:" + candidate.getPath());
            fs.delete(candidate.getPath(), false);
        }

    }

    private int findListValidIteration(FileSystem fs, Path jobDataPath) throws IOException {
        // scan job directory for best value candidate 
        Path valueSearchPattern = new Path(jobDataPath, "value_*-00000");

        FileStatus candidates[] = fs.globStatus(valueSearchPattern);

        int lastValidIterationNo = -1;

        ArrayList<Path> iterationSpecificValues = new ArrayList<Path>();

        for (FileStatus candidate : candidates) {
            // extract iteration portion of name 
            String iterationStr = candidate.getPath().getName().substring("value_".length(), "value_".length() + 5);
            // parse 
            try {
                int iterationId = NUMBER_FORMAT.parse(iterationStr).intValue();
                // now see if we up to PR_NUM_SLAVES values 
                Path iterationSpecificSearchPattern = new Path(jobDataPath, "value_" + iterationStr + "-*");
                // count result 
                FileStatus iterationSpecificEntires[] = fs.globStatus(iterationSpecificSearchPattern);

                if (iterationSpecificEntires.length == CrawlEnvironment.PR_NUMSLAVES) {
                    LOG.info("Iteration Number:" + iterationId + " has the proper number of results");

                    if (lastValidIterationNo == -1 || lastValidIterationNo < iterationId) {
                        // set this iteration as the valid iteration
                        lastValidIterationNo = iterationId;
                        LOG.info("Setting Iteration:" + iterationId + " as last valid iteration number");
                        // clear candidate list 
                        iterationSpecificValues.clear();
                        // add paths to candidate list ... 
                        for (FileStatus iterationSpecificEntry : iterationSpecificEntires) {
                            iterationSpecificValues.add(iterationSpecificEntry.getPath());
                        }
                    }
                } else {
                    LOG.error("Skipping Iteration Number:" + iterationId + ". It only has:"
                            + iterationSpecificEntires.length + "results");
                }

            } catch (ParseException e) {
                LOG.error(CCStringUtils.stringifyException(e));
            }
        }
        return lastValidIterationNo;
    }

    private void loadState() throws IOException {
        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

        // allocate a new server state ... 
        _serverState = new PRMasterState();
        // initially in idle state 
        _serverState.setServerStatus(PRMasterState.ServerStatus.IDLE);
        // paths 
        Path valuesFilePath = new Path("crawl/pageRank/seed/" + _jobId + "/values");
        Path edgesFilePath = new Path("crawl/pageRank/seed/" + _jobId + "/edges");
        // figure out number of values
        int itemCount = fs.globStatus(new Path(valuesFilePath, "value_*")).length;
        LOG.info("There are:" + itemCount + " values for job:" + _jobId);
        Path jobsPath = new Path("crawl/pageRank/jobs/" + _jobId);
        fs.mkdirs(jobsPath);
        // find last valid iteration 
        int lastValidIteration = findListValidIteration(fs, jobsPath);
        LOG.info("Last Valid Iteration for Job:" + _jobId + " is:" + lastValidIteration);
        int nextIteration = lastValidIteration + 1;
        // clear all check point and distribution files 
        clearAllCheckpointAndDistributionFiles(fs, jobsPath);

        PageRankJobConfig jobConfig = new PageRankJobConfig();

        jobConfig.setJobId(_jobId);
        jobConfig.setIterationNumber(nextIteration);
        jobConfig.setMaxIterationNumber(1000);
        jobConfig.setSlaveCount(itemCount);
        jobConfig.setInputValuesPath(valuesFilePath.toString());
        jobConfig.setOutlinksDataPath(edgesFilePath.toString());
        // set up job dir ... 
        Path jobPath = new Path(_hdfsWorkingDir, Long.toString(jobConfig.getJobId()));
        jobConfig.setJobWorkPath(jobPath.toString());
        jobConfig.setAlgorithmId(0);
        jobConfig.setAlpha(.85f);

        _jobConfig = jobConfig;

        // update server state ...
        _serverState.setActiveJobConfig(_jobConfig);
        _serverState.setFieldDirty(PRMasterState.Field_ACTIVEJOBCONFIG);
        _serverState.setServerStatus(PRMasterState.ServerStatus.STARTED);

    }
    /*
    private void loadState() throws IOException {
     // initialize database ... 
     File databasePath = new File(getDataDirectory().getAbsolutePath() + "/" + CrawlEnvironment.PRMASTER_DB);
     LOG.info("Config says PRMaster State db path is: "+databasePath);
        
         
     // initialize record store
     _recordStore.initialize(databasePath, null);
         
     // load db state ... 
     _serverState = (PRMasterState) _recordStore.getRecordByKey(PRMasterStateKey);
         
     if (_serverState == null) {
       // allocate a brand new state 
       _serverState = new PRMasterState();
       // update crc 
       _serverState.setLastKnownSlaveFileCRC(_slavesFileCRC);
       // and write to disk ... 
       _recordStore.beginTransaction();
       _recordStore.insertRecord(null, PRMasterStateKey, _serverState);
       _recordStore.commitTransaction();
           
     }
     else { 
       // validate crc 
       if (_serverState.getLastKnownSlaveFileCRC() != _slavesFileCRC) { 
    LOG.warn("Slave Config changed since last load. Discarding any pending transactions");
    if (_serverState.isFieldDirty(PRMasterState.Field_ACTIVEJOBCONFIG)) { 
      // clear disk state for last job ... 
      finishPageRankJob(_serverState.getActiveJobConfig(),true);
      // clear server state 
      _serverState.getActiveJobConfig().clear();
      // set field clean on job config
      _serverState.setFieldClean(PRMasterState.Field_ACTIVEJOBCONFIG);
      // set state to idle ... 
      _serverState.setServerStatus(PRMasterState.ServerStatus.IDLE);
      // and write new state to disk 
      serializeServerState();
    }
       }
     }
         
     // now load job configs ... 
     Vector<Long> jobRecordList = _recordStore.getChildRecordsByParentId(PRJobConfigKey);
         
     LOG.info("Found "+ jobRecordList.size() + " serialized jobs. Loading job configs...");
         
     for (long recordId : jobRecordList) { 
       PageRankJobConfig jobConfig = (PageRankJobConfig) _recordStore.getRecordById(recordId);
       _jobList.add(jobConfig);
     }
     //TODO:HACK
     _serverState.setServerStatus(PRMasterState.ServerStatus.ITERATING_CALCULATING);
     _serverState.getActiveJobConfig().setIterationNumber(10);
         
    }
    */

    /*
    private void serializeJobConfig(PageRankJobConfig jobConfig) throws IOException { 
      if (jobConfig.getRecordId() != 0) { 
        _recordStore.beginTransaction();
        _recordStore.updateRecordById(jobConfig.getRecordId(), jobConfig);
        _recordStore.commitTransaction();
      }
      else { 
        _recordStore.beginTransaction();
        _recordStore.insertRecord(PRJobConfigKey, PRJobConfigKey + "_" + jobConfig.getJobId(), jobConfig);
        _recordStore.commitTransaction();
      }
    }
    */

    private static synchronized String getOutputName(String prefix, int instanceId) {
        return prefix + NUMBER_FORMAT.format(instanceId);
    }

    /*
    private void finishPageRankJob(PageRankJobConfig config,boolean jobFailed)throws IOException { 
      LOG.info("Finishing PageRankJob:" + config.getJobId() +  "JobFailed:" + jobFailed);
          
      Path jobBasePath = new Path(config.getJobWorkPath());
          
      LOG.info("Constructing Output Directory:" + config.getOutputValuesPath());
      if (!jobFailed) { 
        Path outputPath = new Path(config.getOutputValuesPath());
        //_fileSystem.mkdirs(outputPath);
        //_fileSystem.delete(new Path(outputPath,"*"), true);
        LOG.info("Copying data files to output directory");
            
        // iterate slaves collecting data ... 
        for (int i=0;i<_slaves.size();++i) { 
    Path jobInstancePath = new Path(jobBasePath,getOutputName("",i));
    Path jobOutputPath = new Path(jobInstancePath,Constants.PR_VALUES_FILE_DIR + "/part-0");
    Path finalDestinationPath = new Path(outputPath,getOutputName("part-",i));
    LOG.info("Moving " + jobOutputPath + " to:" + finalDestinationPath);
    //_fileSystem.rename(jobOutputPath, finalDestinationPath);
        }
      }
      LOG.info("Purging Job Dir");
      //_fileSystem.delete(jobBasePath,true);
    }
    */

    private String dumpPageRankJobInfo(PageRankJobConfig config) {

        return "InputPath:" + config.getInputValuesPath() + " OutputPath:" + config.getOutputValuesPath()
                + " Algorithm:" + config.getAlgorithmId() + " Alpha:" + config.getAlpha();
    }

    /*
    private void potentiallyStartNextPRJob() throws IOException { 
          
      if (true) { 
        while (_jobQueue.size() != 0) { 
    // get first element off of queue ...
    PageRankJobConfig config = _jobQueue.remove(0);
        
    LOG.info("Processing Job:" + dumpPageRankJobInfo(config));
              
    /*
    // validate the input path ... 
    Path inputValuePath = new Path(config.getInputValuesPath());
    FileStatus[] valuesPaths = _fileSystem.globStatus(new Path(inputValuePath,"part-?????"));
        
    Path outlinksPath = new Path(config.getOutlinksDataPath());
    FileStatus[] outlinksPaths = _fileSystem.globStatus(new Path(inputValuePath,"part-?????"));
    */

    /*        
    if (outlinksPaths.length != _slaves.size()) { 
      LOG.error("Rejecting Job. NEED " + _slaves.size() + " Outlink Files - Found:" + outlinksPaths.length);
    }
    else
     */
    /*
        
       { 
      // set up job dir ... 
      Path jobPath = new Path(_hdfsWorkingDir,"job-" + config.getJobId());
      // mk the job dir dir 
      _fileSystem.mkdirs(jobPath);
      LOG.info("Assigning Job WorkingDir:" + jobPath);
      config.setJobWorkPath(jobPath.toString());
      */
    /*      
    // update server state ...
    _serverState.setActiveJobConfig(config);
    _serverState.setFieldDirty(PRMasterState.Field_ACTIVEJOBCONFIG);
    _serverState.setServerStatus(PRMasterState.ServerStatus.STARTED);
    // serilalize the state ... 
    serializeServerState();
        
    // and send out start page rank command
    sendSlavesStartPageRankCmd(PRMasterState.ServerStatus.STARTED);
        
    break;
    }
    }
    }
    }
    */

    void sendSlavesStartPageRankCmd(int serverStatus) {
        for (PageRankRemoteSlave slave : _slaves) {
            slave.sendStartPageRankCmd(serverStatus);
        }
    }

    TreeSet<PageRankRemoteSlave> _deferredSlaves = new TreeSet<PageRankRemoteSlave>();

    void sendSlavesDoIterationCmd() {
        if (_serverState.getServerStatus() == PRMasterState.ServerStatus.ITERATING_DISTRIBUTING) {
            int i = 0;
            int desiredInitialRunCount = _slaves.size();
            for (PageRankRemoteSlave slave : _slaves) {
                if (i++ < desiredInitialRunCount) {
                    sendSlaveDoIterationCmd(slave);
                } else {
                    _deferredSlaves.add(slave);
                }
            }
        } else {
            for (PageRankRemoteSlave slave : _slaves) {
                sendSlaveDoIterationCmd(slave);
            }
        }
    }

    void sendSlaveDoIterationCmd(PageRankRemoteSlave slave) {
        if (_serverState.getServerStatus() == PRMasterState.ServerStatus.ITERATING_DISTRIBUTING) {
            if (_deferredSlaves.contains(slave)) {
                LOG.error("Invalid doIteration Cmd on Deferred Slave!");
                return;
            }
        }
        slave.sendDoIterationCmd();
    }

    void sendSlaveCheckpointCommand(int slaveIndex, long txnId, int currentPhase, int currentIterationNumber) {
        PageRankRemoteSlave slave = _slaves.get(slaveIndex);
        slave.sendCheckpointCommand(txnId, currentPhase, currentIterationNumber);
    }

    void sendSlavesEndPageRankCmd() {
        for (PageRankRemoteSlave slave : _slaves) {
            slave.sendEndPageRankCmd();
        }
    }
}