org.commoncrawl.service.pagerank.slave.BeginPageRankTask.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.service.pagerank.slave.BeginPageRankTask.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/
package org.commoncrawl.service.pagerank.slave;

import java.io.File;
import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.commoncrawl.async.CallbackWithResult;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.service.crawler.filters.SuperDomainFilter;
import org.commoncrawl.service.pagerank.Constants;
import org.commoncrawl.service.pagerank.IterationInfo;
import org.commoncrawl.service.pagerank.PRMasterState;
import org.commoncrawl.service.pagerank.PageRankJobConfig;
import org.commoncrawl.service.pagerank.slave.PageRankUtils.PRValueMap;
import org.commoncrawl.util.CCStringUtils;

public class BeginPageRankTask extends PageRankTask<BeginPageRankTask.BeginPageRankTaskResult> {

    private static final Log LOG = LogFactory.getLog(BeginPageRankTask.class);

    private PageRankJobConfig _config;
    private int _prMasterStatus;
    private boolean _isCancelled = false;

    public BeginPageRankTask(PageRankJobConfig jobConfig, int serverStatus, PageRankSlaveServer server,
            CallbackWithResult<BeginPageRankTaskResult> completionCallback) {
        super(server, BeginPageRankTask.BeginPageRankTaskResult.class, completionCallback);

        _config = jobConfig;
        _prMasterStatus = serverStatus;
    }

    public static class BeginPageRankTaskResult extends PageRankTask.PageRankTaskResult {
        public PRValueMap _valueMap = null;
    }

    @Override
    protected void cancelTask() {
        _isCancelled = true;
    }

    @Override
    protected BeginPageRankTaskResult runTask() throws IOException {

        BeginPageRankTaskResult result = new BeginPageRankTaskResult();

        try {

            // create job local directory if necessary 
            _server.getActiveJobLocalPath().mkdirs();

            FileSystem fileSystem = _server.getFileSystem();

            // figure out if we are going to load values from base location or job config (based on iteration number)

            Path rangeRemotePath = new Path(_config.getInputValuesPath(),
                    PageRankUtils.makeUniqueFileName(Constants.PR_RANGE_FILE_PREFIX, 0, _server.getNodeIndex()));
            Path rangeLocalPath = PageRankUtils.makeRangeFilePath(_server.getActiveJobLocalPath(),
                    _server.getNodeIndex());

            Path idsRemotePath = new Path(_config.getInputValuesPath(),
                    PageRankUtils.makeUniqueFileName(Constants.PR_IDS_FILE_PREFIX, 0, _server.getNodeIndex()));
            Path idsLocalPath = new Path(PageRankUtils
                    .makeIdsFilePath(_server.getActiveJobLocalPath(), _server.getNodeIndex()).getAbsolutePath());

            Path outlinksFileRemotePath = new Path(_config.getOutlinksDataPath(),
                    PageRankUtils.makeUniqueFileName(Constants.PR_OUTLINKS_FILE_PREFIX, 0, _server.getNodeIndex()));
            Path outlinksFileLocalPath = new Path(new File(_server.getActiveJobLocalPath(),
                    PageRankUtils.makeUniqueFileName(Constants.PR_OUTLINKS_FILE_PREFIX, 0, _server.getNodeIndex()))
                            .getAbsolutePath());

            Path valuesRemotePath = null;

            if (_config.getIterationNumber() == 0) {
                // fetch values from base values path 
                valuesRemotePath = new Path(_config.getInputValuesPath(), PageRankUtils
                        .makeUniqueFileName(Constants.PR_VALUE_FILE_PREFIX, 0, _server.getNodeIndex()));
                LOG.info("Iteration Number is 0. Using Values File:" + valuesRemotePath);
            } else {
                // fetch latest values from job path (hdfs) based on last iteration number ...
                valuesRemotePath = new Path(_config.getJobWorkPath(), PageRankUtils.makeUniqueFileName(
                        Constants.PR_VALUE_FILE_PREFIX, _config.getIterationNumber() - 1, _server.getNodeIndex()));
                LOG.info("Iteration Number is:" + _config.getIterationNumber() + ". Using Values File:"
                        + valuesRemotePath);
            }

            /*
            Path localValuesFilePath = new Path(new File(_server.getActiveJobLocalPath(),PageRankUtils.makeUniqueFileName(Constants.PR_VALUE_FILE_PREFIX,_config.getIterationNumber(),_server.getNodeIndex())).getAbsolutePath());
                
                
            // copy the files to the local directory ...
            FileStatus rangeFileStatus = fileSystem.getFileStatus(rangeRemotePath);
            File       rangeLocalFile  = new File(rangeLocalPath.toString());
                
            if (rangeLocalFile.exists() == false || rangeLocalFile.length() != rangeFileStatus.getLen()) { 
              rangeLocalFile.delete();
              LOG.info("Copying Range File:" + rangeRemotePath + " to " + rangeLocalPath);
              fileSystem.copyToLocalFile(rangeRemotePath, rangeLocalPath);
            }
            else { 
              LOG.info("Skipping Copy of Range File:" + rangeRemotePath + " to " + rangeLocalPath);
            }
                
            FileStatus idFileStatus = fileSystem.getFileStatus(idsRemotePath);
            File       idLocalFile  = new File(idsLocalPath.toString());
                
            if (idLocalFile.exists() == false || idLocalFile.length() != idFileStatus.getLen()) { 
              LOG.info("Copying Ids File:" + idsRemotePath + " to " + idsLocalPath);
              fileSystem.copyToLocalFile(idsRemotePath, idsLocalPath);
            }
            else { 
              LOG.info("Skipping Copying Ids File:" + idsRemotePath + " to " + idsLocalPath);
            }
               */

            FileStatus outlinksFileStatus = fileSystem.getFileStatus(outlinksFileRemotePath);
            File outlinksLocalFile = new File(outlinksFileLocalPath.toString());

            if (outlinksLocalFile.exists() == false || outlinksLocalFile.length() != outlinksFileStatus.getLen()) {
                LOG.info("Copying outlinks File:" + outlinksFileRemotePath + " to " + outlinksLocalFile);
                fileSystem.copyToLocalFile(outlinksFileRemotePath, outlinksFileLocalPath);
            } else {
                LOG.info("Skipping Copying outlinks File:" + outlinksFileRemotePath + " to " + outlinksLocalFile);
            }

            /*
            FileStatus valuesFileStatus = fileSystem.getFileStatus(valuesRemotePath);
            File       valuesLocalFile  = new File(localValuesFilePath.toString());
                
            if (valuesLocalFile.exists() == false || valuesLocalFile.length() != valuesFileStatus.getLen()) { 
              LOG.info("Copying values File:" + valuesRemotePath + " to " + valuesLocalFile);
              fileSystem.copyToLocalFile(valuesRemotePath,localValuesFilePath);
            }
            else { 
              LOG.info("Skipping Copying values File:" + valuesRemotePath + " to " + valuesLocalFile);
            }
            */
            // now load the values map ...
            result._valueMap = new PageRankUtils.PRValueMap();
            //result._valueMap.open(fileSystem,valuesRemotePath, PageRankUtils.makeRangeFilePath(_server.getActiveJobLocalPath(), _server.getNodeIndex()));

            boolean valuesFileMissing = false;
            if (_server.getActiveJobConfig().getIterationNumber() != 0
                    && !_server.getFileSystem().exists(valuesRemotePath)) {
                LOG.error("Values File Missing for Iteration:" + _server.getActiveJobConfig().getIterationNumber());

                valuesFileMissing = true;
                // revert to iteration zero values file ... 
                valuesRemotePath = new Path(_config.getInputValuesPath(), PageRankUtils
                        .makeUniqueFileName(Constants.PR_VALUE_FILE_PREFIX, 0, _server.getNodeIndex()));
            }
            result._valueMap.open(fileSystem, valuesRemotePath, rangeRemotePath);

            // ok now if iteration number is non-zero,
            // recalculate rank from previous iteration's data ...

            if (_config.getIterationNumber() != 0 && valuesFileMissing) {

                // load data from previous iteration ... 
                int iterationNumberToLoadFrom = _config.getIterationNumber() - 1;
                // ok figure out what state master is in 
                if (_prMasterStatus == PRMasterState.ServerStatus.ITERATING_CALCULATING) {
                    // use current iteration number data 
                    iterationNumberToLoadFrom = 0;
                    LOG.info("Master is in CALCULATION PHASE. SKIP LOAD OF VALUEMAP");
                }
                // in the distribution case ... check to see if checkpoint file is present ... 
                else if (_prMasterStatus == PRMasterState.ServerStatus.ITERATING_DISTRIBUTING) {

                    Path checkpointFilePath = PageRankUtils.getCheckpointFilePath(
                            new Path(_server.getActiveJobConfig().getJobWorkPath()), IterationInfo.Phase.DISTRIBUTE,
                            _server.getActiveJobConfig().getIterationNumber(), _server.getNodeIndex());

                    // ok checkpoint file exists, use current iteration number to load data 
                    if (_server.getFileSystem().exists(checkpointFilePath)) {
                        LOG.info("Checkpoint file exists. SKIP LOAD OF VALUEMAP");
                        iterationNumberToLoadFrom = 0;
                    }
                }

                if (iterationNumberToLoadFrom != 0) {
                    // load super domain filter
                    LOG.info("Initializing SuperDomain Filter");
                    SuperDomainFilter superDomainFilter = new SuperDomainFilter();
                    superDomainFilter.loadFromPath(_server.getDirectoryServiceAddress(),
                            CrawlEnvironment.ROOT_SUPER_DOMAIN_PATH, false);

                    LOG.info("Starting Calculate Task to load value map - Using Iteration Number:"
                            + iterationNumberToLoadFrom);

                    // first zero value map values ... 
                    result._valueMap.zeroValues();

                    PageRankUtils.calculateRank(_server.getConfig(), _server.getFileSystem(), result._valueMap,
                            _server.getActiveJobLocalPath(), _server.getActiveJobConfig().getJobWorkPath(),
                            _server.getNodeIndex(), _server.getBaseConfig().getSlaveCount(),
                            iterationNumberToLoadFrom, superDomainFilter,
                            new PageRankUtils.ProgressAndCancelCheckCallback() {

                                @Override
                                public boolean updateProgress(final float percentComplete) {
                                    _percentComplete = percentComplete;
                                    return BeginPageRankTask.this.isCancelled();
                                }
                            });
                }
            }
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            throw e;
        }
        return result;
    }

    @Override
    public String getDescription() {
        return "Begin PageRank Task";
    }

    @Override
    public synchronized boolean isCancelled() {
        return _isCancelled;
    }

}