dk.netarkivet.harvester.heritrix3.controller.HeritrixController.java Source code

Java tutorial

Introduction

Here is the source code for dk.netarkivet.harvester.heritrix3.controller.HeritrixController.java

Source

/*
 * #%L
 * Netarchivesuite - harvester
 * %%
 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
 *             the National Library of France and the Austrian National Library.
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 2.1 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU General Lesser Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
 * #L%
 */
package dk.netarkivet.harvester.heritrix3.controller;

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.netarchivesuite.heritrix3wrapper.EngineResult;
import org.netarchivesuite.heritrix3wrapper.Heritrix3Wrapper;
import org.netarchivesuite.heritrix3wrapper.Heritrix3Wrapper.CrawlControllerState;
import org.netarchivesuite.heritrix3wrapper.JobResult;
import org.netarchivesuite.heritrix3wrapper.ResultStatus;
import org.netarchivesuite.heritrix3wrapper.jaxb.JobShort;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import dk.netarkivet.common.exceptions.HeritrixLaunchException;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.exceptions.IllegalState;
import dk.netarkivet.common.exceptions.NotImplementedException;
import dk.netarkivet.common.utils.SystemUtils;
import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage;
import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage.CrawlServiceInfo;
import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage.CrawlServiceJobInfo;
import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage.CrawlStatus;
import dk.netarkivet.harvester.harvesting.frontier.FullFrontierReport;
import dk.netarkivet.harvester.heritrix3.Heritrix3Files;

/**
 * This implementation of the HeritrixController interface starts Heritrix3 as a separate process and uses JMX to
 * communicate with it. Each instance executes exactly one process that runs exactly one crawl job.
 */
public class HeritrixController extends AbstractRestHeritrixController {

    /** The logger for this class. */
    private static final Logger log = LoggerFactory.getLogger(HeritrixController.class);

    /**
     * The name that Heritrix3 gives to the job we ask it to create. 
     */
    private String jobName;

    /** The header line (legend) for the statistics report. */
    private String progressStatisticsLegend;

    private int heritrix3EngineRetries;
    private int heritrix3EngineIntervalBetweenRetriesInMillis;

    /**
     * Create a BnfHeritrixController object.
     *
     * @param files Files that are used to set up Heritrix3.
     */
    public HeritrixController(Heritrix3Files files, String jobName) {
        super(files);
        this.jobName = jobName;
    }

    /**
     * Initialize the JMXconnection to the Heritrix3.
     *
     * @throws IOFailure If Heritrix3 dies before initialisation, or we encounter any problems during the initialisation.
     * @see IHeritrixController#initialize()
     */
    @Override
    public void initialize() {

        /////////////////////////////////////////////////////
        // Initialize H3 wrapper 
        /////////////////////////////////////////////////////

        //TODO these numbers could be settings
        this.heritrix3EngineRetries = 60;
        this.heritrix3EngineIntervalBetweenRetriesInMillis = 1000; // 1 second

        h3wrapper = Heritrix3Wrapper.getInstance(getHostName(), getGuiPort(), null, null, getHeritrixAdminName(),
                getHeritrixAdminPassword());

        EngineResult engineResult;
        try {
            engineResult = h3wrapper.waitForEngineReady(heritrix3EngineRetries,
                    heritrix3EngineIntervalBetweenRetriesInMillis);
        } catch (Throwable e) {
            e.printStackTrace();
            throw new IOFailure("Heritrix3 engine not started: " + e);
        }

        if (engineResult != null) {
            if (engineResult.status != ResultStatus.OK) {
                String errMsg = "Heritrix3 wrapper could not connect to Heritrix3. Resultstate = "
                        + engineResult.status;
                log.error(errMsg, engineResult.t);
                throw new IOFailure(errMsg, engineResult.t);
            }
        } else {
            throw new IOFailure("Unexpected error: Heritrix3 wrapper returned null engine result.");
        }

        // POST: Heritrix3 is up and running and responds nicely
        log.info("Heritrix3 REST interface up and running");
    }

    @Override
    public void requestCrawlStart() {
        // Create a new job 
        File cxmlFile = getHeritrixFiles().getOrderFile();
        File seedsFile = getHeritrixFiles().getSeedsFile();
        JobResult jobResult;

        File jobDir = files.getHeritrixJobDir();
        if (!jobDir.exists()) {
            jobDir.mkdirs();
        }

        try {
            log.info("Copying the crawler-beans.cxml file and seeds.txt to the heritrix3 jobdir '{}'", jobDir);
            Heritrix3Wrapper.copyFile(cxmlFile, jobDir);
            Heritrix3Wrapper.copyFileAs(seedsFile, jobDir, "seeds.txt");
        } catch (IOException e) {
            throw new IOFailure("Problem occurred during the copying of files to our heritrix job", e);
        }

        // PRE: h3 is running, and the job files copied to their final location
        EngineResult engineResult = null;
        try {
            engineResult = h3wrapper.rescanJobDirectory();
            log.info("H3 jobs available for building: {}", knownJobsToString(engineResult));

            log.trace("Result of rescanJobDirectory() operation: " + new String(engineResult.response, "UTF-8"));

            jobResult = h3wrapper.buildJobConfiguration(jobName);
            log.trace("Result of buildJobConfiguration() operation: " + new String(jobResult.response, "UTF-8"));
            if (jobResult.status == ResultStatus.OK) {
                if (jobResult.job.statusDescription.equalsIgnoreCase("Unbuilt")) {
                    throw new HeritrixLaunchException(
                            "The job '" + jobName + "' could not be built. Last loglines are "
                                    + StringUtils.join(jobResult.job.jobLogTail, "\n"));
                } else if (jobResult.job.statusDescription.equalsIgnoreCase("Ready")) {
                    log.info("Job {} built successfully", jobName);
                } else if (jobResult.job.statusDescription.startsWith("Finished")) { // Created but not launchable
                    log.warn("The job {} seems unlaunchable. Tearing down the job. Last loglines are ", jobName,
                            StringUtils.join(jobResult.job.jobLogTail, "\n"));
                    jobResult = h3wrapper.teardownJob(jobName);
                    log.trace("Result of teardown() operation: " + new String(jobResult.response, "UTF-8"));
                    throw new HeritrixLaunchException("Job '" + jobName + "' failed to launch: "
                            + StringUtils.join(jobResult.job.jobLogTail, "\n"));
                } else {
                    throw new IllegalState(
                            "Unknown job.statusdescription returned from h3: " + jobResult.job.statusDescription);
                }
            } else {
                throw new IllegalState(
                        "Unknown ResultStatus returned from h3wrapper: " + ResultStatus.toString(jobResult.status));
            }

            jobResult = h3wrapper.waitForJobState(jobName, CrawlControllerState.NASCENT, 60, 1000);
            if (jobResult.job.crawlControllerState.equalsIgnoreCase(CrawlControllerState.NASCENT.toString())) {
                log.info("The H3 job {} in now in state CrawlControllerState.NASCENT", jobName);
            } else {
                log.warn("The job state is now {}. Should have been CrawlControllerState.NASCENT",
                        jobResult.job.crawlControllerState);
            }
            jobResult = h3wrapper.launchJob(jobName);

            log.trace("Result of launchJob() operation: " + new String(jobResult.response, "UTF-8"));
            jobResult = h3wrapper.waitForJobState(jobName, CrawlControllerState.PAUSED, 60, 1000);
            if (jobResult.job.crawlControllerState.equalsIgnoreCase(CrawlControllerState.PAUSED.toString())) {
                log.info("The H3 job {} in now in state CrawlControllerState.PAUSED", jobName);
            } else {
                log.warn("The job state is now {}. Should have been CrawlControllerState.PAUSED",
                        jobResult.job.crawlControllerState);
            }

            jobResult = h3wrapper.unpauseJob(jobName);
            log.info("The job {} is now in state {}", jobName, jobResult.job.crawlControllerState);

            // POST: h3 is running, and the job with name 'jobName' is running
            log.trace("h3-State after unpausing job '{}': {}", jobName, new String(jobResult.response, "UTF-8"));

        } catch (UnsupportedEncodingException e) {
            throw new IOFailure("Unexpected error during communication with heritrix3", e);
        }
    }

    @Override
    public void requestCrawlStop(String reason) {
        log.info("Terminating job {}. Reason: {}", this.jobName, reason);
        JobResult jobResult = h3wrapper.job(jobName);
        if (jobResult != null) {
            if (jobResult.job.isRunning) {
                JobResult result = h3wrapper.terminateJob(this.jobName);
                if (!result.job.isRunning) {
                    log.warn("Job '{}' terminated", this.jobName);
                } else {
                    log.warn("Job '{}' not terminated correctly", this.jobName);
                }
            } else {
                log.warn("Job '{}' not terminated, as it was not running", this.jobName);
            }
        } else {
            log.warn("Job '{}' has maybe already been terminated and/or heritrix3 is no longer running",
                    this.jobName);
        }
    }

    @Override
    public void stopHeritrix() {
        log.debug("Stopping Heritrix3");
        try {
            // Check if a heritrix3 process still exists for this jobName
            ProcessBuilder processBuilder = new ProcessBuilder("pgrep", "-f", jobName);
            log.info("Looking up heritrix3 process with. " + processBuilder.command());
            if (processBuilder.start().waitFor() == 0) { // Yes, ask heritrix3 to shutdown, ignoring any jobs named jobName
                log.info("Heritrix running, requesting heritrix to stop and ignoring running job '{}'", jobName);
                h3wrapper.exitJavaProcess(Arrays.asList(new String[] { jobName }));
            } else {
                log.info("Heritrix3 process not running for job '{}'", jobName);
            }
            // Check again
            if (processBuilder.start().waitFor() == 0) { // The process is still alive, kill it
                log.info("Heritrix3 process still running, pkill'ing heritrix3 ");
                ProcessBuilder killerProcessBuilder = new ProcessBuilder("pkill", "-f", jobName);
                int pkillExitValue = killerProcessBuilder.start().exitValue();
                if (pkillExitValue != 0) {
                    log.warn("Non xero exit value ({}) when trying to pkill Heritrix3.", pkillExitValue);
                } else {
                    log.info("Heritrix process terminated successfully with the pkill command {}",
                            killerProcessBuilder.command());
                }
            } else {
                log.info("Heritrix3 stopped successfully.");
            }
        } catch (IOException e) {
            log.warn("Exception while trying to shutdown heritrix", e);
        } catch (InterruptedException e) {
            log.debug("stopHeritrix call interupted", e);
        }
    }

    /**
     * Return the URL for monitoring this instance.
     *
     * @return the URL for monitoring this instance.
     */
    public String getHeritrixConsoleURL() {
        return "https://" + SystemUtils.getLocalHostName() + ":" + getGuiPort() + "/engine";
    }

    /**
     * Cleanup after an Heritrix3 process. This entails sending the shutdown command to the Heritrix3 process, and killing
     * it forcefully, if it is still alive after waiting the period of time specified by the
     * CommonSettings.PROCESS_TIMEOUT setting.
     *
     * @param crawlDir the crawldir to cleanup (argument is currently not used) 
     * @see IHeritrixController#cleanup()
     */
    public void cleanup(File crawlDir) {
        JobResult jobResult;
        try {
            // Check engine status
            EngineResult engineResult = h3wrapper.rescanJobDirectory();
            if (engineResult != null) {
                List<JobShort> knownJobs = engineResult.engine.jobs;
                if (knownJobs.size() != 1) {
                    log.warn("Should be one job but there is {} jobs: {}", knownJobs.size(),
                            knownJobsToString(engineResult));
                }
            } else {
                log.warn("Unresponsive Heritrix3 engine. Let's try continuing the cleanup anyway");
            }

            // Check that job jobName still exists in H3 engine
            jobResult = h3wrapper.job(jobName);
            if (jobResult != null) {
                if (jobResult.status == ResultStatus.OK && jobResult.job.crawlControllerState != null) {
                    String TEARDOWN = "teardown";
                    if (jobResult.job.availableActions.contains(TEARDOWN)) {
                        log.info("Tearing down h3 job {}", jobName);
                        jobResult = h3wrapper.teardownJob(jobName);
                    } else {
                        String errMsg = "Tearing down h3 job '" + jobName
                                + "' not possible. Not one of the actions available: "
                                + StringUtils.join(jobResult.job.availableActions, ",");
                        log.warn(errMsg);
                        throw new IOFailure(errMsg);
                    }
                }
            } else {
                throw new IOFailure("Unexpected error during communication with heritrix3 during cleanup");
            }
            // Wait for the state: jobResult.job.crawlControllerState == null (but we only try ten times with 1 second interval 
            jobResult = h3wrapper.waitForJobState(jobName, null, 10, heritrix3EngineIntervalBetweenRetriesInMillis);
            // Did we get the expected state?
            if (jobResult.job.crawlControllerState != null) {
                log.warn("The job {} is still lurking about. Shutdown heritrix3 and ignore the job", jobName);
                List<String> jobsToIgnore = new ArrayList<String>();
                jobsToIgnore.add(jobName);
                EngineResult result = h3wrapper.exitJavaProcess(jobsToIgnore);
                if (result == null || (result.status != ResultStatus.RESPONSE_EXCEPTION
                        && result.status != ResultStatus.OFFLINE)) {
                    throw new IOFailure("Heritrix3 could not be shut down");
                }
            } else {
                EngineResult result = h3wrapper.exitJavaProcess(null);
                if (result == null || (result.status != ResultStatus.RESPONSE_EXCEPTION
                        && result.status != ResultStatus.OFFLINE)) {
                    throw new IOFailure("Heritrix3 could not be shut down");
                }
            }
        } catch (Throwable e) {
            throw new IOFailure("Unknown error during communication with heritrix3", e);
        }
    }

    private String knownJobsToString(EngineResult engineResult) {
        String result = "";
        if (engineResult == null || engineResult.engine == null || engineResult.engine.jobs == null) {
            result = null;
        } else {
            List<JobShort> knownjobs = engineResult.engine.jobs;
            for (JobShort js : knownjobs) {
                result += js.shortName + " ";
            }
        }

        return result;
    }

    /**
     * Return the URL for monitoring this instance.
     *
     * @return the URL for monitoring this instance.
     */
    public String getAdminInterfaceUrl() {
        return "https://" + SystemUtils.getLocalHostName() + ":" + getGuiPort() + "/engine";
    }

    /**
     * Gets a message that stores the information summarizing the crawl progress.
     *
     * @return a message that stores the information summarizing the crawl progress.
     */
    public CrawlProgressMessage getCrawlProgress() {
        Heritrix3Files files = getHeritrixFiles();
        CrawlProgressMessage cpm = new CrawlProgressMessage(files.getHarvestID(), files.getJobID(),
                progressStatisticsLegend);
        cpm.setHostUrl(getHeritrixConsoleURL());
        JobResult jobResult = h3wrapper.job(jobName);
        if (jobResult != null) {
            getCrawlServiceAttributes(cpm, jobResult);
        } else {
            log.warn("Unable to get Heritrix3 status for job '{}'", jobName);
        }
        if (cpm.crawlIsFinished()) {
            cpm.setStatus(CrawlStatus.CRAWLING_FINISHED);
            // No need to go further, CrawlService.Job bean does not exist
            return cpm;
        }
        if (jobResult != null) {
            fetchCrawlServiceJobAttributes(cpm, jobResult);
        } else {
            log.warn("Unable to get JobAttributes for job '{}'", jobName);
        }
        return cpm;
    }

    /**
     * Retrieve the values of the crawl service attributes and add them to the CrawlProgressMessage being put together.
     *
     * @param cpm the crawlProgress message being prepared
     */
    private void getCrawlServiceAttributes(CrawlProgressMessage cpm, JobResult job) {
        // TODO check job state??
        CrawlServiceInfo hStatus = cpm.getHeritrixStatus();
        hStatus.setAlertCount(job.job.alertCount); // info taken from job information
        hStatus.setCurrentJob(this.jobName); // Note:Information not taken from H3
        hStatus.setCrawling(job.job.isRunning);// info taken from job information
    }

    /**
     * Retrieve the values of the crawl service job attributes and add them to the CrawlProgressMessage being put
     * together.
     *
     * @param cpm the crawlProgress message being prepared
     */
    private void fetchCrawlServiceJobAttributes(CrawlProgressMessage cpm, JobResult job) {
        CrawlServiceJobInfo jStatus = cpm.getJobStatus();

        /*
                   timestamp  discovered      queued   downloaded       doc/s(avg)  KB/s(avg)   dl-failures   busy-thread   mem-use-KB  heap-size-KB   congestion   max-depth   avg-depth
        2015-04-29T12:42:54Z         774         573          185        0.9(2.31)     49(41)            16             2        61249        270848            1         456         114
        */
        /*
        jStatus.setProgressStatistics(newProgressStats);
        if (progressStatisticsLegend == null) {
        progressStatisticsLegend = (String) executeMBeanOperation(CrawlServiceJobOperation.progressStatisticsLegend);
        }
        */

        long totalUriCount = job.job.uriTotalsReport.totalUriCount;
        long downloadedUriCount = job.job.uriTotalsReport.downloadedUriCount;
        Double progress;
        if (totalUriCount == 0) {
            progress = 0.0;
        } else {
            progress = downloadedUriCount * 100.0 / totalUriCount;
        }
        jStatus.setProgressStatistics(progress + "%");

        Long elapsedSeconds = job.job.elapsedReport.elapsedMilliseconds;
        if (elapsedSeconds == null) {
            elapsedSeconds = -1L;
        } else {
            elapsedSeconds = elapsedSeconds / 1000L;
        }
        jStatus.setElapsedSeconds(elapsedSeconds);

        Double currentProcessedDocsPerSec = job.job.rateReport.currentDocsPerSecond;
        if (currentProcessedDocsPerSec == null) {
            currentProcessedDocsPerSec = new Double(-1L);
        }
        jStatus.setCurrentProcessedDocsPerSec(currentProcessedDocsPerSec);

        Double processedDocsPerSec = job.job.rateReport.averageDocsPerSecond;
        if (processedDocsPerSec == null) {
            processedDocsPerSec = new Double(-1L);
        }
        jStatus.setProcessedDocsPerSec(processedDocsPerSec);

        Integer kbRate = job.job.rateReport.currentKiBPerSec;
        if (kbRate == null) {
            kbRate = -1;
        }
        jStatus.setCurrentProcessedKBPerSec(kbRate);

        Integer processedKBPerSec = job.job.rateReport.averageKiBPerSec;
        if (processedKBPerSec == null) {
            processedKBPerSec = -1;
        }
        jStatus.setProcessedKBPerSec(processedKBPerSec);

        Long discoveredFilesCount = job.job.uriTotalsReport.totalUriCount;
        if (discoveredFilesCount == null) {
            discoveredFilesCount = -1L;
        }
        jStatus.setDiscoveredFilesCount(discoveredFilesCount);

        Long downloadedCount = job.job.uriTotalsReport.downloadedUriCount;
        if (downloadedCount == null) {
            downloadedCount = -1L;
        }
        jStatus.setDownloadedFilesCount(downloadedCount);
        /*
        27 queues: 5 active (1 in-process; 0 ready; 4 snoozed); 0 inactive; 0 retired; 22 exhausted
        */
        String frontierShortReport = String.format(
                "%d queues: %d active (%d in-process; %d ready; %d snoozed); %d inactive; %d retired; %d exhausted",
                job.job.frontierReport.totalQueues, job.job.frontierReport.activeQueues,
                job.job.frontierReport.inProcessQueues, job.job.frontierReport.readyQueues,
                job.job.frontierReport.snoozedQueues, job.job.frontierReport.inactiveQueues,
                job.job.frontierReport.retiredQueues, job.job.frontierReport.exhaustedQueues);
        jStatus.setFrontierShortReport(frontierShortReport);

        String newStatus = "?";
        String StringValue = job.job.crawlControllerState;
        if (StringValue != null) {
            newStatus = (String) StringValue;
        }
        jStatus.setStatus(newStatus);
        String status = (String) StringValue;
        if (status.contains("PAUSE")) { // FIXME this is not correct
            cpm.setStatus(CrawlStatus.CRAWLER_PAUSED);
        } else {
            cpm.setStatus(CrawlStatus.CRAWLER_ACTIVE);
        }

        Integer currentActiveToecount = job.job.loadReport.busyThreads;
        if (currentActiveToecount == null) {
            currentActiveToecount = -1;
        }
        jStatus.setActiveToeCount(currentActiveToecount);
    }

    /**
     * Generates a full frontier report.
     *
     * @return a Full frontier report.
     */
    public FullFrontierReport getFullFrontierReport() {
        //FIXME get frontier report from H3 using an appropriate REST call.
        // Is the following OK: No!!!
        //https://localhost:8444/engine/job/testjob/jobdir/20150210135411/reports/frontier-summary-report.txt

        return null;
        /*      
         return FullFrontierReport.parseContentsAsString(
            jobName,
            (String) executeOperationNoRetry(crawlServiceJobBeanName,
                    CrawlServiceJobOperation.frontierReport.name(), "all"));
                        
         */
    }

    @Override
    public boolean atFinish() {
        throw new NotImplementedException("Not implemented");
    }

    @Override
    public void beginCrawlStop() {
        throw new NotImplementedException("Not implemented");
    }

    @Override
    public void cleanup() {
        throw new NotImplementedException("Not implemented");
    }

    @Override
    public boolean crawlIsEnded() {
        throw new NotImplementedException("Not implemented");
    }

    @Override
    public int getActiveToeCount() {
        throw new NotImplementedException("Not implemented");
    }

    @Override
    public int getCurrentProcessedKBPerSec() {
        throw new NotImplementedException("Not implemented");
    }

    @Override
    public String getHarvestInformation() {
        throw new NotImplementedException("Not implemented");
    }

    @Override
    public String getProgressStats() {
        throw new NotImplementedException("Not implemented");
    }

    @Override
    public long getQueuedUriCount() {
        throw new NotImplementedException("Not implemented");
    }

    @Override
    public boolean isPaused() {
        throw new NotImplementedException("Not implemented");
    }

}