Java tutorial
/* $Id$ * $Revision$ * $Date$ * $Author$ * * The Netarchive Suite - Software to harvest and preserve websites * Copyright 2004-2012 The Royal Danish Library, the Danish State and * University Library, the National Library of France and the Austrian * National Library. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ package dk.netarkivet.harvester.harvesting.distribute; import java.io.File; import java.util.ArrayList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import dk.netarkivet.common.CommonSettings; import dk.netarkivet.common.Constants; import dk.netarkivet.common.distribute.ChannelID; import dk.netarkivet.common.distribute.JMSConnection; import dk.netarkivet.common.distribute.JMSConnectionFactory; import dk.netarkivet.common.exceptions.ArgumentNotValid; import dk.netarkivet.common.exceptions.IOFailure; import dk.netarkivet.common.exceptions.PermissionDenied; import dk.netarkivet.common.exceptions.UnknownID; import dk.netarkivet.common.lifecycle.PeriodicTaskExecutor; import dk.netarkivet.common.utils.ApplicationUtils; import dk.netarkivet.common.utils.CleanupIF; import dk.netarkivet.common.utils.DomainUtils; import dk.netarkivet.common.utils.ExceptionUtils; import dk.netarkivet.common.utils.FileUtils; import dk.netarkivet.common.utils.NotificationType; import dk.netarkivet.common.utils.NotificationsFactory; import dk.netarkivet.common.utils.Settings; import dk.netarkivet.common.utils.SystemUtils; import dk.netarkivet.common.utils.TimeUtils; import dk.netarkivet.harvester.HarvesterSettings; import dk.netarkivet.harvester.datamodel.Job; import dk.netarkivet.harvester.datamodel.JobStatus; import dk.netarkivet.harvester.distribute.HarvesterChannels; import dk.netarkivet.harvester.distribute.HarvesterMessageHandler; import dk.netarkivet.harvester.harvesting.DomainnameQueueAssignmentPolicy; import dk.netarkivet.harvester.harvesting.HarvestController; import dk.netarkivet.harvester.harvesting.HeritrixFiles; import dk.netarkivet.harvester.harvesting.SeedUriDomainnameQueueAssignmentPolicy; import dk.netarkivet.harvester.harvesting.metadata.MetadataEntry; import dk.netarkivet.harvester.harvesting.metadata.PersistentJobData; import dk.netarkivet.harvester.harvesting.metadata.PersistentJobData.HarvestDefinitionInfo; import dk.netarkivet.harvester.harvesting.report.HarvestReport; /** * This class responds to JMS doOneCrawl messages from the HarvestScheduler and * launches a Heritrix crawl with the received job description. The generated * ARC files are uploaded to the bitarchives once a harvest job has been * completed. * * During its operation CrawlStatus messages are sent to the * HarvestSchedulerMonitorServer. When starting the actual harvesting a message * is sent with status 'STARTED'. When the harvesting has finished a message is * sent with either status 'DONE' or 'FAILED'. Either a 'DONE' or 'FAILED' * message with result should ALWAYS be sent if at all possible, but only ever * one such message per job. * * It is necessary to be able to run the Heritrix harvester on several machines * and several processes on each machine. Each instance of Heritrix is started * and monitored by a HarvestControllerServer. * * Initially, all directories under serverdir are scanned for harvestinfo files. * If any are found, they are parsed for information, and all remaining files * are attempted uploaded to the bitarchive. It will then send back a * crawlstatusmessage with status failed. * * A new thread is started for each actual crawl, in which the JMS listener is * removed. Threading is required since JMS will not let the called thread * remove the listener that's being handled. * * After a harvestjob has been terminated, either successfully or * unsuccessfully, the serverdir is again scanned for harvestInfo files to * attempt upload of files not yet uploaded. Then it begins to listen again * after new jobs, if there is enough room available on the machine. If not, it * logs a warning about this, which is also sent as a notification. */ public class HarvestControllerServer extends HarvesterMessageHandler implements CleanupIF { /** The unique instance of this class. */ private static HarvestControllerServer instance; /** The logger to use. */ private static final Log log = LogFactory.getLog(HarvestControllerServer.class); /** The configured application instance id. * @see CommonSettings#APPLICATION_INSTANCE_ID */ private final String applicationInstanceId = Settings.get(CommonSettings.APPLICATION_INSTANCE_ID); /** * The name of the server this <code>HarvestControllerServer</code> is * running on. */ private final String physicalServerName = DomainUtils.reduceHostname(SystemUtils.getLocalHostName()); /** The message to write to log when starting the server. */ private static final String STARTING_MESSAGE = "Starting HarvestControllerServer."; /** The message to write to log when server is started. */ private static final String STARTED_MESSAGE = "HarvestControllerServer started."; /** The message to write to log when stopping the server. */ private static final String CLOSING_MESSAGE = "Closing HarvestControllerServer."; /** The message to write to log when server is stopped. */ private static final String CLOSED_MESSAGE = "Closed down HarvestControllerServer"; /** The message to write to log when starting a crawl. */ private static final String STARTCRAWL_MESSAGE = "Starting crawl of job :"; /** The message to write to log after ending a crawl. */ private static final String ENDCRAWL_MESSAGE = "Ending crawl of job :"; /** The max time to wait for the hosts-report.txt to * be available (in secs). */ static final int WAIT_FOR_HOSTS_REPORT_TIMEOUT_SECS = 30; /** Heritrix version property. */ private static final String HERITRIX_VERSION_PROPERTY = "heritrix.version"; /** queue-assignment-policy property. */ private static final String HERITRIX_QUEUE_ASSIGNMENT_POLICY_PROPERTY = "org.archive.crawler.frontier.AbstractFrontier.queue-assignment-policy"; /** * The CHANNEL of jobs processed by this instance. */ private static final String CHANNEL = Settings.get(HarvesterSettings.HARVEST_CONTROLLER_CHANNEL); /** * The JMS channel on which to listen for {@link HarvesterRegistrationResponse}s. */ public static final ChannelID HARVEST_CHAN_VALID_RESP_ID = HarvesterChannels .getHarvesterRegistrationResponseChannel(); /** The JMSConnection to use. */ private JMSConnection jmsConnection; /** The (singleton) HarvestController that handles the non-JMS parts of * a harvest. */ private final HarvestController controller; /** Jobs are fetched from this queue. */ private ChannelID jobChannel; /** Min. space required to start a job. */ private final long minSpaceRequired; /** the serverdir, where the harvesting takes place. */ private final File serverDir; /** This is true while a doOneCrawl is running. No jobs are accepted while * this is running. */ private CrawlStatus status; /** * In this constructor, the server creates an instance of the * HarvestController, uploads any arc-files from incomplete harvests. * Then it starts listening for new doOneCrawl messages, unless there * is no available space. In that case, it sends a notification to * the administrator and pauses. * * @throws PermissionDenied * If the serverdir or oldjobsdir can't be created. * @throws IOFailure * If harvestInfoFile contains invalid data. * @throws UnknownID if the settings file does not specify a valid queue * priority. */ private HarvestControllerServer() throws IOFailure { log.info(STARTING_MESSAGE); log.info("Bound to harvest channel '" + CHANNEL + "'"); // Make sure serverdir (where active crawl-dirs live) and oldJobsDir // (where old crawl dirs are stored) exist. serverDir = new File(Settings.get(HarvesterSettings.HARVEST_CONTROLLER_SERVERDIR)); ApplicationUtils.dirMustExist(serverDir); log.info("Serverdir: '" + serverDir + "'"); minSpaceRequired = Settings.getLong(HarvesterSettings.HARVEST_SERVERDIR_MINSPACE); if (minSpaceRequired <= 0L) { log.warn("Wrong setting of minSpaceLeft read from Settings: " + minSpaceRequired); throw new ArgumentNotValid("Wrong setting of minSpaceLeft read from Settings: " + minSpaceRequired); } log.info("Harvesting requires at least " + minSpaceRequired + " bytes free."); controller = HarvestController.getInstance(); // Set properties "heritrix.version" and // "org.archive.crawler.frontier.AbstractFrontier // .queue-assignment-policy" System.setProperty(HERITRIX_VERSION_PROPERTY, Constants.getHeritrixVersionString()); System.setProperty(HERITRIX_QUEUE_ASSIGNMENT_POLICY_PROPERTY, "org.archive.crawler.frontier.HostnameQueueAssignmentPolicy," + "org.archive.crawler.frontier.IPQueueAssignmentPolicy," + "org.archive.crawler.frontier.BucketQueueAssignmentPolicy," + "org.archive.crawler.frontier" + ".SurtAuthorityQueueAssignmentPolicy," + DomainnameQueueAssignmentPolicy.class.getName() + "," + SeedUriDomainnameQueueAssignmentPolicy.class.getName()); // Get JMS-connection // Channel THIS_CLIENT is only used for replies to store messages so // do not set as listener here. It is registered in the arcrepository // client. // Channel ANY_xxxPRIORIRY_HACO is used for listening for jobs, and // registered below. jmsConnection = JMSConnectionFactory.getInstance(); log.debug("Obtained JMS connection."); status = new CrawlStatus(); // If any unprocessed jobs are left on the server, process them now processOldJobs(); // Register for listening to harvest channel validity responses JMSConnectionFactory.getInstance().setListener(HARVEST_CHAN_VALID_RESP_ID, this); // Ask if the channel this harvester is assigned to is valid jmsConnection .send(new HarvesterRegistrationRequest(HarvestControllerServer.CHANNEL, applicationInstanceId)); log.info("Requested to check the validity of harvest channel '" + HarvestControllerServer.CHANNEL + "'"); } /** * Returns or creates the unique instance of this singleton * The server creates an instance of the HarvestController, * uploads arc-files from unfinished harvests, and * starts to listen to JMS messages on the incoming jms queues. * * @return The instance * @throws PermissionDenied * If the serverdir or oldjobsdir can't be created * @throws IOFailure * if data from old harvests exist, but contain illegal data */ public static synchronized HarvestControllerServer getInstance() throws IOFailure { if (instance == null) { instance = new HarvestControllerServer(); } return instance; } /** * Release all jms connections. Close the Controller */ public synchronized void close() { log.info(CLOSING_MESSAGE); cleanup(); log.info(CLOSED_MESSAGE); } /** * Will be called on shutdown. * @see CleanupIF#cleanup() */ public void cleanup() { if (controller != null) { controller.cleanup(); } if (jmsConnection != null) { jmsConnection.removeListener(HARVEST_CHAN_VALID_RESP_ID, this); if (jobChannel != null) { jmsConnection.removeListener(jobChannel, this); } } // Stop the sending of status messages status.stopSending(); instance = null; } /** * Looks for old job directories that await uploading. */ private void processOldJobs() { //Search through all crawldirs and process PersistentJobData // files in them File crawlDir = new File(Settings.get(HarvesterSettings.HARVEST_CONTROLLER_SERVERDIR)); File[] subdirs = crawlDir.listFiles(); for (File oldCrawlDir : subdirs) { if (PersistentJobData.existsIn(oldCrawlDir)) { // Assume that crawl had not ended at this point so // job must be marked as failed final String msg = "Found old unprocessed job data in dir '" + oldCrawlDir.getAbsolutePath() + "'. Crawl probably interrupted by " + "shutdown of HarvestController. " + "Processing data."; log.warn(msg); NotificationsFactory.getInstance().notify(msg, NotificationType.WARNING); processHarvestInfoFile(oldCrawlDir, new IOFailure("Crawl probably interrupted by " + "shutdown of HarvestController")); } } } /** * Checks that we're available to do a crawl, and if so, marks us as * unavailable, checks that the job message is well-formed, and starts * the thread that the crawl happens in. If an error occurs starting the * crawl, we will start listening for messages again. * * The sequence of actions involved in a crawl are:</br> * 1. If we are already running, resend the job to the queue and return</br> * 2. Check the job for validity</br> * 3. Send a CrawlStatus message that crawl has STARTED</br> * In a separate thread:</br> * 4. Unregister this HACO as listener</br> * 5. Create a new crawldir (based on the JobID and a timestamp)</br> * 6. Write a harvestInfoFile (using JobID and crawldir) and metadata</br> * 7. Instantiate a new HeritrixLauncher</br> * 8. Start a crawl</br> * 9. Store the generated arc-files and metadata in the known bit-archives * </br>10. _Always_ send CrawlStatus DONE or FAILED</br> * 11. Move crawldir into oldJobs dir</br> * @see #visit(DoOneCrawlMessage) for more details * @param msg The crawl job * @throws IOFailure On trouble harvesting, uploading or * processing harvestInfo * @throws UnknownID if jobID is null in the message * @throws ArgumentNotValid * if the status of the job is not valid - must be SUBMITTED * @throws PermissionDenied */ private void onDoOneCrawl(final DoOneCrawlMessage msg) throws IOFailure, UnknownID, ArgumentNotValid, PermissionDenied { // Only one doOneCrawl at a time. Returning should almost never happen, // since we deregister the listener, but we may receive another message // before the listener is removed. Also, if the job message is // malformed or starting the crawl fails, we re-add the listener. synchronized (this) { if (status.isRunning()) { log.warn("Received crawl request, but sent it back to queue, " + "as another crawl is already running: '" + msg + "'"); jmsConnection.resend(msg, jobChannel); try { // Wait a second before listening again, so the message has // a chance of getting snatched by another harvester. Thread.sleep(TimeUtils.SECOND_IN_MILLIS); } catch (InterruptedException e) { // Since we're not waiting for anything critical, we can // ignore this exception. } return; } stopAcceptingJobs(); } Thread t = null; //This 'try' matches a finally that restores running=false if we don't //start a crawl after all try { final Job job = msg.getJob(); // Every job must have an ID or we can do nothing with it, not even // send a proper failure message back. Long jobID = job.getJobID(); if (jobID == null) { log.warn("DoOneCrawlMessage arrived without JobID: '" + msg.toString() + "'"); throw new UnknownID("DoOneCrawlMessage arrived without JobID"); } log.info("Received crawlrequest for job " + jobID + ": '" + msg + "'"); // Send message to scheduler that job is started CrawlStatusMessage csmStarted = new CrawlStatusMessage(jobID, JobStatus.STARTED); jmsConnection.send(csmStarted); // Jobs should arrive with status "submitted". If this is not the // case, log the error and send a job-failed message back. // HarvestScheduler likes getting a STARTED message before // FAILED, so we oblige it here. if (job.getStatus() != JobStatus.SUBMITTED) { String message = "Message '" + msg.toString() + "' arrived with" + " status " + job.getStatus() + " for job " + jobID + ", should have been STATUS_SUBMITTED"; log.warn(message); sendErrorMessage(jobID, message, message); throw new ArgumentNotValid(message); } final List<MetadataEntry> metadataEntries = msg.getMetadata(); Thread t1; // Create thread in which harvesting will occur t1 = new HarvesterThread(job, msg.getOrigHarvestInfo(), metadataEntries); // start thread which will remove this listener, harvest, store, and // exit the VM t1.start(); log.info("Started harvester thread for job " + jobID); // We delay assigning the thread variable until start() has // succeeded. Thus, if start() fails, we will resume accepting // jobs. t = t1; } finally { // If we didn't start a thread for crawling after all, accept more // messages if (t == null) { startAcceptingJobs(); } } // Now return from this method letting the thread do the work. // This is important as it allows us to receive upload-replies from // THIS_CLIENT in the crawl thread. } /** Sends a CrawlStatusMessage for a failed job with the given short message * and detailed message. * * @param jobID ID of the job that failed * @param message A short message indicating what went wrong * @param detailedMessage A more detailed message detailing why it went * wrong. */ private void sendErrorMessage(long jobID, String message, String detailedMessage) { CrawlStatusMessage csm = new CrawlStatusMessage(jobID, JobStatus.FAILED, null); csm.setHarvestErrors(message); csm.setHarvestErrorDetails(detailedMessage); jmsConnection.send(csm); } /** Stop accepting more jobs. After this is called, all crawl messages * received will be resent to the queue. A bit further down, we will stop * listening altogether, but that requires another thread. */ private synchronized void stopAcceptingJobs() { status.setRunning(true); log.debug("No longer accepting jobs."); } /** Start listening for new crawl requests again. This actually doesn't * re-add a listener, but the listener only gets removed when we're so * far committed that we're going to exit at the end. So to start accepting * jobs again, we stop resending messages we get. * */ private synchronized void startAcceptingJobs() { //allow this haco to receive messages status.setRunning(false); } /** Stop listening for new crawl requests. * */ private void removeListener() { log.debug("Removing listener on CHANNEL '" + jobChannel + "'"); jmsConnection.removeListener(jobChannel, this); } /** Start listening for crawls, if space available. */ private void beginListeningIfSpaceAvailable() { long availableSpace = FileUtils.getBytesFree(serverDir); if (availableSpace > minSpaceRequired) { log.info("Starts to listen to new jobs on queue '" + jobChannel + "'"); jmsConnection.setListener(jobChannel, this); log.info(STARTED_MESSAGE); } else { String pausedMessage = "Not enough available diskspace. Only " + availableSpace + " bytes available. Harvester is paused."; log.error(pausedMessage); NotificationsFactory.getInstance().notify(pausedMessage, NotificationType.ERROR); } } /** Adds error messages from an exception to the status message errors. * * @param csm The message we're setting messages on. * @param crawlException The exception that got thrown from further in, * possibly as far in as Heritrix. * @param errorMessage Description of errors that happened during upload. * @param missingHostsReport If true, no hosts report was found. * @param failedFiles List of files that failed to upload. */ private void setErrorMessages(CrawlStatusMessage csm, Throwable crawlException, String errorMessage, boolean missingHostsReport, int failedFiles) { if (crawlException != null) { csm.setHarvestErrors(crawlException.toString()); csm.setHarvestErrorDetails(ExceptionUtils.getStackTrace(crawlException)); } if (errorMessage.length() > 0) { String shortDesc = ""; if (missingHostsReport) { shortDesc = "No hosts report found"; } if (failedFiles > 0) { if (shortDesc.length() > 0) { shortDesc += ", "; } shortDesc += failedFiles + " files failed to upload"; } csm.setUploadErrors(shortDesc); csm.setUploadErrorDetails(errorMessage); } } /** * Receives a DoOneCrawlMessage and call onDoOneCrawl. * * @param msg the message received * @throws IOFailure * if the crawl fails * if unable to write to harvestInfoFile * @throws UnknownID * if jobID is null in the message * @throws ArgumentNotValid * if the status of the job is not valid - must be * SUBMITTED * @throws PermissionDenied * if the crawldir can't be created */ public void visit(DoOneCrawlMessage msg) throws IOFailure, UnknownID, ArgumentNotValid, PermissionDenied { onDoOneCrawl(msg); } @Override public void visit(HarvesterRegistrationResponse msg) { // If we have already started or the message notifies for another channel, // resend it. String channelName = msg.getHarvestChannelName(); if (status.isChannelValid() || !CHANNEL.equals(channelName)) { // Controller has already started jmsConnection.resend(msg, msg.getTo()); if (log.isDebugEnabled()) { log.debug("Resending harvest channel validity message for channel '" + channelName + "'"); } return; } if (!msg.isValid()) { log.error("Received message stating that channel '" + channelName + "' is invalid. Will stop."); close(); return; } log.info("Received message stating that channel '" + channelName + "' is valid."); // Environment and connections are now ready for processing of messages jobChannel = HarvesterChannels.getHarvestJobChannelId(channelName, msg.isSnapshot()); // Only listen for harvester jobs if enough available space beginListeningIfSpaceAvailable(); // Notify the harvest dispatcher that we are ready startAcceptingJobs(); status.startSending(); } /** * Processes an existing harvestInfoFile:</br> * 1. Retrieve jobID, and crawlDir from the harvestInfoFile * using class PersistentJobData</br> * 2. finds JobId and arcsdir</br> * 3. calls storeArcFiles</br> * 4. moves harvestdir to oldjobs and deletes crawl.log and * other superfluous files. * * @param crawlDir The location of harvest-info to be processed * @param crawlException any exceptions thrown by the crawl which need to * be reported back to the scheduler (may be null for success) * @throws IOFailure if the file cannot be read */ private void processHarvestInfoFile(File crawlDir, Throwable crawlException) throws IOFailure { log.debug("Post-processing files in '" + crawlDir.getAbsolutePath() + "'"); if (!PersistentJobData.existsIn(crawlDir)) { throw new IOFailure("No harvestInfo found in directory: " + crawlDir.getAbsolutePath()); } PersistentJobData harvestInfo = new PersistentJobData(crawlDir); Long jobID = harvestInfo.getJobID(); StringBuilder errorMessage = new StringBuilder(); HarvestReport dhr = null; List<File> failedFiles = new ArrayList<File>(); HeritrixFiles files = new HeritrixFiles(crawlDir, harvestInfo); try { log.info("Store files in directory '" + crawlDir + "' " + "from jobID: " + jobID + "."); dhr = controller.storeFiles(files, errorMessage, failedFiles); } catch (Exception e) { String msg = "Trouble during postprocessing of files in '" + crawlDir.getAbsolutePath() + "'"; log.warn(msg, e); errorMessage.append(e.getMessage()).append("\n"); // send a mail about this problem NotificationsFactory.getInstance().notify( msg + ". Errors accumulated during the postprocessing: " + errorMessage.toString(), NotificationType.ERROR, e); } finally { // Send a done or failed message back to harvest scheduler // FindBugs claims a load of known null value here, but that // will not be the case if storeFiles() succeeds. CrawlStatusMessage csm; if (crawlException == null && errorMessage.length() == 0) { log.info("Job with ID " + jobID + " finished with status DONE"); csm = new CrawlStatusMessage(jobID, JobStatus.DONE, dhr); } else { log.warn("Job with ID " + jobID + " finished with status FAILED"); csm = new CrawlStatusMessage(jobID, JobStatus.FAILED, dhr); setErrorMessages(csm, crawlException, errorMessage.toString(), dhr == null, failedFiles.size()); } try { jmsConnection.send(csm); if (crawlException == null && errorMessage.length() == 0) { files.deleteFinalLogs(); } } finally { // Delete superfluous files and move the rest to oldjobs // Cleanup is in an extra finally, because it is large amounts // of data we need to remove, even on send trouble. log.info("Cleanup after harvesting job with id: " + jobID + "."); files.cleanUpAfterHarvest(new File(Settings.get(HarvesterSettings.HARVEST_CONTROLLER_OLDJOBSDIR))); } } log.info("Done post-processing files for job " + jobID + " in dir: '" + crawlDir.getAbsolutePath() + "'"); } /** A Thread class for the actual harvesting. This is required * in order to stop listening while we're busy harvesting, since JMS * doesn't allow the called thread to remove the listener that was * called. */ private class HarvesterThread extends Thread { /** The harvester Job in this thread. */ private final Job job; /** * Stores documentary information about the harvest. */ private final HarvestDefinitionInfo origHarvestInfo; /** The list of metadata associated with this Job. */ private final List<MetadataEntry> metadataEntries; /** Constructor for the HarvesterThread class. * @param job a harvesting job * @param origHarvestInfo Info about the harvestdefinition * that scheduled this job * @param metadataEntries metadata associated with the given job */ public HarvesterThread(Job job, HarvestDefinitionInfo origHarvestInfo, List<MetadataEntry> metadataEntries) { this.job = job; this.origHarvestInfo = origHarvestInfo; this.metadataEntries = metadataEntries; } /** The thread body for the harvester thread. Removes the JMS * listener, sets up the files for Heritrix, then passes control * to the HarvestController to perform the actual harvest. * * TODO Get file writing into HarvestController as well * (requires some rearrangement of the message sending) * @throws PermissionDenied if we cannot create the crawl directory. * @throws IOFailure if there are problems preparing or running the * crawl. */ public void run() { try { // We must remove the listener inside a thread, // as JMS doesn't allow us to remove it within the // call it made. removeListener(); File crawlDir = createCrawlDir(); final HeritrixFiles files = controller.writeHarvestFiles(crawlDir, job, origHarvestInfo, metadataEntries); log.info(STARTCRAWL_MESSAGE + " " + job.getJobID()); Throwable crawlException = null; try { controller.runHarvest(files); } catch (Throwable e) { String msg = "Error during crawling. " + "The crawl may have been only " + "partially completed."; log.warn(msg, e); crawlException = e; throw new IOFailure(msg, e); } finally { // This handles some message sending, so it must live // in HCS for now, but the meat of it should be in // HarvestController // TODO Refactor to be able to move this out. // TODO This may overwrite another exception, since this // may throw exceptions. processHarvestInfoFile(files.getCrawlDir(), crawlException); } } catch (Throwable e) { String msg = "Fatal error while operating job '" + job + "'"; log.fatal(msg, e); NotificationsFactory.getInstance().notify(msg, NotificationType.ERROR, e); } finally { log.info(ENDCRAWL_MESSAGE + " " + job.getJobID()); // process serverdir for files not yet uploaded. processOldJobs(); shutdownNowOrContinue(); startAcceptingJobs(); beginListeningIfSpaceAvailable(); } } /** * Does the operator want us to shutdown now. * TODO In a later implementation, the harvestControllerServer could * be notified over JMX. Now we just look for a "shutdown.txt" file * in the HARVEST_CONTROLLER_SERVERDIR */ private void shutdownNowOrContinue() { File shutdownFile = new File(serverDir, "shutdown.txt"); if (shutdownFile.exists()) { log.info("Found shutdown-file in serverdir - " + "shutting down the application"); instance.cleanup(); System.exit(0); } } /** * Create the crawl dir, but make sure a message is sent if there is a * problem. * * @return The directory that the crawl will take place in. * @throws PermissionDenied if the directory cannot be created. */ private File createCrawlDir() { // The directory where arcfiles are stored (crawldir in the above // description) File crawlDir = null; // Create the crawldir. This is done here in order to be able // to send a proper message if something goes wrong. try { File baseCrawlDir = new File(Settings.get(HarvesterSettings.HARVEST_CONTROLLER_SERVERDIR)); crawlDir = new File(baseCrawlDir, job.getJobID() + "_" + System.currentTimeMillis()); FileUtils.createDir(crawlDir); log.info("Created crawl directory: '" + crawlDir + "'"); return crawlDir; } catch (PermissionDenied e) { String message = "Couldn't create the directory for job " + job.getJobID(); log.warn(message, e); sendErrorMessage(job.getJobID(), message, ExceptionUtils.getStackTrace(e)); throw e; } } } /** * Used for maintaining the running status of the crawling, is it running * or not. Will also take care of notifying the HarvestJobManager of the * status. */ private class CrawlStatus { /** The status. */ private boolean running = false; private boolean channelIsValid = false; /** Handles the periodic sending of status messages. */ private PeriodicTaskExecutor statusTransmitter; private final int SEND_READY_DELAY = Settings.getInt(HarvesterSettings.SEND_READY_DELAY); /** * Returns <code>true</code> if the a doOneCrawl is running, else * <code>false</code>. * @return Whether a crawl running. */ public boolean isRunning() { return running; } /** * Use for changing the running state. * @param running The new status */ public synchronized void setRunning(boolean running) { this.running = running; } /** * @return the channelIsValid */ protected final boolean isChannelValid() { return channelIsValid; } /** * Starts the sending of status messages. */ public void startSending() { this.channelIsValid = true; statusTransmitter = new PeriodicTaskExecutor("HarvesterStatus", new Runnable() { public void run() { sendStatus(); } }, 0, Settings.getInt(HarvesterSettings.SEND_READY_INTERVAL)); } /** * Stops the sending of status messages. */ public void stopSending() { if (statusTransmitter != null) { statusTransmitter.shutdown(); statusTransmitter = null; } } /** * Send a HarvesterReadyMessage to the HarvestJobManager. */ private synchronized void sendStatus() { try { Thread.sleep(SEND_READY_DELAY); } catch (Exception e) { log.error("Unable to sleep", e); } if (!running) { jmsConnection.send(new HarvesterReadyMessage(applicationInstanceId + " on " + physicalServerName, HarvestControllerServer.CHANNEL)); } } } }