edu.lternet.pasta.portal.search.BrowseCrawlerServlet.java Source code

Introduction

Here is the source code for edu.lternet.pasta.portal.search.BrowseCrawlerServlet.java
Source

/*
 *
 * $Date: 2012-06-22 12:23:25 -0700 (Fri, 22 June 2012) $
 * $Author: dcosta $
 * $Revision: 2145 $
 *
 * Copyright 2011,2012 the University of New Mexico.
 *
 * This work was supported by National Science Foundation Cooperative
 * Agreements #DEB-0832652 and #DEB-0936498.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific
 * language governing permissions and limitations under the License.
 *
 */

package edu.lternet.pasta.portal.search;

import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import javax.servlet.*;
import javax.servlet.http.*;

import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.log4j.Logger;

import edu.lternet.pasta.portal.ConfigurationListener;

/**
 * BrowseCrawlerServlet class allows the BrowseCrawler to run as a background
 * process.
 */
public class BrowseCrawlerServlet extends HttpServlet implements Runnable {

    /*
     * Class fields
     */;
    private static final Logger logger = Logger.getLogger(BrowseCrawlerServlet.class);
    static final long serialVersionUID = 0; // Needed for Eclipse warning.

    /*
     * Instance fields
     */
    private boolean keepRunning = true; // stops thread if false
    private Thread browseCrawlerThread; // background thread
    private int crawlPeriod; // minimum hours between crawls
    ServletContext servletContext = null;
    private int windowBeginsAt = 0; // first hour of crawlwindow (0-23)
    private int windowEndsAt = 23; // last hour of crawl window (0-23) 

    /*
     * Class methods
     */

    /*
     * Instance methods
     */

    /**
     * Determine whether this is a good time to run the crawler. Currently the
     * only criterion is whether the current time is within the crawl window
     * hours. For example, if windowBeginsAt is 3, and windowEndsAt is 4, then
     * the crawler will only run between 3 a.m. and 4 a.m.  (Note that the logic
     * here is limited, because it only works when windowBeginsAt is less than
     * windowEndsAt.)
     */
    private boolean checkCrawlCriteria() {
        boolean crawlCriteria = true;

        crawlCriteria = crawlCriteria && isWithinCrawlWindow();

        return crawlCriteria;
    }

    /**
     * Stops the BrowseCrawlerServlet thread when the servlet shuts down.
     */
    public void destroy() {
        keepRunning = false;
    }

    /**
     * Initializes the servlet by starting a separate thread in which to
     * run the BrowseCrawler main program.
     * 
     * @param config   the ServletConfig object, holding servlet configuration
     *                 info
     * @throws         ServletException
     */
    public void init(ServletConfig config) throws ServletException {
        super.init(config);
        PropertiesConfiguration options = ConfigurationListener.getOptions();
        String browseDirPath = options.getString("browse.dir");
        BrowseSearch.setBrowseCacheDir(browseDirPath);
        Integer crawlPeriodInt;
        this.servletContext = getServletContext();

        crawlPeriodInt = new Integer("24");
        this.crawlPeriod = crawlPeriodInt.intValue();

        browseCrawlerThread = new Thread(this);
        browseCrawlerThread.setPriority(Thread.MIN_PRIORITY); // be a good citizen
        browseCrawlerThread.start();
    }

    /**
     * Initiates a new crawl by constructing a new BrowseCrawler object and
     * calling its crawl() method.
     */
    private void initiateNewCrawl() {
        BrowseCrawler browseCrawler = new BrowseCrawler();
        BrowseGroup browseCache = browseCrawler.crawlKeywordTerms();

        if (browseCache != null) {
            /* Lock the servlet context object to guarantee that only one thread at a
             * time can be getting or setting the context attribute. 
             */
            synchronized (servletContext) {
                servletContext.setAttribute("browseKeywordHTML", browseCache.toHTML());
            }
        }
    }

    /**
     * Boolean to determine whether the current time is within the crawl window.
     * In other words, is this a good time to crawl? For example, if 
     * windowBeginsAt is 3, and windowEndsAt is 4, then the crawler
     * will only run between 3 a.m. and 4 a.m.  (Note that the logic
     * here is limited, because it only works when windowBeginsAt is less than
     * windowEndsAt.)
     * 
     * @return  true if the current time is within the crawl window, else false
     */
    private boolean isWithinCrawlWindow() {
        Calendar now = Calendar.getInstance();
        int hourOfDay = now.get(Calendar.HOUR_OF_DAY);
        boolean isWithin = false;

        if ((hourOfDay >= windowBeginsAt) && (hourOfDay <= windowEndsAt)) {
            isWithin = true;
        }

        logger.debug("windowBeginsAt:" + windowBeginsAt);
        logger.debug("windowEndsAt:  " + windowEndsAt);

        return isWithin;
    }

    /**
     * Runs the BrowseCrawler main program in a separate thread. 
     */
    public void run() {
        long delta; // endTime - startTime
        long endTime; // time that a crawl completes
        final long oneHour = (60 * 60 * 1000); // milliseconds in one hour
        long periodSleepTime; // time to sleep between crawls
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("E yyyy.MM.dd 'at' hh:mm:ss a zzz");
        long startTime; // time that a crawl starts
        final long tenMinutes = (10 * 60 * 1000); // milliseconds in ten minutes

        while (keepRunning) {
            Date now = new Date();
            logger.debug("Checking crawl critieria at time: " + simpleDateFormat.format(now));
            if (checkCrawlCriteria()) {
                logger.info("Initiating new crawl.");
                startTime = System.currentTimeMillis();
                initiateNewCrawl();
                endTime = System.currentTimeMillis();
                delta = endTime - startTime;

                try {
                    periodSleepTime = (crawlPeriod * oneHour) - delta;
                    logger.info("Crawl completed. Sleeping for " + periodSleepTime + " milliseconds.");
                    Thread.sleep(periodSleepTime);
                } catch (InterruptedException e) {
                    logger.warn("InterruptedException: " + e.getMessage());
                }
            } else {
                logger.debug("Crawl criteria failed. Sleeping for ten minutes.");
                try {
                    Thread.sleep(tenMinutes);
                } catch (InterruptedException e) {
                    logger.error("InterruptedException: " + e.getMessage());
                }
            }
        }
    }

}