dk.netarkivet.harvester.indexserver.distribute.IndexRequestServer.java Source code

Java tutorial

Introduction

Here is the source code for dk.netarkivet.harvester.indexserver.distribute.IndexRequestServer.java

Source

/*
 * #%L
 * Netarchivesuite - harvester
 * %%
 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
 *             the National Library of France and the Austrian National Library.
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 2.1 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU General Lesser Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
 * #L%
 */
package dk.netarkivet.harvester.indexserver.distribute;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import dk.netarkivet.common.distribute.Channels;
import dk.netarkivet.common.distribute.JMSConnection;
import dk.netarkivet.common.distribute.JMSConnectionFactory;
import dk.netarkivet.common.distribute.RemoteFile;
import dk.netarkivet.common.distribute.RemoteFileFactory;
import dk.netarkivet.common.distribute.RemoteFileSettings;
import dk.netarkivet.common.distribute.indexserver.RequestType;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.exceptions.IllegalState;
import dk.netarkivet.common.exceptions.UnknownID;
import dk.netarkivet.common.utils.ChecksumCalculator;
import dk.netarkivet.common.utils.CleanupIF;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.StringUtils;
import dk.netarkivet.harvester.HarvesterSettings;
import dk.netarkivet.harvester.distribute.HarvesterMessageHandler;
import dk.netarkivet.harvester.distribute.IndexReadyMessage;
import dk.netarkivet.harvester.indexserver.FileBasedCache;
import dk.netarkivet.harvester.indexserver.IndexRequestServerInterface;

/**
 * Index request server singleton.
 * <p>
 * This class contains a singleton that handles requesting an index over JMS.
 * <p>
 * It will ALWAYS reply to such messages, either with the index, a message telling that only a subset is available, and
 * which, or an error message,
 */
public final class IndexRequestServer extends HarvesterMessageHandler
        implements CleanupIF, IndexRequestServerInterface {

    /** The class logger. */
    private static final Logger log = LoggerFactory.getLogger(IndexRequestServer.class);

    /** The unique instance. */
    private static IndexRequestServer instance;
    /** The handlers for index request types. */
    private Map<RequestType, FileBasedCache<Set<Long>>> handlers;

    /** The connection to the JMSBroker. */
    private static JMSConnection conn;
    /** A set with the current indexing jobs in progress. */
    private static Map<String, IndexRequestMessage> currentJobs;

    /** The max number of concurrent jobs. */
    private static long maxConcurrentJobs;
    /** Are we listening, now. */
    private static AtomicBoolean isListening = new AtomicBoolean();

    /** Interval in milliseconds between listening checks. */
    private static long listeningInterval;
    /** The timer that initiates the checkIflisteningTask. */
    private Timer checkIflisteningTimer = new Timer();

    /** satisfactoryThreshold percentage as an integer. */
    private int satisfactoryThresholdPercentage;

    /**
     * The directory to store backup copies of the currentJobs. In case of the indexserver crashing.
     */
    private File requestDir;

    /**
     * Initialise index request server with no handlers, listening to the index JMS channel.
     */
    private IndexRequestServer() {
        maxConcurrentJobs = Settings.getLong(HarvesterSettings.INDEXSERVER_INDEXING_MAXCLIENTS);
        requestDir = Settings.getFile(HarvesterSettings.INDEXSERVER_INDEXING_REQUESTDIR);
        listeningInterval = Settings.getLong(HarvesterSettings.INDEXSERVER_INDEXING_LISTENING_INTERVAL);
        satisfactoryThresholdPercentage = Settings
                .getInt(HarvesterSettings.INDEXSERVER_INDEXING_SATISFACTORYTHRESHOLD_PERCENTAGE);

        currentJobs = new HashMap<String, IndexRequestMessage>();
        handlers = new EnumMap<RequestType, FileBasedCache<Set<Long>>>(RequestType.class);
        conn = JMSConnectionFactory.getInstance();
        checkIflisteningTimer = new Timer();
    }

    /**
     * Restore old requests from requestDir.
     */
    private void restoreRequestsfromRequestDir() {
        if (!requestDir.exists()) {
            log.info("requestdir not found: creating request dir");
            if (!requestDir.mkdirs()) {
                throw new IOFailure("Unable to create requestdir '" + requestDir.getAbsolutePath() + "'");
            } else {
                return; // requestdir was just created, so nothing to do
            }
        }

        File[] requests = requestDir.listFiles();
        // Fill up the currentJobs
        for (File request : requests) {
            if (request.isFile()) {
                final IndexRequestMessage msg = restoreMessage(request);
                synchronized (currentJobs) {
                    if (!currentJobs.containsKey(msg.getID())) {
                        currentJobs.put(msg.getID(), msg);
                    } else {
                        log.debug("Skipped message w/id='{}'. Already among current jobs", msg.getID());
                        continue;
                    }

                }
                // Start a new thread to handle the actual request.
                new Thread() {
                    public void run() {
                        doProcessIndexRequestMessage(msg);
                    }
                }.start();
                log.info("Restarting indexjob w/ ID={}", msg.getID());
            } else {
                log.debug("Ignoring directory in requestdir: " + request.getAbsolutePath());
            }
        }
    }

    /**
     * Get the unique index request server instance.
     *
     * @return The index request server.
     */
    public static synchronized IndexRequestServer getInstance() {
        if (instance == null) {
            instance = new IndexRequestServer();
        }

        return instance;
    }

    /**
     * Set handler for certain type of index request. If called more than once, new handler overwrites old one.
     *
     * @param t The type of index requested
     * @param handler The handler that should handle this request.
     */
    public void setHandler(RequestType t, FileBasedCache<Set<Long>> handler) {
        ArgumentNotValid.checkNotNull(t, "RequestType t");
        ArgumentNotValid.checkNotNull(handler, "FileBasedCache<Set<Long>> handler");
        log.info("Setting handler for RequestType: {}", t);
        handlers.put(t, handler);
    }

    /**
     * Given a request for an index over a set of job ids, use a cache to try to create the index, Then reply result.
     * <p>
     * If for any reason not all requested jobs can be indexed, return the subset. The client can then retry with this
     * subset, in order to get index of that subset.
     * <p>
     * Values read from the message in order to handle this: - Type of index requested - will use the index cache of
     * this type - Set of job IDs - which jobs to generate index for
     * <p>
     * Values written to message before replying: - The subset indexed - may be the entire set. ALWAYS set unless reply
     * !OK - File with index - ONLY if subset is entire set, the index requested.
     * <p>
     * This method should ALWAYS reply. May reply with not OK message if: - Message received was not OK - Request type
     * is null or unknown in message - Set of job ids is null in message - Cache generation throws exception
     *
     * @param irMsg A message requesting an index.
     * @throws ArgumentNotValid on null parameter
     */
    public synchronized void visit(final IndexRequestMessage irMsg) throws ArgumentNotValid {
        ArgumentNotValid.checkNotNull(irMsg, "IndexRequestMessage irMsg");
        // save new msg to requestDir
        try {
            saveMsg(irMsg);
            synchronized (currentJobs) {
                if (!currentJobs.containsKey(irMsg.getID())) {
                    currentJobs.put(irMsg.getID(), irMsg);
                } else {
                    final String errMsg = "Should not happen. Skipping msg w/ id= '" + irMsg.getID()
                            + "' because already among current jobs. "
                            + "Unable to initiate indexing. Sending failed message back to sender";
                    log.warn(errMsg);
                    irMsg.setNotOk(errMsg);
                    JMSConnectionFactory.getInstance().reply(irMsg);
                    return;
                }
            }
            // Limit the number of concurrently indexing job
            if (currentJobs.size() >= maxConcurrentJobs) {
                if (isListening.get()) {
                    conn.removeListener(Channels.getTheIndexServer(), this);
                    isListening.set(false);
                }
            }

            // Start a new thread to handle the actual request.
            new Thread() {
                public void run() {
                    doProcessIndexRequestMessage(irMsg);
                }
            }.start();
            log.debug("Now {} indexing jobs in progress", currentJobs.size());
        } catch (IOException e) {
            final String errMsg = "Unable to initiate indexing. Send failed message back to sender: " + e;
            log.warn(errMsg, e);
            irMsg.setNotOk(errMsg);
            JMSConnectionFactory.getInstance().reply(irMsg);
        }
    }

    /**
     * Save a IndexRequestMessage to disk.
     *
     * @param irMsg A message to store to disk
     * @throws IOException Throws IOExecption, if unable to save message
     */
    private void saveMsg(IndexRequestMessage irMsg) throws IOException {
        File dest = new File(requestDir, irMsg.getID());
        log.debug("Storing message to {}", dest.getAbsolutePath());
        // Writing message to file
        ObjectOutputStream oos = null;
        try {
            FileOutputStream fos = new FileOutputStream(dest);
            oos = new ObjectOutputStream(fos);
            oos.writeObject(irMsg);
        } finally {
            IOUtils.closeQuietly(oos);
        }

    }

    /**
     * Restore message from serialized state.
     *
     * @param serializedObject the object stored as a file.
     * @return the restored message.
     */
    private IndexRequestMessage restoreMessage(File serializedObject) {
        Object obj = null;
        ObjectInputStream ois = null;
        try {
            // Read the message from disk.
            FileInputStream fis = new FileInputStream(serializedObject);
            ois = new ObjectInputStream(fis);

            obj = ois.readObject();
        } catch (ClassNotFoundException e) {
            throw new IllegalState("Not possible to read the stored message from file '"
                    + serializedObject.getAbsolutePath() + "':", e);
        } catch (IOException e) {
            throw new IOFailure("Not possible to read the stored message from file '"
                    + serializedObject.getAbsolutePath() + "':", e);
        } finally {
            IOUtils.closeQuietly(ois);
        }

        if (obj instanceof IndexRequestMessage) {
            return (IndexRequestMessage) obj;
        } else {
            throw new IllegalState("The serialized message is not a " + IndexRequestMessage.class.getName()
                    + " but a " + obj.getClass().getName());
        }
    }

    /**
     * Method that handles the processing of an indexRequestMessage. Returns the requested index immediately, if already
     * available, otherwise proceeds with the index generation of the requested index. Must be run in its own thread,
     * because it blocks while the index is generated.
     *
     * @param irMsg A message requesting an index
     * @see #visit(IndexRequestMessage)
     */
    private void doProcessIndexRequestMessage(final IndexRequestMessage irMsg) {
        final boolean mustReturnIndex = irMsg.mustReturnIndex();
        try {
            checkMessage(irMsg);
            RequestType type = irMsg.getRequestType();
            Set<Long> jobIDs = irMsg.getRequestedJobs();

            if (log.isInfoEnabled()) {
                log.info("Request received for an index of type '{}' for the {} jobs [{}]", type, jobIDs.size(),
                        StringUtils.conjoin(",", jobIDs));
            }
            FileBasedCache<Set<Long>> handler = handlers.get(type);

            // Here we need to make sure that we don't accidentally process more than
            // one message at the time before the whole process is over
            List<Long> sortedList = new ArrayList<Long>(jobIDs);
            String allIDsString = StringUtils.conjoin("-", sortedList);
            String checksum = ChecksumCalculator.calculateMd5(allIDsString.getBytes());
            log.debug(
                    "Waiting to enter the synchronization zone for the indexing job of size {} with checksum '{}'",
                    jobIDs.size(), checksum);
            // Begin synchronization
            synchronized (checksum.intern()) {
                log.debug("The indexing job of size {} with checksum '{}' is now in the synchronization zone",
                        jobIDs.size(), checksum);
                Set<Long> foundIDs = handler.cache(jobIDs);
                irMsg.setFoundJobs(foundIDs);
                if (foundIDs.equals(jobIDs)) {
                    if (log.isInfoEnabled()) {
                        log.info("Retrieved successfully index of type '{}' for the {} jobs [{}]", type,
                                jobIDs.size(), StringUtils.conjoin(",", jobIDs));
                    }
                    File cacheFile = handler.getCacheFile(jobIDs);
                    if (mustReturnIndex) { // return index now!
                        packageResultFiles(irMsg, cacheFile);
                    }
                } else if (satisfactoryTresholdReached(foundIDs, jobIDs)) {
                    log.info(
                            "Data for full index w/ {} jobs not available. Only found data for {} jobs - "
                                    + "but satisfactoryTreshold reached, so assuming presence of all data",
                            jobIDs.size(), foundIDs.size());
                    // Make sure that the index of the data available is generated
                    Set<Long> theFoundIDs = handler.cache(foundIDs);
                    // TheFoundIDS should be identical to foundIDs
                    // Lets make sure of that
                    Set<Long> diffSet = new HashSet<Long>(foundIDs);
                    diffSet.removeAll(theFoundIDs);

                    // Make a copy of the index available, and give it the name of
                    // the index cache file wanted.
                    File cacheFileWanted = handler.getCacheFile(jobIDs);
                    File cacheFileCreated = handler.getCacheFile(foundIDs);

                    log.info("Satisfactory threshold reached - copying index {} '{}' to full index: {}",
                            (cacheFileCreated.isDirectory() ? "dir" : "file"), cacheFileCreated.getAbsolutePath(),
                            cacheFileWanted.getAbsolutePath());
                    if (cacheFileCreated.isDirectory()) {
                        // create destination cacheFileWanted, and
                        // copy all files in cacheFileCreated to cacheFileWanted.
                        cacheFileWanted.mkdirs();
                        FileUtils.copyDirectory(cacheFileCreated, cacheFileWanted);
                    } else {
                        FileUtils.copyFile(cacheFileCreated, cacheFileWanted);
                    }

                    // TODO This delete-operation commented out, because it is deemed too dangerous,
                    // as the cachedir represented by cacheFileCreated may still be used

                    // log.info("Deleting the temporary index "
                    // + cacheFileCreated.getAbsolutePath());
                    // FileUtils.removeRecursively(cacheFileCreated);
                    log.info("We keep the index '{}', as we don't know if anybody is using it",
                            cacheFileCreated.getAbsolutePath());

                    // Information needed by recipient to store index in local cache
                    irMsg.setFoundJobs(jobIDs);
                    if (mustReturnIndex) { // return index now.
                        packageResultFiles(irMsg, cacheFileWanted);
                    }
                } else {
                    Set<Long> missingJobIds = new HashSet<Long>(jobIDs);
                    missingJobIds.removeAll(foundIDs);
                    log.warn("Failed generating index of type '{}' for the jobs [{}]. Missing data for jobs [{}].",
                            type, StringUtils.conjoin(",", jobIDs), StringUtils.conjoin(",", missingJobIds));
                }

            } // End of synchronization block
        } catch (Throwable t) {
            log.warn("Unable to generate index for jobs [" + StringUtils.conjoin(",", irMsg.getRequestedJobs())
                    + "]", t);
            irMsg.setNotOk(t);
        } finally {
            // Remove job from currentJobs Set
            synchronized (currentJobs) {
                currentJobs.remove(irMsg.getID());
            }
            // delete stored message
            deleteStoredMessage(irMsg);
            String state = "failed";
            if (irMsg.isOk()) {
                state = "successful";
            }
            if (mustReturnIndex) {
                log.info("Sending {} reply for IndexRequestMessage back to sender '{}'.", state,
                        irMsg.getReplyTo());
                JMSConnectionFactory.getInstance().reply(irMsg);
            } else {
                log.info("Sending {} IndexReadyMessage to Scheduler for harvest {}", state, irMsg.getHarvestId());
                boolean isindexready = true;
                if (state.equalsIgnoreCase("failed")) {
                    isindexready = false;
                }
                IndexReadyMessage irm = new IndexReadyMessage(irMsg.getHarvestId(), isindexready,
                        irMsg.getReplyTo(), Channels.getTheIndexServer());
                JMSConnectionFactory.getInstance().send(irm);
            }
        }
    }

    /**
     * Package the result files with the message reply.
     *
     * @param irMsg the message being answered
     * @param cacheFile The location of the result on disk.
     */
    private void packageResultFiles(IndexRequestMessage irMsg, File cacheFile) {
        RemoteFileSettings connectionParams = irMsg.getRemoteFileSettings();

        if (connectionParams != null) {
            log.debug("Trying to use client supplied RemoteFileServer: {}", connectionParams.getServerName());
        }
        if (cacheFile.isDirectory()) {
            // This cache uses multiple files stored in a directory,
            // so transfer them all.
            File[] cacheFiles = cacheFile.listFiles();
            List<RemoteFile> resultFiles = new ArrayList<RemoteFile>(cacheFiles.length);
            for (File f : cacheFiles) {
                resultFiles.add(RemoteFileFactory.getCopyfileInstance(f, irMsg.getRemoteFileSettings()));
            }
            irMsg.setResultFiles(resultFiles);
        } else {
            irMsg.setResultFile(RemoteFileFactory.getCopyfileInstance(cacheFile, irMsg.getRemoteFileSettings()));
        }
    }

    /**
     * Threshold for when the created index contains enough data to be considered a satisfactory index. Uses the
     * {@link IndexRequestServer#satisfactoryThresholdPercentage}.
     *
     * @param foundIDs The list of IDs contained in the index
     * @param requestedIDs The list of IDs requested in the index.
     * @return true, if the ratio foundIDs/requestedIDs is above the
     * {@link IndexRequestServer#satisfactoryThresholdPercentage}.
     */
    private boolean satisfactoryTresholdReached(Set<Long> foundIDs, Set<Long> requestedIDs) {
        int jobsRequested = requestedIDs.size();
        int jobsFound = foundIDs.size();
        int percentage = (jobsFound * 100) / jobsRequested;
        if (percentage > satisfactoryThresholdPercentage) {
            return true;
        } else {
            return false;
        }
    }

    /**
     * Deleted stored file for given message.
     *
     * @param irMsg a given IndexRequestMessage
     */
    private void deleteStoredMessage(IndexRequestMessage irMsg) {
        File expectedSerializedFile = new File(requestDir, irMsg.getID());
        log.debug("Trying to delete stored serialized message: {}", expectedSerializedFile.getAbsolutePath());
        if (!expectedSerializedFile.exists()) {
            log.warn("The file does not exist any more.");
            return;
        }
        boolean deleted = FileUtils.remove(expectedSerializedFile);
        if (!deleted) {
            log.debug("The file '{}' was not deleted", expectedSerializedFile);
        }
    }

    /**
     * Helper method to check message properties. Will throw exceptions on any trouble.
     *
     * @param irMsg The message to check.
     * @throws ArgumentNotValid If message is not OK, or if the list of jobs or the index request type is null.
     * @throws UnknownID If the index request type is of a form that is unknown to the server.
     */
    private void checkMessage(final IndexRequestMessage irMsg) throws UnknownID, ArgumentNotValid {
        ArgumentNotValid.checkTrue(irMsg.isOk(), "Message was not OK");
        ArgumentNotValid.checkNotNull(irMsg.getRequestType(), "RequestType type");
        ArgumentNotValid.checkNotNull(irMsg.getRequestedJobs(), "Set<Long> jobIDs");
        if (handlers.get(irMsg.getRequestType()) == null) {
            throw new UnknownID("No handler known for requesttype " + irMsg.getRequestType());
        }
    }

    /** Releases the JMS-connection and resets the singleton. */
    public void close() {
        cleanup();
    }

    /** Releases the JMS-connection and resets the singleton. */
    public void cleanup() {
        // shutdown listening timer.
        checkIflisteningTimer.cancel();
        conn.removeListener(Channels.getTheIndexServer(), this);
        handlers.clear();

        if (instance != null) {
            instance = null;
        }
    }

    /**
     * Look for stored messages to be preprocessed, and start processing those. And start the separate thread that
     * decides if we should listen for index-requests.
     */
    public void start() {
        restoreRequestsfromRequestDir();
        log.info("{} indexing jobs in progress that was stored in requestdir: {}", currentJobs.size(),
                requestDir.getAbsolutePath());

        // Define and start thread to observe current jobs:
        // Only job is to look at the isListening atomicBoolean.
        // If not listening, check if we are ready to listen again.
        TimerTask checkIfListening = new ListeningTask(this);
        isListening.set(false);
        checkIflisteningTimer.schedule(checkIfListening, 0L, listeningInterval);
    }

    /**
     * Defines the task to repeatedly check the listening status. And begin listening again, if we are ready for more
     * tasks.
     */
    private static class ListeningTask extends TimerTask {
        /** The indexrequestserver this task is associated with. */
        private IndexRequestServer thisIrs;

        /**
         * Constructor for the ListeningTask.
         *
         * @param irs The indexrequestserver this task should be associated with
         */
        ListeningTask(IndexRequestServer irs) {
            thisIrs = irs;
        }

        @Override
        public void run() {
            log.trace("Checking if we should be listening again");
            if (!isListening.get()) {
                if (maxConcurrentJobs > currentJobs.size()) {
                    log.info("Enabling listening to the indexserver channel '{}'", Channels.getTheIndexServer());
                    conn.setListener(Channels.getTheIndexServer(), thisIrs);
                    isListening.set(true);
                }
            }
        }

    }

}