dk.netarkivet.harvester.indexserver.FileBasedCache.java Source code

Introduction

Here is the source code for dk.netarkivet.harvester.indexserver.FileBasedCache.java
Source

/* File:        $Id$
 * Revision:    $Revision$
 * Author:      $Author$
 * Date:        $Date$
 *
 * The Netarchive Suite - Software to harvest and preserve websites
 * Copyright 2004-2012 The Royal Danish Library, the Danish State and
 * University Library, the National Library of France and the Austrian
 * National Library.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 
 *  USA
 */

package dk.netarkivet.harvester.indexserver;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.channels.FileLock;
import java.nio.channels.OverlappingFileLockException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import dk.netarkivet.common.CommonSettings;
import dk.netarkivet.common.distribute.indexserver.Index;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.Settings;

/**
 * A generic cache that stores items in files.  This abstract superclass handles
 * placement of the cache directory and adding/getting files using the
 * subclasses' methods for generating filenames.
 * 
 * @param <I> The type of cache. 
 */
public abstract class FileBasedCache<I> {
    /** Cache directory. */
    protected File cacheDir;
    /** Logger. */
    private Log log = LogFactory.getLog(getClass().getName());

    /**
     * Creates a new FileBasedCache object.  This creates a directory under the
     * main cache directory holding cached files.
     *
     * @param cacheName Name of this cache (enabling sharing among processes).
     *                  The directory created in the cachedir will have this
     *                  name.
     */
    public FileBasedCache(String cacheName) {
        ArgumentNotValid.checkNotNullOrEmpty(cacheName, "cacheName");
        this.cacheDir = new File(new File(Settings.get(CommonSettings.CACHE_DIR)), cacheName).getAbsoluteFile();
        log.info("Metadata cache for '" + cacheName + "' uses directory '" + getCacheDir().getAbsolutePath() + "'");
        FileUtils.createDir(getCacheDir());
    }

    /**
     * Get the directory that the files are cached in.  Subclasses should
     * override this to create their own directory with this directory.  The
     * full directory structure will be created if required in the constructor.
     *
     * @return A directory that cache files can reside in.
     */
    public File getCacheDir() {
        return cacheDir;
    }

    /**
     * Get the file that caches content for the given ID.
     *
     * @param id Some sort of id that uniquely identifies the item within the
     *           cache.
     *
     * @return A file (possibly nonexistant or empty) that can cache the data
     *         for the id.
     */
    public abstract File getCacheFile(I id);

    /**
     * Fill in actual data in the file in the cache.  This is the workhorse
     * method that is allowed to modify the cache.  When this method is called,
     * the cache can assume that getCacheFile(id) does not exist.
     *
     * @param id Some identifier for the item to be cached.
     *
     * @return An id of content actually available.  In most cases, this will be
     *         the same as id, but for complex I it could be a subset (or null
     *         if the type argument I is a simple type).  If the return value is
     *         not the same as id, the file will not contain cached data, and
     *         may not even exist.
     */
    protected abstract I cacheData(I id);

    /**
     * Ensure that a file containing the appropriate content exists for the ID.
     * If the content cannot be found, this method may return null (if I is a
     * simple type) or an appropriate subset (if I is, say, a Set) indicating
     * the data that is actually available.  In the latter case, calling cache
     * on the returned set should always fill the file for that subset (barring
     * catastrophic failure).
     *
     * Locking:  If the file is not immediately found, we enter a file-creation
     * state.  To avoid corrupted data, we must ensure that only one cache
     * instance, and only one thread within any instance, creates the file. Thus
     * as long as somebody else seems to be creating the file, we wait and see
     * if they finish.  This is checked by having an exclusive lock on a
     * ".working" file (we cannot use the result file, as it has to be created
     * to be locked, and we may end up with a different cached file than we
     * thought, see above).  The .working file itself is irrelevant, only the
     * lock on it matters.
     *
     * @param id Some sort of id that uniquely identifies the item within the
     *           cache.
     *
     * @return The id given if it was successfully fetched, otherwise null if
     *         the type parameter I does not allow subsets, or a subset of id if
     *         it does.  This subset should be immediately cacheable.
     */
    public I cache(I id) {
        ArgumentNotValid.checkNotNull(id, "id");
        File cachedFile = getCacheFile(id);
        try {
            File fileBehindLockFile = new File(cachedFile.getAbsolutePath() + ".working");
            FileOutputStream lockFile = new FileOutputStream(fileBehindLockFile);
            FileLock lock = null;
            // Make sure no other thread tries to create this
            log.debug("Waiting to enter synchronization on " + fileBehindLockFile.getAbsolutePath().intern());
            // FIXME Potential memory leak. intern() remembers all strings until JVM exits.
            synchronized (fileBehindLockFile.getAbsolutePath().intern()) {
                try {
                    // Make sure no other process tries to create this.
                    log.debug("locking filechannel for file '" + fileBehindLockFile.getAbsolutePath()
                            + "' (thread = " + Thread.currentThread().getName() + ")");
                    try {
                        lock = lockFile.getChannel().lock();
                    } catch (OverlappingFileLockException e) {
                        // Exception is logged below
                        throw new IOException(e.getMessage(), e);
                    }
                    // Now we know nobody else touches the file.
                    // If the file already exists, just return it.
                    if (cachedFile.exists()) {
                        return id;
                    }
                    return cacheData(id);
                } finally {
                    if (lock != null) {
                        log.debug("release lock on filechannel " + lockFile.getChannel());
                        lock.release();
                    }
                    lockFile.close();
                }
            }
        } catch (IOException e) {
            String errMsg = "Error obtaining lock for file '" + cachedFile.getAbsolutePath() + "'.";
            log.warn(errMsg, e);
            throw new IOFailure(errMsg, e);
        }
    }

    /**
     * Utility method to get a number of cache entries at a time.
     * Implementations of FileBasedCache may override this to perform the
     * caching more efficiently, if caching overhead per file is large.
     *
     * @param ids List of IDs that uniquely identify a set of items within the
     *            cache.
     *
     * @return A map from ID to the files containing cached data for those IDs.
     *         If caching failed, even partially, for an ID, the entry for the
     *         ID doesn't exist.
     */
    public Map<I, File> get(Set<I> ids) {
        ArgumentNotValid.checkNotNull(ids, "Set<I> ids");
        Map<I, File> result = new HashMap<I, File>(ids.size());
        for (I id : ids) {
            if (id.equals(cache(id))) {
                result.put(id, getCacheFile(id));
            } else {
                result.put(id, null);
            }
        }
        return result;
    }

    /**
     * Forgiving index generating method, that returns a file with an index, of
     * the greatest possible subset of a given id, and the subset.
     *
     * If the type I for instance is a Set, you may get an index of only a
     * subset. If I is a File, null may be seen as a subset.
     *
     * @param id The requested index.
     *
     * @return An index over the greatest possible subset, and the subset.
     *
     * @see #cache for more information.
     */
    public Index<I> getIndex(I id) {
        I response = id;
        I lastResponse = null;
        while (response != null && !response.equals(lastResponse)) {
            if (lastResponse != null) {
                log.info("Requested index of type '" + this.getCacheDir().getName() + "' data '" + lastResponse
                        + "' not available. Retrying with available subset '" + response + "'");
            }
            lastResponse = response;
            response = cache(lastResponse);
        }
        File cacheFile = getCacheFile(response);
        log.info("Generated index '" + cacheFile + "' of id '" + response + "', request was for '" + id + "'");
        return new Index<I>(cacheFile, response);
    }
}