au.edu.jcu.fascinator.plugin.harvester.directory.DerbyCache.java Source code

Java tutorial

Introduction

Here is the source code for au.edu.jcu.fascinator.plugin.harvester.directory.DerbyCache.java

Source

/*
 * The Fascinator - Plugin - File System Harvester - Derby Cache
 * Copyright (C) 2011 University of Southern Queensland
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

package au.edu.jcu.fascinator.plugin.harvester.directory;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.sql.Connection;
import java.sql.DatabaseMetaData;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.sql.Timestamp;
import java.util.HashSet;
import java.util.Properties;
import java.util.Set;

import org.apache.commons.codec.binary.Hex;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.googlecode.fascinator.common.JsonSimpleConfig;

/**
 * <p>
 * This class is designed to encapsulate all logic required to run a cache
 * backed into a derby database in support of the file system harvester. There
 * are two caches available.
 * <p>
 * 
 * <ul>
 * <li><b>Basic</b>: The file is considered 'cached' if the last modified date
 * matches the database entry. On some operating systems (like linux) this can
 * provide a minimum of around 2 seconds of granularity. For most purposes this
 * is sufficient, and this cache is the most efficient.</li>
 * <li><b>Hashed</b>: The entire contents of the file are SHA hashed and the
 * hash is stored in the database. The file is considered cached if the old hash
 * matches the new hash. This approach will only trigger a harvest if the
 * contents of the file really change, but it is quite slow across large data
 * sets and large files.</li>
 * </ul>
 * 
 * <p>
 * Some form of caching must be enabled to support deletion detection for this
 * plugin.
 * </p>
 * 
 * @author Greg Pendlebury
 */
public class DerbyCache {
    /** Logging */
    private final Logger log = LoggerFactory.getLogger(DerbyCache.class);

    /** JDBC Driver */
    private static String DERBY_DRIVER = "org.apache.derby.jdbc.EmbeddedDriver";

    /** Connection string prefix */
    private static String DERBY_PROTOCOL = "jdbc:derby:";

    /** Database name */
    private static String DATABASE_NAME = "fsHarvestCache";

    /** Basic table */
    private static String BASIC_TABLE = "basic";

    /** Hash table */
    private static String HASH_TABLE = "hashed";

    /** Database home directory */
    private String derbyHome;

    /** Database connection */
    private Connection connection;

    /** Are we using the database cache */
    private boolean useCache;
    private String cacheType;
    private String cacheId;

    public DerbyCache(JsonSimpleConfig config) throws Exception {
        cacheType = config.getString(null, "harvester", "file-system", "caching");
        cacheId = config.getString(null, "harvester", "file-system", "cacheId");
        if (cacheType != null && cacheId != null && (cacheType.equals("basic") || cacheType.equals("hashed"))) {
            useCache = true;
            startDatabase(config);
        } else {
            log.error("Caching is either disabled or not configured properly:");
            log.error("Cache Type: '{}'", cacheType);
            log.error("Cache ID: '{}'", cacheId);
        }
    }

    private void startDatabase(JsonSimpleConfig config) throws Exception {
        // Find data directory
        derbyHome = config.getString(null, "database-service", "derbyHome");
        String oldHome = System.getProperty("derby.system.home");

        // Derby's data directory has already been configured
        if (oldHome != null) {
            if (derbyHome != null) {
                // Use the existing one, but throw a warning
                log.warn("Using previously specified data directory:"
                        + " '{}', provided value has been ignored: '{}'", oldHome, derbyHome);
            } else {
                // This is ok, no configuration conflicts
                log.info("Using existing data directory: '{}'", oldHome);
            }

            // We don't have one, config MUST have one
        } else {
            if (derbyHome == null) {
                log.error("No database home directory configured!");
                return;
            } else {
                // Establish its validity and existance, create if necessary
                File file = new File(derbyHome);
                if (file.exists()) {
                    if (!file.isDirectory()) {
                        throw new Exception("Database home '" + derbyHome + "' is not a directory!");
                    }
                } else {
                    file.mkdirs();
                    if (!file.exists()) {
                        throw new Exception(
                                "Database home '" + derbyHome + "' does not exist and could not be created!");
                    }
                }
                System.setProperty("derby.system.home", derbyHome);
            }
        }

        // Database prep work
        try {
            checkTable(BASIC_TABLE);
            checkTable(HASH_TABLE);
        } catch (SQLException ex) {
            log.error("Error during database preparation:", ex);
            throw new Exception("Error during database preparation:", ex);
        }
        // log.debug("Derby caching database online!");
    }

    private Connection connection() throws SQLException {
        if (connection == null || !connection.isValid(1)) {
            // At least try to close if not null... even though its not valid
            if (connection != null) {
                log.error("!!! Database connection has failed, recreating.");
                try {
                    connection.close();
                } catch (SQLException ex) {
                    log.error("Error closing invalid connection, ignoring: {}", ex.getMessage());
                }
            }

            // Open a new connection
            Properties props = new Properties();
            // Load the JDBC driver
            try {
                Class.forName(DERBY_DRIVER).newInstance();
            } catch (Exception ex) {
                log.error("Driver load failed: ", ex);
                throw new SQLException("Driver load failed: ", ex);
            }

            // Establish a database connection
            connection = DriverManager.getConnection(DERBY_PROTOCOL + DATABASE_NAME + ";create=true", props);
        }
        return connection;
    }

    /**
     * <p>
     * Check whether or not the file has changed according to the configured
     * cache. The response from this method should be used as an indicator on
     * whether or not to proceed with harvesting the file. In that context:
     * </p>
     * 
     * <ul>
     * <li>If there are any errors or exceptions accessing or assessing the
     * file, the return value will be <b>false</b>.</li>
     * <li>If no caches are configured at all, the return value will be
     * <b>true</b>.</li>
     * <li>If any cache is turned on and the file appears to have changed, the
     * return value will be <b>true</b>.</li>
     * <li>If any cache is turned on and the file <b>does not</b> appear to have
     * changed, the return value will be <b>false</b>.</li>
     * </ul>
     * 
     * @param file : The file to test.
     * @return boolean : <b>True</b> if the harvest should proceed, otherwise
     *         <b>false</b>.
     */
    public boolean hasChanged(String oid, File file) {
        // log.debug("hasChanged('{}', '{}')", oid, file.getAbsolutePath());
        // Sanity check
        if (file == null || !file.exists()) {
            return false;
        }
        // Cache check
        if (useCache) {
            try {
                if (cacheType.equals("basic")) {
                    return checkBasicCache(oid, file);
                }
                if (cacheType.equals("hashed")) {
                    return checkHashedCache(oid, file);
                }
            } catch (Exception ex) {
                log.error("Error during cache process: ", ex);
                return false;
            }
        }
        // Fallback, just approve it
        return true;
    }

    /**
     * <p>
     * Used to support deletion detection. Should be called at the beginning of
     * a harvest to reset the flags in the database.
     * </p>
     * 
     */
    public void resetFlags() {
        // log.debug("resetFlags()");
        if (useCache) {
            try {
                // Run whichever update is required
                PreparedStatement sql = null;
                if (cacheType.equals("basic")) {
                    sql = connection().prepareStatement(
                            "UPDATE " + BASIC_TABLE + " SET changeFlag = 0" + " WHERE cacheId = '" + cacheId + "'");
                }

                if (cacheType.equals("hashed")) {
                    sql = connection().prepareStatement(
                            "UPDATE " + HASH_TABLE + " SET changeFlag = 0" + " WHERE cacheId = '" + cacheId + "'");
                }
                sql.executeUpdate();
                close(sql);

            } catch (Exception ex) {
                log.error("Error updating cache to reset flags: ", ex);
            }
        }
    }

    /**
     * <p>
     * Used to support deletion detection. This method is called after the
     * harvest, and will return a list of all file paths
     * </p>
     * 
     * @return Set<String>: A list of all object IDs in the cache which have not
     *         been 'touched'
     */
    public Set<String> getUnsetFlags() {
        // log.debug("getUnsetFlags()");
        Set<String> response = null;

        if (useCache) {
            try {
                // Run whichever update is required
                ResultSet result = null;
                PreparedStatement sql = null;
                if (cacheType.equals("basic")) {
                    sql = connection().prepareStatement("SELECT oid FROM " + BASIC_TABLE + " WHERE changeFlag = 0"
                            + " AND cacheId = '" + cacheId + "'");
                }

                if (cacheType.equals("hashed")) {
                    sql = connection().prepareStatement("SELECT oid FROM " + HASH_TABLE + " WHERE changeFlag = 0"
                            + " AND cacheId = '" + cacheId + "'");
                }

                // Build response
                response = new HashSet<String>();
                result = sql.executeQuery();
                while (result.next()) {
                    String oid = result.getString("oid");
                    if (oid != null) {
                        response.add(oid);
                    }
                }
                close(result);
                close(sql);

            } catch (Exception ex) {
                log.error("Error updating cache to reset flags: ", ex);
            }
        }

        return response;
    }

    /**
     * <p>
     * Used to support deletion detection. This method is just for cleanup after
     * the process completes.
     * </p>
     * 
     */
    public void purgeUnsetFlags() {
        // log.debug("purgeUnsetFlags()");
        if (useCache) {
            try {
                // Run whichever update is required
                PreparedStatement sql = null;
                if (cacheType.equals("basic")) {
                    sql = connection().prepareStatement("DELETE FROM " + BASIC_TABLE + " WHERE changeFlag = 0"
                            + " AND cacheId = '" + cacheId + "'");
                }

                if (cacheType.equals("hashed")) {
                    sql = connection().prepareStatement("DELETE FROM " + HASH_TABLE + " WHERE changeFlag = 0"
                            + " AND cacheId = '" + cacheId + "'");
                }

                sql.executeUpdate();
                close(sql);
            } catch (Exception ex) {
                log.error("Error updating cache to delete data: ", ex);
            }
        }
    }

    private boolean checkBasicCache(String oid, File file) throws Exception {
        // log.debug("checkBasicCache('{}', '{}')", oid,
        // file.getAbsolutePath());
        // What do we know?
        long lastCached = getLastModified(oid);
        long lastModified = file.lastModified();

        // Now decide the return value
        // log.debug("BASIC : cache({}) vs. file({})", lastCached,
        // lastModified);
        if (lastCached == -1l) {
            // First time... insert and return true
            // log.debug("BASIC : TRUE (INSERT) : ({})", oid);
            insertLastModified(oid, file.lastModified());
            return true;
        } else {
            // Force an update... even if unchanged, the flag avoids deletes
            updateLastModified(oid, file.lastModified());

            if (lastModified > lastCached) {
                // Data has changed... return true
                // log.debug("BASIC : TRUE (UPDATE) : ({})", oid);
                return true;
            }
        }

        // No luck
        // log.debug("BASIC : FALSE : ({})", oid);
        return false;
    }

    private boolean checkHashedCache(String oid, File file) throws Exception {
        // log.debug("checkHashedCache('{}', '{}')", oid,
        // file.getAbsolutePath());
        // What do we know?
        String cachedHash = getHash(oid);
        String currentHash = hashFile(file);

        // Now decide the return value
        // log.debug("HASHED : cache({}) vs. file({})", cachedHash,
        // currentHash);
        if (cachedHash == null) {
            // First time... insert and return true
            // log.debug("HASHED : TRUE (INSERT) : ({})", oid);
            insertHash(oid, currentHash);
            return true;
        } else {
            // Force an update... even if unchanged, the flag avoids deletes
            updateHash(oid, currentHash);

            if (!currentHash.equals(cachedHash)) {
                // Data has changed... return true
                // log.debug("HASHED : TRUE (UPDATE) : ({})", oid);
                return true;
            }
        }

        // No luck
        // log.debug("HASHED : FALSE : ({})", oid);
        return false;
    }

    /**
     * Shutdown the database connections and cleanup.
     * 
     * @throws Exception if there are errors
     */
    public void shutdown() throws Exception {
        // Derby can only be shutdown from one thread,
        // we'll catch errors from the rest.
        // String threadedShutdownMessage = DERBY_DRIVER
        // + " is not registered with the JDBC driver manager";
        try {
            // Tell the database to close
            // DriverManager.getConnection(DERBY_PROTOCOL + ";shutdown=true");
            // Shutdown just this database (but not the engine)
            DriverManager.getConnection(DERBY_PROTOCOL + DATABASE_NAME + ";shutdown=true");
        } catch (SQLException ex) {
            // These test values are used if the engine is NOT shutdown
            if (ex.getErrorCode() == 45000 && ex.getSQLState().equals("08006")) {

                // Valid response
                // if (ex.getErrorCode() == 50000 &&
                // ex.getSQLState().equals("XJ015")) {
                // Error response
            } else {
                // Make sure we ignore simple thread issues
                // if (!ex.getMessage().equals(threadedShutdownMessage)) {
                // throw new Exception("Error during database shutdown:", ex);
                // }
            }
        } finally {
            try {
                // Close our connection
                if (connection != null) {
                    connection.close();
                    connection = null;
                }
            } catch (SQLException ex) {
                throw new Exception("Error closing connection:", ex);
            }
        }
    }

    private long getLastModified(String oid) {
        try {
            PreparedStatement sql = connection()
                    .prepareStatement("SELECT lastModified FROM " + BASIC_TABLE + " WHERE oid = ? AND cacheId = ?");

            // Prepare and execute
            sql.setString(1, oid);
            sql.setString(2, cacheId);
            ResultSet result = sql.executeQuery();

            // Build response
            Timestamp ts = null;
            if (result.next()) {
                ts = result.getTimestamp("lastModified");
            }
            close(result);
            close(sql);

            if (ts == null) {
                return -1;
            } else {
                return ts.getTime();
            }
        } catch (SQLException ex) {
            log.error("Error querying last modified date: ", ex);
            return -1;
        }
    }

    private void insertLastModified(String oid, long lastModified) throws Exception {
        PreparedStatement sql = connection().prepareStatement("INSERT INTO " + BASIC_TABLE
                + " (oid, cacheId, lastModified, changeFlag)" + " VALUES (?, ?, ?, 1)");

        // Prepare and execute
        sql.setString(1, oid);
        sql.setString(2, cacheId);
        sql.setTimestamp(3, new Timestamp(lastModified));
        sql.executeUpdate();
        close(sql);
    }

    private void updateLastModified(String oid, long lastModified) throws Exception {
        PreparedStatement sql = connection().prepareStatement("UPDATE " + BASIC_TABLE
                + " SET lastModified = ?, changeFlag = 1" + " WHERE oid = ? and cacheId = ?");

        // Prepare and execute
        sql.setTimestamp(1, new Timestamp(lastModified));
        sql.setString(2, oid);
        sql.setString(3, cacheId);
        sql.executeUpdate();
        close(sql);
    }

    private String getHash(String oid) {
        try {
            PreparedStatement sql = connection()
                    .prepareStatement("SELECT hash FROM " + HASH_TABLE + " WHERE oid = ? AND cacheId = ?");

            // Prepare and execute
            sql.setString(1, oid);
            sql.setString(2, cacheId);
            ResultSet result = sql.executeQuery();

            // Build response
            String response = null;
            if (result.next()) {
                response = result.getString("hash");
            }
            close(result);
            close(sql);

            return response;
        } catch (SQLException ex) {
            log.error("Error querying last hash: ", ex);
            return null;
        }
    }

    private void insertHash(String oid, String hash) throws Exception {
        PreparedStatement sql = connection().prepareStatement(
                "INSERT INTO " + HASH_TABLE + " (oid, cacheId, hash, changeFlag) VALUES (?, ?, ?, 1)");

        // Prepare and execute
        sql.setString(1, oid);
        sql.setString(2, cacheId);
        sql.setString(3, hash);
        sql.executeUpdate();
        close(sql);
    }

    private void updateHash(String oid, String hash) throws Exception {
        PreparedStatement sql = connection().prepareStatement(
                "UPDATE " + HASH_TABLE + " SET hash = ?, changeFlag = 1" + " WHERE oid = ? AND cacheId = ?");

        // Prepare and execute
        sql.setString(1, hash);
        sql.setString(2, oid);
        sql.setString(3, cacheId);
        sql.executeUpdate();
        close(sql);
    }

    /**
     * Check for the existence of a table and arrange for its creation if not
     * found.
     * 
     * @param table The table to look for and create.
     * @throws SQLException if there was an error.
     */
    private void checkTable(String table) throws SQLException {
        boolean tableFound = findTable(table);

        // Create the table if we couldn't find it
        if (!tableFound) {
            log.debug("Table '{}' not found, creating now!", table);
            createTable(table);

            // Double check it was created
            if (!findTable(table)) {
                log.error("Unknown error creating table '{}'", table);
                throw new SQLException("Could not find or create table '" + table + "'");
            }
        }
    }

    /**
     * Check if the given table exists in the database.
     * 
     * @param table The table to look for
     * @return boolean flag if the table was found or not
     * @throws SQLException if there was an error accessing the database
     */
    private boolean findTable(String table) throws SQLException {
        boolean tableFound = false;
        DatabaseMetaData meta = connection().getMetaData();
        ResultSet result = meta.getTables(null, null, null, null);
        while (result.next() && !tableFound) {
            if (result.getString("TABLE_NAME").equalsIgnoreCase(table)) {
                tableFound = true;
            }
        }
        close(result);
        return tableFound;
    }

    /**
     * Create the given table in the database.
     * 
     * @param table The table to create
     * @throws SQLException if there was an error during creation, or an unknown
     *             table was specified.
     */
    private void createTable(String table) throws SQLException {
        if (table.equals(BASIC_TABLE)) {
            Statement sql = connection().createStatement();
            sql.execute("CREATE TABLE " + BASIC_TABLE + "(oid VARCHAR(255) NOT NULL, "
                    + "cacheId VARCHAR(255) NOT NULL, " + "lastModified TIMESTAMP NOT NULL, "
                    + "changeFlag SMALLINT NOT NULL, " + "PRIMARY KEY (oid, cacheId))");
            close(sql);
            return;
        }
        if (table.equals(HASH_TABLE)) {
            Statement sql = connection().createStatement();
            sql.execute("CREATE TABLE " + HASH_TABLE + "(oid VARCHAR(255) NOT NULL, "
                    + "cacheId VARCHAR(255) NOT NULL, " + "hash VARCHAR(50) NOT NULL, "
                    + "changeFlag SMALLINT NOT NULL, " + "PRIMARY KEY (oid, cacheId))");
            close(sql);
            return;
        }
        throw new SQLException("Unknown table '" + table + "' requested!");
    }

    /**
     * Attempt to close a ResultSet. Basic wrapper for exception catching and
     * logging
     * 
     * @param resultSet The ResultSet to try and close.
     */
    private void close(ResultSet resultSet) {
        if (resultSet != null) {
            try {
                resultSet.close();
            } catch (SQLException ex) {
                log.error("Error closing result set: ", ex);
            }
        }
        resultSet = null;
    }

    /**
     * Attempt to close a Statement. Basic wrapper for exception catching and
     * logging
     * 
     * @param statement The Statement to try and close.
     */
    private void close(Statement statement) {
        if (statement != null) {
            try {
                statement.close();
            } catch (SQLException ex) {
                log.error("Error closing statement: ", ex);
            }
        }
        statement = null;
    }

    /*
     * Sourced (and adapted) from commons-codec-1.4, required since 1.4
     * contains bug which affects httpclient request headers
     */
    private static final int BUFFER_SIZE = 1024;

    private String hashFile(File file) throws IOException {
        InputStream data = new FileInputStream(file);

        try {
            MessageDigest digest = MessageDigest.getInstance("SHA");
            byte[] buffer = new byte[BUFFER_SIZE];
            int read = data.read(buffer, 0, BUFFER_SIZE);
            while (read > -1) {
                digest.update(buffer, 0, read);
                read = data.read(buffer, 0, BUFFER_SIZE);
            }
            return new String(Hex.encodeHex(digest.digest()));
        } catch (NoSuchAlgorithmException nsae) {
            throw new RuntimeException(nsae.getMessage());
        }
    }
}