au.edu.jcu.fascinator.plugin.harvester.directory.DirectoryNameHarvester.java Source code

Introduction

Here is the source code for au.edu.jcu.fascinator.plugin.harvester.directory.DirectoryNameHarvester.java
Source

/* 
 * The Fascinator - Directory Name Harvester Plugin
 * Copyright (C) 2012 James Cook University
 * 
 * Acknowledgements: This Harvester plugin is based on two plugins 
 * - CSVHarvester - com.googlecode.fascinator.harvester.csv 
 *      - author Greg Pendlebury
 *      - Queensland Cyber Infrastructure Foundation (http://www.qcif.edu.au/)
 * - FileSystemHarvester - com.googlecode.fascinator.harvester.filesystem
 *      - author - Oliver Lucido
 *      - University of Southern Queensland (http://www.usq.edu.au/)
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 */
package au.edu.jcu.fascinator.plugin.harvester.directory;

import com.googlecode.fascinator.api.harvester.HarvesterException;
import com.googlecode.fascinator.api.storage.DigitalObject;
import com.googlecode.fascinator.api.storage.Payload;
import com.googlecode.fascinator.api.storage.StorageException;
import com.googlecode.fascinator.common.JsonObject;
import com.googlecode.fascinator.common.JsonSimple;
import com.googlecode.fascinator.common.harvester.impl.GenericHarvester;
import com.googlecode.fascinator.common.storage.StorageUtils;

import java.io.File;
import java.io.InputStream;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Stack;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.json.simple.JSONArray;

/*
 *    <p>
 *        This plugin harvests subdirectory names in a specified 
 *        directory in the file system. It can use a cache to do 
 *        incremental harvests, which only harvest directories that have been
 *        added since the last time it was run.
 *
 *        The directory name harvested is used in conjunction with a default 
 *        metadata file (default-metadata.json) to create metadata about 
 *        the directory.
 *
 *        Each directory harvested, may contain an override metadata file 
 *        (metadata.json) to replace the default metadata or even add to the
 *       default metadata.
 *
 *        This harvester was built to harvest bird species where standard 
 *        metadata applied to each species, but an override file may exist to 
 *        provide replacement or additional metadata details.
 *
 *        Both the default metadata and override metadata files can contain 
 *        multiple types of metadata to be created for each species. A unique
 *        record is created for each metadata type detected.
 *
 *        The caching used is the same as that from the File System Harvester.
 *    </p>
 *
 *    <h3>Configuration</h3>
 *    <p>
 *    <table border="1">
 *        <tr>
 *            <th>Option</th>
 *            <th>Description</th>
 *            <th>Required</th>
 *            <th>Default</th>
 *        </tr>
 *
 *        <tr>
 *            <td>baseDir</td>
 *            <td>Path of subdirectories to be harvested</td>
 *            <td><b>Yes</b></td>
 *            <td>${user.home}/Documents/public/</td>
 *        </tr>
 *
 *        <tr>
 *            <td>recordIDPrefix</td>
 *            <td>A prefix applied to all records created</td>
 *            <td><b>No</b></td>
 *            <td></td>
 *        </tr>
 *
 *        <tr>
 *            <td>force</td>
 *            <td>Force harvest the specified directory or file again even when it's not
 *                modified (ignore cache)</td>
 *            <td>No</td>
 *            <td>false</td>
 *        </tr>
 *
 *        <tr>
 *            <td>caching</td>
 *            <td>Caching method to use. Valid entries are 'basic' and 'hashed'</td>
 *            <td>No</td>
 *            <td>null</td>
 *        </tr>
 *        <tr>
 *            <td>cacheId</td>
 *            <td>The cache ID to use in the database if caching is in use.</td>
 *            <td>Yes (if valid 'caching' value is provided)</td>
 *            <td>null</td>
 *        </tr>
 *        <tr><td>payloadId</td> 
 *            <td>The payload identifier used to store the JSON data</td>
 *            <td>No</td>
 *            <td>defaults to "metadata.json"</td>
 *        </tr>
 *        <tr>
 *            <td>derbyHome</td>
 *            <td>Path to use for the file store of the database. Should match other Derby
 *                paths provided in the configuration file for the application.</td>
 *            <td>Yes (if valid 'caching' value is provided)</td>
 *            <td>null</td>
 *        </tr>
 *    </table>
 *
 *    <h3>Caching</h3>
 *    With regards to the underlying cache you have three options for configuration:
 *    <ol>
 *        <li>No caching: All files will always be be harvested. Be aware that
 *            without caching there is no support for deletion.</li>
 *        <li><b>Basic</b> caching: The file is considered 'cached' if the last
 *            modified date matches the database entry. On some operating systems (like
 *            linux) this can provide a minimum of around 2 seconds of granularity. For
 *            most purposes this is sufficient, and this cache is the most efficient.</li>
 *        <li><b>Hashed</b> caching: The entire contents of the file are SHA hashed
 *            and the hash is stored in the database. The file is considered cached if the
 *            old hash matches the new hash. This approach will only trigger a harvest if
 *            the contents of the file really change, but it is quite slow across large
 *            data sets and large files.</li>
 *    </ol>
 *    Deletion support is not provided.
 *
 *    <h3>Examples</h3>
 *    <ol>
 *        <li>
 *            Harvesting ${user.home}/Documents/public/Downloads directory. The
 *            harvest includes the sub directories in the Downloads directory, and does 
 *            not re-harvest unmodified directories if they exist in the cache 
 *            database under the 'default' cache. Below is a sample configuration 
 *            file: dir-config.json. This configuration specifies a single target 
 *            directory, has caching setup, specifies the names of the default files 
 *            and that each directory being harvested contains two types of metadata.
 *            This means that two records will be created for each directory (bird species)
 *            processed.
 *
 *            <pre>
 *    "harvester": {
 *        "type": "directory",
 *        "directory": {
 *            "targets": [ {
 *                "baseDir": "${test.dir}/Downloads",
 *                "recordIDPrefix": "jcu.edu.au/tdh/collection/"
 *            } ]
 *        },
 *        "file-system": {
 *            "caching": "basic",
 *            "cacheId": "config-caching"
 *        },
 *        "default-files": {
 *            "default-metadata-filename" : "default-metadata.json",
 *            "override-metadata-filename": "metadata.json"
 *        },
 *        "metadata-types": [ 
 *            {"type" : "occurrences"},
 *            {"type" : "suitability"}
 *        ]
 *    },
 *    "database-service": {
 *        "derbyHome" : "${test.cache.dir}"
 *    }
 *            </pre>
 *
 *        </li>
 *        <li>
 *            Below is an example directory structure with the default and override metadata files listed. Actions taken by the harvester are detailed.
 *            <pre>
 *Downloads                                                     (dir)
 *|----default-metadata.json                                    (file) The metadata types and the metadata contained 
 *|                                                                    in this file is applied to all birds species
 *|----Australian Raven - Corvus coronoides                     (dir)  A record is created for this folder name with 
 *|                                                                    the default metadata for each type.
 *|----Crimson Rosella - Platycercus (Platycercus) elegans      (dir)  A record is created for this folder name with 
 *|    |                                                               the default metadata overridden by metadata.json for each type.
 *|    ----metadata.json                                        (file) Override metadata file for Crimson Rosella
 *|----Pale-headed Rosella - Platycercus (Violania) adscitus    (dir)  A record is created for this folder name with 
 *|                                                                    the default metadata for each type.
 *|----Southern Cassowary - Casuaris casuaris                   (dir)  A record is created for this folder name with 
 *        |                                                               the default metadata overridden by metadata.json for each type.
 *       ----metadata.json                                        (file) Override metadata file for Southern Cassowary
 *            </pre>
 *        </li>
 *        <li>
 *        Below is a sample default metadata file.
 *        <pre>
 *{
 *    "harvester": {
 *        "type": "directory",
 *        "default-metadata":  {
 *            "occurrences": [ {
 *                "metadata1": "Observed Occurrences",
 *                "metadata2": "Predicted Occurrences",
 *                "metadata3": "Conservation",
 *                "metadata4": "Biodiversity",
 *                "metadata5": "Ornithologists rules"
 *            } ],
 *            "suitability": [ {
 *                "metadata1": "Habitat not suitable",
 *                "metadata2": "Species, present, no opinion on habitat quality available",
 *                "metadata3": "Good habitat in a vegetatino zone core for that species",
 *                "metadata4": "The Bird Observers Council of Australia, have great social gatherings around the country",
 *                "metadata5": "These metadata are all configurable"
 *            } ]
 *        }
 *    }
 *}
 *        </pre>
 *        </li>
 *        <li>
 *            Below is a sample override metadata file.
 *            <pre>
 *{
 *    "harvester": {
 *        "type": "directory",
 *        "metadata": {
 *            "occurrences": [ {
 *                "metadata2": "Override default value: The cassowary can be dangerous. BEWARE!!.",
 *                "metadata6": "This metadata should be added to the default ones, as it's new",
 *                "metadata7": "New, should also be added."
 *                } ],
 *            "suitability": [ {
 *                "metadata1": "Override default value: cassowaryss are cute.",
 *                "metadata3": "This metadata should be added to the default ones, as it's new",
 *                "metadata6": "New, should also be added.",
 *                "metadataTony" :"The habitat is not suitable in the long term for this species"
 *               } ]
 *        }
 *   }
 *}
 *            </pre>
 *        </li>
 *    </ol>
 *
 *    <h3>Wiki Link</h3>
 *    <p>
 *        <b>None</b>
 *    </p>
 *
 *   @author Jay van Schyndel
 *
 */
public class DirectoryNameHarvester extends GenericHarvester {

    /**
     * default ignore list
     */
    private static final String DEFAULT_PAYLOAD_ID = "metadata.json";
    /**
     * logging
     */
    private Logger log = LoggerFactory.getLogger(DirectoryNameHarvester.class);
    /**
     * Harvesting targets
     */
    private List<JsonSimple> targets;
    /**
     * Target index
     */
    private Integer targetIndex;
    /**
     * Target index
     */
    private File nextFile;
    /**
     * Stack of queued files to harvest
     */
    private Stack<File> fileStack;
    /**
     * whether or not there are more files to harvest
     */
    private boolean hasMore;
    /**
     * force harvesting all directories
     */
    private boolean force;
    /**
     * Caching
     */
    private DerbyCache cache;
    /**
     * A prefix for generating the object's ID
     */
    private String idPrefix;
    /**
     * Payload ID
     */
    private String payloadId;
    /**
     * Default metadata filename
     */
    private String defaultMetadataFilename;
    /**
     * Default metadata override filename
     */
    private String overrideMetadataFilename;
    /**
     * Default metadata
     */
    private JsonSimple defaultMetadata;
    /**
     * Default metadata
     */
    private JsonSimple overrideMetadata;
    /**
     * Harvesting targets
     */
    private List<JsonSimple> metadataTypes;

    /**
     * Directory Harvester Constructor
     */
    public DirectoryNameHarvester() {
        super("directory", "Directory Harvester");
    }

    /**
     * Initialisation of Directory harvester plugin
     *
     * @throws HarvesterException if fails to initialise
     */
    @Override
    public void init() throws HarvesterException {
        // Check for valid targests
        targets = getJsonConfig().getJsonSimpleList("harvester", "directory", "targets");
        if (targets.isEmpty()) {
            throw new HarvesterException("No targets specified");
        }

        //obtain the metadata filenames
        JsonSimple defaultFilenames = new JsonSimple(getJsonConfig().getObject("harvester", "default-files"));

        //this filename is expected to exist in the main folder, it provides the 
        //standard metadata for all directories
        defaultMetadataFilename = defaultFilenames.getString("null", "default-metadata-filename");
        //this filename is optional, it may exist in the directories being 
        //processed. It's contents can override and add to the default metadata
        overrideMetadataFilename = defaultFilenames.getString("null", "override-metadata-filename");

        if (defaultMetadataFilename == null) {
            throw new HarvesterException("No default metadata filename specified");
        }
        if (overrideMetadataFilename == null) {
            throw new HarvesterException("No override metadata filename specified");
        }

        //obtain the metadata types
        metadataTypes = getJsonConfig().getJsonSimpleList("harvester", "metadata-types");

        // Loop processing variables
        fileStack = new Stack<File>();
        targetIndex = null;
        hasMore = true;

        // Caching
        try {
            cache = new DerbyCache(getJsonConfig());
            // Reset flags for deletion support
            cache.resetFlags();
        } catch (Exception ex) {
            log.error("Error instantiating cache: ", ex);
            throw new HarvesterException(ex);
        }

        // Prep the first file
        nextFile = getNextFile();
    }

    /**
     * Get the next file due to be harvested
     *
     * @return The next file to harvest, null if none
     */
    private File getNextFile() {
        File next = null;
        if (fileStack.empty()) {
            next = getNextTarget();
        } else {
            next = fileStack.pop();
        }
        if (next == null) {
            hasMore = false;
        }
        return next;
    }

    /**
     * Retrieve the next file specified as a target in configuration
     *
     * @return The next target file, null if none
     */
    private File getNextTarget() {
        // First execution
        if (targetIndex == null) {
            targetIndex = new Integer(0);
        } else {
            targetIndex++;
        }

        // We're finished
        if (targetIndex >= targets.size()) {
            return null;
        }

        // Get the next target
        JsonSimple target = targets.get(targetIndex);
        String path = target.getString(null, "baseDir");
        if (path == null) {
            log.warn("No path provided for target, skipping!");
            return getNextTarget();

        } else {
            File file = new File(path);
            if (!file.exists()) {
                log.warn("Path '{}' does not exist, skipping!", path);
                return getNextTarget();

            } else {
                log.info("Target file/directory found: '{}'", path);
                updateConfig(target, path);
                return file;
            }
        }
    }

    /**
     * Update harvest configuration when switching target path
     *
     * @param tConfig The target configuration
     * @param path    The path to the target
     */
    private void updateConfig(JsonSimple tConfig, String path) {
        force = tConfig.getBoolean(false, "force");
        idPrefix = tConfig.getString("", "recordIDPrefix");
        payloadId = tConfig.getString(DEFAULT_PAYLOAD_ID, "payloadId");

        //load the default-metadata file
        try {
            this.defaultMetadata = new JsonSimple(new File(path, this.defaultMetadataFilename));
        } catch (IOException ex) {
            log.error("Unable to open default metadata file", path + this.defaultMetadataFilename);
        }
    }

    /**
     * Shutdown the plugin
     *
     * @throws HarvesterException if there are errors
     */
    @Override
    public void shutdown() throws HarvesterException {
        if (cache != null) {
            try {
                cache.shutdown();
            } catch (Exception ex) {
                log.error("Error shutting down cache: ", ex);
                throw new HarvesterException(ex);
            }
        }
    }

    /**
     * Harvest the next set of directories, and return their Object IDs
     *
     * @return Set<String> The set of object IDs just harvested
     * @throws HarvesterException if there are errors
     */
    @Override
    public Set<String> getObjectIdList() throws HarvesterException {
        Set<String> fileObjectIdList = new HashSet<String>();

        // We had no valid targets
        if (nextFile == null) {
            hasMore = false;
            return fileObjectIdList;
        }

        // Normal logic
        while (nextFile != null) {
            if (nextFile.isDirectory()) {
                File[] children = nextFile.listFiles();
                for (File child : children) {
                    if (child.isDirectory()) {
                        harvestDirectory(fileObjectIdList, child);
                    }
                }
            }

            // Progess the stack and return
            nextFile = getNextFile();
        }

        return fileObjectIdList;
    }

    /**
     * Harvest a directory based on configuration
     *
     * @param list The set of harvested IDs to add to
     * @param file The file to harvest
     * @throws HarvesterException if there are errors
     */

    private void harvestDirectory(Set<String> list, File file) throws HarvesterException {

        for (int i = 0; i < this.metadataTypes.size(); i++) {

            // The OID that will be used to store the record
            String oid = DigestUtils.md5Hex(file.getPath() + metadataTypes.get(i));
            // Check if it is in the cache, make sure the cache call come before
            // 'force' in the boolean OR so that the cache entry is 'touched'
            if (cache.hasChanged(oid, file) || force) {
                try {
                    createDigitalObject(list, file, oid, metadataTypes.get(i));
                } catch (StorageException se) {
                    log.warn("Directory not harvested {}: {}", file, se.getMessage());
                }
            }
        }
    }

    /**
     * Check if there are more objects to harvest
     *
     * @return
     * <code>true</code> if there are more,
     * <code>false</code> otherwise
     */
    @Override
    public boolean hasMoreObjects() {
        return hasMore;
    }

    /**
     * Create digital object
     *
     * @param list         The set of harvested IDs to add to
     * @param file         File (directory name) to be transformed to be digital object
     * @param oid          Unique identifier for the object
     * @param metaDataType the metadata type being processed based on configuration setup.
     * @throws HarvesterException if fail to create the object
     * @throws StorageException if fail to save the file to the storage
     */
    private void createDigitalObject(Set<String> list, File file, String oid, JsonSimple metadataType)
            throws HarvesterException, StorageException {

        //obtaining directory name to use as part of the metadata
        String directory = file.getName();

        //read the override metadata file if it exists
        JsonSimple overrideMeta = null;

        try {
            this.overrideMetadata = new JsonSimple(new File(file, this.overrideMetadataFilename));
            overrideMeta = new JsonSimple(this.overrideMetadata.getObject("harvester", "metadata"));
        } catch (IOException ex) {
            log.info("Metadata override file does not exist: ", file + this.overrideMetadataFilename);
        }

        JsonSimple defaultMetadata = new JsonSimple(
                this.defaultMetadata.getObject("harvester", "default-metadata"));

        //asembling the data
        JsonSimple data = new JsonSimple();
        //adding the species (the directory name) to the data        
        data.getJsonObject().put("species", directory);

        //Merging the default metadata with the override metadata
        //obtaining the metadata type
        String type = metadataType.getString("", "type");
        //for the metadata type, obtain the metadata and add it
        JSONArray tempDefault = defaultMetadata.getArray(type);
        JsonObject tempObj = (JsonObject) tempDefault.get(0);
        data.getJsonObject().putAll(tempObj);

        JSONArray tempOverride = null;
        if (overrideMeta != null) {
            tempOverride = overrideMeta.getArray(type);
            tempObj = (JsonObject) tempOverride.get(0);
            data.getJsonObject().putAll(tempObj);
        }

        JsonObject meta = new JsonObject();
        meta.put("dc.identifier", idPrefix + directory + "/" + type);

        storeJsonInObject(data.getJsonObject(), meta, oid);
        list.add(oid);
    }

    /**
     * Store the processed data and metadata in the system.
     *
     * @param dataJson an instantiated JSON object containing data to store
     * @param metaJson an instantiated JSON object containing metadata to store
     * @throws HarvesterException if an error occurs
     */
    private void storeJsonInObject(JsonObject dataJson, JsonObject metaJson, String oid) throws HarvesterException {

        //Does the object already exists ?
        DigitalObject object = null;
        try {

            object = getStorage().getObject(oid);
            storeJsonInPayload(dataJson, metaJson, object);

        } catch (StorageException ex) {

            //This is going to be brand new
            try {

                object = StorageUtils.getDigitalObject(getStorage(), oid);
                storeJsonInPayload(dataJson, metaJson, object);

            } catch (StorageException ex2) {

                throw new HarvesterException("Error creating new digital object:", ex2);
            }
        }

        //Set the pending flag
        if (object != null) {
            try {
                object.getMetadata().setProperty("render-pending", "true");
                object.close();
            } catch (Exception ex) {
                log.error("Error setting 'render-pending' flag: ", ex);
            }
        }
    }

    private void storeJsonInPayload(JsonObject dataJson, JsonObject metaJson, DigitalObject object)
            throws HarvesterException {

        Payload payload = null;
        JsonSimple json = new JsonSimple();
        try {
            //New payloads
            payload = object.getPayload(payloadId);
            //log.debug("Updating existing payload: '{}' => '{}'", object.getId(), payloadId);

            //Get the old JSON to merge
            try {
                json = new JsonSimple(payload.open());
            } catch (IOException ex) {
                log.error("Error parsing existing JSON: '{}' => '{}'", object.getId(), payloadId);
                throw new HarvesterException("Error parsing existing JSON: ", ex);
            } finally {
                payload.close();
            }

            //Update storage
            try {
                InputStream in = streamMergedJson(dataJson, metaJson, json);
                object.updatePayload(payloadId, in);
            } catch (IOException ex2) {
                throw new HarvesterException("Error processing JSON data: ", ex2);
            } catch (StorageException ex2) {
                throw new HarvesterException("Error updating payload: ", ex2);
            }

        } catch (StorageException ex) {
            //Create a new Payload
            try {
                //log.debug("Creating new payload: '{}' => '{}'", object.getId(), payloadId);
                InputStream in = streamMergedJson(dataJson, metaJson, json);
                payload = object.createStoredPayload(payloadId, in);
            } catch (IOException ex2) {
                throw new HarvesterException("Error parsing JSON encoding: ", ex2);
            } catch (StorageException ex2) {
                throw new HarvesterException("Error creating new payload: ", ex2);
            }
        }

        //Tidy up before we finish
        if (payload != null) {
            try {
                payload.setContentType("application/json");
                payload.close();
            } catch (Exception ex) {
                log.error("Error setting Payload MIME type and closing:", ex);
            }
        }
    }

    /**
     * Merge the newly processed data with any (possible) existing data already
     * present, also convert the completed JSON merge into a Stream for storage.
     *
     * @param dataJson an instantiated JSON object containing data to store
     * @param metaJson an instantiated JSON object containing metadata to store
     * @param existing an instantiated JsonSimple object with any existing data
     * @throws IOException if any character encoding issues effect the Stream
     */
    private InputStream streamMergedJson(JsonObject dataJson, JsonObject metaJson, JsonSimple existing)
            throws IOException {
        //Overwrite and/or create only nodes we condier new data
        existing.getJsonObject().put("recordIDPrefix", idPrefix);
        JsonObject existingData = existing.writeObject("data");
        existingData.putAll(dataJson);
        JsonObject existingMeta = existing.writeObject("metadata");
        existingMeta.putAll(metaJson);

        //Turn into a stream and return
        String jsonString = existing.toString(true);
        return IOUtils.toInputStream(jsonString, "UTF-8");
    }
}