edu.cornell.mannlib.vitro.webapp.controller.harvester.FileHarvestController.java Source code

Introduction

Here is the source code for edu.cornell.mannlib.vitro.webapp.controller.harvester.FileHarvestController.java
Source

/* $This file is distributed under the terms of the license in /doc/license.txt$ */

package edu.cornell.mannlib.vitro.webapp.controller.harvester;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.servlet.ServletContext;
import javax.servlet.ServletException;
import javax.servlet.ServletOutputStream;
import javax.servlet.UnavailableException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.xml.parsers.DocumentBuilderFactory;

import org.apache.commons.fileupload.FileItem;
import org.apache.commons.fileupload.servlet.ServletFileUpload;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.json.JSONException;
import org.json.JSONObject;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import edu.cornell.mannlib.vitro.webapp.config.ConfigurationProperties;
import edu.cornell.mannlib.vitro.webapp.controller.VitroRequest;
import edu.cornell.mannlib.vitro.webapp.controller.freemarker.FreemarkerHttpServlet;
import edu.cornell.mannlib.vitro.webapp.controller.freemarker.UrlBuilder;
import edu.cornell.mannlib.vitro.webapp.controller.freemarker.responsevalues.ExceptionResponseValues;
import edu.cornell.mannlib.vitro.webapp.controller.freemarker.responsevalues.ResponseValues;
import edu.cornell.mannlib.vitro.webapp.controller.freemarker.responsevalues.TemplateResponseValues;
import edu.cornell.mannlib.vitro.webapp.filestorage.backend.FileStorageSetup;
import edu.cornell.mannlib.vitro.webapp.filestorage.uploadrequest.FileUploadServletRequest;

public class FileHarvestController extends FreemarkerHttpServlet {

    private static final long serialVersionUID = 1L;
    private static final Log log = LogFactory.getLog(FileHarvestController.class);
    private static final String TEMPLATE_DEFAULT = "fileharvest.ftl";

    private static final String NORMAL_TERMINATION_LAST_OUTPUT = "File Harvest completed successfully";

    private static final String PARAMETER_FIRST_UPLOAD = "firstUpload";
    private static final String PARAMETER_UPLOADED_FILE = "uploadedFile";
    private static final String PARAMETER_MODE = "mode";
    private static final String PARAMETER_JOB = "job";

    private static final String POST_TO = UrlBuilder.getUrl("/harvester/harvest");

    private static final String MODE_HARVEST = "harvest";
    private static final String MODE_CHECK_STATUS = "checkStatus";
    private static final String MODE_DOWNLOAD_TEMPLATE = "template";

    /**
     * Stores information about the Harvester thread for a particular user session.
     */
    private Map<String, SessionInfo> sessionIdToSessionInfo = new Hashtable<String, SessionInfo>(); //Hashtable is threadsafe, HashMap is not

    /**
     * A list of known job parameters (that is, "job=" values from the query string which we will accept from the browser).
     * This should be filled in the static initializer and then never written to again.
     */
    private static final List<String> knownJobs = new ArrayList<String>();

    /**
     * Relative path from the VIVO Uploads directory to the root location where user-uploaded files will be stored.  Include
     * final slash.
     */
    private static final String PATH_TO_UPLOADS = "harvester/";

    /**
     * Relative path from the Harvester root directory to the main area reserved for the VIVO File Harvest feature.  Include
     * final slash.
     */
    private static final String PATH_TO_FILE_HARVEST_ROOT = "vivo/";

    /**
     * Relative path from the Harvester root directory to the directory where user-downloadable template files are stored.
     * Include final slash.
     */
    public static final String PATH_TO_TEMPLATE_FILES = PATH_TO_FILE_HARVEST_ROOT + "templates/";

    /**
     * Relative path from the Harvester root directory to the directory containing the script templates.  Include final slash.
     */
    public static final String PATH_TO_HARVESTER_SCRIPTS = PATH_TO_FILE_HARVEST_ROOT + "scripts/";

    /**
     * Relative path from the Harvester root directory to the directory containing the script templates.  Include final slash.
     */
    public static final String PATH_TO_HARVESTED_DATA = PATH_TO_FILE_HARVEST_ROOT + "harvested-data/";

    static {
        fillKnownJobTypesList();
    }

    /**
     * Fill the known job types list.  Any time a new job type is added, we need to make sure this method is adding it to the list.
     * By "new job type" is meant a new "job=" parameter that we understand when we see it in the query string.  This typically means
     * we have also handled seeing this parameter in the getJob() method of this class.
     *
     * The exception to all this is a new CSV job, which is entirely handled by adding a new CsvFileHarvestJob.JobType enum value.  This
     * method as well as this class's getJob() method already handle the rest.
     */
    private static void fillKnownJobTypesList() {

        //fill known CSV job types
        CsvFileHarvestJob.JobType[] csvFileHarvestJobTypes = CsvFileHarvestJob.JobType.values();
        for (CsvFileHarvestJob.JobType csvFileHarvestJobType : csvFileHarvestJobTypes) {
            knownJobs.add(csvFileHarvestJobType.httpParameterName.toLowerCase());
        }
    }

    @Override
    protected ResponseValues processRequest(VitroRequest vreq) {
        try {
            cleanUpOldSessions();

            String job = vreq.getParameter(PARAMETER_JOB);
            String jobKnown = "false";
            if ((job != null) && FileHarvestController.knownJobs.contains(job.toLowerCase()))
                jobKnown = "true";

            FileHarvestJob jobObject = getJob(vreq, job);

            Map<String, Object> body = new HashMap<String, Object>();
            String harvesterPath = getHarvesterPath(vreq);
            //body.put("uploadPostback", "false");
            body.put("paramFirstUpload", PARAMETER_FIRST_UPLOAD);
            body.put("paramUploadedFile", PARAMETER_UPLOADED_FILE);
            body.put("paramMode", PARAMETER_MODE);
            body.put("paramJob", PARAMETER_JOB);
            body.put("modeHarvest", MODE_HARVEST);
            body.put("modeCheckStatus", MODE_CHECK_STATUS);
            body.put("modeDownloadTemplate", MODE_DOWNLOAD_TEMPLATE);
            body.put("job", job);
            body.put("jobKnown", jobKnown);
            body.put("harvesterLocation", harvesterPath);
            body.put("postTo", POST_TO + "?" + PARAMETER_JOB + "=" + job);
            body.put("jobSpecificHeader", (jobObject != null) ? jobObject.getPageHeader() : "");
            body.put("jobSpecificLinkHeader", (jobObject != null) ? jobObject.getLinkHeader() : "");
            body.put("jobSpecificDownloadHelp", (jobObject != null) ? jobObject.getTemplateDownloadHelp() : "");
            body.put("jobSpecificFillInHelp", (jobObject != null) ? jobObject.getTemplateFillInHelp() : "");
            body.put("jobSpecificNoNewDataMessage", (jobObject != null) ? jobObject.getNoNewDataMessage() : "");
            return new TemplateResponseValues(TEMPLATE_DEFAULT, body);
        } catch (Throwable e) {
            log.error(e, e);
            return new ExceptionResponseValues(e);
        }
    }

    @Override
    protected String getTitle(String siteName, VitroRequest vreq) {
        return "VIVO Harvester Test";
    }

    /**
     * Returns the root location of the VIVO Harvester on this machine.
     * @return the root location of the VIVO Harvester on this machine
     */
    public static String getHarvesterPath(HttpServletRequest req) {
        String pathToHarvester = ConfigurationProperties.getBean(req).getProperty("harvester.location");
        if (pathToHarvester == null) {
            log.error("The deploy.properties file does not contain a value for 'harvester.location'");
            return "";
        }
        return pathToHarvester;
    }

    /**
     * Returns the path on this machine of the area within Harvester reserved for File Harvest.
     * @return the path on this machine of the area within Harvester reserved for File Harvest
     */
    public static String getFileHarvestRootPath(HttpServletRequest req) {
        String fileHarvestRootPath = getHarvesterPath(req) + PATH_TO_FILE_HARVEST_ROOT;
        return fileHarvestRootPath;
    }

    /**
     * Returns the base directory used for all File Harvest uploads.
     * @param context the current servlet context
     * @return the base directory for file harvest uploads
     * @throws Exception if the Vitro home directory could not be found
     */
    private static String getUploadPathBase(ServletContext context) throws Exception {
        String vitroHomeDirectoryName = ConfigurationProperties.getBean(context)
                .getProperty(FileStorageSetup.PROPERTY_VITRO_HOME_DIR);
        if (vitroHomeDirectoryName == null) {
            throw new Exception("Vitro home directory name could not be found.");
        }

        String pathBase = vitroHomeDirectoryName + "/" + FileStorageSetup.FILE_STORAGE_SUBDIRECTORY + "/"
                + PATH_TO_UPLOADS;
        return pathBase;
    }

    /**
     * Gets the FileHarvestJob implementation that is needed to handle the specified request.  This
     * will depend on the type of harvest being performed (CSV, RefWorks, etc.)
     * @param vreq the request from the browser
     * @param jobParameter the POST or GET parameter "job".  Might not be available in vreq at this point,
     *                     thus we are requiring that it be sent in.
     * @return the FileHarvestJob that will provide harvest-type-specific services for this request
     */
    private FileHarvestJob getJob(VitroRequest vreq, String jobParameter) {
        String namespace = vreq.getWebappDaoFactory().getDefaultNamespace();

        FileHarvestJob job = null;

        if (jobParameter == null)
            log.error("No job specified.");
        else if (CsvFileHarvestJob.JobType.containsTypeWithHttpParameterName(jobParameter)) //check if this is a CSV job
            job = CsvFileHarvestJob.createJob(CsvFileHarvestJob.JobType.getByHttpParameterName(jobParameter), vreq,
                    namespace);
        else
            log.error("Invalid job: " + jobParameter);

        return job;
    }

    /**
     * Gets the location where we want to save uploaded files.  This location is in the VIVO uploads directory under
     * "harvester", and then in a directory named by the user's session ID as retrieved from the request.  The path
     * returned by this method will end in a slash (/).
     * @param vreq the request from which to get the session ID
     * @return the path to the location where uploaded files will be saved.  This path will end in a slash (/)
     */
    public static String getUploadPath(VitroRequest vreq) {
        try {
            String path = getUploadPathBase(vreq.getSession().getServletContext()) + getSessionId(vreq) + "/";
            return path;
        } catch (Exception e) {
            log.error(e, e);
            throw new RuntimeException(e);
        }
    }

    @Override
    public void doPost(HttpServletRequest request, HttpServletResponse response)
            throws IOException, ServletException {

        try {
            boolean isMultipart = ServletFileUpload.isMultipartContent(request);
            String mode = request.getParameter(PARAMETER_MODE);
            if (isMultipart)
                doFileUploadPost(request, response);
            else if (mode.equals(MODE_HARVEST))
                doHarvestPost(request, response);
            else if (mode.equals(MODE_CHECK_STATUS))
                doCheckHarvestStatusPost(request, response);
            else if (mode.equals(MODE_DOWNLOAD_TEMPLATE))
                doDownloadTemplatePost(request, response);
            else
                throw new Exception("Unrecognized post mode: " + mode);
        } catch (Exception e) {
            log.error(e, e);
        }
    }

    /**
     * This is for when the user clicks the "Upload" button on the form, sending a file to the server.  An HTTP post is
     * redirected here when it is determined that the request was multipart (as this will identify the post as a file
     * upload click).
     * @param request the HTTP request
     * @param response the HTTP response
     * @throws IOException if an IO error occurs
     * @throws ServletException if a servlet error occurs
     */
    private void doFileUploadPost(HttpServletRequest request, HttpServletResponse response)
            throws IOException, ServletException {

        JSONObject json = generateJson(false);
        try {
            VitroRequest vreq = new VitroRequest(request);

            //parse request for uploaded file
            int maxFileSize = 1024 * 1024;
            FileUploadServletRequest req = FileUploadServletRequest.parseRequest(vreq, maxFileSize);
            if (req.hasFileUploadException()) {
                Exception e = req.getFileUploadException();
                new ExceptionVisibleToUser(e);
            }

            //get the job parameter
            String jobParameter = req.getParameter(PARAMETER_JOB);

            //get the location where we want to save the files (it will end in a slash), then create a File object out of it
            String path = getUploadPath(vreq);
            File directory = new File(path);

            //if this is a page refresh, we do not want to save stale files that the user doesn't want anymore, but we
            //  still have the same session ID and therefore the upload directory is unchanged.  Thus we must clear the
            //  upload directory if it exists (a "first upload" parameter, initialized to "true" but which gets set to
            //  "false" once the user starts uploading stuff is used for this).
            String firstUpload = req.getParameter(PARAMETER_FIRST_UPLOAD); //clear directory on first upload
            if (firstUpload.toLowerCase().equals("true")) {
                if (directory.exists()) {
                    File[] children = directory.listFiles();
                    for (File child : children) {
                        child.delete();
                    }
                }
            }

            //if the upload directory does not exist then create it
            if (!directory.exists())
                directory.mkdirs();

            //get the file harvest job for this request (this will determine what type of harvest is run)
            FileHarvestJob job = getJob(vreq, jobParameter);

            //get the files out of the parsed request (there should only be one)
            Map<String, List<FileItem>> fileStreams = req.getFiles();
            if (fileStreams.get(PARAMETER_UPLOADED_FILE) != null
                    && fileStreams.get(PARAMETER_UPLOADED_FILE).size() > 0) {

                //get the individual file data from the request
                FileItem csvStream = fileStreams.get(PARAMETER_UPLOADED_FILE).get(0);
                String name = csvStream.getName();

                //if another uploaded file exists with the same name, alter the name so that it is unique
                name = handleNameCollision(name, directory);

                //write the file from the request to the upload directory
                File file = new File(path + name);
                try {
                    csvStream.write(file);
                } finally {
                    csvStream.delete();
                }

                //ask the file harvest job to validate that it's okay with what was uploaded; if not delete the file
                String errorMessage = job.validateUpload(file);
                boolean success;
                if (errorMessage != null) {
                    success = false;
                    file.delete();
                } else {
                    success = true;
                    errorMessage = "success";
                }

                //prepare the results which will be sent back to the browser for display
                try {
                    json.put("success", success);
                    json.put("fileName", name);
                    json.put("errorMessage", errorMessage);
                } catch (JSONException e) {
                    log.error(e, e);
                    return;
                }

            } else {

                //if for some reason no file was included with the request, send an error back
                try {
                    json.put("success", false);
                    json.put("fileName", "(none)");
                    json.put("errorMessage", "No file uploaded");
                } catch (JSONException e) {
                    log.error(e, e);
                    return;
                }

            }
        } catch (ExceptionVisibleToUser e) {
            log.error(e, e);

            //handle exceptions whose message is for the user
            try {
                json.put("success", false);
                json.put("filename", "(none)");
                json.put("errorMessage", e.getMessage());
            } catch (JSONException f) {
                log.error(f, f);
                return;
            }
        } catch (Exception e) {
            log.error(e, e);
            json = generateJson(true);
        }

        //write the prepared response
        response.getWriter().write(json.toString());
    }

    /**
     * This is for when the user clicks the "Harvest" button on the form, sending a file to the server.  An HTTP post is
     * redirected here when an isHarvestClick parameter is contained in the post data and set to "true".
     * @param request the HTTP request
     * @param response the HTTP response
     */
    private void doHarvestPost(HttpServletRequest request, HttpServletResponse response) {

        JSONObject json;
        try {
            VitroRequest vreq = new VitroRequest(request);
            FileHarvestJob job = getJob(vreq, vreq.getParameter(PARAMETER_JOB));

            //String path = getUploadPath(vreq);

            String script = job.getScript();
            String additionsFilePath = job.getAdditionsFilePath();
            String scriptFileLocation = getScriptFileLocation(vreq);
            runScript(getSessionId(request), script, additionsFilePath, scriptFileLocation, job);

            json = generateJson(false);
            json.put("progressSinceLastCheck", "");
            json.put("scriptText", script);
            json.put("finished", false);

        } catch (Exception e) {
            json = generateJson(true);
            log.error(e, e);
        }

        try {
            response.getWriter().write(json.toString());
        } catch (IOException e) {
            log.error(e, e);
        }
    }

    /**
     * This is for posts automatically sent by the client during the harvest, to check on the status of the harvest and
     * return updated log data and whether the harvest is complete or still running.  An HTTP post is redirected here
     * when an isHarvestClick parameter is contained in the post data and set to "false".
     * @param request the HTTP request
     * @param response the HTTP response
     */
    private void doCheckHarvestStatusPost(HttpServletRequest request, HttpServletResponse response) {

        JSONObject json;
        try {
            String newline = "\n";

            String sessionId = getSessionId(request);
            SessionInfo sessionInfo = sessionIdToSessionInfo.get(sessionId);

            //if we have started a thread, check the status and return it to the user
            if (sessionInfo != null) {

                String[] unsentLogLines;
                ArrayList<String> unsentLogLinesList = sessionInfo.unsentLogLines;

                //don't let the harvester thread add data to the unsent log lines list until we have both copied it and cleared it
                synchronized (unsentLogLinesList) {
                    unsentLogLines = unsentLogLinesList.toArray(new String[unsentLogLinesList.size()]);
                    unsentLogLinesList.clear();
                }

                String progressSinceLastCheck = "";
                for (int i = 0; i < unsentLogLines.length; i++) {
                    progressSinceLastCheck += unsentLogLines[i] + newline;
                }

                boolean finished = sessionInfo.isFinished();
                boolean abnormalTermination = false;

                VitroRequest vreq = new VitroRequest(request);
                ArrayList<String> newlyAddedUrls = new ArrayList<String>();
                ArrayList<String> newlyAddedUris = new ArrayList<String>();
                if (finished) {
                    newlyAddedUris = sessionInfo.newlyAddedUris;
                    if (newlyAddedUris != null) {
                        for (String uri : newlyAddedUris) {

                            newlyAddedUrls.add(UrlBuilder.getIndividualProfileUrl(uri, vreq));
                        }
                    }

                    //remove all entries in "sessionIdTo..." mappings for this session ID
                    clearSessionInfo(sessionId);

                    if (sessionInfo.getAbnormalTermination())
                        abnormalTermination = true;
                }

                if (!abnormalTermination) {
                    json = generateJson(false);
                    json.put("progressSinceLastCheck", progressSinceLastCheck);
                    json.put("finished", finished);
                    json.put("newlyAddedUris", newlyAddedUris);
                    json.put("newlyAddedUrls", newlyAddedUrls);
                } else {
                    json = generateJson(true);
                    log.error("File harvest terminated abnormally.");
                }
            } else { //if we have not started a harvest thread, the browser should not have made this request to begin with.  Bad browser, very bad browser.
                json = generateJson(true);
                log.error("Attempt to check status of a harvest that was never started!  (Session ID " + sessionId
                        + ")");
            }
        } catch (Exception e) {
            json = generateJson(true);
            log.error(e, e);
        }

        try {
            response.getWriter().write(json.toString());
        } catch (IOException e) {
            log.error(e, e);
        }
    }

    private void doDownloadTemplatePost(HttpServletRequest request, HttpServletResponse response) {

        VitroRequest vreq = new VitroRequest(request);
        FileHarvestJob job = getJob(vreq, vreq.getParameter(PARAMETER_JOB));
        File fileToSend = new File(job.getTemplateFilePath());

        response.setContentType("application/octet-stream");
        response.setContentLength((int) (fileToSend.length()));
        response.setHeader("Content-Disposition", "attachment; filename=\"" + fileToSend.getName() + "\"");

        try {
            byte[] byteBuffer = new byte[(int) (fileToSend.length())];
            DataInputStream inStream = new DataInputStream(new FileInputStream(fileToSend));

            ServletOutputStream outputStream = response.getOutputStream();
            for (int length = inStream.read(byteBuffer); length != -1; length = inStream.read(byteBuffer)) {
                outputStream.write(byteBuffer, 0, length);
            }

            inStream.close();
            outputStream.flush();
            outputStream.close();
        } catch (IOException e) {
            log.error(e, e);
        }
    }

    /**
     * Returns the location in which the ready-to-run scripts, after having template replacements made on them, will be
     * placed.  Final slash included.
     * @return the location in which the ready-to-run scripts will be placed
     */
    private static String getScriptFileLocation(HttpServletRequest req) {
        return getHarvesterPath(req) + PATH_TO_HARVESTER_SCRIPTS + "temp/";
    }

    private File createScriptFile(String scriptFileLocation, String script) throws IOException {
        File scriptDirectory = new File(scriptFileLocation);
        if (!scriptDirectory.exists()) {
            scriptDirectory.mkdirs();
        }

        File tempFile = File.createTempFile("harv", ".sh", scriptDirectory);

        FileWriter writer = new FileWriter(tempFile);
        writer.write(script);
        writer.close();

        return tempFile;
    }

    private void runScript(String sessionId, String script, String additionsFilePath, String scriptFileLocation,
            FileHarvestJob job) {
        clearSessionInfo(sessionId);

        ScriptRunner runner = new ScriptRunner(sessionId, script, additionsFilePath, scriptFileLocation, job);
        SessionInfo info = new SessionInfo(sessionId, runner);
        sessionIdToSessionInfo.put(sessionId, info);
        runner.start();
    }

    /**
     * Handles a name conflict in a directory by providing a new name that does not conflict with the
     * name of a file already uploaded.
     * @param filename the name of the file to be added to the directory
     * @param directory the directory where the file should be added, in which to check for files of the
     *                  same name
     * @return a filename that does not conflict with any files in the directory.  If the filename parameter
     *         works, then that is returned.  Otherwise a number is appended in parentheses to the part of
     *         the file name prior to the final "." symbol (if one exists).
     */
    private String handleNameCollision(String filename, File directory) {
        String base = filename;
        String extension = "";
        if (filename.contains(".")) {
            base = filename.substring(0, filename.lastIndexOf("."));
            extension = filename.substring(filename.indexOf("."));
        }

        String renamed = filename;

        for (int i = 1; new File(directory, renamed).exists(); i++) {
            renamed = base + " (" + String.valueOf(i) + ")" + extension;
        }

        return renamed;
    }

    /**
     * Returns the ID of the current session between server and browser.
     * @param request the request coming in from the browser
     * @return the session ID
     */
    private static String getSessionId(HttpServletRequest request) {
        return request.getSession().getId();
    }

    /**
     * Parse an additions file (RDF/XML) to get the URIs of newly-harvested data, which will be sent to the browser and
     * displayed to the user as links.
     * @param additionsFile the file containing the newly-added RDF/XML
     * @param newlyAddedUris a list in which to place the newly added URIs
     */
    private void extractNewlyAddedUris(File additionsFile, List<String> newlyAddedUris, FileHarvestJob job) {

        try {
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            factory.setNamespaceAware(true);
            Document document = factory.newDocumentBuilder().parse(additionsFile);
            //Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(additionsFile);
            NodeList descriptionNodes = document
                    .getElementsByTagNameNS("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "Description");

            int numNodes = descriptionNodes.getLength();
            for (int i = 0; i < numNodes; i++) {
                Node node = descriptionNodes.item(i);

                ArrayList<String> types = getRdfTypes(node);

                boolean match = false;
                String[] validRdfTypesForJob = job.getRdfTypesForLinks();
                for (String rdfType : validRdfTypesForJob) {
                    if (types.contains(rdfType))
                        match = true;
                    break;
                }

                if (match) {

                    NamedNodeMap attributes = node.getAttributes();
                    Node aboutAttribute = attributes.getNamedItemNS("http://www.w3.org/1999/02/22-rdf-syntax-ns#",
                            "about");
                    if (aboutAttribute != null) {
                        String value = aboutAttribute.getNodeValue();
                        newlyAddedUris.add(value);
                    }
                }
            }

        } catch (Exception e) {
            log.error(e, e);
        }
    }

    /**
     * Parse an XML node for all subnodes with qualified name "rdf:type", and return each's "rdf:resource" value in a list.
     * @param descriptionNode the RDF description node
     * @return a list of rdf:types of the given description node
     */
    private ArrayList<String> getRdfTypes(Node descriptionNode) {
        ArrayList<String> rdfTypesList = new ArrayList<String>();

        NodeList children = descriptionNode.getChildNodes();
        int numChildren = children.getLength();
        for (int i = 0; i < numChildren; i++) {
            Node child = children.item(i);

            String namespace = child.getNamespaceURI();
            String name = child.getLocalName();
            String fullName = namespace + name;
            if (fullName.equals("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")) {
                NamedNodeMap attributes = child.getAttributes();
                Node resourceAttribute = attributes.getNamedItemNS("http://www.w3.org/1999/02/22-rdf-syntax-ns#",
                        "resource");
                if (resourceAttribute != null) {
                    //String attributeNamespace = resourceAttribute.getNamespaceURI();
                    String value = resourceAttribute.getNodeValue();
                    //rdfTypesList.add(attributeNamespace + value);
                    rdfTypesList.add(value);
                }
            }
        }

        return rdfTypesList;
    }

    /**
     * If a session info object exists for this session ID, abort the thread if it is still running and remove the object.
     * @param sessionId the session ID for which to clear info
     */
    private void clearSessionInfo(String sessionId) {
        SessionInfo sessionInfo = this.sessionIdToSessionInfo.get(sessionId);
        if (sessionInfo != null) {
            if (!sessionInfo.isFinished()) {
                if (sessionInfo.harvestThread.isAlive()) {
                    sessionInfo.harvestThread.abortRun();
                }
            }
            this.sessionIdToSessionInfo.remove(sessionId);
        }
    }

    /**
     * If all goes according to plan, clearSessionInfo() should be called once the client gets the last bit of information from the
     * harvest.  However, if the client doesn't request it (because the browser was closed, etc.) then the method will never get called.
     * This method gets called every time the page is initially loaded, to look for session data that is 6 hours old or more, and remove
     * it.
     */
    private void cleanUpOldSessions() {
        int minutesToAllowSession = 360;
        long millisecondsToAllowSession = minutesToAllowSession * 60 * 1000;

        Date now = new Date();
        Set<String> keySet = this.sessionIdToSessionInfo.keySet();
        for (String sessionId : keySet) {
            SessionInfo info = this.sessionIdToSessionInfo.get(sessionId);
            Date startTime = info.createTime;
            long differenceInMilliseconds = now.getTime() - startTime.getTime();
            if (differenceInMilliseconds > millisecondsToAllowSession) {
                log.debug("Removing old session: " + sessionId);
                clearSessionInfo(sessionId);
            }
        }
    }

    /**
     * Create a new JSON object
     * @param fatalError whether the fatal error flag should be set on this object
     * @return the new JSON object
     */
    private JSONObject generateJson(boolean fatalError) {
        JSONObject json = null;
        try {
            json = new JSONObject();
            json.put("fatalError", fatalError);
        } catch (JSONException e) {
            log.error(e.getMessage(), e);
        }
        return json;
    }

    /**
     * Information relating to a particular user session, created just before the harvester thread is starting.
     * @author mbarbieri
     */
    private class SessionInfo {

        /**
         * The session ID for this user session.
         */
        @SuppressWarnings("unused")
        public final String sessionId;

        /**
         * The time this object was created.
         */
        public final Date createTime;

        /**
         * The Harvester thread for his user session.
         */
        public final ScriptRunner harvestThread;

        /**
         * Harvester output that has not yet been sent back to the browser, for this user session.
         */
        public final ArrayList<String> unsentLogLines = new ArrayList<String>();

        /**
         * Flag indicating that the thread has finished.
         */
        private boolean finished = false;

        /**
         * Flag indicating that the thread finished abnormally.
         */
        private boolean abnormalTermination = false;

        /**
         * Newly added entries to VIVO, for this user session.
         */
        public final ArrayList<String> newlyAddedUris = new ArrayList<String>();

        public SessionInfo(String sessionId, ScriptRunner harvestThread) {

            this.createTime = new Date();

            this.sessionId = sessionId;
            this.harvestThread = harvestThread;
        }

        public void setAbnormalTermination() {
            abnormalTermination = true;
        }

        public boolean getAbnormalTermination() {
            return abnormalTermination;
        }

        public void finish() {
            finished = true;
        }

        public boolean isFinished() {
            return finished;
        }
    }

    /**
     * Provides a way of throwing an exception whose message it is OK to display unedited to the user.
     */
    private class ExceptionVisibleToUser extends Exception {
        private static final long serialVersionUID = 1L;

        public ExceptionVisibleToUser(Throwable cause) {
            super(cause);
        }
    }

    private class ScriptRunner extends Thread {

        private final String sessionId;
        private final String script;
        private final String additionsFilePath;
        private final String scriptFileLocation;
        private final FileHarvestJob job;

        private volatile boolean abort = false;

        public ScriptRunner(String sessionId, String script, String additionsFilePath, String scriptFileLocation,
                FileHarvestJob job) {
            this.sessionId = sessionId;
            this.script = script;
            this.additionsFilePath = additionsFilePath;
            this.scriptFileLocation = scriptFileLocation;
            this.job = job;
        }

        public void abortRun() {
            abort = true;
        }

        @Override
        public void run() {
            SessionInfo sessionInfo = sessionIdToSessionInfo.get(sessionId);
            boolean normalTerminationLineFound = false;
            if (sessionInfo != null) {
                try {
                    ArrayList<String> unsentLogLines = sessionInfo.unsentLogLines;

                    File scriptFile = createScriptFile(this.scriptFileLocation, this.script);

                    String command = "/bin/bash " + this.scriptFileLocation + scriptFile.getName();

                    log.info("Running command: " + command);
                    Process pr = Runtime.getRuntime().exec(command);

                    //try { Thread.sleep(15000); } catch(InterruptedException e) {log.error(e, e);}

                    BufferedReader processOutputReader = new BufferedReader(
                            new InputStreamReader(pr.getInputStream()));
                    for (String line = processOutputReader.readLine(); line != null; line = processOutputReader
                            .readLine()) {

                        normalTerminationLineFound = line.endsWith(NORMAL_TERMINATION_LAST_OUTPUT); //set every read to ensure it's the last line

                        //don't add stuff to this list if the main thread is running a "transaction" of copying out the data to send to client and then clearing the list
                        synchronized (unsentLogLines) {
                            unsentLogLines.add(line);
                        }
                        log.info("Harvester output: " + line);

                        if (this.abort)
                            break;
                    }

                    if (!this.abort) {
                        BufferedReader processErrorReader = new BufferedReader(
                                new InputStreamReader(pr.getErrorStream()));
                        for (String line = processErrorReader.readLine(); line != null; line = processErrorReader
                                .readLine()) {
                            log.info("Harvester error: " + line);

                            if (this.abort)
                                break;
                        }
                    }

                    if (this.abort) {
                        log.debug("Aborting harvester script for session " + this.sessionId + ".");
                        pr.destroy();
                    } else {
                        int exitVal;

                        try {
                            exitVal = pr.waitFor();
                        } catch (InterruptedException e) {
                            throw new IOException(e.getMessage(), e);
                        }

                        log.debug("Harvester script for session " + this.sessionId + " exited with error code "
                                + exitVal);

                        File additionsFile = new File(this.additionsFilePath);
                        if (additionsFile.exists())
                            extractNewlyAddedUris(additionsFile, sessionInfo.newlyAddedUris, this.job);
                        else
                            log.error("Additions file not found: " + this.additionsFilePath);
                    }

                    log.info("Harvester script execution complete");
                } catch (IOException e) {
                    log.error(e, e);
                } finally {
                    sessionInfo.finish();
                    if (!normalTerminationLineFound)
                        sessionInfo.setAbnormalTermination();
                }
            }
        }
    }
}