edu.lternet.pasta.portal.Harvester.java Source code

Java tutorial

Introduction

Here is the source code for edu.lternet.pasta.portal.Harvester.java

Source

/*
 *
 * $Date: 2012-04-02 11:10:19 -0700 (Mon, 02 Apr 2012) $
 * $Author: dcosta $
 * $Revision: $
 *
 * Copyright 2011,2012 the University of New Mexico.
 *
 * This work was supported by National Science Foundation Cooperative
 * Agreements #DEB-0832652 and #DEB-0936498.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific
 * language governing permissions and limitations under the License.
 *
 */

package edu.lternet.pasta.portal;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Writer;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;

import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;

import edu.lternet.pasta.client.DataPackageManagerClient;
import edu.lternet.pasta.client.LoginClient;
import edu.lternet.pasta.client.PastaAuthenticationException;
import edu.lternet.pasta.client.PastaIdleTimeException;
import edu.lternet.pasta.common.EmlPackageId;
import edu.lternet.pasta.common.EmlPackageIdFormat;
import edu.lternet.pasta.common.EmlUtility;
import edu.lternet.pasta.common.ResourceNotFoundException;

/**
 * 
 * @author dcosta
 *
 */
public class Harvester implements Runnable {

    /*
     * Class variables
     */

    private static final Logger logger = Logger.getLogger(edu.lternet.pasta.portal.Harvester.class);
    private static final String PACKAGEID_PARSING_ERROR = "Unable to parse packageId from document. The document may not be well-formed XML.";

    /*
     * Instance variables
     */

    private int dummyPackageIdCounter = 1;
    private String harvesterPath = null;
    private String harvestReportId = null;
    private String harvestDirPath = null;
    private boolean evaluate;
    // List of EML documents URLs to batch process
    private ArrayList<String> documentURLs = null;
    private String uid = null;

    /*
     * Constructors
     */

    /**
     * Constructs a Harvester object with the specified values.
     * 
     * @param harvesterPath  the directory under which harvest results are stored
     * @param harvestReportId  the harvest report identifier
     * @param uid  the user identifier, e.g. "ucarroll"
     * @param isEvaluate  true if evaluate, false if upload
     */
    public Harvester(String harvesterPath, String harvestReportId, String uid, boolean isEvaluate) {
        this.harvesterPath = harvesterPath;
        this.harvestReportId = harvestReportId;
        this.harvestDirPath = String.format("%s/%s", harvesterPath, harvestReportId);
        this.uid = uid;
        this.evaluate = isEvaluate;
    }

    /*
     * Class methods
     */

    /**
     * Create a directory if it does not already exist.  This will attempt
     * to create any parent directories if necessary.
     * 
     * @param dirPath the full pathname of the directory to create
     * @returns boolean representing success or failure of directory creation
     */
    public static void createDirectory(String dirPath) throws IOException {
        File file = new File(dirPath);
        if (file.exists() && file.isDirectory()) {
            return;
        }
        if (!file.mkdirs()) {
            throw new IOException("Could not create directory: " + dirPath);
        }
    }

    /*
     * A main program that exercises much of the HarvesterServlet's functionality.\
     * No command arguments are needed.
     */
    public static void main(String[] args) {
        String testURL = "http://trachyte.lternet.edu:8080/testData/NoneSuchBugCountDuplicateEntity.xml";
        //String testMetacatHarvestURL = "http://trachyte.lternet.edu:8080/testData/trachyteHarvestList.xml";
        String uid = null;
        String password = null;

        ConfigurationListener.configure();
        PropertiesConfiguration options = ConfigurationListener.getOptions();

        if (options == null) {
            logger.error("Failed to load the DataPortal properties file: 'dataportal.properties'");
        } else {
            String harvesterPath = options.getString("harvester.path");
            uid = options.getString("eventservice.uid");
            if (uid == null) {
                logger.error("No value found for property: 'eventservice.uid'");
            } else {
                uid = "ucarroll";
            }
            boolean isEvaluate = true;
            String harvestReportId = uid + "-evaluate";
            Harvester harvester = new Harvester(harvesterPath, harvestReportId, uid, isEvaluate);
            password = options.getString("eventservice.password");
            if (password == null) {
                logger.error("No value found for property: 'eventservice.password'");
            }

            /*
             * Authenticate the test user
             */
            try {
                LoginClient loginClient = new LoginClient(uid, password);
            } catch (PastaAuthenticationException e) {
                logger.error("User '" + uid + "' failed to authenticate.");
            }

            harvester.documentURLs = new ArrayList<String>();
            harvester.documentURLs.add(testURL);
            try {
                harvester.processDocumentURLs();
            } catch (Exception e) {
                e.printStackTrace();
                logger.error(e.getMessage());
            }
        }
    }

    /*
     * Instance methods
     */

    /*
     * Check whether too many harvests are already executing.
     */
    private boolean checkForTooManyHarvests() {
        boolean tooMany = false;

        return tooMany;
    }

    /*
     * Reads the contents of the document URL into a string.
     */
    private String emlStringFromURL(String documentURL) throws IOException {
        String emlString = null;

        try {
            URL url = new URL(documentURL);
            // Use Java 7 try-with-resources. It closes the stream automatically.
            try (InputStream inputStream = url.openStream()) {
                InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
                emlString = getAsString(inputStreamReader, true);
            }
        } catch (IOException e) {
            logger.error(e.getMessage());
            throw (e);
        }

        return emlString;
    }

    /**
     *  Reads character data from the <code>Reader</code> provided, using a 
     *  buffered read. Returns data as a <code>StringBufer</code>
     *
     *  @param  reader              <code>Reader</code> object to be read
     *
     *  @param  closeWhenFinished   <code>boolean</code> value to indicate 
     *                              whether Reader should be closed when reading
     *                              finished
     *
     *  @return                     <code>StringBuffer</code> containing  
     *                              characters read from the <code>Reader</code>
     *
     *  @throws IOException if there are problems accessing or using the Reader.
     */
    public StringBuffer getAsStringBuffer(Reader reader, boolean closeWhenFinished) throws IOException {
        if (reader == null)
            return null;

        StringBuffer sb = new StringBuffer();

        try {
            char[] buff = new char[4096];
            int numCharsRead;

            while ((numCharsRead = reader.read(buff, 0, buff.length)) != -1) {
                sb.append(buff, 0, numCharsRead);
            }
        } catch (IOException ioe) {
            throw ioe;
        } finally {
            if (closeWhenFinished) {
                try {
                    if (reader != null)
                        reader.close();
                } catch (IOException ce) {
                    ce.printStackTrace();
                }
            }
        }

        return sb;
    }

    /**
     *  Reads character data from the <code>Reader</code> provided, using a 
     *  buffered read. Returns data as a <code>String</code>
     *
     *  @param  reader              <code>Reader</code> object to be read
     *
     *  @param  closeWhenFinished   <code>boolean</code> value to indicate 
     *                              whether Reader should be closed when reading
     *                              finished
     *
     *  @return                     <code>String</code> containing  
     *                              characters read from the <code>Reader</code>
     *
     *  @throws IOException if there are problems accessing or using the Reader.
     */
    public String getAsString(Reader reader, boolean closeWhenFinished) throws IOException {
        StringBuffer sb = getAsStringBuffer(reader, closeWhenFinished);
        return sb.toString();
    }

    /**
     * Accesses the harvesterPath value.
     * 
     * @return  harvesterPath
     */
    public String getHarvesterPath() {
        return harvesterPath;
    }

    /**
     * Accesses the harvestReportId value.
     * 
     * @return  harvestReportId
     */
    public String getHarvestReportId() {
        return harvestReportId;
    }

    /**
     * Access the evaluate boolean value.
     * 
     * @return  evaluate: true if this is an evaluate operation, 
     *          false if this is an upload operation
     */
    public boolean isEvaluate() {
        return evaluate;
    }

    /*
     * Harvests or evaluates a single EML document.
     */
    private void processEMLFile(String harvestDirPath, String uid, File emlFile, boolean isEvaluate) {
        String filename = "serviceMessage.txt";
        String packageId = "";
        EmlPackageId emlPackageId = null;

        try {
            DataPackageManagerClient dpmClient = new DataPackageManagerClient(uid);
            String serviceMessage = null;

            /*
             * Parse the packageId from the EML document.
             * 
             * If we fail to determine the packageId value by parsing the EML
             * document, then assign a dummy value. Increment the integer part
             * of the dummy value to ensure uniqueness during this harvest.
             */
            try {
                emlPackageId = EmlUtility.emlPackageIdFromEML(emlFile);
                EmlPackageIdFormat epif = new EmlPackageIdFormat();
                packageId = epif.format(emlPackageId);
            } catch (Exception e) {
                packageId = "Unknown-Package-ID-" + dummyPackageIdCounter++;
            }

            String packageIdPath = harvestDirPath + "/" + packageId;
            String verb = isEvaluate ? "Evaluating" : "Uploading";
            logger.info(String.format("%s data package: %s", verb, packageId));

            Harvester.createDirectory(packageIdPath);

            /* 
             * Process upload (insert or update) operation 
             */
            if (!isEvaluate) {
                if (emlPackageId != null) {
                    String scope = emlPackageId.getScope();
                    Integer identifier = emlPackageId.getIdentifier();
                    boolean isUpdate = isUpdate(dpmClient, scope, identifier);

                    String resourceMap = null;
                    try {
                        if (isUpdate) {
                            resourceMap = dpmClient.updateDataPackage(scope, identifier, emlFile);
                        } else {
                            resourceMap = dpmClient.createDataPackage(emlFile);
                        }

                        if (resourceMap != null) {
                            String resourceMapPath = packageIdPath + "/resourceMap.txt";
                            File resourceMapFile = new File(resourceMapPath);
                            FileUtils.writeStringToFile(resourceMapFile, resourceMap);

                            /*
                             * Store a local copy of the quality report so that
                             * quality check statistics can be displayed in the
                             * harvest report.
                             */
                            String revision = emlPackageId.getRevision().toString();
                            String qualityReportStr = dpmClient.readDataPackageReport(scope, identifier, revision);
                            String qualityReportPath = packageIdPath + "/qualityReport.xml";
                            File qualityReportFile = new File(qualityReportPath);
                            boolean append = false;
                            FileUtils.writeStringToFile(qualityReportFile, qualityReportStr, append);
                        }
                    } catch (PastaIdleTimeException e) {
                        writeServiceMessage(packageIdPath, filename, e.getMessage());
                    } catch (Exception e) {
                        String eMessage = e.getMessage();

                        if (isQualityReport(eMessage)) {
                            filename = "qualityReport.xml";
                            serviceMessage = eMessage;
                        } else {
                            serviceMessage = String.format("Error uploading packageId '%s': %s", packageId,
                                    eMessage);
                            logger.error(serviceMessage);
                        }

                        writeServiceMessage(packageIdPath, filename, serviceMessage);
                    }
                } else {
                    serviceMessage = PACKAGEID_PARSING_ERROR;
                    logger.error(serviceMessage);
                    writeServiceMessage(packageIdPath, filename, serviceMessage);
                }
            }
            /* 
             * Process evaluate operation 
             */
            else {
                if (emlPackageId != null) {
                    String qualityReportXML = null;

                    try {
                        qualityReportXML = dpmClient.evaluateDataPackage(emlFile);
                        String qualityReportPath = packageIdPath + "/qualityReport.xml";
                        File qualityReportFile = new File(qualityReportPath);
                        FileUtils.writeStringToFile(qualityReportFile, qualityReportXML);
                    } catch (PastaIdleTimeException e) {
                        writeServiceMessage(packageIdPath, filename, e.getMessage());
                    } catch (Exception e) {
                        logger.error(serviceMessage);
                        writeServiceMessage(packageIdPath, filename, e.getMessage());
                    }
                } else {
                    serviceMessage = PACKAGEID_PARSING_ERROR;
                    logger.error(serviceMessage);
                    writeServiceMessage(packageIdPath, filename, serviceMessage);
                }
            }
        } catch (PastaAuthenticationException e) {
            logger.error(e.getMessage());
        } catch (Exception e) {
            logger.error(e.getMessage());
        }
    }

    private void writeServiceMessage(String packageIdPath, String filename, String serviceMessage) {
        String serviceMessagePath = packageIdPath + "/" + filename;

        try {
            File serviceMessageFile = new File(serviceMessagePath);
            boolean append = true;
            FileUtils.writeStringToFile(serviceMessageFile, serviceMessage, append);
        } catch (IOException e) {
            logger.error(String.format("Error writing service message to file %s: %s", serviceMessagePath,
                    e.getMessage()));
        }
    }

    /*
     * Boolean to determine whether the entity body returned by the
     * DataPackageManager service is a quality report XML.
     */
    private boolean isQualityReport(String serviceMessage) {
        boolean isQualityReport = false;

        if (serviceMessage != null) {
            if (serviceMessage.contains("<qr:qualityReport")) {
                if (serviceMessage.trim().endsWith("</qr:qualityReport>")) {
                    isQualityReport = true;
                }
            }
        }

        return isQualityReport;
    }

    /*
     * Boolean to determine whether a data package should be
     * updated versus created.
     * 
     * @return  true if the data package should be updated, false if
     *          it should be created.
     */
    private boolean isUpdate(DataPackageManagerClient dpmClient, String scope, Integer identifier)
            throws Exception {
        boolean isUpdate = false;

        try {
            String revisionsStr = dpmClient.listDataPackageRevisions(scope, identifier, null);
            if (revisionsStr != null) {
                String[] revisionsArray = revisionsStr.split("\n");
                for (int i = 0; i < revisionsArray.length; i++) {
                    String revision = revisionsArray[i];
                    if (revision != null && !revision.equals("")) {
                        isUpdate = true;
                        break;
                    }
                }
            }
        } catch (ResourceNotFoundException e) {
            // A 404 status means that no revisions were found.
            // No action needed.
        }

        return isUpdate;
    }

    /**
     * Runs the processDocumentURLs() code in a separate thread for batch
     * processing.  
     */
    public void run() {
        try {
            processDocumentURLs();
            logger.debug("Launched batch processing thread to process " + documentURLs.size() + " EML documents.");
        } catch (Exception e) {
            logger.error("Exception while batch processing document URLs: " + e.getMessage());
        }
    }

    /*
     * Inserts or evaluates a list of EML documents. The document
     * URLs must first be stored in the this.documentURLs
     * instance variable.
     */
    private void processDocumentURLs() throws Exception {
        boolean tooManyHarvests = checkForTooManyHarvests();

        if (!tooManyHarvests) {
            // Directory for storing harvester files for this harvest
            Harvester.createDirectory(harvestDirPath);

            // Sub-directory for temporary EML files
            String harvestEMLPath = harvestDirPath + "/eml";
            Harvester.createDirectory(harvestEMLPath);

            for (String documentURL : documentURLs) {
                if (documentURL != null && !documentURL.equals("")) {
                    String urlErrorMessage = null;
                    String emlString = null;

                    try {
                        emlString = emlStringFromURL(documentURL);
                    } catch (IOException e) {
                        urlErrorMessage = "IOException : " + e.getMessage();
                        logger.error(urlErrorMessage);
                    }

                    /*
                     * Save the EML to file then process it for evaluation or upload
                     */
                    if (emlString != null) {
                        File emlFile = saveEmlToFile(harvestEMLPath, emlString, evaluate);
                        processEMLFile(harvestDirPath, uid, emlFile, evaluate);
                        // Sleep to allow DAS database connection recovery
                        Thread.sleep(30000);
                    } else if (urlErrorMessage != null) {
                        writeUrlIoMessage(harvestDirPath, documentURL, urlErrorMessage);
                    }
                }
            }
        }
    }

    /**
     * Inserts or evaluates a single EML document, passed in as an XML string.
     * 
     * @param emlString  the EML XML document string
     */
    public void processSingleDocument(String emlString) throws Exception {
        // Directory for storing harvester files for this harvest
        Harvester.createDirectory(harvestDirPath);

        // Sub-directory for temporary EML files
        String harvestEMLPath = harvestDirPath + "/eml";
        Harvester.createDirectory(harvestEMLPath);

        /*
         * Save the EML to file then process it for evaluation or upload
         */
        if (emlString != null) {
            File emlFile = saveEmlToFile(harvestEMLPath, emlString, evaluate);
            processEMLFile(harvestDirPath, uid, emlFile, evaluate);
        }
    }

    /**
     * Inserts or evaluates a single EML document, passed in as an XML file.
     * 
     * @param emlFile  the EML XML document file
     */
    public void processSingleDocument(File emlFile) throws Exception {
        // Directory for storing harvester files for this harvest
        Harvester.createDirectory(harvestDirPath);

        // Sub-directory for temporary EML files
        String harvestEMLPath = harvestDirPath + "/eml";
        Harvester.createDirectory(harvestEMLPath);

        /*
         * Process the EML file for evaluation or upload
         */
        if (emlFile != null) {
            processEMLFile(harvestDirPath, uid, emlFile, evaluate);
        }
    }

    /*
     * Write the text of a URL IO error message to file for subsequent use in
     * harvest reports.
     */
    private void writeUrlIoMessage(String path, String url, String message) {
        boolean append = true;
        String urlMessagesPath = path + "/urlMessages.txt";

        File urlMessagesFile = new File(urlMessagesPath);

        try {
            if (urlMessagesFile != null) {
                String messageText = "URL: " + url + " ; " + message + "\n";
                FileUtils.writeStringToFile(urlMessagesFile, messageText, append);
            }
        } catch (IOException e) {
            e.printStackTrace();
            logger.error(e.getMessage());
        }
    }

    /*
     * Saves an EML document to the file system for subsequent
     * processing.
     */
    private File saveEmlToFile(String harvestEMLPath, String xml, boolean isEvaluate) {
        File tempFile = null;
        Date now = new Date();
        Long mili = now.getTime();
        String tempFileName = mili.toString() + ".xml";
        logger.debug("NOW: " + mili.toString());
        StringBuffer xmlBuffer = new StringBuffer(xml);

        String tempFilePath = harvestEMLPath + "/" + tempFileName;
        tempFile = new File(tempFilePath);

        try {
            FileWriter fileWriter = new FileWriter(tempFile);
            writeToWriter(xmlBuffer, fileWriter, true);
        } catch (IOException e) {
            logger.error("IOException:\n" + e.getMessage());
            e.printStackTrace();
        }

        return tempFile;
    }

    /**
     * Sets the documentURLs instance variable to the specified urlList.
     * 
     * @param urlList    A list of documentURL string. Each URL should reference
     *                   an EML document to be processed.
     */
    public void setDocumentURLs(ArrayList<String> urlList) {
        this.documentURLs = urlList;
    }

    /**
     *  Reads character data from the <code>StringBuffer</code> provided, and 
     *  writes it to the <code>Writer</code> provided, using a buffered write. 
     *
     *  @param  buffer              <code>StringBuffer</code> whose contents are 
     *                              to be written to the <code>Writer</code>
     *
     *  @param  writer              <code>java.io.Writer</code> where contents 
     *                              of StringBuffer are to be written
     *
     *  @param  closeWhenFinished   <code>boolean</code> value to indicate 
     *                              whether Reader should be closed when reading
     *                              finished
     *
     *  @return                     <code>StringBuffer</code> containing  
     *                              characters read from the <code>Reader</code>
     *
     *  @throws IOException if there are problems accessing or using the Writer.
     */
    public void writeToWriter(StringBuffer buffer, Writer writer, boolean closeWhenFinished) throws IOException {
        if (writer == null) {
            throw new IOException("writeToWriter(): Writer is null");
        }

        char[] bufferChars = new char[buffer.length()];
        buffer.getChars(0, buffer.length(), bufferChars, 0);

        try {
            writer.write(bufferChars);
            writer.flush();
        } catch (IOException ioe) {
            throw ioe;
        } finally {
            if (closeWhenFinished) {
                try {
                    if (writer != null)
                        writer.close();
                } catch (IOException ce) {
                    ce.printStackTrace();
                }
            }
        }
    }

}