org.gbif.harvest.digir.DigirHarvester.java Source code

Introduction

Here is the source code for org.gbif.harvest.digir.DigirHarvester.java
Source

/*******************************************************************************
 * Copyright (C) 2008 Global Biodiversity Information Facility Secretariat.
 * All Rights Reserved.
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/MPL/
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 ******************************************************************************/
package org.gbif.harvest.digir;

import org.gbif.harvest.AbstractHarvester;
import org.gbif.harvest.NameRangeHandler;
import org.gbif.harvest.constants.ProtocolTypeEnum;
import org.gbif.harvest.core.Constants;
import org.gbif.harvest.exception.HarvesterException;
import org.gbif.harvest.exception.OperationStoppedException;
import org.gbif.harvest.exception.WrappedSaxException;
import org.gbif.harvest.log.CommonGBIFLogEvent;
import org.gbif.harvest.util.Diagnostics;
import org.gbif.harvest.util.FileUtils;
import org.gbif.harvest.util.GbifLogger;
import org.gbif.harvest.util.RequestUtils;
import org.gbif.harvest.util.TemplateUtils;
import org.gbif.harvest.writers.RequestResponseWriterManager;
import org.gbif.harvest.writers.constants.HarvestActionEnum;
import org.gbif.harvest.xml.DigesterUtils;
import org.gbif.harvest.xml.SimpleXml2Tab;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.net.SocketException;
import java.nio.charset.CharacterCodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import org.apache.commons.io.LineIterator;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Level;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

/**
 * A harvester that will harvest DwC data using the DiGIR protocol.
 * Note that this class can be run independently for testing purposes, using the
 * DigirHarvesterTest
 * inventory(Map<String, String> params)
 * processInventoried(Map<String, String> params)
 * search(Map<String, String> params)
 * processHarvested(Map<String, String> params)
 *
 * @author timrobertson
 * @author kbraak
 */
public class DigirHarvester extends AbstractHarvester {

    private RequestResponseWriterManager inventoryWriter;
    private RequestResponseWriterManager searchWriter;

    private static String BASE_LOCATION = "org/gbif/harvest/digir";
    private static String MAPPING_DIRECTORY_NAME = "mapping";
    private static String TEMPLATE_DIRECTORY_NAME = "template";

    // the DiGIR template for the inventory
    private static String INVENTORY_TEMPLATE_FILENAME = "inventory";

    // the DiGIR template for the search
    private static String SEARCH_TEMPLATE_FILENAME = "search";

    // max number of records returned in an inventory response
    private static final int MAX_INVENTORY_RESPONSE_SIZE = 200;

    // max number of records returned in a search response
    private static final int MAX_RESPONSE_SIZE = 900;

    // how many times we want to divide the search response limit
    private static final int DIVISOR = 9;

    // general mapping file names
    private static final String REQUEST_NAMESPACE_MAPPING_FILENAME = "requestNamespaceMapping";

    // default content namespace regulating response
    private static final String DEFAULT_CONTENT_NAMESPACE = "http://digir.sourceforge.net/schema/conceptual/darwin/full/2003/1.0/darwin2full.xsd";

    private static final String scientificNameXPathElement = "*/ScientificName";

    private static final String endOfRecordsDiagnosticCode = "END_OF_RECORDS";
    private static final String queryProducedNoResultsDiagnosticCode = "QUERY_PRODUCED_NO_RESULTS";

    // element for processing
    private static final String HARVESTED_REPEATING_ELEMENT = "*/record";

    // On connection reset (SocketException) time to wait: 3 minutes
    protected static final int WAIT_TIME = 180000;

    // list of those xpaths to those elements we're interested in
    // processing/extracting
    private List<String> inventoried_elements_of_interest;
    private Map<String, String> requestNamespaceMappings;

    private Map<String, Integer> errorCodes;

    private final TemplateUtils templateUtils;
    private final FileUtils fileUtils;
    private final RequestUtils requestUtils;
    private final DigesterUtils digesterUtils;
    private final GbifLogger gbifLogger;

    /**
     * Constructor.
     *
     * @param templateUtils TemplateUtils
     * @param fileUtils     FileUtils
     * @param requestUtils  RequestUtils
     * @param digesterUtils DigesterUtils
     * @param gbifLogger    GbifLogger
     */
    public DigirHarvester(TemplateUtils templateUtils, FileUtils fileUtils, RequestUtils requestUtils,
            DigesterUtils digesterUtils, GbifLogger gbifLogger) {
        this.templateUtils = templateUtils;
        this.fileUtils = fileUtils;
        this.requestUtils = requestUtils;
        this.digesterUtils = digesterUtils;
        this.gbifLogger = gbifLogger;
        init();
    }

    /**
     * Runs the iterative inventory requests: Executes a request and loops (if necessary)
     * until the full record set has been retrieved.
     *
     * @param destination       of the DiGIR server to issue against
     * @param templateParams    Map used to hold inventory template dictionary
     * @param directoryAsString where files of interest are located
     * @param protocol          name of the harvester
     * @param bw                buffered writer to GBIF harvest log message file
     *
     * @throws HarvesterException        thrown if operation failed
     * @throws OperationStoppedException thrown if operation was stopped
     */
    private void fireInventory(String destination, Map<String, String> templateParams, String directoryAsString,
            String protocol, BufferedWriter bw) throws HarvesterException, OperationStoppedException {

        StringBuilder sb = new StringBuilder(DigirHarvester.BASE_LOCATION);
        sb.append("/");
        sb.append(protocol);
        sb.append("/");
        sb.append(DigirHarvester.TEMPLATE_DIRECTORY_NAME);
        sb.append("/");
        sb.append(DigirHarvester.INVENTORY_TEMPLATE_FILENAME);
        sb.append(Constants.VELOCITY_FILENAME_EXTENSION);
        String templateLocation = sb.toString();

        boolean finished = false;
        int loopCount = 0;
        int startInventoryIndex = 0;
        Set<String> duplicateNames = new HashSet<String>();
        while (!finished) {
            String query;
            String request;
            try {
                query = templateUtils.getAndMerge(templateLocation, templateParams);
                request = requestUtils.buildURL(destination, "request", query);
            } catch (Exception e) {
                log.error("Inventory request could not be constructed.", e);
                throw new HarvesterException(e.getMessage(), e);
            }

            // save the request
            try {
                inventoryWriter.writeRequest(query.getBytes());
            } catch (IOException e) {
                log.warn("Inventory request file could not be written: " + e.getMessage(), e);
                throw new HarvesterException(e);
            }

            // fire the request
            if (debug) {
                log.debug("digirharvester.inventory.execute");
            }
            Diagnostics diagnostics;
            try {
                diagnostics = requestUtils.executeGetRequestAndReturnDiagnostics(request, ProtocolTypeEnum.DIGIR,
                        inventoryWriter);
                if (diagnostics == null) {
                    String msg = "Failure in populating diagnostics object from response";
                    log.error(msg);
                    throw new HarvesterException(msg);
                }
            } catch (IOException e) {
                log.error("Failure writing inventory request", e);
                // if no response received write GBIF log message
                gbifLogger.openAndWriteToGbifLogMessageFile(directoryAsString,
                        CommonGBIFLogEvent.COMMON_MESSAGES_HARVEST_BAD_REQUEST.getName(),
                        CommonGBIFLogEvent.COMMON_MESSAGES_HARVEST_BAD_REQUEST.getValue(), Level.ERROR_INT,
                        ("Request failed to url: " + destination), 1, false);
                throw new HarvesterException(e.getMessage(), e);
            }
            logDiagnosticsInfo(diagnostics.getRecords(), bw, Constants.INVENTORY_OPERATIONNAME);
            finished = diagnostics.isFinished();

            // it could be that the DiGIR tool hasn't been configured to recognise start and limit params.
            // Therefore, since no END_OF_RECORDS = true would occur, we need to test whether we are getting
            // duplicate responses
            if (!finished) {
                try {
                    File directory = new File(directoryAsString);
                    duplicateNames = digesterUtils.checkForDuplicateElements(directory,
                            Constants.INVENTORY_RESPONSE_FILENAME, scientificNameXPathElement);
                } catch (OperationStoppedException e) {
                    throw new HarvesterException(e.getMessage(), e);
                }
            }

            // There is a chance that a single request will
            // not be enough to retrieve all the records.
            // Therefore, if the end has not been reached, re-fire with start
            // parameter pointing at the next iteration index
            if (!finished) {
                // increment start index
                startInventoryIndex += Integer.valueOf(templateParams.get("maxResults"));
                // replace previous start index
                templateParams.put("startAt", Integer.toString(startInventoryIndex));
                loopCount++;
                log.info("Requesting next " + templateParams.get("maxResults") + " records, starting at: "
                        + String.valueOf(startInventoryIndex));
            }
        }
        // log any duplicates found!
        if (duplicateNames.size() > 0) {
            for (String duplicate : duplicateNames) {
                log.warn("Duplicate name found in inventory response(s), check leading/trailing whitespace: "
                        + duplicate);
            }
        }
    }

    /**
     * Build and issue a search request and store the result in gzipped file. Returned boolean
     * indicates whether or not the entire requested result set is contained in the response.
     *
     * @param destination    of the DiGIR server to issue against
     * @param templateParams Map used to hold search template dictionary
     * @param protocol       name of the harvester
     * @param bw             buffered writer to GBIF harvest log message file
     * @param requestCount   start param
     *
     * @return boolean search completed
     *
     * @throws HarvesterException        if operation fails
     * @throws SAXException              if response could not be parsed
     * @throws OperationStoppedException if operation was manually terminated
     * @throws SocketException           if connection was reset
     */
    private boolean fireSearch(String destination, Map<String, String> templateParams, String protocol,
            BufferedWriter bw, Integer requestCount)
            throws HarvesterException, SAXException, OperationStoppedException, SocketException {

        StringBuilder sb = new StringBuilder(DigirHarvester.BASE_LOCATION);
        sb.append("/");
        sb.append(protocol);
        sb.append("/");
        sb.append(DigirHarvester.TEMPLATE_DIRECTORY_NAME);
        sb.append("/");
        sb.append(DigirHarvester.SEARCH_TEMPLATE_FILENAME);
        sb.append(Constants.VELOCITY_FILENAME_EXTENSION);
        String templateLocation = sb.toString();
        if (debug) {
            log.debug("using templateLocation [" + templateLocation + "]");
        }

        if (debug) {
            log.debug("starting search request [" + requestCount + "]");
        }
        String query;
        String request;
        String lower = templateParams.get("lower");
        String upper = templateParams.get("upper");
        try {
            query = templateUtils.getAndMerge(templateLocation, templateParams);
            request = requestUtils.buildURL(destination, "request", query);
        } catch (Exception e) {
            log.error("Search request could not be constructed for range [" + lower + "] - [" + upper + "]", e);
            throw new HarvesterException(e.getMessage(), e);
        }
        if (debug) {
            log.debug("Got query [" + query + "]");
        }
        if (debug) {
            log.debug("Got request [" + request + "]");
        }

        // save the request
        try {
            searchWriter.writeRequest(query.getBytes());
        } catch (IOException e) {
            // failing to write request is not the end of the world, but a bad sign...
            log.warn("Search request file could not be written for range [" + lower + "] - [" + upper + "]", e);
        }

        // execute request to retrieve result set
        if (debug) {
            log.debug("Executing search request");
        }
        Diagnostics diagnostics;
        try {
            diagnostics = requestUtils.executeGetRequestAndReturnDiagnostics(request, ProtocolTypeEnum.DIGIR,
                    searchWriter);
            if (diagnostics == null) {
                String msg = "Failure in populating diagnostics object from response";
                log.error(msg);
                throw new HarvesterException(msg);
            }
            if (debug) {
                log.debug("got diagnostics: [" + diagnostics + "]");
            }
        } catch (FileNotFoundException e) {
            log.error("Search response file could not found for range [" + lower + "] - [" + upper + "]", e);
            throw new HarvesterException(e.getMessage(), e);
        } catch (IOException e) {
            // *sigh*
            if (e instanceof WrappedSaxException) {
                log.info("SAXException in parsing search response");
                throw ((WrappedSaxException) e).getWrappedException();
            } else if (e instanceof SocketException) {
                log.error(
                        "The connection may have been reset, so let's take a break and re-issue the same request for range ["
                                + lower + "] - [" + upper + "]",
                        e);
                try {
                    Thread.sleep(WAIT_TIME);
                } catch (InterruptedException ie) {
                    // do nothing
                } finally {
                    throw (SocketException) e;
                }
            } else {
                log.error("Search response file could not be written for range [" + lower + "] - [" + upper + "]",
                        e);
                throw new HarvesterException(e.getMessage(), e);
            }
        }

        // log all diagnostics information, see if we're finished, log GBIF msgs
        if (diagnostics != null) {
            logDiagnosticsInfo(diagnostics.getRecords(), bw, Constants.SEARCH_OPERATIONNAME);
        } else {
            throw new HarvesterException("Diagnostics was null! This is a fatal unexpected error");
        }
        return diagnostics.isFinished();
    }

    private void init() {
        inventoryWriter = new RequestResponseWriterManager(HarvestActionEnum.INVENTORY, fileUtils);
        searchWriter = new RequestResponseWriterManager(HarvestActionEnum.SEARCH, fileUtils);

        requestNamespaceMappings = new HashMap<String, String>();

        inventoried_elements_of_interest = new ArrayList<String>();
        inventoried_elements_of_interest.add(scientificNameXPathElement);
        inventoried_elements_of_interest.add("*/ScientificName@count");

        errorCodes = new HashMap<String, Integer>();
        errorCodes.put("FAILED_CONFIGURATION_LOAD", Level.FATAL_INT);

        errorCodes.put("INVALID_QUERY", Level.ERROR_INT);
        errorCodes.put("INTERNAL_DATABASE_ERROR", Level.ERROR_INT);
        errorCodes.put("DATABASE_ERROR", Level.ERROR_INT);
        errorCodes.put("BAD_QUERY", Level.ERROR_INT);
        errorCodes.put("UNSUPPORTED_ACCESSPOINT", Level.ERROR_INT);
        errorCodes.put("COMPARISON_TYPE_NOT_SUPPORTED", Level.ERROR_INT);
        errorCodes.put("QUERY_TERM_TOO_SHORT", Level.ERROR_INT);
        errorCodes.put("INVALID_QUERY_TERM", Level.ERROR_INT);
        errorCodes.put("BAD_CONCEPT_MAP", Level.ERROR_INT);
        errorCodes.put("RESOURCE_NOT_FOUND", Level.ERROR_INT);
        errorCodes.put("INVALID_REQUEST", Level.ERROR_INT);
        errorCodes.put("LOAD_FORMAT_FAILED", Level.ERROR_INT);
        errorCodes.put("NO_FILTER_IN_REQUEST", Level.ERROR_INT);
        errorCodes.put("COULD_NOT_LOAD_RESOURCES_FILE", Level.ERROR_INT);
        errorCodes.put("REMOTE_JOINS_NOT_ALLOWED", Level.ERROR_INT);
        errorCodes.put("REMOTE_JOIN_FAILED", Level.ERROR_INT);
        errorCodes.put("TOO_MANY_ERRORS", Level.ERROR_INT);
        // error_codes.put("LOG_ERROR", Level.ERROR_INT);
        errorCodes.put("GENERAL_ERROR", Level.ERROR_INT);
        errorCodes.put("DIGIR_NOT_ALLOWED", Level.ERROR_INT);
        errorCodes.put("DIGIRM_RECURSIVE_OPERATION", Level.ERROR_INT);

        errorCodes.put("FILTER_TOO_SHORT", Level.WARN_INT);
        errorCodes.put(queryProducedNoResultsDiagnosticCode, Level.WARN_INT);
        errorCodes.put("METADATA_FORMAT_PROBLEM", Level.WARN_INT);
        errorCodes.put("REMOTE_JOIN_INFO", Level.WARN_INT);
        errorCodes.put("NO_CONTENT_REQUESTED", Level.WARN_INT);
        errorCodes.put("NO_CHARACTER_CONVERSION", Level.WARN_INT);

        errorCodes.put("MATCH_COUNT", Level.INFO_INT);
        errorCodes.put("RECORD_COUNT", Level.INFO_INT);
        errorCodes.put(endOfRecordsDiagnosticCode, Level.INFO_INT);
        // error_codes.put("STATUS_INTERVAL", Level.INFO_INT);
        // error_codes.put("STATUS_DATA", Level.INFO_INT);

        errorCodes.put("SQL_DEBUG_INFO", Level.DEBUG_INT);
    }

    /**
     * The entry point required for the user interface integration.
     *
     * @param params BioDatasource parameters as map
     *
     * @throws HarvesterException thrown if method fails
     * @see org.gbif.harvest.digir.DigirHarvester#inventory(String, String, String, String, String, String)
     */
    public void inventory(Map<String, String> params) throws HarvesterException {
        String outputDirectory = Constants.BASE_DIR.concat(File.separator).concat(params.get("directory"));
        inventory(params.get("url"), Constants.CONCEPT, outputDirectory, params.get("resource_name"),
                params.get("protocol"), params.get("maxInventoryResponse"));
    }

    /**
     * Issue the inventory request repeatedly until finished.
     *
     * @param destination          of the DiGIR server to issue against
     * @param concept              to inventory for
     * @param outputDirectory      to which the response is saved
     * @param resource             name to query
     * @param protocol             name
     * @param maxInventoryResponse maximum size of inventory response
     *
     * @throws HarvesterException thrown if method fails
     */
    public void inventory(String destination, String concept, String outputDirectory, String resource,
            String protocol, String maxInventoryResponse) throws HarvesterException {
        log.info("Start inventory");
        if (debug) {
            log.debug(">> inventory");
        }

        // set the request response writer to the right directory
        inventoryWriter.setFileOutputDirectory(outputDirectory);

        // build the parameters required for the template into a map
        Map<String, String> templateParams = new HashMap<String, String>();
        templateParams.put("destination", destination);
        templateParams.put("concept", concept);
        templateParams.put("resource", resource);

        // determine size of name ranges.
        int recordsPerResponse = -1;
        try {
            recordsPerResponse = Integer.valueOf(maxInventoryResponse);
        } catch (NumberFormatException e) {
            log.warn("Error interpreting parameter recordsPerResponse " + maxInventoryResponse
                    + " taken from Metadata response:", e);
        }

        if (recordsPerResponse > MAX_INVENTORY_RESPONSE_SIZE || recordsPerResponse <= 0) {
            recordsPerResponse = MAX_RESPONSE_SIZE;
            log.info("Inventory response size defaulting to " + String.valueOf(MAX_INVENTORY_RESPONSE_SIZE)
                    + " records.");
        } else {
            log.info("Inventory response size set to " + String.valueOf(MAX_INVENTORY_RESPONSE_SIZE) + " records.");
        }
        templateParams.put("maxResults", String.valueOf(recordsPerResponse));

        // grab the gbif log message output file
        File gbifLogMessageFile = new File(outputDirectory,
                Constants.GBIF_LOG_MESSAGE_FILENAME + Constants.TEXT_FILENAME_EXTENSION);

        // created buffered writer
        BufferedWriter gbifLogMessageFileBW = null;
        try {
            // create file if it didn't already exist
            if (!gbifLogMessageFile.exists()) {
                gbifLogMessageFile.createNewFile();
            }
            gbifLogMessageFileBW = new BufferedWriter(
                    new OutputStreamWriter(new FileOutputStream(gbifLogMessageFile, true), "UTF8"));

            // fire the inventory request(s)
            fireInventory(destination, templateParams, outputDirectory, protocol, gbifLogMessageFileBW);
        } catch (IOException e) {
            log.error("An error occurred during inventory: " + e.getMessage(), e);
        } catch (OperationStoppedException e) {
            throw new HarvesterException(e.getMessage(), e);
        } finally {
            if (gbifLogMessageFileBW != null) {
                // close bufferedWriter on GBIF Log Event Message file
                try {
                    gbifLogMessageFileBW.close();
                } catch (IOException e) {
                    log.error("Could not close buffered writer for file " + gbifLogMessageFile.getAbsolutePath(),
                            e);
                }
            }
        }
        if (debug) {
            log.debug("<< inventory");
        }
        log.info("End inventory");
    }

    /**
     * In any DiGIR response, there is a set of diagnostics where error codes are
     * returned.
     * This method takes as a parameter, a Map that contains all Error_Code/Info
     * key/value pairs taken from the diagnostics elements, logging them
     * according to their respective severity.
     * This method determines whether the response is the last,
     * that is whether we can expect any more responses. In the case we encounter
     * an error of such severity that no more valid responses can be given,
     * it returns false.
     * This method also writes to a file that collects GBIF log messages
     * for harvesting. Iterating through error codes with varying severities,
     * if it is found that there is a matching BiocaseLogEvent for that error
     * code, the necessary information is written to the file.
     *
     * @param records       Map with Error_Code/Info key/value entries
     * @param bw            BufferedWriter to GBIF harvesting log message file
     * @param operationName name of operation
     */
    private void logDiagnosticsInfo(Map<String, String> records, BufferedWriter bw, String operationName) {
        super.logBasicDiagnostics(records, bw, operationName, errorCodes, gbifLogger, ProtocolTypeEnum.DIGIR);
    }

    /**
     * The entry point required for the user interface integration.
     *
     * @param params BioDatasource parameters as Map
     *
     * @throws HarvesterException if operation fails
     * @see org.gbif.harvest.digir.DigirHarvester#processHarvested(String, String, String)
     */
    public void processHarvested(Map<String, String> params) throws HarvesterException {
        String outputDirectory = Constants.BASE_DIR.concat(File.separator).concat(params.get("directory"));
        processHarvested(outputDirectory, params.get("protocol"), params.get("mappingFile"));
    }

    /**
     * Processes the harvested records into a single tab file extracting only
     * the elements of interest for each record.
     *
     * @param outputDirectory To work in
     * @param protocol        name of harvester
     * @param mappingFile     name to use
     *
     * @throws HarvesterException thrown if method fails
     */
    public void processHarvested(String outputDirectory, String protocol, String mappingFile)
            throws HarvesterException {
        log.info("Start process harvested");

        Map<String, String> harvestedElementsOfInterest = new HashMap<String, String>();

        // Ensure output directory exists
        File directory = new File(outputDirectory);
        if (!directory.exists()) {
            if (debug) {
                log.debug("Creating new directory [" + directory.getAbsolutePath() + "]");
            }
            directory.mkdirs(); // including parents
        }

        // properties we harvest are read from file
        Properties mapping = new Properties();
        String mappingFilePath = fileUtils.constructMappingFilePath(DigirHarvester.BASE_LOCATION, protocol,
                DigirHarvester.MAPPING_DIRECTORY_NAME, mappingFile);
        if (debug) {
            log.debug("Attempting mapping file load from [" + mappingFilePath + "]");
        }
        InputStream mappingFileStream = null;
        try {
            mappingFileStream = this.getClass().getResourceAsStream(mappingFilePath);
            if (mappingFileStream == null) {
                String msg = "Mapping file resource [" + mappingFilePath + "] does not exist";
                log.warn(msg);
                throw new HarvesterException(msg);
            }
            mapping.load(mappingFileStream);
            for (Object key : mapping.keySet()) {
                harvestedElementsOfInterest.put((String) key, mapping.getProperty((String) key));
            }
        } catch (IOException e) {
            log.error("Problem loading index mapping file", e);
            throw new HarvesterException(e.getMessage(), e);
        } finally {
            if (mappingFileStream != null) {
                try {
                    mappingFileStream.close();
                } catch (IOException e) {
                    log.error(
                            "An error occurred closing input stream on " + mappingFilePath + ": " + e.getMessage(),
                            e);
                }
            }
        }

        // Prepare directory
        if (debug) {
            log.debug("Start preparing directory for process harvested");
        }
        if (directory.isDirectory()) {
            try {
                // remove processes harvested file
                fileUtils.prepareDirectory(outputDirectory, Constants.HARVESTED_FILENAME);
                if (debug) {
                    log.debug("Finished preparing directory for process harvested");
                }
            } catch (Exception e) {
                log.error("Error preparing directory for process harvested", e);
                throw new HarvesterException(e.getMessage(), e);
            }
        }

        // create the tab delimited file
        File output = new File(
                outputDirectory + "/" + Constants.HARVESTED_FILENAME + Constants.TEXT_FILENAME_EXTENSION);

        // create a buffered writer on the output file
        BufferedWriter bw = null;
        try {
            bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF8"));
        } catch (IOException e) {
            log.error("File writer to harvested file could not be created", e);
        }

        // Write the header line to the tab file: composed of the various
        // keys taken from the mapping file
        try {
            Iterator<String> iter = harvestedElementsOfInterest.keySet().iterator();
            while (iter.hasNext()) {

                String property = iter.next();
                if (StringUtils.isNotBlank(property)) {
                    bw.write(property);
                }

                if (iter.hasNext()) {
                    bw.write("\t");
                } else {
                    bw.write("\n");
                }
            }
            log.info("Header line of harvested tab file has been written successfully");
        } catch (Exception e) {
            log.error("Header line of harvested tab file could not be written", e);
        } finally {
            try {
                bw.close();
            } catch (IOException e) {
                log.warn("File input stream of harvested tab file could not be closed", e);
            }
        }

        // use the utility class to process the records
        SimpleXml2Tab harvestedXml2Tab = new SimpleXml2Tab(fileUtils, digesterUtils, gbifLogger, requestUtils);
        try {
            harvestedXml2Tab.run(directory, output, Constants.SEARCH_RESPONSE_FILENAME,
                    harvestedElementsOfInterest.values(), DigirHarvester.HARVESTED_REPEATING_ELEMENT);
            // for console
            log.info("Writing to file: " + output.getAbsolutePath());
        }
        // was the operation stopped?
        catch (OperationStoppedException e) {
            throw new HarvesterException(e.getMessage(), e);
        } catch (IOException e) {
            log.warn("Error reading harvested records file", e);
            throw new HarvesterException(e.getMessage(), e);
        }
        log.info("End process harvested");
    }

    /*
     * Entry point methods
     */

    /**
     * The entry point required for the user interface integration.
     *
     * @param params BioDatasource parameters as Map
     *
     * @throws HarvesterException thrown if operation fails
     * @see org.gbif.harvest.digir.DigirHarvester#processHarvested(String, String, String)
     */
    public void processInventoried(Map<String, String> params) throws HarvesterException {
        String outputDirectory = Constants.BASE_DIR.concat(File.separator).concat(params.get("directory"));
        processInventoried(outputDirectory, params.get("maxSearchResponse"), params.get("minQueryTermLength"));
    }

    /**
     * Processes the inventory file(s) from XML to a tab delimited file and
     * saves it in the output directory.
     *
     * @param outputDirectory    to which the response is saved
     * @param maxSearchResponse  maximum size of search response
     * @param minQueryTermLength minimum of characters in a query term
     *
     * @throws HarvesterException thrown if the method fails
     */
    public void processInventoried(String outputDirectory, String maxSearchResponse, String minQueryTermLength)
            throws HarvesterException {
        if (debug) {
            log.debug(">> processInventoried");
        }

        // Ensure directory exists
        File directory = new File(outputDirectory);
        if (!directory.exists()) {
            if (debug) {
                log.debug("Creating new directory: " + directory.getAbsolutePath());
            }
            directory.mkdirs(); // including parents
        }

        // Prepare directory: remove the inventoried tab file
        if (debug) {
            log.debug("Start preparing directory for process inventoried");
        }
        if (directory.isDirectory()) {
            try {
                // remove the inventoried tab file
                fileUtils.prepareDirectory(outputDirectory, Constants.INVENTORIED_FILENAME);
                if (debug) {
                    log.debug("Finished preparing directory for process inventoried");
                }
            } catch (Exception e) {
                log.error("Error preparing directory for process inventoried.", e);
                throw new HarvesterException(e.getMessage(), e);
            }
        }

        // create the output tab delimited file
        File outputTabFile = new File(
                directory + "/" + Constants.INVENTORIED_FILENAME + Constants.TEXT_FILENAME_EXTENSION);

        // parse the inventory by elements of interest and extract to output
        // file in tab format
        SimpleXml2Tab inventoryXml2Tab = new SimpleXml2Tab(fileUtils, digesterUtils, gbifLogger, requestUtils);
        try {
            inventoryXml2Tab.run(directory, outputTabFile, Constants.INVENTORY_RESPONSE_FILENAME,
                    inventoried_elements_of_interest, "*/record");
            // so that file can be opened via console page
            log.info("Writing to file: " + outputTabFile.getAbsolutePath());
        }
        // was the operation stopped?
        catch (OperationStoppedException e) {
            throw new HarvesterException(e.getMessage(), e);
        } catch (IOException e) {
            log.warn("Error reading inventoried records file.", e);
            throw new HarvesterException(e.getMessage(), e);
        }

        // determine minimum possible size of a name
        int minNameLength = -1;
        if (StringUtils.isBlank(minQueryTermLength) || minQueryTermLength.equalsIgnoreCase("NULL")) {
            log.warn("Error converting minQueryTermLength [" + minQueryTermLength
                    + "], please check that it has been set properly by the provider");
        } else {
            try {
                minNameLength = Integer.valueOf(minQueryTermLength);
            } catch (Exception e) {
                minNameLength = Constants.MIN_QUERY_TERM_LENGTH;
            }
        }
        // ensure it's always larger than 0
        if (minNameLength <= 0) {
            minNameLength = Constants.MIN_QUERY_TERM_LENGTH;
        }

        // sort inventoried.txt file
        try {
            log.info("Sorting inventoried list");
            fileUtils.sortInventoried(outputDirectory, minNameLength);
        } catch (Exception e) {
            // worst case, inventoried list isn't sorted
            log.error("inventoried list could not be sorted", e);
        }

        // determine size of name ranges.
        int namesPerRange = -1;
        try {
            namesPerRange = Integer.valueOf(maxSearchResponse);
        } catch (Exception e) {
            log.warn("Error converting maxSearchResponseRecords [" + maxSearchResponse
                    + "], please check that it has been set properly by the provider", e);
        }

        if (namesPerRange < Constants.MIN_QUERY_TERM_LENGTH || namesPerRange > MAX_RESPONSE_SIZE) {
            namesPerRange = MAX_RESPONSE_SIZE;
        }
        log.info("Defaulting to name ranges of size [" + namesPerRange + "]");

        // now read in the file and process it into chunks of namesPerRange
        try {
            fileUtils.createHarvestNameRanges(outputDirectory, namesPerRange, minNameLength);
        } catch (CharacterCodingException e) {
            log.error("Name ranges file could not be written because one of the names couldn't be UTF-8 encoded.",
                    e);
            throw new HarvesterException(e.getMessage(), e);
        } catch (IOException e) {
            log.error("Name ranges file could not be written", e);
            throw new HarvesterException(e.getMessage(), e);
        }
        if (debug) {
            log.debug("<< processInventoried");
        }
    }

    /**
     * The entry point required for the user interface integration.
     *
     * @param params BioDatasource parameters as Map
     *
     * @throws HarvesterException if method fails
     * @see org.gbif.harvest.digir.DigirHarvester#search(String, String, String, String, String, int)
     */
    public void search(Map<String, String> params) throws HarvesterException {
        String outputDirectory = Constants.BASE_DIR.concat(File.separator).concat(params.get("directory"));

        int target = 0;
        try {
            String targetCount = params.get("targetCount");
            target = Integer.valueOf(targetCount);
        } catch (Exception e) {
            // there was no target count, or it was invalid, so just set to 0
        }

        search(params.get("resource_name"), params.get("url"), outputDirectory, params.get("protocol"),
                params.get("maxSearchResponse"), target);
    }

    /**
     * Iterates over the nameRanges file, executing a search for each range. A
     * single search retrieves all records for the concept of interest that fall
     * in the specified range. If the concept were scientific name, for example,
     * the searches would collectively retrieve all the destination's records
     * for all scientific names contained in the resource of interest. This
     * information, collected as raw xml responses, is saved in the output
     * directory. The files written this way depend on the assumption
     * that at most one harvester will be targeting a given resource/name range
     * pair at a time.
     *
     * @param resource          to query
     * @param destination       of the DiGIR server to issue against
     * @param outputDirectory   where files of interest are located
     * @param protocol          name of harvester
     * @param maxSearchResponse maximum number of records returned in a response
     * @param targetCount       target count of resource
     *
     * @throws HarvesterException thrown if method fails
     */
    public void search(String resource, String destination, String outputDirectory, String protocol,
            String maxSearchResponse, int targetCount) throws HarvesterException {
        if (debug) {
            log.debug(">> search");
        }

        // set the request response writer to the right directory
        searchWriter.setFileOutputDirectory(outputDirectory);

        // setup the NameRangeHandler for this search
        NameRangeHandler nameRangeHandler = new NameRangeHandler(outputDirectory, fileUtils);

        // get iterator over name ranges
        LineIterator nameRangeIter = nameRangeHandler.getNameRangeIterator();

        // build the parameters required for the template into a map
        Map<String, String> templateParams = new HashMap<String, String>();
        templateParams.put("resource", resource);
        templateParams.put("destination", destination);

        // gather the request content namespaces
        Properties mapping = new Properties();
        String mappingFilePath = fileUtils.constructMappingFilePath(BASE_LOCATION, protocol,
                DigirHarvester.MAPPING_DIRECTORY_NAME, DigirHarvester.REQUEST_NAMESPACE_MAPPING_FILENAME);
        if (debug) {
            log.debug("Reading mappingFile from [" + mappingFilePath + "]");
        }
        InputStream is = null;
        try {
            is = getClass().getResourceAsStream(mappingFilePath);
            if (is == null) {
                throw new HarvesterException(
                        "Namespace mapping file not found, aborting search [" + mappingFilePath + "]");
            }
            mapping.load(is);
            for (Object key : mapping.keySet()) {
                requestNamespaceMappings.put((String) key, mapping.getProperty((String) key));
            }
        } catch (IOException e) {
            log.error("Problem loading request namespace mapping file", e);
            throw new HarvesterException(e.getMessage(), e);
        } finally {
            if (is != null) {
                try {
                    is.close();
                } catch (IOException e) {
                    log.error(
                            "An error occurred closing input stream on " + mappingFilePath + ": " + e.getMessage(),
                            e);
                }
            }
        }

        // set the default DwC request namespace
        String contentNamespace = DigirHarvester.DEFAULT_CONTENT_NAMESPACE;
        for (String property : requestNamespaceMappings.keySet()) {
            if (StringUtils.equalsIgnoreCase("full", property)) {
                contentNamespace = requestNamespaceMappings.get(property);
            }
        }
        if (debug) {
            log.debug("The request content namespace that will be used is [" + contentNamespace + "]");
        }
        templateParams.put("contentNamespace", contentNamespace);

        // determine max search response size
        int maxResponseSize = -1;
        try {
            maxResponseSize = Integer.valueOf(maxSearchResponse);
        } catch (Exception e) {
            log.warn("Error reading maxResponseSize [" + maxSearchResponse
                    + "], please check that it has been set properly by the provider", e);
        }
        if (maxResponseSize < Constants.MIN_QUERY_TERM_LENGTH || maxResponseSize > MAX_RESPONSE_SIZE) {
            maxResponseSize = MAX_RESPONSE_SIZE;
            log.info("Defaulting maximum number of search records returned in a single response ["
                    + String.valueOf(MAX_RESPONSE_SIZE) + "]");
        } else {
            log.info("The maximum number of search records returned in a single response has been set to ["
                    + String.valueOf(MAX_RESPONSE_SIZE) + "]");
        }

        // grab the gbif log message output file
        File gbifLogMessageFile = new File(outputDirectory,
                Constants.GBIF_LOG_MESSAGE_FILENAME + Constants.TEXT_FILENAME_EXTENSION);
        // created buffered writer
        BufferedWriter gbifLogMessageFileBW = null;
        try {
            gbifLogMessageFileBW = new BufferedWriter(
                    new OutputStreamWriter(new FileOutputStream(gbifLogMessageFile, true), "UTF8"));
        } catch (IOException e) {
            log.error("Couldn't open GBIF 'Harvesting' log event file", e);
        }

        // each line in the file is a name range
        int requestCount = 0;
        int sameRequestCount = 0;
        while (nameRangeIter.hasNext()) {

            // For each line, extract the lower and upper name ranges
            String line = nameRangeIter.nextLine();
            String lower = StringUtils.strip(fileUtils.getDelimitedPart(line, "\t", 0));
            String upper = StringUtils.strip(fileUtils.getDelimitedPart(line, "\t", 1));
            if (StringUtils.isBlank(lower) || StringUtils.isBlank(upper)) {
                // something is wrong with that range - log and ignore
                log.warn("Found invalid name range lower [" + lower + "] and upper [" + upper
                        + "] - ignoring and continuing with next range");
                continue;
            }
            log.info("Start harvesting range [" + lower + " - " + upper + "]");

            // add range parameters to templateParams
            templateParams.put("lower", lower);
            templateParams.put("upper", upper);

            Integer startAt = 0;
            Integer innerMaxResults = maxResponseSize;
            boolean finished = false;
            boolean fireSameSearch = false;
            Integer endOfErrorRange = null;
            while (!finished) {
                templateParams.put("startAt", startAt.toString());
                templateParams.put("maxResults", innerMaxResults.toString());
                boolean inError = false;
                try {
                    finished = fireSearch(destination, templateParams, protocol, gbifLogMessageFileBW,
                            requestCount);
                } catch (SAXParseException saxe) {
                    log.info("SAX Parse exception in parsing search response starting from [" + startAt
                            + "] with maxResults [" + innerMaxResults + "]", saxe);
                    // the chances are good that this error is in only one of the records coming back
                    // so we can try again and start narrowing the size of the response until we find
                    // the one causing the problem. It also may have been a transient error, so just
                    // retrying would be enough to fix it. In that case we watch for a startAt beyond
                    // this error range and when that is hit, bump the search back up to maxResponseSize.
                    if (innerMaxResults == 1) {
                        // we've found exactly one record that is in error - log in detail and move on
                        log.warn(
                                "Search record in error - record num [" + startAt + "]" + "in range lower [" + lower
                                        + "] upper [" + upper + "] " + "from destination [" + destination + "]",
                                saxe);
                        // assume that most errors are bad chars in a single record, so now set max results back to full
                        endOfErrorRange = null;
                        innerMaxResults = maxResponseSize;
                        startAt++;
                        // start should never exceed targetCount (avoid endless looping - skip range)
                        if (startAt > targetCount) {
                            log.error("Inside retry loop: request parameter startAt exceeded targetCount: "
                                    + String.valueOf(targetCount) + " Skipping range lower [" + lower + "] upper ["
                                    + upper + "]");
                            break;
                        } else {
                            continue;
                        }
                    } else {
                        inError = true;
                        endOfErrorRange = startAt + innerMaxResults;
                    }

                } catch (SAXException e) {
                    // we don't know enough to recover, so write this range out as failed, and move on to next range
                    nameRangeHandler.appendFailedRange(lower, upper);
                    break;
                } catch (SocketException se) {
                    // the connection was reset, so re-issue the exact same request.
                    // We have already waited 3 minutes to give the provider software the chance to reset in fireSearch()
                    sameRequestCount++;
                    // only re-issue same request on SocketException 5 times
                    if (sameRequestCount <= 4) {
                        log.info("Re-issue same request (" + String.valueOf(sameRequestCount) + ")");
                        inError = true;
                        fireSameSearch = true;
                    } else {
                        log.info("Exceeded max number of possible re-issue same request");
                        nameRangeHandler.appendFailedRange(lower, upper);
                        break;
                    }
                } // was the operation stopped?
                catch (OperationStoppedException e) {
                    throw new HarvesterException(e.getMessage(), e);
                }

                // setup for the next pass
                if (!finished) {
                    if (inError && !fireSameSearch) {
                        innerMaxResults = Math.max(1, innerMaxResults / DIVISOR);
                    } else if (inError && fireSameSearch) {
                        // parameters stay the same
                    } else {
                        requestCount++;
                        if (endOfErrorRange != null && (startAt > endOfErrorRange)) {
                            if (debug) {
                                log.debug("StartAt passed endOfErrorRange, resetting max results");
                            }
                            innerMaxResults = maxResponseSize;
                            endOfErrorRange = null;
                        }
                        startAt = startAt + innerMaxResults;
                    }
                } else {
                    // successful completion of range
                    nameRangeHandler.registerSuccessfulRange(lower);
                }
            }
        }
        LineIterator.closeQuietly(nameRangeIter);
        // a successful search has been completed, so do name range cleanup (ie delete pendingNameRanges file)
        nameRangeHandler.close();

        // close bufferedWriter on GBIF Log Event Message file
        if (gbifLogMessageFile != null) {
            // close bufferedWriter on GBIF Log Event Message file
            try {
                gbifLogMessageFileBW.close();
            } catch (IOException e) {
                log.error("Could not close buffered writer for file [" + gbifLogMessageFile.getAbsolutePath() + "]",
                        e);
            }
        }

        log.info("End search");
    }
}