Java tutorial
/******************************************************************************* * Copyright (C) 2008 Global Biodiversity Information Facility Secretariat. * All Rights Reserved. * The contents of this file are subject to the Mozilla Public * License Version 1.1 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.mozilla.org/MPL/ * Software distributed under the License is distributed on an "AS * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or * implied. See the License for the specific language governing * rights and limitations under the License. ******************************************************************************/ package org.gbif.harvest.digir; import org.gbif.harvest.AbstractHarvester; import org.gbif.harvest.NameRangeHandler; import org.gbif.harvest.constants.ProtocolTypeEnum; import org.gbif.harvest.core.Constants; import org.gbif.harvest.exception.HarvesterException; import org.gbif.harvest.exception.OperationStoppedException; import org.gbif.harvest.exception.WrappedSaxException; import org.gbif.harvest.log.CommonGBIFLogEvent; import org.gbif.harvest.util.Diagnostics; import org.gbif.harvest.util.FileUtils; import org.gbif.harvest.util.GbifLogger; import org.gbif.harvest.util.RequestUtils; import org.gbif.harvest.util.TemplateUtils; import org.gbif.harvest.writers.RequestResponseWriterManager; import org.gbif.harvest.writers.constants.HarvestActionEnum; import org.gbif.harvest.xml.DigesterUtils; import org.gbif.harvest.xml.SimpleXml2Tab; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.net.SocketException; import java.nio.charset.CharacterCodingException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import org.apache.commons.io.LineIterator; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Level; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; /** * A harvester that will harvest DwC data using the DiGIR protocol. * Note that this class can be run independently for testing purposes, using the * DigirHarvesterTest * inventory(Map<String, String> params) * processInventoried(Map<String, String> params) * search(Map<String, String> params) * processHarvested(Map<String, String> params) * * @author timrobertson * @author kbraak */ public class DigirHarvester extends AbstractHarvester { private RequestResponseWriterManager inventoryWriter; private RequestResponseWriterManager searchWriter; private static String BASE_LOCATION = "org/gbif/harvest/digir"; private static String MAPPING_DIRECTORY_NAME = "mapping"; private static String TEMPLATE_DIRECTORY_NAME = "template"; // the DiGIR template for the inventory private static String INVENTORY_TEMPLATE_FILENAME = "inventory"; // the DiGIR template for the search private static String SEARCH_TEMPLATE_FILENAME = "search"; // max number of records returned in an inventory response private static final int MAX_INVENTORY_RESPONSE_SIZE = 200; // max number of records returned in a search response private static final int MAX_RESPONSE_SIZE = 900; // how many times we want to divide the search response limit private static final int DIVISOR = 9; // general mapping file names private static final String REQUEST_NAMESPACE_MAPPING_FILENAME = "requestNamespaceMapping"; // default content namespace regulating response private static final String DEFAULT_CONTENT_NAMESPACE = "http://digir.sourceforge.net/schema/conceptual/darwin/full/2003/1.0/darwin2full.xsd"; private static final String scientificNameXPathElement = "*/ScientificName"; private static final String endOfRecordsDiagnosticCode = "END_OF_RECORDS"; private static final String queryProducedNoResultsDiagnosticCode = "QUERY_PRODUCED_NO_RESULTS"; // element for processing private static final String HARVESTED_REPEATING_ELEMENT = "*/record"; // On connection reset (SocketException) time to wait: 3 minutes protected static final int WAIT_TIME = 180000; // list of those xpaths to those elements we're interested in // processing/extracting private List<String> inventoried_elements_of_interest; private Map<String, String> requestNamespaceMappings; private Map<String, Integer> errorCodes; private final TemplateUtils templateUtils; private final FileUtils fileUtils; private final RequestUtils requestUtils; private final DigesterUtils digesterUtils; private final GbifLogger gbifLogger; /** * Constructor. * * @param templateUtils TemplateUtils * @param fileUtils FileUtils * @param requestUtils RequestUtils * @param digesterUtils DigesterUtils * @param gbifLogger GbifLogger */ public DigirHarvester(TemplateUtils templateUtils, FileUtils fileUtils, RequestUtils requestUtils, DigesterUtils digesterUtils, GbifLogger gbifLogger) { this.templateUtils = templateUtils; this.fileUtils = fileUtils; this.requestUtils = requestUtils; this.digesterUtils = digesterUtils; this.gbifLogger = gbifLogger; init(); } /** * Runs the iterative inventory requests: Executes a request and loops (if necessary) * until the full record set has been retrieved. * * @param destination of the DiGIR server to issue against * @param templateParams Map used to hold inventory template dictionary * @param directoryAsString where files of interest are located * @param protocol name of the harvester * @param bw buffered writer to GBIF harvest log message file * * @throws HarvesterException thrown if operation failed * @throws OperationStoppedException thrown if operation was stopped */ private void fireInventory(String destination, Map<String, String> templateParams, String directoryAsString, String protocol, BufferedWriter bw) throws HarvesterException, OperationStoppedException { StringBuilder sb = new StringBuilder(DigirHarvester.BASE_LOCATION); sb.append("/"); sb.append(protocol); sb.append("/"); sb.append(DigirHarvester.TEMPLATE_DIRECTORY_NAME); sb.append("/"); sb.append(DigirHarvester.INVENTORY_TEMPLATE_FILENAME); sb.append(Constants.VELOCITY_FILENAME_EXTENSION); String templateLocation = sb.toString(); boolean finished = false; int loopCount = 0; int startInventoryIndex = 0; Set<String> duplicateNames = new HashSet<String>(); while (!finished) { String query; String request; try { query = templateUtils.getAndMerge(templateLocation, templateParams); request = requestUtils.buildURL(destination, "request", query); } catch (Exception e) { log.error("Inventory request could not be constructed.", e); throw new HarvesterException(e.getMessage(), e); } // save the request try { inventoryWriter.writeRequest(query.getBytes()); } catch (IOException e) { log.warn("Inventory request file could not be written: " + e.getMessage(), e); throw new HarvesterException(e); } // fire the request if (debug) { log.debug("digirharvester.inventory.execute"); } Diagnostics diagnostics; try { diagnostics = requestUtils.executeGetRequestAndReturnDiagnostics(request, ProtocolTypeEnum.DIGIR, inventoryWriter); if (diagnostics == null) { String msg = "Failure in populating diagnostics object from response"; log.error(msg); throw new HarvesterException(msg); } } catch (IOException e) { log.error("Failure writing inventory request", e); // if no response received write GBIF log message gbifLogger.openAndWriteToGbifLogMessageFile(directoryAsString, CommonGBIFLogEvent.COMMON_MESSAGES_HARVEST_BAD_REQUEST.getName(), CommonGBIFLogEvent.COMMON_MESSAGES_HARVEST_BAD_REQUEST.getValue(), Level.ERROR_INT, ("Request failed to url: " + destination), 1, false); throw new HarvesterException(e.getMessage(), e); } logDiagnosticsInfo(diagnostics.getRecords(), bw, Constants.INVENTORY_OPERATIONNAME); finished = diagnostics.isFinished(); // it could be that the DiGIR tool hasn't been configured to recognise start and limit params. // Therefore, since no END_OF_RECORDS = true would occur, we need to test whether we are getting // duplicate responses if (!finished) { try { File directory = new File(directoryAsString); duplicateNames = digesterUtils.checkForDuplicateElements(directory, Constants.INVENTORY_RESPONSE_FILENAME, scientificNameXPathElement); } catch (OperationStoppedException e) { throw new HarvesterException(e.getMessage(), e); } } // There is a chance that a single request will // not be enough to retrieve all the records. // Therefore, if the end has not been reached, re-fire with start // parameter pointing at the next iteration index if (!finished) { // increment start index startInventoryIndex += Integer.valueOf(templateParams.get("maxResults")); // replace previous start index templateParams.put("startAt", Integer.toString(startInventoryIndex)); loopCount++; log.info("Requesting next " + templateParams.get("maxResults") + " records, starting at: " + String.valueOf(startInventoryIndex)); } } // log any duplicates found! if (duplicateNames.size() > 0) { for (String duplicate : duplicateNames) { log.warn("Duplicate name found in inventory response(s), check leading/trailing whitespace: " + duplicate); } } } /** * Build and issue a search request and store the result in gzipped file. Returned boolean * indicates whether or not the entire requested result set is contained in the response. * * @param destination of the DiGIR server to issue against * @param templateParams Map used to hold search template dictionary * @param protocol name of the harvester * @param bw buffered writer to GBIF harvest log message file * @param requestCount start param * * @return boolean search completed * * @throws HarvesterException if operation fails * @throws SAXException if response could not be parsed * @throws OperationStoppedException if operation was manually terminated * @throws SocketException if connection was reset */ private boolean fireSearch(String destination, Map<String, String> templateParams, String protocol, BufferedWriter bw, Integer requestCount) throws HarvesterException, SAXException, OperationStoppedException, SocketException { StringBuilder sb = new StringBuilder(DigirHarvester.BASE_LOCATION); sb.append("/"); sb.append(protocol); sb.append("/"); sb.append(DigirHarvester.TEMPLATE_DIRECTORY_NAME); sb.append("/"); sb.append(DigirHarvester.SEARCH_TEMPLATE_FILENAME); sb.append(Constants.VELOCITY_FILENAME_EXTENSION); String templateLocation = sb.toString(); if (debug) { log.debug("using templateLocation [" + templateLocation + "]"); } if (debug) { log.debug("starting search request [" + requestCount + "]"); } String query; String request; String lower = templateParams.get("lower"); String upper = templateParams.get("upper"); try { query = templateUtils.getAndMerge(templateLocation, templateParams); request = requestUtils.buildURL(destination, "request", query); } catch (Exception e) { log.error("Search request could not be constructed for range [" + lower + "] - [" + upper + "]", e); throw new HarvesterException(e.getMessage(), e); } if (debug) { log.debug("Got query [" + query + "]"); } if (debug) { log.debug("Got request [" + request + "]"); } // save the request try { searchWriter.writeRequest(query.getBytes()); } catch (IOException e) { // failing to write request is not the end of the world, but a bad sign... log.warn("Search request file could not be written for range [" + lower + "] - [" + upper + "]", e); } // execute request to retrieve result set if (debug) { log.debug("Executing search request"); } Diagnostics diagnostics; try { diagnostics = requestUtils.executeGetRequestAndReturnDiagnostics(request, ProtocolTypeEnum.DIGIR, searchWriter); if (diagnostics == null) { String msg = "Failure in populating diagnostics object from response"; log.error(msg); throw new HarvesterException(msg); } if (debug) { log.debug("got diagnostics: [" + diagnostics + "]"); } } catch (FileNotFoundException e) { log.error("Search response file could not found for range [" + lower + "] - [" + upper + "]", e); throw new HarvesterException(e.getMessage(), e); } catch (IOException e) { // *sigh* if (e instanceof WrappedSaxException) { log.info("SAXException in parsing search response"); throw ((WrappedSaxException) e).getWrappedException(); } else if (e instanceof SocketException) { log.error( "The connection may have been reset, so let's take a break and re-issue the same request for range [" + lower + "] - [" + upper + "]", e); try { Thread.sleep(WAIT_TIME); } catch (InterruptedException ie) { // do nothing } finally { throw (SocketException) e; } } else { log.error("Search response file could not be written for range [" + lower + "] - [" + upper + "]", e); throw new HarvesterException(e.getMessage(), e); } } // log all diagnostics information, see if we're finished, log GBIF msgs if (diagnostics != null) { logDiagnosticsInfo(diagnostics.getRecords(), bw, Constants.SEARCH_OPERATIONNAME); } else { throw new HarvesterException("Diagnostics was null! This is a fatal unexpected error"); } return diagnostics.isFinished(); } private void init() { inventoryWriter = new RequestResponseWriterManager(HarvestActionEnum.INVENTORY, fileUtils); searchWriter = new RequestResponseWriterManager(HarvestActionEnum.SEARCH, fileUtils); requestNamespaceMappings = new HashMap<String, String>(); inventoried_elements_of_interest = new ArrayList<String>(); inventoried_elements_of_interest.add(scientificNameXPathElement); inventoried_elements_of_interest.add("*/ScientificName@count"); errorCodes = new HashMap<String, Integer>(); errorCodes.put("FAILED_CONFIGURATION_LOAD", Level.FATAL_INT); errorCodes.put("INVALID_QUERY", Level.ERROR_INT); errorCodes.put("INTERNAL_DATABASE_ERROR", Level.ERROR_INT); errorCodes.put("DATABASE_ERROR", Level.ERROR_INT); errorCodes.put("BAD_QUERY", Level.ERROR_INT); errorCodes.put("UNSUPPORTED_ACCESSPOINT", Level.ERROR_INT); errorCodes.put("COMPARISON_TYPE_NOT_SUPPORTED", Level.ERROR_INT); errorCodes.put("QUERY_TERM_TOO_SHORT", Level.ERROR_INT); errorCodes.put("INVALID_QUERY_TERM", Level.ERROR_INT); errorCodes.put("BAD_CONCEPT_MAP", Level.ERROR_INT); errorCodes.put("RESOURCE_NOT_FOUND", Level.ERROR_INT); errorCodes.put("INVALID_REQUEST", Level.ERROR_INT); errorCodes.put("LOAD_FORMAT_FAILED", Level.ERROR_INT); errorCodes.put("NO_FILTER_IN_REQUEST", Level.ERROR_INT); errorCodes.put("COULD_NOT_LOAD_RESOURCES_FILE", Level.ERROR_INT); errorCodes.put("REMOTE_JOINS_NOT_ALLOWED", Level.ERROR_INT); errorCodes.put("REMOTE_JOIN_FAILED", Level.ERROR_INT); errorCodes.put("TOO_MANY_ERRORS", Level.ERROR_INT); // error_codes.put("LOG_ERROR", Level.ERROR_INT); errorCodes.put("GENERAL_ERROR", Level.ERROR_INT); errorCodes.put("DIGIR_NOT_ALLOWED", Level.ERROR_INT); errorCodes.put("DIGIRM_RECURSIVE_OPERATION", Level.ERROR_INT); errorCodes.put("FILTER_TOO_SHORT", Level.WARN_INT); errorCodes.put(queryProducedNoResultsDiagnosticCode, Level.WARN_INT); errorCodes.put("METADATA_FORMAT_PROBLEM", Level.WARN_INT); errorCodes.put("REMOTE_JOIN_INFO", Level.WARN_INT); errorCodes.put("NO_CONTENT_REQUESTED", Level.WARN_INT); errorCodes.put("NO_CHARACTER_CONVERSION", Level.WARN_INT); errorCodes.put("MATCH_COUNT", Level.INFO_INT); errorCodes.put("RECORD_COUNT", Level.INFO_INT); errorCodes.put(endOfRecordsDiagnosticCode, Level.INFO_INT); // error_codes.put("STATUS_INTERVAL", Level.INFO_INT); // error_codes.put("STATUS_DATA", Level.INFO_INT); errorCodes.put("SQL_DEBUG_INFO", Level.DEBUG_INT); } /** * The entry point required for the user interface integration. * * @param params BioDatasource parameters as map * * @throws HarvesterException thrown if method fails * @see org.gbif.harvest.digir.DigirHarvester#inventory(String, String, String, String, String, String) */ public void inventory(Map<String, String> params) throws HarvesterException { String outputDirectory = Constants.BASE_DIR.concat(File.separator).concat(params.get("directory")); inventory(params.get("url"), Constants.CONCEPT, outputDirectory, params.get("resource_name"), params.get("protocol"), params.get("maxInventoryResponse")); } /** * Issue the inventory request repeatedly until finished. * * @param destination of the DiGIR server to issue against * @param concept to inventory for * @param outputDirectory to which the response is saved * @param resource name to query * @param protocol name * @param maxInventoryResponse maximum size of inventory response * * @throws HarvesterException thrown if method fails */ public void inventory(String destination, String concept, String outputDirectory, String resource, String protocol, String maxInventoryResponse) throws HarvesterException { log.info("Start inventory"); if (debug) { log.debug(">> inventory"); } // set the request response writer to the right directory inventoryWriter.setFileOutputDirectory(outputDirectory); // build the parameters required for the template into a map Map<String, String> templateParams = new HashMap<String, String>(); templateParams.put("destination", destination); templateParams.put("concept", concept); templateParams.put("resource", resource); // determine size of name ranges. int recordsPerResponse = -1; try { recordsPerResponse = Integer.valueOf(maxInventoryResponse); } catch (NumberFormatException e) { log.warn("Error interpreting parameter recordsPerResponse " + maxInventoryResponse + " taken from Metadata response:", e); } if (recordsPerResponse > MAX_INVENTORY_RESPONSE_SIZE || recordsPerResponse <= 0) { recordsPerResponse = MAX_RESPONSE_SIZE; log.info("Inventory response size defaulting to " + String.valueOf(MAX_INVENTORY_RESPONSE_SIZE) + " records."); } else { log.info("Inventory response size set to " + String.valueOf(MAX_INVENTORY_RESPONSE_SIZE) + " records."); } templateParams.put("maxResults", String.valueOf(recordsPerResponse)); // grab the gbif log message output file File gbifLogMessageFile = new File(outputDirectory, Constants.GBIF_LOG_MESSAGE_FILENAME + Constants.TEXT_FILENAME_EXTENSION); // created buffered writer BufferedWriter gbifLogMessageFileBW = null; try { // create file if it didn't already exist if (!gbifLogMessageFile.exists()) { gbifLogMessageFile.createNewFile(); } gbifLogMessageFileBW = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(gbifLogMessageFile, true), "UTF8")); // fire the inventory request(s) fireInventory(destination, templateParams, outputDirectory, protocol, gbifLogMessageFileBW); } catch (IOException e) { log.error("An error occurred during inventory: " + e.getMessage(), e); } catch (OperationStoppedException e) { throw new HarvesterException(e.getMessage(), e); } finally { if (gbifLogMessageFileBW != null) { // close bufferedWriter on GBIF Log Event Message file try { gbifLogMessageFileBW.close(); } catch (IOException e) { log.error("Could not close buffered writer for file " + gbifLogMessageFile.getAbsolutePath(), e); } } } if (debug) { log.debug("<< inventory"); } log.info("End inventory"); } /** * In any DiGIR response, there is a set of diagnostics where error codes are * returned. * This method takes as a parameter, a Map that contains all Error_Code/Info * key/value pairs taken from the diagnostics elements, logging them * according to their respective severity. * This method determines whether the response is the last, * that is whether we can expect any more responses. In the case we encounter * an error of such severity that no more valid responses can be given, * it returns false. * This method also writes to a file that collects GBIF log messages * for harvesting. Iterating through error codes with varying severities, * if it is found that there is a matching BiocaseLogEvent for that error * code, the necessary information is written to the file. * * @param records Map with Error_Code/Info key/value entries * @param bw BufferedWriter to GBIF harvesting log message file * @param operationName name of operation */ private void logDiagnosticsInfo(Map<String, String> records, BufferedWriter bw, String operationName) { super.logBasicDiagnostics(records, bw, operationName, errorCodes, gbifLogger, ProtocolTypeEnum.DIGIR); } /** * The entry point required for the user interface integration. * * @param params BioDatasource parameters as Map * * @throws HarvesterException if operation fails * @see org.gbif.harvest.digir.DigirHarvester#processHarvested(String, String, String) */ public void processHarvested(Map<String, String> params) throws HarvesterException { String outputDirectory = Constants.BASE_DIR.concat(File.separator).concat(params.get("directory")); processHarvested(outputDirectory, params.get("protocol"), params.get("mappingFile")); } /** * Processes the harvested records into a single tab file extracting only * the elements of interest for each record. * * @param outputDirectory To work in * @param protocol name of harvester * @param mappingFile name to use * * @throws HarvesterException thrown if method fails */ public void processHarvested(String outputDirectory, String protocol, String mappingFile) throws HarvesterException { log.info("Start process harvested"); Map<String, String> harvestedElementsOfInterest = new HashMap<String, String>(); // Ensure output directory exists File directory = new File(outputDirectory); if (!directory.exists()) { if (debug) { log.debug("Creating new directory [" + directory.getAbsolutePath() + "]"); } directory.mkdirs(); // including parents } // properties we harvest are read from file Properties mapping = new Properties(); String mappingFilePath = fileUtils.constructMappingFilePath(DigirHarvester.BASE_LOCATION, protocol, DigirHarvester.MAPPING_DIRECTORY_NAME, mappingFile); if (debug) { log.debug("Attempting mapping file load from [" + mappingFilePath + "]"); } InputStream mappingFileStream = null; try { mappingFileStream = this.getClass().getResourceAsStream(mappingFilePath); if (mappingFileStream == null) { String msg = "Mapping file resource [" + mappingFilePath + "] does not exist"; log.warn(msg); throw new HarvesterException(msg); } mapping.load(mappingFileStream); for (Object key : mapping.keySet()) { harvestedElementsOfInterest.put((String) key, mapping.getProperty((String) key)); } } catch (IOException e) { log.error("Problem loading index mapping file", e); throw new HarvesterException(e.getMessage(), e); } finally { if (mappingFileStream != null) { try { mappingFileStream.close(); } catch (IOException e) { log.error( "An error occurred closing input stream on " + mappingFilePath + ": " + e.getMessage(), e); } } } // Prepare directory if (debug) { log.debug("Start preparing directory for process harvested"); } if (directory.isDirectory()) { try { // remove processes harvested file fileUtils.prepareDirectory(outputDirectory, Constants.HARVESTED_FILENAME); if (debug) { log.debug("Finished preparing directory for process harvested"); } } catch (Exception e) { log.error("Error preparing directory for process harvested", e); throw new HarvesterException(e.getMessage(), e); } } // create the tab delimited file File output = new File( outputDirectory + "/" + Constants.HARVESTED_FILENAME + Constants.TEXT_FILENAME_EXTENSION); // create a buffered writer on the output file BufferedWriter bw = null; try { bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF8")); } catch (IOException e) { log.error("File writer to harvested file could not be created", e); } // Write the header line to the tab file: composed of the various // keys taken from the mapping file try { Iterator<String> iter = harvestedElementsOfInterest.keySet().iterator(); while (iter.hasNext()) { String property = iter.next(); if (StringUtils.isNotBlank(property)) { bw.write(property); } if (iter.hasNext()) { bw.write("\t"); } else { bw.write("\n"); } } log.info("Header line of harvested tab file has been written successfully"); } catch (Exception e) { log.error("Header line of harvested tab file could not be written", e); } finally { try { bw.close(); } catch (IOException e) { log.warn("File input stream of harvested tab file could not be closed", e); } } // use the utility class to process the records SimpleXml2Tab harvestedXml2Tab = new SimpleXml2Tab(fileUtils, digesterUtils, gbifLogger, requestUtils); try { harvestedXml2Tab.run(directory, output, Constants.SEARCH_RESPONSE_FILENAME, harvestedElementsOfInterest.values(), DigirHarvester.HARVESTED_REPEATING_ELEMENT); // for console log.info("Writing to file: " + output.getAbsolutePath()); } // was the operation stopped? catch (OperationStoppedException e) { throw new HarvesterException(e.getMessage(), e); } catch (IOException e) { log.warn("Error reading harvested records file", e); throw new HarvesterException(e.getMessage(), e); } log.info("End process harvested"); } /* * Entry point methods */ /** * The entry point required for the user interface integration. * * @param params BioDatasource parameters as Map * * @throws HarvesterException thrown if operation fails * @see org.gbif.harvest.digir.DigirHarvester#processHarvested(String, String, String) */ public void processInventoried(Map<String, String> params) throws HarvesterException { String outputDirectory = Constants.BASE_DIR.concat(File.separator).concat(params.get("directory")); processInventoried(outputDirectory, params.get("maxSearchResponse"), params.get("minQueryTermLength")); } /** * Processes the inventory file(s) from XML to a tab delimited file and * saves it in the output directory. * * @param outputDirectory to which the response is saved * @param maxSearchResponse maximum size of search response * @param minQueryTermLength minimum of characters in a query term * * @throws HarvesterException thrown if the method fails */ public void processInventoried(String outputDirectory, String maxSearchResponse, String minQueryTermLength) throws HarvesterException { if (debug) { log.debug(">> processInventoried"); } // Ensure directory exists File directory = new File(outputDirectory); if (!directory.exists()) { if (debug) { log.debug("Creating new directory: " + directory.getAbsolutePath()); } directory.mkdirs(); // including parents } // Prepare directory: remove the inventoried tab file if (debug) { log.debug("Start preparing directory for process inventoried"); } if (directory.isDirectory()) { try { // remove the inventoried tab file fileUtils.prepareDirectory(outputDirectory, Constants.INVENTORIED_FILENAME); if (debug) { log.debug("Finished preparing directory for process inventoried"); } } catch (Exception e) { log.error("Error preparing directory for process inventoried.", e); throw new HarvesterException(e.getMessage(), e); } } // create the output tab delimited file File outputTabFile = new File( directory + "/" + Constants.INVENTORIED_FILENAME + Constants.TEXT_FILENAME_EXTENSION); // parse the inventory by elements of interest and extract to output // file in tab format SimpleXml2Tab inventoryXml2Tab = new SimpleXml2Tab(fileUtils, digesterUtils, gbifLogger, requestUtils); try { inventoryXml2Tab.run(directory, outputTabFile, Constants.INVENTORY_RESPONSE_FILENAME, inventoried_elements_of_interest, "*/record"); // so that file can be opened via console page log.info("Writing to file: " + outputTabFile.getAbsolutePath()); } // was the operation stopped? catch (OperationStoppedException e) { throw new HarvesterException(e.getMessage(), e); } catch (IOException e) { log.warn("Error reading inventoried records file.", e); throw new HarvesterException(e.getMessage(), e); } // determine minimum possible size of a name int minNameLength = -1; if (StringUtils.isBlank(minQueryTermLength) || minQueryTermLength.equalsIgnoreCase("NULL")) { log.warn("Error converting minQueryTermLength [" + minQueryTermLength + "], please check that it has been set properly by the provider"); } else { try { minNameLength = Integer.valueOf(minQueryTermLength); } catch (Exception e) { minNameLength = Constants.MIN_QUERY_TERM_LENGTH; } } // ensure it's always larger than 0 if (minNameLength <= 0) { minNameLength = Constants.MIN_QUERY_TERM_LENGTH; } // sort inventoried.txt file try { log.info("Sorting inventoried list"); fileUtils.sortInventoried(outputDirectory, minNameLength); } catch (Exception e) { // worst case, inventoried list isn't sorted log.error("inventoried list could not be sorted", e); } // determine size of name ranges. int namesPerRange = -1; try { namesPerRange = Integer.valueOf(maxSearchResponse); } catch (Exception e) { log.warn("Error converting maxSearchResponseRecords [" + maxSearchResponse + "], please check that it has been set properly by the provider", e); } if (namesPerRange < Constants.MIN_QUERY_TERM_LENGTH || namesPerRange > MAX_RESPONSE_SIZE) { namesPerRange = MAX_RESPONSE_SIZE; } log.info("Defaulting to name ranges of size [" + namesPerRange + "]"); // now read in the file and process it into chunks of namesPerRange try { fileUtils.createHarvestNameRanges(outputDirectory, namesPerRange, minNameLength); } catch (CharacterCodingException e) { log.error("Name ranges file could not be written because one of the names couldn't be UTF-8 encoded.", e); throw new HarvesterException(e.getMessage(), e); } catch (IOException e) { log.error("Name ranges file could not be written", e); throw new HarvesterException(e.getMessage(), e); } if (debug) { log.debug("<< processInventoried"); } } /** * The entry point required for the user interface integration. * * @param params BioDatasource parameters as Map * * @throws HarvesterException if method fails * @see org.gbif.harvest.digir.DigirHarvester#search(String, String, String, String, String, int) */ public void search(Map<String, String> params) throws HarvesterException { String outputDirectory = Constants.BASE_DIR.concat(File.separator).concat(params.get("directory")); int target = 0; try { String targetCount = params.get("targetCount"); target = Integer.valueOf(targetCount); } catch (Exception e) { // there was no target count, or it was invalid, so just set to 0 } search(params.get("resource_name"), params.get("url"), outputDirectory, params.get("protocol"), params.get("maxSearchResponse"), target); } /** * Iterates over the nameRanges file, executing a search for each range. A * single search retrieves all records for the concept of interest that fall * in the specified range. If the concept were scientific name, for example, * the searches would collectively retrieve all the destination's records * for all scientific names contained in the resource of interest. This * information, collected as raw xml responses, is saved in the output * directory. The files written this way depend on the assumption * that at most one harvester will be targeting a given resource/name range * pair at a time. * * @param resource to query * @param destination of the DiGIR server to issue against * @param outputDirectory where files of interest are located * @param protocol name of harvester * @param maxSearchResponse maximum number of records returned in a response * @param targetCount target count of resource * * @throws HarvesterException thrown if method fails */ public void search(String resource, String destination, String outputDirectory, String protocol, String maxSearchResponse, int targetCount) throws HarvesterException { if (debug) { log.debug(">> search"); } // set the request response writer to the right directory searchWriter.setFileOutputDirectory(outputDirectory); // setup the NameRangeHandler for this search NameRangeHandler nameRangeHandler = new NameRangeHandler(outputDirectory, fileUtils); // get iterator over name ranges LineIterator nameRangeIter = nameRangeHandler.getNameRangeIterator(); // build the parameters required for the template into a map Map<String, String> templateParams = new HashMap<String, String>(); templateParams.put("resource", resource); templateParams.put("destination", destination); // gather the request content namespaces Properties mapping = new Properties(); String mappingFilePath = fileUtils.constructMappingFilePath(BASE_LOCATION, protocol, DigirHarvester.MAPPING_DIRECTORY_NAME, DigirHarvester.REQUEST_NAMESPACE_MAPPING_FILENAME); if (debug) { log.debug("Reading mappingFile from [" + mappingFilePath + "]"); } InputStream is = null; try { is = getClass().getResourceAsStream(mappingFilePath); if (is == null) { throw new HarvesterException( "Namespace mapping file not found, aborting search [" + mappingFilePath + "]"); } mapping.load(is); for (Object key : mapping.keySet()) { requestNamespaceMappings.put((String) key, mapping.getProperty((String) key)); } } catch (IOException e) { log.error("Problem loading request namespace mapping file", e); throw new HarvesterException(e.getMessage(), e); } finally { if (is != null) { try { is.close(); } catch (IOException e) { log.error( "An error occurred closing input stream on " + mappingFilePath + ": " + e.getMessage(), e); } } } // set the default DwC request namespace String contentNamespace = DigirHarvester.DEFAULT_CONTENT_NAMESPACE; for (String property : requestNamespaceMappings.keySet()) { if (StringUtils.equalsIgnoreCase("full", property)) { contentNamespace = requestNamespaceMappings.get(property); } } if (debug) { log.debug("The request content namespace that will be used is [" + contentNamespace + "]"); } templateParams.put("contentNamespace", contentNamespace); // determine max search response size int maxResponseSize = -1; try { maxResponseSize = Integer.valueOf(maxSearchResponse); } catch (Exception e) { log.warn("Error reading maxResponseSize [" + maxSearchResponse + "], please check that it has been set properly by the provider", e); } if (maxResponseSize < Constants.MIN_QUERY_TERM_LENGTH || maxResponseSize > MAX_RESPONSE_SIZE) { maxResponseSize = MAX_RESPONSE_SIZE; log.info("Defaulting maximum number of search records returned in a single response [" + String.valueOf(MAX_RESPONSE_SIZE) + "]"); } else { log.info("The maximum number of search records returned in a single response has been set to [" + String.valueOf(MAX_RESPONSE_SIZE) + "]"); } // grab the gbif log message output file File gbifLogMessageFile = new File(outputDirectory, Constants.GBIF_LOG_MESSAGE_FILENAME + Constants.TEXT_FILENAME_EXTENSION); // created buffered writer BufferedWriter gbifLogMessageFileBW = null; try { gbifLogMessageFileBW = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(gbifLogMessageFile, true), "UTF8")); } catch (IOException e) { log.error("Couldn't open GBIF 'Harvesting' log event file", e); } // each line in the file is a name range int requestCount = 0; int sameRequestCount = 0; while (nameRangeIter.hasNext()) { // For each line, extract the lower and upper name ranges String line = nameRangeIter.nextLine(); String lower = StringUtils.strip(fileUtils.getDelimitedPart(line, "\t", 0)); String upper = StringUtils.strip(fileUtils.getDelimitedPart(line, "\t", 1)); if (StringUtils.isBlank(lower) || StringUtils.isBlank(upper)) { // something is wrong with that range - log and ignore log.warn("Found invalid name range lower [" + lower + "] and upper [" + upper + "] - ignoring and continuing with next range"); continue; } log.info("Start harvesting range [" + lower + " - " + upper + "]"); // add range parameters to templateParams templateParams.put("lower", lower); templateParams.put("upper", upper); Integer startAt = 0; Integer innerMaxResults = maxResponseSize; boolean finished = false; boolean fireSameSearch = false; Integer endOfErrorRange = null; while (!finished) { templateParams.put("startAt", startAt.toString()); templateParams.put("maxResults", innerMaxResults.toString()); boolean inError = false; try { finished = fireSearch(destination, templateParams, protocol, gbifLogMessageFileBW, requestCount); } catch (SAXParseException saxe) { log.info("SAX Parse exception in parsing search response starting from [" + startAt + "] with maxResults [" + innerMaxResults + "]", saxe); // the chances are good that this error is in only one of the records coming back // so we can try again and start narrowing the size of the response until we find // the one causing the problem. It also may have been a transient error, so just // retrying would be enough to fix it. In that case we watch for a startAt beyond // this error range and when that is hit, bump the search back up to maxResponseSize. if (innerMaxResults == 1) { // we've found exactly one record that is in error - log in detail and move on log.warn( "Search record in error - record num [" + startAt + "]" + "in range lower [" + lower + "] upper [" + upper + "] " + "from destination [" + destination + "]", saxe); // assume that most errors are bad chars in a single record, so now set max results back to full endOfErrorRange = null; innerMaxResults = maxResponseSize; startAt++; // start should never exceed targetCount (avoid endless looping - skip range) if (startAt > targetCount) { log.error("Inside retry loop: request parameter startAt exceeded targetCount: " + String.valueOf(targetCount) + " Skipping range lower [" + lower + "] upper [" + upper + "]"); break; } else { continue; } } else { inError = true; endOfErrorRange = startAt + innerMaxResults; } } catch (SAXException e) { // we don't know enough to recover, so write this range out as failed, and move on to next range nameRangeHandler.appendFailedRange(lower, upper); break; } catch (SocketException se) { // the connection was reset, so re-issue the exact same request. // We have already waited 3 minutes to give the provider software the chance to reset in fireSearch() sameRequestCount++; // only re-issue same request on SocketException 5 times if (sameRequestCount <= 4) { log.info("Re-issue same request (" + String.valueOf(sameRequestCount) + ")"); inError = true; fireSameSearch = true; } else { log.info("Exceeded max number of possible re-issue same request"); nameRangeHandler.appendFailedRange(lower, upper); break; } } // was the operation stopped? catch (OperationStoppedException e) { throw new HarvesterException(e.getMessage(), e); } // setup for the next pass if (!finished) { if (inError && !fireSameSearch) { innerMaxResults = Math.max(1, innerMaxResults / DIVISOR); } else if (inError && fireSameSearch) { // parameters stay the same } else { requestCount++; if (endOfErrorRange != null && (startAt > endOfErrorRange)) { if (debug) { log.debug("StartAt passed endOfErrorRange, resetting max results"); } innerMaxResults = maxResponseSize; endOfErrorRange = null; } startAt = startAt + innerMaxResults; } } else { // successful completion of range nameRangeHandler.registerSuccessfulRange(lower); } } } LineIterator.closeQuietly(nameRangeIter); // a successful search has been completed, so do name range cleanup (ie delete pendingNameRanges file) nameRangeHandler.close(); // close bufferedWriter on GBIF Log Event Message file if (gbifLogMessageFile != null) { // close bufferedWriter on GBIF Log Event Message file try { gbifLogMessageFileBW.close(); } catch (IOException e) { log.error("Could not close buffered writer for file [" + gbifLogMessageFile.getAbsolutePath() + "]", e); } } log.info("End search"); } }