Java tutorial
/******************************************************************************* * Copyright (C) 2008 Global Biodiversity Information Facility Secretariat. * All Rights Reserved. * * The contents of this file are subject to the Mozilla Public * License Version 1.1 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or * implied. See the License for the specific language governing * rights and limitations under the License. ******************************************************************************/ package org.gbif.harvest.digir; import org.gbif.harvest.AbstractHarvester; import org.gbif.harvest.core.AbstractSynchroniserFactory; import org.gbif.harvest.core.Constants; import org.gbif.harvest.exception.HarvesterException; import org.gbif.harvest.exception.OperationStoppedException; import org.gbif.harvest.log.CommonGBIFLogEvent; import org.gbif.harvest.log.I18nLog; import org.gbif.harvest.log.I18nLogFactory; import org.gbif.harvest.model.BioDatasource; import org.gbif.harvest.service.BioDatasourceManager; import org.gbif.harvest.util.FileUtils; import org.gbif.harvest.util.GbifLogger; import org.gbif.harvest.util.JSONUtils; import org.gbif.harvest.util.RequestUtils; import org.gbif.harvest.util.TemplateUtils; import org.gbif.util.BioDatasourceUtils; import java.io.BufferedWriter; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.RandomAccessFile; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Level; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.Node; import org.dom4j.io.SAXReader; import org.dom4j.tree.DefaultDocument; import org.dom4j.xpath.DefaultXPath; /** * This is a special handler for DiGIR where it will issue a metadata request * against a DiGIR endpoint, and create a new BioDatasource for each resource * behind it having a recognized schemaLocation. * In the event that a BioDatasource already exists, its metadata is updated. * * @author timrobertson * @author kbraak */ public class DigirMetadataHandler extends AbstractHarvester { // commons logging private I18nLog log = I18nLogFactory.getLog(this.getClass()); private static final String BASE_LOCATION = "org/gbif/harvest/digir"; private static final String MAPPING_DIRECTORY_NAME = "mapping"; private static final String TEMPLATE_DIRECTORY_NAME = "template"; // the DiGIR template for the metadata request private static final String REQUEST_TEMPLATE_FILENAME = "metadata"; private static final String METADATA_MAPPING_FILE_NAME = "metadataMapping"; // the mapping file to determine which index mapping file to use private static final String SCHEMA_LOCATION_MAPPING_FILENAME = "schemaLocationMapping"; // the mapping file to determine which protocol to use private static final String PROTOCOL_MAPPING_FILENAME = "protocolMapping"; private static final String RESOURCES_WITH_COUNT_FILENAME = "resources_with_count"; private static final String DEFAULT_MAPPING_FILE = "indexMapping_dwc_1_0"; private static final String DEFAULT_PROTOCOL = "digir_1_0"; // Maps with xpaths to those elements we're interested in processing/extracting private static Map<String, String> metadataElementsOfInterest = new HashMap<String, String>(); // Maps with xpaths to those contact related elements we're interested in processing/extracting private static Map<String, String> metadataResourceContactElementsOfInterest = new HashMap<String, String>(); // NOTE: currently provider contacts not being gathered - taken from UDDI instead private static Map<String, String> metadataProviderContactElementsOfInterest = new HashMap<String, String>(); private static Pattern tabPattern = Pattern.compile("\t"); // key names: these names must be used in all metadata mappings files private static String resourceNameKeyName = "code"; private static String schemaLocationKeyName = "schemaLocation"; private static String minQueryTermLengthKeyName = "minQueryTermLength"; private static String maxInventoryResponseKeyName = "maxInventoryResponse"; private static String maxSearchResponseKeyName = "maxSearchResponse"; private static String recordCountKeyName = "recordCount"; private static String conceptualSchemaKeyName = "conceptualSchema"; // File writers private static BufferedWriter resourceContactsBW = null; private static BufferedWriter resourcesBW = null; // The name of the keys used in the repeating elements / xpath map. // These MUST be the same as the keys in the metadataMapping properties file // that identify the repeating elements' XPath expressions private static final String resourceEntityRepeatingElementName = "reXPath"; private static final String contactEntityRepeatingElementName = "ceXPath"; private Map<String, DefaultXPath> metadataRepeatingElementsXpath; // for use in setting xpaths private Map<String, String> namespaceMap; private Set<String> conceptualSchemaWhitelist; private int lineNumber; private TemplateUtils templateUtils; private RequestUtils requestUtils; private FileUtils fileUtils; private GbifLogger gbifLogger; private BioDatasourceManager bioDatasourceManager; private List<AbstractSynchroniserFactory> synchroniserFactories = new LinkedList<AbstractSynchroniserFactory>(); public DigirMetadataHandler(TemplateUtils templateUtils, RequestUtils requestUtils, FileUtils fileUtils, GbifLogger gbifLogger, BioDatasourceManager bioDatasourceManager, List<AbstractSynchroniserFactory> synchroniserFactories) { this.templateUtils = templateUtils; this.requestUtils = requestUtils; this.fileUtils = fileUtils; this.gbifLogger = gbifLogger; this.bioDatasourceManager = bioDatasourceManager; this.synchroniserFactories = synchroniserFactories; init(); } /** * Construct a new BioDatasource, or update a pre-existing one. * * @param name of BioDatasource * @param url access point URL * @param resourceName code * @param resourceCount count * @param uddiKey registry service UUID * @param params map of BioDatasource params * @param contentNamespace contentNamespace * @param mappingFile name * @param protocol name * @param parentDirectoryName parent directory name * * @throws HarvesterException thrown if method fails */ private void createOrUpdateBioDatasource(String name, String url, String resourceName, String resourceCount, String uddiKey, Map<String, Object> params, String contentNamespace, String mappingFile, String protocol, String parentDirectoryName) throws HarvesterException { // Whether we're creating/updating, we always need to update params: params.put("url", url); params.put("resource_name", resourceName); params.put("contentNamespace", contentNamespace); params.put("mappingFile", mappingFile); params.put("protocol", protocol); params.put("harvesterFactory", Constants.DIGIR_HARVESTER_FACTORY); // construct the new, validated directory name String newValidDirectoryName = BioDatasourceUtils.constructBioDatasourceOperatorDirectoryName(resourceName, parentDirectoryName); params.put("directory", newValidDirectoryName); // get country name String country = null; if (params.containsKey("country")) { country = (String) params.get("country"); // "country":null is converted to "country":"\"null\"" if (StringUtils.equalsIgnoreCase(country, "\"null\"")) { country = null; } } // get provider name String dataProviderName = null; if (params.containsKey("providerName")) { dataProviderName = params.get("providerName").toString(); } // add synchroniserFactories list to params synchroniserFactories = getSynchroniserFactories(); List<String> factories = new LinkedList<String>(); Iterator<AbstractSynchroniserFactory> iter = synchroniserFactories.iterator(); while (iter.hasNext()) { Class cls = (iter.next().getClass()); String clsName = cls.getName(); factories.add(clsName); } params.put("synchroniserFactories", factories); // construct BioDatasource's name String newName = BioDatasourceUtils.constructBioDatasourceName(name, resourceName); Long id = bioDatasourceManager.checkIfBioDatasourceExists(newName, uddiKey); try { // if this is a new BioDatasource if (id.compareTo(-1L) == 0) { // default count to 0 int count = 0; try { count = Integer.valueOf(resourceCount); } catch (NumberFormatException e) { count = 0; log.info("defaultCount", String.valueOf(count)); } // update params Map<String, Object> newParams = new HashMap<String, Object>(); newParams.putAll(params); newParams.put("name", newName); newParams.put("uddiKey", uddiKey); newParams.put("targetCount", String.valueOf(count)); String parametersAsJson = JSONUtils.jsonFromMap(newParams); // create new BioDatasource BioDatasource datasource = new BioDatasource(newName, dataProviderName, Constants.DIGIR_HARVESTER_FACTORY, parametersAsJson, count, uddiKey, country, url); bioDatasourceManager.save(datasource); log.info("createBioDatasource", newName); log.info("setCount", resourceCount); } else { BioDatasource bioDatasource = bioDatasourceManager.get(id); // update params Map<String, Object> oldParams = JSONUtils.mapFromJSON(bioDatasource.getParametersAsJSON()); oldParams.putAll(params); // update its target count oldParams.put("targetCount", resourceCount); bioDatasource.setParametersAsJSON(JSONUtils.jsonFromMap(oldParams)); bioDatasource.setTargetCount(Integer.parseInt(resourceCount)); // in case the url has changed bioDatasource.setUrl(url); // in case the country has changed bioDatasource.setCountry(country); // in case the provider name has changed bioDatasource.setProviderName(BioDatasourceUtils.prepareStringForUI(dataProviderName)); bioDatasourceManager.save(bioDatasource); log.info("createBioDatasource.exists", bioDatasource.getName()); log.info("updateCount", resourceCount); } } catch (Exception e) { log.error("error.createBioDatasource", e.getMessage(), e); throw new HarvesterException(e.getMessage(), e); } } public int getLineNumber() { return lineNumber; } /** * Determine the mapping file. * If there is a problem loading the file, or no match exists for the * contentNamespace, the default is used. * * @param contentNamespace contentNamespace * @param directory as String * @param resourceName code * * @return mappingFile name * * @throws HarvesterException thrown if method fails */ private String getMappingFile(String contentNamespace, String directory, String resourceName) throws HarvesterException { // Initially, set the mapping file to the default String mappingFile = DEFAULT_MAPPING_FILE; if (StringUtils.isNotBlank(contentNamespace)) { Properties mapping = new Properties(); String mappingFilePath = fileUtils.constructMappingFilePath(BASE_LOCATION, MAPPING_DIRECTORY_NAME, SCHEMA_LOCATION_MAPPING_FILENAME); InputStream is = null; try { is = DigirMetadataHandler.class.getResourceAsStream(mappingFilePath); mapping.load(is); boolean found = false; for (Object key : mapping.keySet()) { if (StringUtils.equals(contentNamespace, (String) key)) { mappingFile = mapping.getProperty((String) key); found = true; } } // if not found, alert operator if (!found) { log.error("digirmetadatahandler.default.conceptualMappingNotFound", new String[] { resourceName, contentNamespace }); // and write GBIF Log Message gbifLogger.openAndWriteToGbifLogMessageFile(directory, CommonGBIFLogEvent.COMMON_MESSAGES_UNKNOWN_SCHEMA_LOCATION.getName(), CommonGBIFLogEvent.COMMON_MESSAGES_UNKNOWN_SCHEMA_LOCATION.getValue(), Level.ERROR_INT, "For resource=" + resourceName + ": the schemaLocation " + contentNamespace + " was not found in the DiGIR conceptualMapping.properties file. If this is a valid schemaLocation, please update this file and try again. Defaulting to DwC 1.0", 1, false); } } catch (NullPointerException e) { log.info("error.mappingFileExists", new String[] { mappingFilePath, e.getMessage() }, e); throw new HarvesterException(e.getMessage(), e); } catch (IOException e) { log.error("digirmetadatahandler.error.getMappingFile", e.getMessage(), e); log.error("digirmetadatahandler.default.getMappingFile", mappingFile); } finally { if (is != null) { try { is.close(); } catch (IOException e) { log.error("An error occurred closing input stream on " + mappingFilePath + ": " + e.getMessage(), e); } } } } else { log.error( "No schemaLocation attribute was specified in element conceptualSchema: defaulting to DwC 1.0"); } return mappingFile; } /** * Determine the protocol. * If there is a problem loading the file, or no match exists for the * contentNamespace, the default is used. * * @param contentNamespace contentNamespace * * @return protocol name * * @throws HarvesterException thrown if method fails */ private String getProtocol(String contentNamespace) throws HarvesterException { // Initially, set the protocol to the default String protocol = DEFAULT_PROTOCOL; Properties mapping = new Properties(); String mappingFilePath = fileUtils.constructMappingFilePath(BASE_LOCATION, MAPPING_DIRECTORY_NAME, PROTOCOL_MAPPING_FILENAME); InputStream is = null; try { is = DigirMetadataHandler.class.getResourceAsStream(mappingFilePath); mapping.load(is); boolean found = false; for (Object key : mapping.keySet()) { if (StringUtils.equals(contentNamespace, (String) key)) { protocol = mapping.getProperty((String) key); found = true; } } // if not found, alert operator if (!found) { log.error("digirmetadatahandler.default.protocolMappingNotFound", contentNamespace); } } catch (NullPointerException e) { log.info("error.mappingFileExists", new String[] { mappingFilePath, e.getMessage() }, e); throw new HarvesterException(e.getMessage(), e); } catch (IOException e) { log.error("digirmetadatahandler.error.getProtocol", e.getMessage(), e); log.debug("digirmetadatahandler.default.getProtocol", protocol); } finally { if (is != null) { try { is.close(); } catch (IOException e) { log.error( "An error occurred closing input stream on " + mappingFilePath + ": " + e.getMessage(), e); } } } return protocol; } /** * @return the synchroniserFactories */ public List<AbstractSynchroniserFactory> getSynchroniserFactories() { return synchroniserFactories; } private void init() { // with default (tapir 1.0) values as place holders metadataRepeatingElementsXpath = new HashMap<String, DefaultXPath>(); metadataRepeatingElementsXpath.put(resourceEntityRepeatingElementName, new DefaultXPath("//digir_1_0:resource")); metadataRepeatingElementsXpath.put(contactEntityRepeatingElementName, new DefaultXPath("//digir_1_0:resource/digir_1_0:contact")); // when more versions of DiGIR become available, these can no longer be hard-coded. namespaceMap = new HashMap<String, String>(); namespaceMap.put("digir_1_0", "http://digir.net/schema/protocol/2003/1.0"); // load conceptualSchema whiteList list conceptualSchemaWhitelist = new HashSet<String>(); conceptualSchemaWhitelist.add("http://digir.net/schema/conceptual/darwin/2003/1.0"); conceptualSchemaWhitelist.add("http://www.iobis.org/obis"); conceptualSchemaWhitelist.add("OBIS Schema Version 1.0"); } /** * The entry point required for the user interface integration. * * @param params map of the datasource to whom the operation belongs * * @throws HarvesterException thrown if method fails */ public void issueMetadata(Map<String, String> params) throws HarvesterException { Map<String, Object> paramsCopy = new HashMap<String, Object>(); paramsCopy.putAll(params); issueMetadata(params.get("name"), params.get("url"), params.get("uddiKey"), Constants.BASE_DIR.concat(File.separator).concat(params.get("directory")), paramsCopy); } /** * Issues a metadata request to a DiGIR provider. It then collects metadata * about the different resources located behind that provider's access * point. * The collected metadata for each resource, like the name (code), count, * etc. is written to a file. * Iterating over this file, a new BioDatasource is created for each * resource, with its name, count, and other attributes all set accordingly. * Note that the name of each new BioDatasource is the concatenation of the * Provider and resource code. * * @param name of the datasource * @param url of the datasource * @param uddiKey of the datasource * @param directory to save files to * @param params map of the datasource * * @throws HarvesterException thrown if method fails */ public void issueMetadata(String name, String url, String uddiKey, String directory, Map<String, Object> params) throws HarvesterException { log.info("start.issueMetadata"); // Determine the protocol // For now use default protocol as this is the only one needed at metadata level String protocol = DEFAULT_PROTOCOL; // populate element of interest maps from the mapping file's properties populateElementOfInterestsMapsFromMappingFile(METADATA_MAPPING_FILE_NAME, protocol); // send metadata request and get response as ByteArrayInputStream ByteArrayInputStream metadataResponse = metadataRequest(url, directory, protocol); // collect resources metadata, including contact metadata, into // separate output files processAllMetadata(metadataResponse, directory); // recover our metadata files File resourcesFile = new File(directory, DigirMetadataHandler.RESOURCES_WITH_COUNT_FILENAME.concat(Constants.TEXT_FILENAME_EXTENSION)); File contactsFile = new File(directory, Constants.CONTACT_FILENAME.concat(Constants.TEXT_FILENAME_EXTENSION)); // Iterate over resource metadata file, and resource contact metadata file // For each resource's contacts, write a new contact metadata file // For each resource create a new BioDatasource // NOTE: it is in the BioDatasource's directory that the contact metadata // file is saved RandomAccessFile contactsRaf = null; RandomAccessFile resourcesRaf = null; try { contactsRaf = new RandomAccessFile(contactsFile, "r"); contactsRaf.seek(0L); String contactLine = fileUtils.readUTFLine(contactsRaf); // put the header column properties into a list (minus line number) List<String> contactPropertiesList = retrieveStringListFromLine(contactLine); contactLine = fileUtils.readUTFLine(contactsRaf); int contactLineNumber = -1; if (StringUtils.isNotBlank(contactLine)) { contactLineNumber = Integer.valueOf(fileUtils.getDelimitedPart(contactLine, "\t", 0)); } // Open a file cursor to the resources and contacts files resourcesRaf = new RandomAccessFile(resourcesFile, "r"); resourcesRaf.seek(0L); String resourceLine = fileUtils.readUTFLine(resourcesRaf); int resourceLineNumber = 1; // put the header column properties into an array String[] resourceProperties = tabPattern.split(resourceLine); // remove all line breaking characters resourceProperties = fileUtils.removeLineBreakingCharacters(resourceProperties); while ((resourceLine = fileUtils.readUTFLine(resourcesRaf)) != null) { // set the position of the cursor resourceLineNumber = Integer.valueOf(fileUtils.getDelimitedPart(resourceLine, "\t", 0)); Map<String, Object> newParams = new HashMap<String, Object>(); newParams.putAll(params); String schemaLocation = null; String conceptualSchema = ""; String recordCount = "0"; for (int columnIndex = 1; columnIndex < resourceProperties.length; columnIndex++) { String property = resourceProperties[columnIndex]; String value = fileUtils.getDelimitedPart(resourceLine, "\t", columnIndex); System.out.println("the value for property: " + property + " is: " + value); // ignore the value if it's null if (StringUtils.isNotBlank(value) || !StringUtils.equalsIgnoreCase(value, "null")) { newParams.put(property, value); } else if (StringUtils.isBlank(value) && property.equals(minQueryTermLengthKeyName)) { newParams.put(property, "0"); } else if (StringUtils.isBlank(value) && property.equals(maxInventoryResponseKeyName)) { newParams.put(property, "0"); } else if (StringUtils.isBlank(value) && property.equals(maxSearchResponseKeyName)) { newParams.put(property, "0"); } } // Determine the schema location if (newParams.containsKey(schemaLocationKeyName)) { schemaLocation = (String) newParams.get(schemaLocationKeyName); } // Get the resourceName String resourceName = null; if (newParams.containsKey(resourceNameKeyName)) { resourceName = (String) newParams.get(resourceNameKeyName); } String mappingFile = getMappingFile(schemaLocation, directory, resourceName); // Determine the protocol protocol = getProtocol(schemaLocation); // Determine the count if (newParams.containsKey(recordCountKeyName)) { recordCount = (String) newParams.get(recordCountKeyName); } // ensure resource relates to a recognized conceptualSchema boolean recognizedConceptualSchema = false; if (newParams.containsKey(conceptualSchemaKeyName)) { conceptualSchema = (String) newParams.get(conceptualSchemaKeyName); if (StringUtils.isNotBlank(conceptualSchema)) { recognizedConceptualSchema = conceptualSchemaWhitelist.contains(conceptualSchema); } } // only proceed with biodatasource creation if we have a resourceName if (StringUtils.isNotBlank(StringUtils.trimToNull(resourceName)) && recognizedConceptualSchema) { // Get location where we'll save the contact file String validatedResourceName = fileUtils.validateDirectoryName(resourceName); // create new directory if necessary File resourceDirectory = new File(directory, validatedResourceName); if (!resourceDirectory.exists()) { log.debug("Creating new directory: " + resourceDirectory.getAbsolutePath()); resourceDirectory.mkdirs(); // including parents } // delete pre-existing contact file fileUtils.prepareDirectory(resourceDirectory.getAbsolutePath(), Constants.CONTACT_FILENAME); // create new contact file File newContactsFile = new File(resourceDirectory + "/" + Constants.CONTACT_FILENAME + Constants.TEXT_FILENAME_EXTENSION); newContactsFile.createNewFile(); // create bufferedWriter on contact file BufferedWriter newContactsBW = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(newContactsFile, true), "UTF8")); // write header line fileUtils.writeValuesToFile(newContactsBW, contactPropertiesList); // Move over contacts as far as position of current cursor while (contactLineNumber <= resourceLineNumber && StringUtils.isNotBlank(contactLine)) { // retrieve all values, minus line number List<String> contactValues = retrieveStringListFromLine(contactLine); // write values to file fileUtils.writeValuesToFile(newContactsBW, contactValues); // move to next line contactLine = fileUtils.readUTFLine(contactsRaf); if (StringUtils.isNotBlank(contactLine)) { int lineNumber = Integer.valueOf(fileUtils.getDelimitedPart(contactLine, "\t", 0)); if (lineNumber > contactLineNumber) { contactLineNumber = lineNumber; } } } // close BW newContactsBW.close(); // log having written the files so that they appear in the console log.info("Writing to file: " + newContactsFile.getAbsolutePath()); // Construct the new BioDatasource createOrUpdateBioDatasource(name, url, resourceName, recordCount, uddiKey, newParams, schemaLocation, mappingFile, protocol, (String) params.get("directory")); } else if (StringUtils.isBlank(StringUtils.trimToNull(resourceName))) { log.error("error.issueMetadata.noName"); } else if (!recognizedConceptualSchema) { log.error("conceptualSchema (" + conceptualSchema + ") was not recognized for resource=" + resourceName + " therefore NO BioDatasource will be created"); } } } catch (IOException e) { log.error("An IOException occurred during issueMetadata(): " + e.getMessage(), e); } finally { try { if (contactsRaf != null) { contactsRaf.close(); } } catch (IOException e) { log.error("digirmetadatahandler.error.issueMetadata.closingCursors"); } try { if (resourcesRaf != null) { resourcesRaf.close(); } } catch (IOException e) { log.error("digirmetadatahandler.error.issueMetadata.closingCursors"); } } log.info("end.issueMetadata"); } /** * Executes a metadata request that retrieves information about the * resources behind a given access point and saves it in the output * directory. * * @param destination of the DiGIR access point to request against * @param outputDirectory to which the response will be saved * @param protocol name * * @return metadata response as ByteArrayInputStream * * @throws HarvesterException thrown if method fails */ public ByteArrayInputStream metadataRequest(String destination, String outputDirectory, String protocol) throws HarvesterException { log.info("start.metadataRequest"); // build the parameters required for the template into a map Map<String, String> templateParams = new HashMap<String, String>(); templateParams.put("destination", destination); // Prepare directory File directory = new File(outputDirectory); log.debug("start.metadataRequest.prepareDirectory"); if (directory.isDirectory()) { try { // remove all metadata requests and responses fileUtils.prepareDirectory(outputDirectory, Constants.METADATA_PREFIX); fileUtils.prepareDirectory(outputDirectory, DigirMetadataHandler.RESOURCES_WITH_COUNT_FILENAME); log.debug("end.metadataRequest.prepareDirectory"); } catch (Exception e) { log.error("error.metadataRequest.prepareDirectory", e.getMessage(), e); throw new HarvesterException(e.getMessage(), e); } } // build the DiGIR metadata request String query; String request; String templateLocation = BASE_LOCATION.concat("/").concat(protocol).concat("/") .concat(TEMPLATE_DIRECTORY_NAME).concat("/").concat(REQUEST_TEMPLATE_FILENAME) .concat(Constants.VELOCITY_FILENAME_EXTENSION); try { query = templateUtils.getAndMerge(templateLocation, templateParams); request = requestUtils.buildURL(destination, "request", query); } catch (Exception e) { log.error("error.metadataRequest.buildUrl", e.getMessage()); throw new HarvesterException(e.getMessage(), e); } // save the request try { fileUtils.writeSequentiallyNamedGzippedFile(outputDirectory, Constants.METADATA_REQUEST_FILENAME, query.getBytes()); } catch (IOException e) { log.warn("error.metadataRequest.writeRequest", e.getMessage()); } // fire the request ByteArrayInputStream is; byte[] array; try { // get response as byte array array = requestUtils.executePersistentGetRequestAndReturnByteArray(request, outputDirectory, destination); // save the response and return the newly created file fileUtils.writeSequentiallyNamedGzippedFile(outputDirectory, Constants.METADATA_RESPONSE_FILENAME, array); // set input stream is = new ByteArrayInputStream(array); } // was the operation stopped? catch (OperationStoppedException e) { throw new HarvesterException(e.getMessage(), e); } catch (IOException e) { log.error("error.metadataRequest.writeResponse", e.getMessage()); throw new HarvesterException(e.getMessage(), e); } log.info("end.metadataRequest"); return is; } /** * Parse the response file and write the parsed values to their * appropriate file. * * @param inputStream representing harvested xml response * * @throws DocumentException thrown if parsing error occurred * @throws IOException thrown */ private void parseResponseFile(ByteArrayInputStream inputStream) throws DocumentException, IOException { // create a DOM4J tree, reading a Document from the given File SAXReader reader = new SAXReader(); reader.setEncoding("UTF-8"); Document document = reader.read(inputStream); document.setXMLEncoding("UTF-8"); // get all resource Elements List<Node> resourceEntities = (metadataRepeatingElementsXpath.get(resourceEntityRepeatingElementName)) .selectNodes(document); // iterate over resource Elements for (Node resourceEntity : resourceEntities) { // Detatch resource Element and create new Document with it DefaultDocument doc1 = new DefaultDocument(); doc1.setRootElement((Element) resourceEntity.detach()); // get all resource contact Elements List<Node> resourceContacts = (metadataRepeatingElementsXpath.get(contactEntityRepeatingElementName)) .selectNodes(doc1); // iterate over contact Elements for (Node resourceContact : resourceContacts) { // Detatch relatedEntity Element and create new Document with it DefaultDocument doc2 = new DefaultDocument(); doc2.setRootElement((Element) resourceContact.detach()); // write hasContact elements-of-interest to file fileUtils.writeValuesToFile(resourceContactsBW, metadataResourceContactElementsOfInterest.values(), doc2, namespaceMap, String.valueOf(getLineNumber())); } // write relatedEntity elements-of-interest to file fileUtils.writeValuesToFile(resourcesBW, metadataElementsOfInterest.values(), doc1, namespaceMap, String.valueOf(getLineNumber())); setLineNumber(getLineNumber() + 1); } } /** * Iterates over the metadata mapping file, populating the various * elements-of-interest maps. Regular expressions divide the mapping file's * properties into the appropriate element-of-interest map. * Note: The mapping file's properties are in the following format: * [element-of-interest name] + [property name] = [XPath expresson] * The regular expression matches according to the [element-of-interest * name] * The corresponding element-of-interest map is then populated with: key = * [property name] & value = [XPath expression] * * @param mappingFile name * @param protocol name * * @throws HarvesterException thrown if method fails */ private void populateElementOfInterestsMapsFromMappingFile(String mappingFile, String protocol) throws HarvesterException { // Create regex patterns // we're interested in all non-contact related properties Pattern metadataKeyPattern = Pattern.compile("metadata([\\S]*)"); Pattern providerContactKeyPattern = Pattern.compile("providerContact([\\S]*)"); Pattern resourceContactKeyPattern = Pattern.compile("resourceContact([\\S]*)"); // properties we harvest are read from file Properties mapping = new Properties(); String mappingFilePath = fileUtils.constructMappingFilePath(BASE_LOCATION, protocol, MAPPING_DIRECTORY_NAME, mappingFile); InputStream is = null; try { is = DigirMetadataHandler.class.getResourceAsStream(mappingFilePath); mapping.load(is); // Divide the mapping properties into various element-of-interest maps for (Object key : mapping.keySet()) { Boolean matched = false; // Matchers matching keys belonging to repeating element groups Matcher metadataKeyMatcher = metadataKeyPattern.matcher((String) key); if (metadataKeyMatcher.matches()) { String property = metadataKeyMatcher.group(1); metadataElementsOfInterest.put(property, mapping.getProperty((String) key)); matched = true; } if (!matched) { Matcher providerContactKeyMatcher = providerContactKeyPattern.matcher((String) key); if (providerContactKeyMatcher.matches()) { String property = providerContactKeyMatcher.group(1); metadataProviderContactElementsOfInterest.put(property, mapping.getProperty((String) key)); matched = true; } if (!matched) { Matcher resourceContactKeyMatcher = resourceContactKeyPattern.matcher((String) key); if (resourceContactKeyMatcher.matches()) { String property = resourceContactKeyMatcher.group(1); metadataResourceContactElementsOfInterest.put(property, mapping.getProperty((String) key)); matched = true; } if (!matched) { // Determines the XPath expressions used to isolate repeating elements in a // metadata xml response. if (metadataRepeatingElementsXpath.keySet().contains(key)) { // construct an XPath expression for repeating Element DefaultXPath xpath = new DefaultXPath(mapping.getProperty((String) key)); xpath.setNamespaceURIs(namespaceMap); metadataRepeatingElementsXpath.put((String) key, xpath); } } } } } } catch (NullPointerException e) { log.info("error.mappingFileExists", new String[] { mappingFilePath, e.getMessage() }, e); throw new HarvesterException(e.getMessage(), e); } catch (Exception e) { log.error("error.populateElementOfInterestsMapsFromMappingFile", new String[] { mappingFile, e.getMessage() }, e); throw new HarvesterException(e.getMessage(), e); } finally { if (is != null) { try { is.close(); } catch (IOException e) { log.error( "An error occurred closing input stream on " + mappingFilePath + ": " + e.getMessage(), e); } } } } /** * Collect resource metadata, including resource contact metadata. * Resources metadata is outputed to file. * Resource contact metadata is also outputted to file * * @param metadataResponse as ByteArrayInputStream * @param outputDirectory directory to write to * * @throws HarvesterException thrown if method fails */ private void processAllMetadata(ByteArrayInputStream metadataResponse, String outputDirectory) throws HarvesterException { log.info("start.processAllMetadata"); // create the output directory File directory = new File(outputDirectory); // Prepare directory log.debug("digirmetadatahandler.start.processAllMetadata.prepareDirectory"); if (directory.isDirectory()) { try { // remove all pre-existing contact tab files fileUtils.prepareDirectory(outputDirectory, DigirMetadataHandler.RESOURCES_WITH_COUNT_FILENAME); fileUtils.prepareDirectory(outputDirectory, Constants.CONTACT_FILENAME); log.debug("digirmetadatahandler.end.processAllMetadata.prepareDirectory"); } catch (Exception e) { log.error("digirmetadatahandler.error.processAllMetadata.prepareDirectory", e.getMessage(), e); throw new HarvesterException(e.getMessage(), e); } } // create the output files File resourcesFile = new File(directory, DigirMetadataHandler.RESOURCES_WITH_COUNT_FILENAME.concat(Constants.TEXT_FILENAME_EXTENSION)); File resourceContactsFile = new File(directory, Constants.CONTACT_FILENAME.concat(Constants.TEXT_FILENAME_EXTENSION)); // ensure that they exist anew try { resourcesFile.createNewFile(); resourceContactsFile.createNewFile(); } catch (IOException e) { log.error("digirmetadatahandler.error.processAllMetadata.createFiles", e.getMessage(), e); throw new HarvesterException(e.getMessage(), e); } // create file writers for each file try { resourcesBW = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(resourcesFile, true), "UTF8")); resourceContactsBW = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(resourceContactsFile, true), "UTF8")); } catch (IOException e) { log.error("error.createBWs", e.getMessage(), e); throw new HarvesterException(e.getMessage(), e); } // write header column line for each file try { // The header line is derived from the names of the properties fileUtils.writeHeaderLine(resourcesBW, metadataElementsOfInterest.keySet(), true); // an identification number column name is also written fileUtils.writeHeaderLine(resourceContactsBW, metadataResourceContactElementsOfInterest.keySet(), true); } catch (IOException e) { log.error("error.writeHeaders", e.getMessage(), e); throw new HarvesterException(e.getMessage(), e); } // parse metadata setLineNumber(1); try { parseResponseFile(metadataResponse); } catch (Exception e) { log.error("error.metadataRequest.parsing", e.getMessage(), e); throw new HarvesterException(e.getMessage(), e); } // close the buffer writers and inputStream, and log having written the files so that // they appear in the console try { resourcesBW.close(); resourceContactsBW.close(); log.info("Writing to file: " + resourcesFile.getAbsolutePath()); log.info("Writing to file: " + resourceContactsFile.getAbsolutePath()); // close inputStream metadataResponse.close(); } catch (IOException e) { log.error("error.closeBWs", e.getMessage(), e); throw new HarvesterException(e.getMessage(), e); } log.info("end.processAllMetadata"); } /** * Populates a list from the tab delimited Stings taken * from a String. * * @param line tab delimited String * * @return list of tab delimited Strings taken from input String */ private List<String> retrieveStringListFromLine(String line) { // put the header column properties into an array String[] array = tabPattern.split(line); // remove line breaking characters array = fileUtils.removeLineBreakingCharacters(array); // add properties to a list, excluding line number (index 0) List<String> list = new LinkedList<String>(); for (int i = 1; i < array.length; i++) { list.add(array[i]); } return list; } public void setBioDatasourceManager(BioDatasourceManager bioDatasourceManager) { this.bioDatasourceManager = bioDatasourceManager; } public void setLineNumber(int lineNumber) { this.lineNumber = lineNumber; } /** * @param synchroniserFactories the synchroniserFactories to set */ public void setSynchroniserFactories(List<AbstractSynchroniserFactory> synchroniserFactories) { this.synchroniserFactories = synchroniserFactories; } }