org.gbif.registry.metasync.protocols.digir.DigirMetadataSynchroniser.java Source code

Introduction

Here is the source code for org.gbif.registry.metasync.protocols.digir.DigirMetadataSynchroniser.java
Source

/*
 * Copyright 2013 Global Biodiversity Information Facility (GBIF)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.gbif.registry.metasync.protocols.digir;

import org.gbif.api.model.registry.Citation;
import org.gbif.api.model.registry.Contact;
import org.gbif.api.model.registry.Dataset;
import org.gbif.api.model.registry.Endpoint;
import org.gbif.api.model.registry.Identifier;
import org.gbif.api.model.registry.Installation;
import org.gbif.api.model.registry.MachineTag;
import org.gbif.api.vocabulary.ContactType;
import org.gbif.api.vocabulary.EndpointType;
import org.gbif.api.vocabulary.IdentifierType;
import org.gbif.api.vocabulary.InstallationType;
import org.gbif.registry.metasync.api.ErrorCode;
import org.gbif.registry.metasync.api.MetadataException;
import org.gbif.registry.metasync.api.SyncResult;
import org.gbif.registry.metasync.protocols.BaseProtocolHandler;
import org.gbif.registry.metasync.protocols.digir.model.DigirContact;
import org.gbif.registry.metasync.protocols.digir.model.DigirMetadata;
import org.gbif.registry.metasync.protocols.digir.model.DigirResource;
import org.gbif.registry.metasync.util.Constants;

import java.net.URI;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.http.client.HttpClient;

import static org.gbif.registry.metasync.util.Constants.METADATA_NAMESPACE;

import static com.google.common.base.Preconditions.checkArgument;
import static org.apache.commons.lang3.StringUtils.trimToNull;

/**
 * DiGIR synchronisation happens by issuing a metadata request to the single endpoint that the installation should
 * have. The response contains a list of resources which we parse to GBIF {@link Dataset} objects, each having a single
 * Endpoint.
 */
public class DigirMetadataSynchroniser extends BaseProtocolHandler {

    // Source: http://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page#comment24134610_10324802
    private static final Pattern DOI_PATTERN = Pattern
            .compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\\'])\\S)+)\\b");
    // keyword used to identify if an endpoint is of type DIGIR_MANIS
    private static final String MANIS_KEYWORD = "manis";
    // only schemaLocation of type DIGIR_MANIS not containing the word "manis"
    private static final String MANIS_SCHEMA_LOCATION = "http://bnhm.berkeley.edu/DwC/bnhm_dc2_schema.xsd";

    public DigirMetadataSynchroniser(HttpClient httpClient) {
        super(httpClient);
    }

    @Override
    public boolean canHandle(Installation installation) {
        return installation.getType() == InstallationType.DIGIR_INSTALLATION;
    }

    @Override
    public SyncResult syncInstallation(Installation installation, List<Dataset> datasets) throws MetadataException {
        checkArgument(installation.getType() == InstallationType.DIGIR_INSTALLATION,
                "Only supports DiGIR Installations");

        if (installation.getEndpoints().size() != 1) {
            throw new MetadataException("A DiGIR Installation should only ever have one Endpoint, this one has ["
                    + installation.getEndpoints().size() + "]", ErrorCode.OTHER_ERROR);
        }
        Endpoint endpoint = installation.getEndpoints().get(0);

        DigirMetadata metadata = getDigirMetadata(endpoint);
        updateInstallation(metadata, installation);
        updateInstallationEndpoint(metadata, endpoint);
        return mapToDatasets(metadata, datasets, endpoint.getUrl(), installation);
    }

    private DigirMetadata getDigirMetadata(Endpoint endpoint) throws MetadataException {
        return doHttpRequest(endpoint.getUrl(), newDigester(DigirMetadata.class));
    }

    /**
     * Updates the Installation in in place with all the data gathered from the Endpoint.
     */
    private void updateInstallation(DigirMetadata metadata, Installation installation) {
        installation.setContacts(matchContacts(installation.getContacts(),
                convertToRegistryContacts(metadata.getHost().getContacts())));

        installation.setDescription(metadata.getHost().getDescription());

        installation.addMachineTag(
                MachineTag.newInstance(METADATA_NAMESPACE, Constants.DIGIR_CODE, metadata.getHost().getCode()));
        installation.addMachineTag(MachineTag.newInstance(METADATA_NAMESPACE, Constants.INSTALLATION_VERSION,
                metadata.getImplementation()));
    }

    /**
     * Updates the single Endpoint that a DiGIR Installation has.
     */
    private void updateInstallationEndpoint(DigirMetadata metadata, Endpoint endpoint) {
        endpoint.setDescription(metadata.getHost().getDescription());
    }

    /**
     * Maps the resources we got from the metadata response to Datasets that are currently hosted by this Installation.
     * We identify Datasets by the {@code code} attribute that we're getting from the metadata response. We're saving
     * this code on the Dataset itself as a machine tag.
     */
    private SyncResult mapToDatasets(DigirMetadata metadata, Iterable<Dataset> datasets, URI url,
            Installation installation) {
        List<Dataset> added = Lists.newArrayList();
        List<Dataset> deleted = Lists.newArrayList();
        Map<Dataset, Dataset> updated = Maps.newHashMap();

        // Maps currently existing DiGIR codes to the Datasets from our Registry that use those codes
        Map<String, Dataset> codeMap = Maps.newHashMap();
        for (Dataset dataset : datasets) {

            // Find the "code" machine tag
            for (MachineTag tag : dataset.getMachineTags()) {
                if (tag.getNamespace().equals(METADATA_NAMESPACE) && tag.getName().equals(Constants.DIGIR_CODE)) {
                    codeMap.put(tag.getValue(), dataset);
                }
            }
        }

        // Sort in either updated or added Datasets using the just built Map
        for (DigirResource resource : metadata.getResources()) {
            Dataset newDataset = convertToDataset(resource, url);
            if (codeMap.containsKey(resource.getCode())) {
                updated.put(codeMap.get(resource.getCode()), newDataset);
            } else {
                added.add(newDataset);
            }
        }

        // All Datasets that weren't updated must have been deleted
        for (Dataset dataset : datasets) {
            if (!updated.containsKey(dataset)) {
                deleted.add(dataset);
            }
        }

        return new SyncResult(updated, added, deleted, installation);
    }

    /**
     * Converts a DiGIR resource to a GBIF Dataset.
     */
    private Dataset convertToDataset(DigirResource resource, URI url) {
        Dataset dataset = new Dataset();
        dataset.setTitle(resource.getName());
        dataset.setDescription(resource.getDescription());

        // We're only using the very first related URI even though there might be more
        if (!resource.getRelatedInformation().isEmpty()) {
            dataset.setHomepage(resource.getRelatedInformation().iterator().next());
        }
        dataset.setCitation(new Citation(resource.getCitation(), null));
        dataset.setRights(resource.getUseRestrictions());
        dataset.setContacts(convertToRegistryContacts(resource.getContacts()));

        dataset.addMachineTag(MachineTag.newInstance(METADATA_NAMESPACE, Constants.DIGIR_CODE, resource.getCode()));

        if (resource.getNumberOfRecords() != 0) {
            dataset.addMachineTag(MachineTag.newInstance(METADATA_NAMESPACE, Constants.DECLARED_COUNT,
                    String.valueOf(resource.getNumberOfRecords())));
        }

        if (resource.getMaxSearchResponseRecords() != 0) {
            dataset.addMachineTag(
                    MachineTag.newInstance(METADATA_NAMESPACE, Constants.DIGIR_MAX_SEARCH_RESPONSE_RECORDS,
                            String.valueOf(resource.getMaxSearchResponseRecords())));
        }

        if (resource.getDateLastUpdated() != null) {
            dataset.addMachineTag(MachineTag.newInstance(METADATA_NAMESPACE, Constants.DATE_LAST_UPDATED,
                    resource.getDateLastUpdated().toString()));
        }

        for (Map.Entry<String, URI> entry : resource.getConceptualSchemas().entrySet()) {
            dataset.addMachineTag(MachineTag.newInstance(METADATA_NAMESPACE, Constants.CONCEPTUAL_SCHEMA,
                    entry.getValue().toASCIIString()));
        }

        // See if the code contains a DOI and set it as an Identifier
        Matcher matcher = DOI_PATTERN.matcher(resource.getCode());
        if (matcher.find()) {
            Identifier identifier = new Identifier();
            identifier.setType(IdentifierType.DOI);
            identifier.setIdentifier(matcher.group());
            dataset.getIdentifiers().add(identifier);
        }

        // Each DiGIR Dataset has exactly one Endpoint, we create and populate it here
        Endpoint endpoint = new Endpoint();
        endpoint.setDescription(resource.getName());
        endpoint.setUrl(url);
        // normal DiGIR vs MaNIS DiGIR?
        endpoint.setType(determineEndpointType(resource.getConceptualSchemas()));
        dataset.addEndpoint(endpoint);

        return dataset;
    }

    /**
     * Iterates through the resource's map of namespace (conceptualSchemas) / schemaLocation key value pairs.
     * If a DIGIR_MANIS endpoint is found in the list, the EndpointType is equal to DiGIR_MANIS. A DiGIR_MANIS
     * endpoint is identified, by checking if the schemaLocation 1)contains the word "manis" or 2) is equal to
     * "http://bnhm.berkeley.edu/DwC/bnhm_dc2_schema.xsd".
     *
     * @param conceptualSchemas map with namespace (conceptualSchemas), schemaLocation key value pairs
     *
     * @return endpoint type, defaulting to (normal) DiGIR
     */
    EndpointType determineEndpointType(Map<String, URI> conceptualSchemas) {
        for (URI schemaLocation : conceptualSchemas.values()) {
            if (schemaLocation.toString().equalsIgnoreCase(MANIS_SCHEMA_LOCATION)
                    || schemaLocation.toString().toLowerCase().contains(MANIS_KEYWORD)) {
                return EndpointType.DIGIR_MANIS;
            }
        }
        return EndpointType.DIGIR;
    }

    /**
     * Converts a list of DiGIR contacts to GBIF {@link Contact} objects.
     */
    private List<Contact> convertToRegistryContacts(Iterable<DigirContact> contacts) {
        List<Contact> resultList = Lists.newArrayList();
        for (DigirContact contact : contacts) {
            resultList.add(convertToRegistryContact(contact));
        }
        return resultList;
    }

    /**
     * Converts a single DiGIR contact to a GBIF {@link Contact}
     */
    private Contact convertToRegistryContact(DigirContact digirContact) {
        Contact contact = new Contact();
        contact.setFirstName(trimToNull(digirContact.getName()));
        contact.setPosition(trimToNull(digirContact.getTitle()));
        contact.setEmail(trimToNull(digirContact.getEmail()));
        contact.setPhone(trimToNull(digirContact.getPhone()));
        contact.setType(ContactType.inferType(digirContact.getType()));
        return contact;
    }

}