org.ambraproject.rhino.service.impl.IngestionService.java Source code

Introduction

Here is the source code for org.ambraproject.rhino.service.impl.IngestionService.java
Source

/*
 * Copyright (c) 2017 Public Library of Science
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

package org.ambraproject.rhino.service.impl;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.Optional;
import java.util.Set;

import org.ambraproject.rhino.config.RuntimeConfiguration;
import org.ambraproject.rhino.content.xml.ArticleXml;
import org.ambraproject.rhino.content.xml.CustomMetadataExtractor;
import org.ambraproject.rhino.content.xml.ManifestXml;
import org.ambraproject.rhino.content.xml.XmlContentException;
import org.ambraproject.rhino.identity.ArticleIdentifier;
import org.ambraproject.rhino.identity.Doi;
import org.ambraproject.rhino.model.Article;
import org.ambraproject.rhino.model.ArticleIngestion;
import org.ambraproject.rhino.model.ArticleItem;
import org.ambraproject.rhino.model.article.ArticleCustomMetadata;
import org.ambraproject.rhino.model.article.ArticleMetadata;
import org.ambraproject.rhino.model.ingest.ArticlePackage;
import org.ambraproject.rhino.model.ingest.ArticlePackageBuilder;
import org.ambraproject.rhino.model.ingest.IngestPackage;
import org.ambraproject.rhino.rest.RestClientException;
import org.ambraproject.rhino.service.ArticleCrudService;
import org.ambraproject.rhino.service.HibernatePersistenceService;
import org.ambraproject.rhino.util.Archive;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.HttpStatus;
import org.w3c.dom.Document;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableSet;

public class IngestionService extends AmbraService {

    public static final String MANIFEST_XML = "manifest.xml";

    @Autowired
    private CustomMetadataExtractor.Factory customMetadataExtractorFactory;
    @Autowired
    private HibernatePersistenceService hibernatePersistenceService;
    @Autowired
    private ArticleCrudService articleCrudService;

    public ArticleIngestion ingest(Archive archive, Optional<String> bucketName)
            throws IOException, XmlContentException {
        IngestPackage ingestPackage = createIngestPackage(archive, bucketName);
        return processIngestPackage(ingestPackage);
    }

    private IngestPackage createIngestPackage(Archive archive, Optional<String> bucketName) throws IOException {
        ManifestXml manifestXml = getManifestXml(archive);

        ImmutableSet<String> entryNames = archive.getEntryNames();
        manifestXml.validateManifestCompleteness(entryNames);

        String manuscriptEntry = getManuscriptEntry(entryNames, manifestXml);

        Document document = getDocument(archive, manuscriptEntry);

        ArticleXml parsedArticle = new ArticleXml(document);
        ArticleCustomMetadata customMetadata = customMetadataExtractorFactory.parse(document).build();

        ArticlePackage articlePackage = new ArticlePackageBuilder(resolveBucketName(bucketName), archive,
                parsedArticle, manifestXml).build();

        articlePackage.validateAssetCompleteness(parsedArticle.findAllAssetNodes().getDois());

        ArticleMetadata articleMetadata = parsedArticle.build();
        return new IngestPackage(articlePackage, articleMetadata, customMetadata);
    }

    /**
     * Validate the bucket name against the set of allowed buckets and supply the default if needed.
     *
     * @param bucketName the bucket name specified as the destination for this ingestion, or empty if the client did not
     *                   specify a bucket name
     * @return the specified bucket name, or the default if the client did not specify a bucket name
     * @throws RestClientException if the clietn specified a disallowed (or nonexistent) bucket name
     */
    private String resolveBucketName(Optional<String> bucketName) {
        RuntimeConfiguration.MultiBucketContentRepoEndpoint corpusStorage = runtimeConfiguration.getCorpusStorage();
        if (!bucketName.isPresent()) {
            return corpusStorage.getDefaultBucket();
        }

        String configuredName = bucketName.get();
        Set<String> allowedBuckets = corpusStorage.getAllBuckets();
        if (!allowedBuckets.contains(configuredName)) {
            String message = String.format("" + "Invalid bucket name: %s. Allowed values are: %s. "
                    + "(Allowed values are specified by rhino.yaml.)", configuredName, allowedBuckets);
            throw new RestClientException(message, HttpStatus.BAD_REQUEST);
        }
        return configuredName;
    }

    @VisibleForTesting
    ArticleIngestion processIngestPackage(IngestPackage ingestPackage) {
        Doi doi = ArticleIdentifier.create(ingestPackage.getArticleMetadata().getDoi()).getDoi();

        ArticlePackage articlePackage = ingestPackage.getArticlePackage();

        for (ManifestXml.Asset asset : articlePackage.getManifest().getAssets()) {
            Doi assetDoi = Doi.create(asset.getUri());
            validateAssetUniqueness(doi, articleCrudService.getAllArticleItems(assetDoi));
        }
        validateManuscript(doi, articlePackage.getManifest().getArticleAsset().getUri());

        return persistArticle(ingestPackage, doi, articlePackage);
    }

    /**
     * Loads the manuscript referenced in the archive.
     *
     * @param archive The archive
     * @param manuscriptEntry The manuscript entry
     *
     * @return The manuscript
     *
     * @throws IOException if unable to load manuscript
     */
    @VisibleForTesting
    Document getDocument(Archive archive, String manuscriptEntry) throws IOException {
        Document document;
        try (InputStream manuscriptStream = new BufferedInputStream(archive.openFile(manuscriptEntry))) {
            document = AmbraService.parseXml(manuscriptStream);
        }
        return document;
    }

    private String getManuscriptEntry(ImmutableSet<String> entryNames, ManifestXml manifestXml) {
        ManifestXml.Representation manuscriptRepr = manifestXml.getArticleAsset().getRepresentation("manuscript")
                .orElseThrow(() -> new RestClientException("Manuscript entry not found in manifest",
                        HttpStatus.BAD_REQUEST));

        String manuscriptEntry = manuscriptRepr.getFile().getEntry();
        if (!entryNames.contains(manuscriptEntry)) {
            throw new RestClientException("Manuscript file not found in archive: " + manuscriptEntry,
                    HttpStatus.BAD_REQUEST);
        }
        return manuscriptEntry;
    }

    @VisibleForTesting
    ManifestXml getManifestXml(Archive archive) throws IOException {
        final ImmutableSet<String> entryNames = archive.getEntryNames();
        if (!entryNames.contains(MANIFEST_XML)) {
            throw new RestClientException("Archive has no manifest file", HttpStatus.BAD_REQUEST);
        }

        ManifestXml manifestXml;
        try (InputStream manifestStream = new BufferedInputStream(archive.openFile(MANIFEST_XML))) {
            manifestXml = new ManifestXml(AmbraService.parseXml(manifestStream));
        }
        return manifestXml;
    }

    @VisibleForTesting
    void validateManuscript(Doi doi, String manuscriptAssetUri) {
        if (!doi.equals(Doi.create(manuscriptAssetUri))) {
            String message = String.format(
                    "Article DOI is inconsistent. From manifest: \"%s\" From manuscript: \"%s\"",
                    manuscriptAssetUri, doi.getName());
            throw new RestClientException(message, HttpStatus.BAD_REQUEST);
        }
    }

    private ArticleIngestion persistArticle(IngestPackage ingestPackage, Doi doi, ArticlePackage articlePackage) {
        Article article = hibernatePersistenceService.persistArticle(doi);
        ArticleIngestion ingestion = hibernatePersistenceService.persistIngestion(article, ingestPackage);

        hibernatePersistenceService.persistAssets(articlePackage, ingestion);

        hibernateTemplate.flush();
        hibernateTemplate.refresh(ingestion); // Pick up auto-persisted timestamp
        return ingestion;
    }

    @VisibleForTesting
    void validateAssetUniqueness(Doi articleDoi, Collection<ArticleItem> articleItems) {
        for (ArticleItem existingItem : articleItems) {
            Article existingParentArticle = existingItem.getIngestion().getArticle();
            if (!Doi.create(existingParentArticle.getDoi()).equals(articleDoi)) {
                String errorMessage = String.format(
                        "Incoming article ingestion (doi:%s) has a duplicate "
                                + "article asset. Duplicate asset belongs to article doi: %s.",
                        articleDoi.getName(), existingParentArticle.getDoi());
                throw new RestClientException(errorMessage, HttpStatus.BAD_REQUEST);
            }
        }
    }
}