org.ambraproject.article.service.IngesterImpl.java Source code

Java tutorial

Introduction

Here is the source code for org.ambraproject.article.service.IngesterImpl.java

Source

/*
 * $HeadURL$
 * $Id$
 *
 * Copyright (c) 2006-2010 by Public Library of Science
 *     http://plos.org
 *     http://ambraproject.org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.ambraproject.article.service;

import org.ambraproject.admin.service.DocumentManagementService;
import org.ambraproject.admin.service.SyndicationService;
import org.ambraproject.article.ArchiveProcessException;
import org.ambraproject.filestore.FSIDMapper;
import org.ambraproject.filestore.FileStoreException;
import org.ambraproject.filestore.FileStoreService;
import org.ambraproject.models.Article;
import org.ambraproject.models.ArticleRelationship;
import org.ambraproject.models.Category;
import org.ambraproject.models.Issue;
import org.ambraproject.models.Journal;
import org.ambraproject.models.Volume;
import org.ambraproject.service.article.DuplicateArticleIdException;
import org.ambraproject.service.article.NoSuchArticleIdException;
import org.ambraproject.service.hibernate.HibernateServiceImpl;
import org.hibernate.Criteria;
import org.hibernate.FetchMode;
import org.hibernate.HibernateException;
import org.hibernate.Session;
import org.hibernate.StaleObjectStateException;
import org.hibernate.criterion.DetachedCriteria;
import org.hibernate.criterion.Restrictions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.BeanWrapper;
import org.springframework.beans.BeanWrapperImpl;
import org.springframework.beans.factory.annotation.Required;
import org.springframework.dao.DataAccessException;
import org.springframework.orm.hibernate3.HibernateCallback;
import org.springframework.transaction.annotation.Isolation;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;
import org.w3c.dom.Document;

import java.beans.PropertyDescriptor;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.sql.SQLException;
import java.util.Collection;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

/**
 * @author Alex Kudlick
 */
public class IngesterImpl extends HibernateServiceImpl implements Ingester {
    private static final Logger log = LoggerFactory.getLogger(IngesterImpl.class);

    private FileStoreService fileStoreService;
    private SyndicationService syndicationService;
    private IngestArchiveProcessor ingestArchiveProcessor;
    private DocumentManagementService documentManagementService;

    /**
     * Set the IngestArchiveProcessor to use to create an Article object from the XML
     *
     * @param ingestArchiveProcessor - the xml processor to use
     */
    @Required
    public void setIngestArchiveProcessor(IngestArchiveProcessor ingestArchiveProcessor) {
        this.ingestArchiveProcessor = ingestArchiveProcessor;
    }

    /**
     * Set the documentManagementService, used to remove files from the filesystem on reingest
     *
     * @param documentManagementService the document management service to use
     */
    @Required
    public void setDocumentManagementService(DocumentManagementService documentManagementService) {
        this.documentManagementService = documentManagementService;
    }

    /**
     * Set the {@link FileStoreService} to use to store files
     *
     * @param fileStoreService - the filestore to use
     */
    @Required
    public void setFileStoreService(FileStoreService fileStoreService) {
        this.fileStoreService = fileStoreService;
    }

    /**
     * Set the Syndication Service to use in creating syndications for ingested articles
     *
     * @param syndicationService - the {@link SyndicationService} to set
     */
    @Required
    public void setSyndicationService(SyndicationService syndicationService) {
        this.syndicationService = syndicationService;
    }

    /**
     * TODO: Rollback from the filestore if there's a problem storing the files
     *
     * @param archive - the archive to ingest
     * @param force   if true then don't check whether this article already exists but just save this new article.
     * @return the new article
     * @throws DuplicateArticleIdException if an article exists with the same URI as the new article and <var>force</var>
     *                                     is false
     * @throws IngestException             if there's any other problem ingesting the article
     */
    @Transactional(rollbackFor = Throwable.class)
    @SuppressWarnings("unchecked")
    public Article ingest(ZipFile archive, boolean force) throws DuplicateArticleIdException, IngestException {
        Article article = null;
        try {
            final Document articleXml = ingestArchiveProcessor.extractArticleXml(archive);
            article = ingestArchiveProcessor.processArticle(archive, articleXml);
            updateWithExistingJournal(article);

            final String articleDoi = article.getDoi();
            //Check if we already have an article
            Article existingArticle = (Article) hibernateTemplate.execute(new HibernateCallback() {
                @Override
                public Object doInHibernate(Session session) throws HibernateException, SQLException {
                    return (Article) session.createCriteria(Article.class).setFetchMode("journals", FetchMode.JOIN)
                            .add(Restrictions.eq("doi", articleDoi))
                            .setResultTransformer(Criteria.DISTINCT_ROOT_ENTITY).uniqueResult();
                }
            });

            // if the article is in Disabled state, we allow ingest without force
            if (!force && existingArticle != null && existingArticle.getState() != Article.STATE_DISABLED) {
                throw new DuplicateArticleIdException(article.getDoi());
            }

            if (existingArticle != null) {
                updateArticle(article, existingArticle);
                article = existingArticle;
            } else {
                saveArticle(article);
            }

            // For every RelatedArticle object, create a reciprocal link from old Article to this Article.
            addReciprocalRelatedArticleAssociations(article);

            //if this is an image article, update the issues for which this is the image
            if (article.getDoi().contains("image")) {
                updateIssueForImageArticle(article);
            }
            //create syndications
            syndicationService.createSyndications(article.getDoi());

            //Store files to the file store
            storeFiles(archive, article.getDoi());

            return article;
        } catch (ArchiveProcessException e) {
            throw new IngestException(
                    "Error processing zip archive to extract article information; archive" + archive.getName(), e);
        } catch (IOException e) {
            throw new IngestException("Error reading entries from zip archive: " + archive.getName(), e);
        } catch (FileStoreException e) {
            throw new IngestException("Error storing blobs to the file store; article: " + article.getDoi()
                    + ", archive: " + archive.getName(), e);
        } catch (DataAccessException e) {
            throw new IngestException(
                    "Error storing information for article " + article.getDoi() + " to the SQL database", e);
        } catch (HibernateException e) {
            throw new IngestException(
                    "Error storing information for article " + article.getDoi() + " to the SQL database", e);
        } catch (NoSuchArticleIdException e) {
            throw new IngestException("Article wasn't stored to the database", e);
        }
    }

    @SuppressWarnings("unchecked")
    private void updateWithExistingJournal(Article article) {
        Set<Journal> journals = new HashSet<Journal>(article.getJournals().size());
        for (Journal journal : article.getJournals()) {
            journals.addAll(hibernateTemplate.findByCriteria(
                    DetachedCriteria.forClass(Journal.class).add(Restrictions.eq("eIssn", journal.geteIssn()))
                            .setResultTransformer(Criteria.DISTINCT_ROOT_ENTITY)));
        }
        article.getJournals().clear();
        article.getJournals().addAll(journals);
    }

    @SuppressWarnings("unchecked")
    private void updateIssueForImageArticle(Article imageArticle) {
        List<Issue> issues = hibernateTemplate.findByCriteria(
                DetachedCriteria.forClass(Issue.class).add(Restrictions.eq("imageUri", imageArticle.getDoi())));
        for (Issue issue : issues) {
            issue.setDescription(imageArticle.getDescription());
            issue.setTitle(imageArticle.getTitle());
            hibernateTemplate.update(issue);
        }
    }

    private void saveArticle(Article article) {
        log.debug("Saving article information for article: {}", article.getDoi());
        hibernateTemplate.save(article);
    }

    /**
     * Update an existing article by copying properties from the new one over.  Note that we can't call saveOrUpdate,
     * since the new article is not a persistent instance, but has all the properties that we want.
     * <p/>
     * See <a href="http://stackoverflow.com/questions/4779239/update-persistent-object-with-transient-object-using-hibernate">this
     * post on stack overflow</a> for more information
     * <p/>
     * For collections, we clear the old property and add all the new entries, relying on 'delete-orphan' to delete the
     * old objects. The downside of this approach is that it results in a delete statement for each entry in the old
     * collection, and an insert statement for each entry in the new collection.  There a couple of things we could do to
     * optimize this: <ol> <li>Write a sql statement to delete the old entries in one go</li> <li>copy over collection
     * properties recursively instead of clearing the old collection.  e.g. for {@link Article#assets}, instead of
     * clearing out the old list, we would find the matching asset by DOI and Extension, and update its properties</li>
     * </ol>
     * <p/>
     * Option number 2 is messy and a lot of code (I've done it before)
     *
     * @param article         the new article, parsed from the xml
     * @param existingArticle the article pulled up from the database
     * @throws IngestException if there's a problem copying properties or updating
     */
    @SuppressWarnings("unchecked")
    private void updateArticle(final Article article, final Article existingArticle) throws IngestException {
        log.debug("ReIngesting (force ingest) article: {}", existingArticle.getDoi());
        //Hibernate deletes orphans after inserting the new rows, which violates a unique constraint on (doi, extension) for assets
        //this temporary change gets around the problem, before the old assets are orphaned and deleted
        hibernateTemplate.execute(new HibernateCallback() {
            @Override
            public Object doInHibernate(Session session) throws HibernateException, SQLException {
                session.createSQLQuery("update articleAsset " + "set doi = concat('old-',doi), "
                        + "extension = concat('old-',extension) " + "where articleID = :articleID")
                        .setParameter("articleID", existingArticle.getID()).executeUpdate();
                return null;
            }
        });

        final BeanWrapper source = new BeanWrapperImpl(article);
        final BeanWrapper destination = new BeanWrapperImpl(existingArticle);

        try {
            //copy properties
            for (final PropertyDescriptor property : destination.getPropertyDescriptors()) {
                final String name = property.getName();
                if (!name.equals("ID") && !name.equals("created") && !name.equals("lastModified")
                        && !name.equals("class")) {
                    //Collections shouldn't be dereferenced but have elements added
                    //See http://www.onkarjoshi.com/blog/188/hibernateexception-a-collection-with-cascade-all-delete-orphan-was-no-longer-referenced-by-the-owning-entity-instance/
                    if (Collection.class.isAssignableFrom(property.getPropertyType())) {
                        Collection orig = (Collection) destination.getPropertyValue(name);
                        orig.clear();
                        Collection sourcePropertyValue = (Collection) source.getPropertyValue(name);
                        if (sourcePropertyValue != null) {
                            orig.addAll((Collection) source.getPropertyValue(name));
                        }
                    } else {
                        //just set the new value
                        destination.setPropertyValue(name, source.getPropertyValue(name));
                    }
                }
            }
            //Circular relationship in related articles
            for (ArticleRelationship articleRelationship : existingArticle.getRelatedArticles()) {
                articleRelationship.setParentArticle(existingArticle);
            }
        } catch (Exception e) {
            throw new IngestException("Error copying properties for article " + article.getDoi(), e);
        }

        hibernateTemplate.update(existingArticle);
    }

    /**
     * Process files from the archive and store them to the {@link FileStoreService}
     *
     * @param archive - the archive being ingested
     * @param doi
     * @throws java.io.IOException - if there's a problem reading from the zip file
     * @throws org.ambraproject.filestore.FileStoreException
     *                             - if there's a problem writing files to the file store
     */
    private void storeFiles(final ZipFile archive, String doi) throws IOException, FileStoreException {
        log.info("Removing existing files (if any) for {}", doi);

        try {
            documentManagementService.removeFromFileSystem(doi);
        } catch (Exception e) {
            throw new FileStoreException("Error removing existing files from the file store", e);
        }

        log.info("Storing files from archive {} to the file store", archive.getName());
        Enumeration<? extends ZipEntry> entries = archive.entries();

        while (entries.hasMoreElements()) {
            ZipEntry entry = entries.nextElement();
            if (!entry.getName().equalsIgnoreCase("manifest.dtd")) {
                InputStream inputStream = null;
                OutputStream outputStream = null;
                try {
                    inputStream = archive.getInputStream(entry);
                    outputStream = fileStoreService.getFileOutStream(FSIDMapper.zipToFSID(doi, entry.getName()),
                            entry.getSize());
                    fileStoreService.copy(inputStream, outputStream);
                } finally {
                    if (inputStream != null) {
                        try {
                            inputStream.close();
                        } catch (IOException e) {
                            log.warn("Error closing input stream while writing files", e);
                        }
                    }
                    if (outputStream != null) {
                        try {
                            outputStream.close();
                        } catch (IOException e) {
                            log.warn("Error closing output stream while writing files", e);
                        }
                    }
                }
            }
        }
        log.info("Finished storing files from archive {}", archive.getName());
    }

    /**
     * Add/update reciprocal related article links.  There are two situations in which this is relevant: <ol> <li> We are
     * ingesting Article B. Article A already exists, and relates to Article B. We need to update Article A's relationship
     * to set the 'otherArticleID' property, and make sure that Article B has a link back to Article A. </li> <li> We
     * ingesting Article B, which has a relationship pointing to Article A.  Article A already exists. We need to make
     * sure Article A has a link back to Article B, and update the 'otherArticleID' property on both of the relationships.
     * </li> </ol>
     *
     * @param newArticle The Article which is being ingested
     */
    @SuppressWarnings("unchecked")
    private void addReciprocalRelatedArticleAssociations(Article newArticle) {
        //keep track of the other articles we already updated
        Set<String> otherArticleDois = new HashSet<String>(newArticle.getRelatedArticles().size());

        //For each of the articles that the new one links to, update the reciprocal relations
        for (ArticleRelationship relationship : newArticle.getRelatedArticles()) {
            otherArticleDois.add(relationship.getOtherArticleDoi());
            Article otherArticle;
            //Set the 'otherArticleID' property for any new relationships created by this article
            try {
                otherArticle = (Article) hibernateTemplate.findByCriteria(DetachedCriteria.forClass(Article.class)
                        .add(Restrictions.eq("doi", relationship.getOtherArticleDoi())), 0, 1).get(0);
            } catch (IndexOutOfBoundsException e) {
                //other article didn't exist
                continue;
            }
            relationship.setOtherArticleID(otherArticle.getID());
            hibernateTemplate.update(relationship);

            //Now ensure that there is a reciprocal link, i.e. that the 'other article' links back to the new one
            boolean createNewRelationship = true;
            //so we have to check if the other article already has a link to this one
            for (ArticleRelationship otherArticleRelationship : otherArticle.getRelatedArticles()) {
                if (otherArticleRelationship.getOtherArticleDoi().equals(newArticle.getDoi())) {
                    createNewRelationship = false;
                    otherArticleRelationship.setOtherArticleID(newArticle.getID());
                    hibernateTemplate.update(otherArticleRelationship);
                    break;
                }
            }
            //if the other article didn't already have a link to this one, we need to make a new one
            if (createNewRelationship) {
                ArticleRelationship reciprocalLink = new ArticleRelationship();
                reciprocalLink.setParentArticle(otherArticle);
                reciprocalLink.setOtherArticleID(newArticle.getID());
                reciprocalLink.setOtherArticleDoi(newArticle.getDoi());
                reciprocalLink.setType(relationship.getType());
                otherArticle.getRelatedArticles().add(reciprocalLink);
                hibernateTemplate.update(otherArticle);
            }
        }

        //Now we need to find any existing articles that link to the new one, (that we didn't just update) and update the relationships
        List<Article> articlesLinkingToNewOne;
        if (!otherArticleDois.isEmpty()) {
            //articles linking to this one that we didn't already visit
            articlesLinkingToNewOne = hibernateTemplate.findByCriteria(DetachedCriteria.forClass(Article.class)
                    .add(Restrictions.not(Restrictions.in("doi", otherArticleDois)))
                    .createCriteria("relatedArticles")
                    .add(Restrictions.eq("otherArticleDoi", newArticle.getDoi())));
        } else {
            //hibernate throws a sql grammar exception if you do a restrictions.in() with an empty collection
            articlesLinkingToNewOne = hibernateTemplate
                    .findByCriteria(DetachedCriteria.forClass(Article.class).createCriteria("relatedArticles")
                            .add(Restrictions.eq("otherArticleDoi", newArticle.getDoi())));
        }
        for (Article otherArticle : articlesLinkingToNewOne) {
            //update the other article's relationship
            for (ArticleRelationship otherRelationship : otherArticle.getRelatedArticles()) {
                if (otherRelationship.getOtherArticleDoi().equals(newArticle.getDoi())) {
                    otherRelationship.setOtherArticleID(newArticle.getID());
                    hibernateTemplate.update(otherRelationship);
                    //create a relationship linking to the other article
                    ArticleRelationship relationship = new ArticleRelationship();
                    relationship.setParentArticle(newArticle);
                    relationship.setOtherArticleID(otherArticle.getID());
                    relationship.setOtherArticleDoi(otherArticle.getDoi());
                    relationship.setType(otherRelationship.getType());
                    newArticle.getRelatedArticles().add(relationship);
                }
            }
        }
        //if we added new relationships, update the new article
        if (articlesLinkingToNewOne.size() > 0) {
            hibernateTemplate.update(newArticle);
        }
    }
}