Java tutorial
/* * $HeadURL$ * $Id$ * * Copyright (c) 2006-2012 by Public Library of Science * http://plos.org * http://ambraproject.org * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. | */ package org.ambraproject.article.service; import net.sf.saxon.Controller; import net.sf.saxon.TransformerFactoryImpl; import net.sf.saxon.serialize.MessageWarner; import org.ambraproject.article.ArchiveProcessException; import org.ambraproject.models.Article; import org.ambraproject.models.ArticleAsset; import org.ambraproject.models.ArticleAuthor; import org.ambraproject.models.ArticleEditor; import org.ambraproject.models.ArticleRelationship; import org.ambraproject.models.Category; import org.ambraproject.models.CitedArticle; import org.ambraproject.models.CitedArticleAuthor; import org.ambraproject.models.CitedArticleEditor; import org.ambraproject.models.Journal; import org.ambraproject.service.article.ArticleClassifier; import org.ambraproject.service.article.ArticleService; import org.ambraproject.util.Rhino; import org.ambraproject.util.XPathUtil; import org.apache.commons.codec.binary.Base64; import org.apache.commons.configuration.Configuration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Required; import org.ambraproject.xml.transform.EntityResolvingSource; import org.ambraproject.xml.transform.cache.CachedSource; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.EntityResolver; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import javax.xml.parsers.DocumentBuilder; import javax.xml.transform.ErrorListener; import javax.xml.transform.OutputKeys; import javax.xml.transform.Source; import javax.xml.transform.Templates; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.URIResolver; import javax.xml.transform.dom.DOMResult; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import javax.xml.xpath.XPathExpressionException; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.StringReader; import java.io.StringWriter; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Enumeration; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; /** * {@link IngestArchiveProcessor} that uses an xsl stylesheet to format the article xml into an easy to parse xml * * @author Bill OConnor * @author Alex Kudlick Date: 6/20/11 * <p/> * org.ambraproject.article.service */ public class XslIngestArchiveProcessor implements IngestArchiveProcessor { private static final Logger log = LoggerFactory.getLogger(XslIngestArchiveProcessor.class); private DocumentBuilder documentBuilder; private TransformerFactory transformerFactory; private InputStream xslDefaultTemplate; private Map<String, String> xslTemplateMap; private Configuration configuration; private XPathUtil xPathUtil; private ArticleClassifier articleClassifier; private ArticleService articleService; public XslIngestArchiveProcessor() { transformerFactory = new TransformerFactoryImpl(); transformerFactory.setURIResolver(new URLResolver()); transformerFactory.setAttribute("http://saxon.sf.net/feature/version-warning", Boolean.FALSE); transformerFactory.setAttribute("http://saxon.sf.net/feature/strip-whitespace", "none"); transformerFactory.setErrorListener(new ErrorListener() { public void warning(TransformerException te) { log.warn("Warning received while processing a stylesheet", te); } public void error(TransformerException te) { log.warn("Error received while processing a stylesheet", te); } public void fatalError(TransformerException te) { log.warn("Fatal error received while processing a stylesheet", te); } }); xPathUtil = new XPathUtil(); } private static class URLResolver implements URIResolver { public Source resolve(String href, String base) throws TransformerException { if (href.length() == 0) return null; // URL doesn't handle this case properly, so let default resolver handle it try { URL url = new URL(new URL(base), href); return new StreamSource(url.toString()); } catch (MalformedURLException mue) { log.warn("Failed to resolve '" + href + "' relative to '" + base + "' - falling back to " + "default URIResolver", mue); return null; } } } /** * This allows the stylesheets to access XML docs (such as pmc.xml) in the zip archive. */ private static class ZipURIResolver extends URLResolver { private final ZipFile zip; public ZipURIResolver(ZipFile zip) { this.zip = zip; } @Override public Source resolve(String href, String base) throws TransformerException { if (log.isDebugEnabled()) log.debug("resolving: base='" + base + "', href='" + href + "'"); if (!base.startsWith("zip:")) return super.resolve(href, base); try { InputSource src = resolveToIS(base, href); if (src == null) return null; return new EntityResolvingSource(src, new EntityResolver() { public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException { if (systemId != null && systemId.startsWith("zip:")) return resolveToIS("zip:/", systemId); return CachedSource.getResolver().resolveEntity(publicId, systemId); } }); } catch (IOException ioe) { throw new TransformerException(ioe); } catch (SAXException se) { throw new TransformerException(se); } } private InputSource resolveToIS(String base, String rel) throws IOException { URI uri = URI.create(base).resolve(rel); InputStream is = zip.getInputStream(zip.getEntry(uri.getPath().substring(1))); if (is == null) // hack to deal with broken AP zip's that contain absolute paths is = zip.getInputStream(zip.getEntry(uri.getPath())); if (log.isDebugEnabled()) log.debug("resolved: uri='" + uri + "', found=" + (is != null)); if (is == null) return null; InputSource src = new InputSource(is); src.setSystemId(uri.toString()); return src; } } /** * Setter for XSL Templates. Takes in a string as the filename and searches for it in resource * path and then as a URI. * * @param xslTemplate The xslTemplate to set. * @throws java.net.URISyntaxException */ @Required public void setXslDefaultTemplate(String xslTemplate) throws URISyntaxException { InputStream s = getAsStream(xslTemplate); this.xslDefaultTemplate = s; } /** * Set the xsl style sheet to use * * @param xslTemplateMap - A map of classpath-relative location of an xsl stylesheets to use to process the article xml */ @Required public void setXslTemplateMap(Map<String, String> xslTemplateMap) { this.xslTemplateMap = xslTemplateMap; } /** * Set the document builder to use for constructing documents from the zip file entries * * @param documentBuilder - the document builder to use */ @Required public void setDocumentBuilder(DocumentBuilder documentBuilder) { this.documentBuilder = documentBuilder; } @Required public void setConfiguration(Configuration configuration) { this.configuration = configuration; } @Required public void setArticleClassifier(ArticleClassifier articleClassifier) { this.articleClassifier = articleClassifier; } @Required public void setArticleService(ArticleService articleService) { this.articleService = articleService; } @Override public Article processArticle(ZipFile archive, Document articleXml) throws ArchiveProcessException { InputStream xsl = null; try { String zipInfo = describeZip(archive); Document transformedXml = transformZip(archive, zipInfo, articleXml, configuration.getString("ambra.platform.doiUrlPrefix", null)); Article article = parseTransformedXml(transformedXml); String archiveName = archive.getName().contains(File.separator) ? archive.getName().substring(archive.getName().lastIndexOf(File.separator) + 1) : archive.getName(); article.setArchiveName(archiveName); // Attempt to assign categories to the article based on the taxonomy server. However, // we still want to ingest the article even if this process fails. List<String> terms = null; try { terms = articleClassifier.classifyArticle(articleXml); } catch (Exception e) { log.warn("Taxonomy server not responding, but ingesting article anyway", e); } if (terms != null && terms.size() > 0) { articleService.setArticleCategories(article, terms); } else { article.setCategories(new HashSet<Category>()); } Journal journal = new Journal(); journal.seteIssn(article.geteIssn()); article.setJournals(new HashSet<Journal>()); article.getJournals().add(journal); article.setStrkImgURI(extractStrikingImageURI(archive)); return article; } catch (IOException e) { throw new ArchiveProcessException("Error reading from Zip archive", e); } catch (TransformerException e) { throw new ArchiveProcessException("Error transforming Article xml", e); } catch (ParseException e) { throw new ArchiveProcessException("Error parsing a date in the xml", e); } catch (XPathExpressionException e) { throw new ArchiveProcessException("Error parsing transformed xml", e); } catch (URISyntaxException e) { throw new ArchiveProcessException("Error malformed uri", e); } catch (Exception e) { throw new ArchiveProcessException("Error parsing xml", e); } finally { if (xsl != null) { try { xsl.close(); } catch (IOException e) { log.warn("Error closing input stream for xsl stylesheet"); } } } } /** * TODO: use an xml unmarshaller to do this - see <a href="http://static.springsource.org/spring-ws/site/reference/html/oxm.html">this * TODO: page</a> for some spring-wrapped versions (mmmmm... spring). For this it would behoove us to reformat the * transformed xml TODO: to match the object model * * @param transformedXml the result of the xsl transform on the article xml * @return a fully-populated, unsaved article object * @throws javax.xml.xpath.XPathExpressionException * if there's an error parsing the xml */ private Article parseTransformedXml(Document transformedXml) throws XPathExpressionException, ParseException { Article article = new Article(); //basic article properties article.setDoi(xPathUtil.evaluate(transformedXml, "//Article/@id")); article.setState(Article.STATE_UNPUBLISHED); article.seteIssn(xPathUtil.evaluate(transformedXml, "//Article/eIssn")); //properties that used to be in dublin core if (xPathUtil.selectSingleNode(transformedXml, "//Article/dublinCore/title") != null) { article.setTitle( Rhino.getAllText(xPathUtil.selectSingleNode(transformedXml, "//Article/dublinCore/title"))); } if (!xPathUtil.evaluate(transformedXml, "//Article/dublinCore/format/text()").isEmpty()) { article.setFormat(xPathUtil.evaluate(transformedXml, "//Article/dublinCore/format/text()")); } if (!xPathUtil.evaluate(transformedXml, "//Article/dublinCore/language/text()").isEmpty()) { article.setLanguage(xPathUtil.evaluate(transformedXml, "//Article/dublinCore/language/text()")); } if (xPathUtil.selectSingleNode(transformedXml, "//Article/dublinCore/description") != null) { article.setDescription(Rhino .getAllText(xPathUtil.selectSingleNode(transformedXml, "//Article/dublinCore/description"))); } if (!xPathUtil.evaluate(transformedXml, "//Article/dublinCore/rights/text()").isEmpty()) { article.setRights(xPathUtil.evaluate(transformedXml, "//Article/dublinCore/rights/text()")); } DateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); if (!xPathUtil.evaluate(transformedXml, "//Article/dublinCore/date/text()").isEmpty()) { article.setDate(dateFormatter.parse( xPathUtil.evaluate(transformedXml, "//Article/dublinCore/date/text()").replaceAll(" UTC", ""))); } //properties that used to be in bib citation if (xPathUtil.selectSingleNode(transformedXml, "//Article/dublinCore/bibliographicCitation") != null) { String volume = xPathUtil.evaluate(transformedXml, "//Article/dublinCore/bibliographicCitation/volume/text()"); if (!volume.isEmpty()) { article.setVolume(volume); } String publisherLocation = xPathUtil.evaluate(transformedXml, "//Article/dublinCore/bibliographicCitation/publisherLocation/text()"); if (!publisherLocation.isEmpty()) { article.setPublisherLocation(publisherLocation); } String publisherName = xPathUtil.evaluate(transformedXml, "//Article/dublinCore/bibliographicCitation/publisherName/text()"); if (!publisherName.isEmpty()) { article.setPublisherName(publisherName); } String journal = xPathUtil.evaluate(transformedXml, "//Article/dublinCore/bibliographicCitation/journal/text()"); if (!journal.isEmpty()) { article.setJournal(journal); } String issue = xPathUtil.evaluate(transformedXml, "//Article/dublinCore/bibliographicCitation/issue/text()"); if (!issue.isEmpty()) { article.setIssue(issue); } String pages = xPathUtil.evaluate(transformedXml, "//Article/dublinCore/bibliographicCitation/pages/text()"); if (!pages.isEmpty()) { article.setPages(pages); } String eLocationId = xPathUtil.evaluate(transformedXml, "//Article/dublinCore/bibliographicCitation/eLocationId/text()"); if (!eLocationId.isEmpty()) { article.seteLocationId(eLocationId); } String url = xPathUtil.evaluate(transformedXml, "//Article/dublinCore/bibliographicCitation/url/text()"); if (!url.isEmpty()) { article.setUrl(url); } } //types Set<String> articleTypes = parseArticleTypes(transformedXml); article.setTypes(articleTypes); //authors List<ArticleAuthor> authors = parseArticleAuthors(transformedXml); article.setAuthors(authors); //editors List<ArticleEditor> editors = parseArticleEditors(transformedXml); article.setEditors(editors); //cited articles List<CitedArticle> references = parseCitedArticles(transformedXml); article.setCitedArticles(references); //assets List<ArticleAsset> assets = parseArticleAssets(transformedXml, article); article.setAssets(assets); //related articles List<ArticleRelationship> relatedArticles = parseRelatedArticles(transformedXml, article); article.setRelatedArticles(relatedArticles); List<String> collabAuthors = parseCollabAuthors(transformedXml, "//Article/dublinCore/bibliographicCitation"); article.setCollaborativeAuthors(collabAuthors); return article; } private List<String> parseCollabAuthors(Document transformedXml, String nodeXpath) throws XPathExpressionException { int collabAuthorCount = Integer .valueOf(xPathUtil.evaluate(transformedXml, "count(" + nodeXpath + "/collaborativeAuthors)")); List<String> collabAuthors = new ArrayList<String>(collabAuthorCount); for (int i = 1; i <= collabAuthorCount; i++) { collabAuthors .add(xPathUtil.evaluate(transformedXml, nodeXpath + "/collaborativeAuthors[" + i + "]/text()")); } return collabAuthors; } private List<ArticleRelationship> parseRelatedArticles(Document transformedXml, Article article) throws XPathExpressionException { int relatedArticleCount = Integer .valueOf(xPathUtil.evaluate(transformedXml, "count(//Article/relatedArticles)")); List<ArticleRelationship> relatedArticles = new ArrayList<ArticleRelationship>(relatedArticleCount); for (int i = 1; i <= relatedArticleCount; i++) { ArticleRelationship relatedArticle = new ArticleRelationship(); relatedArticle.setParentArticle(article); relatedArticle.setOtherArticleDoi( xPathUtil.evaluate(transformedXml, "//Article/relatedArticles[" + i + "]/article/text()")); relatedArticle.setType( xPathUtil.evaluate(transformedXml, "//Article/relatedArticles[" + i + "]/relationType/text()")); relatedArticles.add(relatedArticle); } return relatedArticles; } private List<CitedArticle> parseCitedArticles(Document transformedXml) throws XPathExpressionException { int referenceCount = Integer .valueOf(xPathUtil.evaluate(transformedXml, "count(" + "//Article/dublinCore/references)")); List<CitedArticle> references = new ArrayList<CitedArticle>(referenceCount); for (int i = 1; i <= referenceCount; i++) { String nodeXpath = "//Article/dublinCore/references[" + i + "]"; CitedArticle citedArticle = new CitedArticle(); String type = xPathUtil.evaluate(transformedXml, nodeXpath + "/citationType/text()"); if (!type.isEmpty()) { citedArticle.setCitationType(type); } String year = xPathUtil.evaluate(transformedXml, nodeXpath + "/year/text()"); if (!year.isEmpty()) { citedArticle.setYear(Integer.valueOf(year)); } String displayYear = xPathUtil.evaluate(transformedXml, nodeXpath + "/displayYear/text()"); if (!displayYear.isEmpty()) { citedArticle.setDisplayYear(displayYear); } String month = xPathUtil.evaluate(transformedXml, nodeXpath + "/month/text()"); if (!month.isEmpty()) { citedArticle.setMonth(month); } String day = xPathUtil.evaluate(transformedXml, nodeXpath + "/day/text()"); if (!day.isEmpty()) { citedArticle.setDay(day); } String volume = xPathUtil.evaluate(transformedXml, nodeXpath + "/volume/text()"); if (!volume.isEmpty()) { citedArticle.setVolume(volume); } String volumeNumber = xPathUtil.evaluate(transformedXml, nodeXpath + "/volumeNumber/text()"); if (!volumeNumber.isEmpty()) { citedArticle.setVolumeNumber(Integer.valueOf(volumeNumber)); } String publisherLocation = xPathUtil.evaluate(transformedXml, nodeXpath + "/publisherLocation/text()"); if (!publisherLocation.isEmpty()) { citedArticle.setPublisherLocation(publisherLocation); } String publisherName = xPathUtil.evaluate(transformedXml, nodeXpath + "/publisherName/text()"); if (!publisherName.isEmpty()) { citedArticle.setPublisherName(publisherName); } String pages = xPathUtil.evaluate(transformedXml, nodeXpath + "/pages/text()"); if (!pages.isEmpty()) { citedArticle.setPages(pages); } String eLocationId = xPathUtil.evaluate(transformedXml, nodeXpath + "/eLocationId/text()"); if (!eLocationId.isEmpty()) { citedArticle.seteLocationID(eLocationId); } String journal = xPathUtil.evaluate(transformedXml, nodeXpath + "/journal/text()"); if (!journal.isEmpty()) { citedArticle.setJournal(journal); } String issue = xPathUtil.evaluate(transformedXml, nodeXpath + "/issue/text()"); if (!issue.isEmpty()) { citedArticle.setIssue(issue); } String key = xPathUtil.evaluate(transformedXml, nodeXpath + "/key/text()"); if (!key.isEmpty()) { citedArticle.setKey(key); } String url = xPathUtil.evaluate(transformedXml, nodeXpath + "/url/text()"); if (!url.isEmpty()) { citedArticle.setUrl(url); } String doi = xPathUtil.evaluate(transformedXml, nodeXpath + "/doi/text()"); if (!doi.isEmpty()) { citedArticle.setDoi(doi); } Node noteNode = xPathUtil.selectSingleNode(transformedXml, nodeXpath + "/note"); if (noteNode != null) { citedArticle.setNote(Rhino.getAllText(noteNode)); } Node titleNode = xPathUtil.selectSingleNode(transformedXml, nodeXpath + "/title"); if (titleNode != null) { citedArticle.setTitle(Rhino.getAllText(titleNode)); } Node summaryNode = xPathUtil.selectSingleNode(transformedXml, nodeXpath + "/summary"); if (summaryNode != null) { citedArticle.setSummary(Rhino.getAllText(summaryNode)); } //Set the people referenced by the article in this citation int authorCount = Integer .valueOf(xPathUtil.evaluate(transformedXml, "count(" + nodeXpath + "/authors)")); int editorCount = Integer .valueOf(xPathUtil.evaluate(transformedXml, "count(" + nodeXpath + "/editors)")); List<CitedArticleAuthor> authors = new ArrayList<CitedArticleAuthor>(authorCount); List<CitedArticleEditor> editors = new ArrayList<CitedArticleEditor>(editorCount); for (int j = 1; j <= authorCount; j++) { CitedArticleAuthor author = new CitedArticleAuthor(); author.setFullName( xPathUtil.evaluate(transformedXml, nodeXpath + "/authors[" + j + "]/realName/text()")); author.setGivenNames( xPathUtil.evaluate(transformedXml, nodeXpath + "/authors[" + j + "]/givenNames/text()")); author.setSurnames( xPathUtil.evaluate(transformedXml, nodeXpath + "/authors[" + j + "]/surnames/text()")); author.setSuffix( xPathUtil.evaluate(transformedXml, nodeXpath + "/authors[" + j + "]/suffix/text()")); authors.add(author); } for (int j = 1; j <= editorCount; j++) { CitedArticleEditor editor = new CitedArticleEditor(); editor.setFullName( xPathUtil.evaluate(transformedXml, nodeXpath + "/editors[" + j + "]/realName/text()")); editor.setGivenNames( xPathUtil.evaluate(transformedXml, nodeXpath + "/editors[" + j + "]/givenNames/text()")); editor.setSurnames( xPathUtil.evaluate(transformedXml, nodeXpath + "/editors[" + j + "]/surnames/text()")); editor.setSuffix( xPathUtil.evaluate(transformedXml, nodeXpath + "/editors[" + j + "]/suffix/text()")); editors.add(editor); } citedArticle.setAuthors(authors); citedArticle.setEditors(editors); List<String> collabAuthors = parseCollabAuthors(transformedXml, nodeXpath); citedArticle.setCollaborativeAuthors(collabAuthors); references.add(citedArticle); } return references; } // TODO: consider deleting this method. It is not presently called. This is the old taxonomy, // where we get category information from the article XML. We are changing this over to the // new AI taxonomy, which we get from a call to an external server. I'm keeping this method // around for the time being since it's not clear what we should do if the taxonomy server // is down (perhaps fall back to this)? private Set<Category> parseArticleCategories(Document transformedXml) throws XPathExpressionException { int categoryCount = Integer.valueOf(xPathUtil.evaluate(transformedXml, "count(//Article/categories)")); Set<Category> categories = new HashSet<Category>(categoryCount); for (int i = 1; i <= categoryCount; i++) { String mainCategory = xPathUtil.evaluate(transformedXml, "//Article/categories[" + i + "]/mainCategory/text()"); String subCategory = xPathUtil.evaluate(transformedXml, "//Article/categories[" + i + "]/subCategory/text()"); String categoryStr = ""; Category category = new Category(); if (!mainCategory.isEmpty()) { categoryStr = "/" + mainCategory; } if (!subCategory.isEmpty()) { categoryStr = categoryStr + "/" + subCategory; } category.setPath(categoryStr); categories.add(category); } return categories; } private Set<String> parseArticleTypes(Document transformedXml) throws XPathExpressionException { NodeList articleTypeNodes = xPathUtil.selectNodes(transformedXml, "//Article/articleType/text()"); Set<String> articleTypes = new HashSet<String>(articleTypeNodes.getLength()); for (int i = 0; i < articleTypeNodes.getLength(); i++) { articleTypes.add(articleTypeNodes.item(i).getNodeValue()); } return articleTypes; } private List<ArticleEditor> parseArticleEditors(Document transformedXml) throws XPathExpressionException { int editorCount = Integer.valueOf( xPathUtil.evaluate(transformedXml, "count(//Article/dublinCore/bibliographicCitation/editors)")); List<ArticleEditor> editors = new ArrayList<ArticleEditor>(editorCount); for (int i = 1; i <= editorCount; i++) { ArticleEditor editor = new ArticleEditor(); editor.setFullName(xPathUtil.evaluate(transformedXml, "//Article/dublinCore/bibliographicCitation/editors[" + i + "]/realName/text()")); editor.setGivenNames(xPathUtil.evaluate(transformedXml, "//Article/dublinCore/bibliographicCitation/editors[" + i + "]/givenNames/text()")); editor.setSurnames(xPathUtil.evaluate(transformedXml, "//Article/dublinCore/bibliographicCitation/editors[" + i + "]/surnames/text()")); editor.setSuffix(xPathUtil.evaluate(transformedXml, "//Article/dublinCore/bibliographicCitation/editors[" + i + "]/suffix/text()")); editors.add(editor); } return editors; } private List<ArticleAuthor> parseArticleAuthors(Document transformedXml) throws XPathExpressionException { int authorCount = Integer.valueOf( xPathUtil.evaluate(transformedXml, "count(//Article/dublinCore/bibliographicCitation/authors)")); List<ArticleAuthor> authors = new ArrayList<ArticleAuthor>(authorCount); for (int i = 1; i <= authorCount; i++) { ArticleAuthor author = new ArticleAuthor(); author.setFullName(xPathUtil.evaluate(transformedXml, "//Article/dublinCore/bibliographicCitation/authors[" + i + "]/realName/text()")); author.setGivenNames(xPathUtil.evaluate(transformedXml, "//Article/dublinCore/bibliographicCitation/authors[" + i + "]/givenNames/text()")); author.setSurnames(xPathUtil.evaluate(transformedXml, "//Article/dublinCore/bibliographicCitation/authors[" + i + "]/surnames/text()")); author.setSuffix(xPathUtil.evaluate(transformedXml, "//Article/dublinCore/bibliographicCitation/authors[" + i + "]/suffix/text()")); authors.add(author); } return authors; } private List<ArticleAsset> parseArticleAssets(Document transformedXml, Article article) throws XPathExpressionException { //article assets int secondaryObjectCount = Integer.valueOf(xPathUtil.evaluate(transformedXml, "count(//Article/parts)")); //each secondary object has 3 files, plus article has xml and pdf List<ArticleAsset> assets = new ArrayList<ArticleAsset>(4 * secondaryObjectCount + 2); //get the 'representations' of the article (xml and pdf) Integer articleRepCount = Integer .valueOf(xPathUtil.evaluate(transformedXml, "count(//Article/representations)")); for (int i = 1; i <= articleRepCount; i++) { ArticleAsset asset = new ArticleAsset(); asset.setDoi(article.getDoi()); asset.setExtension( xPathUtil.evaluate(transformedXml, "//Article/representations[" + i + "]/name/text()")); asset.setContentType( xPathUtil.evaluate(transformedXml, "//Article/representations[" + i + "]/contentType/text()")); asset.setSize(Long.valueOf( xPathUtil.evaluate(transformedXml, "//Article/representations[" + i + "]/size/text()"))); asset.setTitle(article.getTitle()); asset.setDescription(article.getDescription()); assets.add(asset); } for (int i = 1; i <= secondaryObjectCount; i++) { String partXpath = "//Article/parts[" + i + "]"; Integer repCount = Integer .valueOf(xPathUtil.evaluate(transformedXml, "count(" + partXpath + "/representations)")); String title = xPathUtil.evaluate(transformedXml, partXpath + "/dublinCore/title/text()"); String description = xPathUtil.evaluate(transformedXml, partXpath + "/dublinCore/description/text()"); for (int j = 1; j <= repCount; j++) { String repXpath = partXpath + "/representations[" + j + "]"; ArticleAsset asset = new ArticleAsset(); asset.setDoi(xPathUtil.evaluate(transformedXml, partXpath + "/@id")); asset.setContextElement(xPathUtil.evaluate(transformedXml, partXpath + "/contextElement/text()")); asset.setContentType(xPathUtil.evaluate(transformedXml, repXpath + "/contentType/text()")); asset.setExtension(xPathUtil.evaluate(transformedXml, repXpath + "/name/text()")); asset.setSize(Long.valueOf(xPathUtil.evaluate(transformedXml, repXpath + "/size/text()"))); asset.setTitle(title); asset.setDescription(description); assets.add(asset); } } return assets; } /** * Run the zip file through the xsl stylesheet * * @param zip the zip archive containing the items to ingest * @param zipInfo the document describing the zip archive (adheres to zip.dtd) * @param articleXml the stylesheet to run on <var>zipInfo</var>; this is the main script * @param doiUrlPrefix DOI URL prefix * @return a document describing the fedora objects to create (must adhere to fedora.dtd) * @throws javax.xml.transform.TransformerException * if an error occurs during the processing */ private Document transformZip(ZipFile zip, String zipInfo, Document articleXml, String doiUrlPrefix) throws TransformerException, URISyntaxException { Transformer t = getTranslet(articleXml); t.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); t.setURIResolver(new ZipURIResolver(zip)); // override the doi url prefix if one is specified in the config if (doiUrlPrefix != null) t.setParameter("doi-url-prefix", doiUrlPrefix); /* * Note: it would be preferable (and correct according to latest JAXP specs) to use * t.setErrorListener(), but Saxon does not forward <xls:message>'s to the error listener. * Hence we need to use Saxon's API's in order to get at those messages. */ final StringWriter msgs = new StringWriter(); MessageWarner em = new MessageWarner(); ((Controller) t).setMessageEmitter(em); t.setErrorListener(new ErrorListener() { public void warning(TransformerException te) { log.warn("Warning received while processing zip", te); } public void error(TransformerException te) { log.warn("Error received while processing zip", te); msgs.write(te.getMessageAndLocation() + '\n'); } public void fatalError(TransformerException te) { log.warn("Fatal error received while processing zip", te); msgs.write(te.getMessageAndLocation() + '\n'); } }); Source inp = new StreamSource(new StringReader(zipInfo), "zip:/"); DOMResult res = new DOMResult(); try { t.transform(inp, res); } catch (TransformerException te) { if (msgs.getBuffer().length() > 0) { log.error(msgs.getBuffer().toString()); throw new TransformerException(msgs.toString(), te); } else throw te; } if (msgs.getBuffer().length() > 0) throw new TransformerException(msgs.toString()); return (Document) res.getNode(); } @Override public Document extractArticleXml(ZipFile archive) throws ArchiveProcessException { try { Document manifest = extractXml(archive, "MANIFEST.xml"); String xmlFileName = xPathUtil.evaluate(manifest, "//article/@main-entry"); return extractXml(archive, xmlFileName); } catch (Exception e) { throw new ArchiveProcessException("Error extracting article xml from archive: " + archive.getName(), e); } } @Override public String extractStrikingImageURI(ZipFile archive) throws ArchiveProcessException { String strkImg = ""; try { Document manifest = extractXml(archive, "MANIFEST.xml"); strkImg = xPathUtil.evaluate(manifest, "//object[@strkImage='True']/@uri"); } catch (Exception e) { log.info("No Striking image selected in manifest.xml. " + archive.getName()); } return strkImg; } /** * Helper method to extract XML from the zip file * * @param zipFile - the zip file containing the file to extract * @param fileName - the file to extract * @return - the parsed xml file * @throws java.io.IOException - if there's a problem reading from the zip file * @throws org.xml.sax.SAXException - if there's a problem parsing the xml */ private Document extractXml(ZipFile zipFile, String fileName) throws IOException, SAXException { InputStream inputStream = null; try { inputStream = zipFile.getInputStream(zipFile.getEntry(fileName)); return documentBuilder.parse(inputStream); } finally { if (inputStream != null) { try { inputStream.close(); } catch (IOException e) { log.warn("Error closing zip input stream during ingest processing", e); } } } } /** * Generate a description of the given zip archive. * * @param zip the zip archive to describe * @return the xml doc describing the archive (adheres to zip.dtd) * @throws IOException if an exception occurred reading the zip archive */ public static String describeZip(ZipFile zip) throws IOException { StringBuilder res = new StringBuilder(500); res.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); res.append("<ZipInfo"); if (zip.getName() != null) res.append(" name=\"").append(attrEscape(zip.getName())).append("\""); res.append(">\n"); Enumeration<? extends ZipEntry> entries = zip.entries(); while (entries.hasMoreElements()) entry2xml(entries.nextElement(), res); res.append("</ZipInfo>\n"); return res.toString(); } /** * Generate a description for a single zip-entry. * * @param ze the zip entry to describe. * @param buf the buffer to place the description into */ private static void entry2xml(ZipEntry ze, StringBuilder buf) { buf.append("<ZipEntry name=\"").append(attrEscape(ze.getName())).append("\""); if (ze.isDirectory()) buf.append(" isDirectory=\"true\""); if (ze.getCrc() >= 0) buf.append(" crc=\"").append(ze.getCrc()).append("\""); if (ze.getSize() >= 0) buf.append(" size=\"").append(ze.getSize()).append("\""); if (ze.getCompressedSize() >= 0) buf.append(" compressedSize=\"").append(ze.getCompressedSize()).append("\""); if (ze.getTime() >= 0) buf.append(" time=\"").append(ze.getTime()).append("\""); if (ze.getComment() != null || ze.getExtra() != null) { buf.append(">\n"); if (ze.getComment() != null) buf.append("<Comment>").append(xmlEscape(ze.getComment())).append("</Comment>\n"); if (ze.getExtra() != null) buf.append("<Extra>").append(base64Encode(ze.getExtra())).append("</Extra>\n"); buf.append("</ZipEntry>\n"); } else { buf.append("/>\n"); } } private static String xmlEscape(String str) { return str.replaceAll("&", "&").replaceAll("<", "<").replaceAll(">", ">"); } private static String attrEscape(String str) { return str.replaceAll("&", "&").replaceAll("<", "<").replaceAll("\"", """); } private static String base64Encode(byte[] data) { try { return new String(Base64.encodeBase64(data), "ISO-8859-1"); } catch (IOException ioe) { throw new RuntimeException(ioe); // can't happen } } /** * This method is useful for debugging, so you can put dom2String(xml) in the watch list of the debugger and see the * xml * * @param dom the xml to turn in to a string * @return the xml as a string */ private String dom2String(Node dom) { try { StringWriter sw = new StringWriter(500); Transformer t = transformerFactory.newTransformer(); t.transform(new DOMSource(dom), new StreamResult(sw)); return sw.toString(); } catch (TransformerException te) { log.error("Error converting dom to string", te); return ""; } } /** * @param filenameOrURL filenameOrURL * @throws java.net.URISyntaxException URISyntaxException * @return the local or remote file or url as a java.io.File */ public InputStream getAsStream(final String filenameOrURL) throws URISyntaxException { return getClass().getClassLoader().getResourceAsStream(filenameOrURL); } /** * Get a translet, compiled stylesheet, for the xslTemplate. If the doc is null * use the default template. If the doc is not null then get the DTD version. * IF the DTD version does not exist use the default template else use the * template associated with that version. * * @param doc the dtd version of document * @return Translet for the xslTemplate. * @throws javax.xml.transform.TransformerException TransformerException. */ private Transformer getTranslet(Document doc) throws TransformerException, URISyntaxException { final TransformerFactory tFactory = TransformerFactory.newInstance(); // key is "" if the Attribute does not exist InputStream templateStream; String key = doc.getDocumentElement().getAttribute("dtd-version").trim(); if ((doc == null) || (!xslTemplateMap.containsKey(key)) || (key.equalsIgnoreCase(""))) { templateStream = xslDefaultTemplate; } else { String templateName = xslTemplateMap.get(key); templateStream = getAsStream(templateName); } Templates translet = tFactory.newTemplates(new StreamSource(templateStream)); return translet.newTransformer(); } }