nz.govt.natlib.ndha.wctdpsdepositor.extractor.XPathWctMetsExtractor.java Source code

Introduction

Here is the source code for nz.govt.natlib.ndha.wctdpsdepositor.extractor.XPathWctMetsExtractor.java
Source

/**
 * nz.govt.natlib.ndha.wctdpsdepositor - Software License
 *
 * Copyright 2007/2009 National Library of New Zealand.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0 
 *
 * or the file "LICENSE.txt" included with the software.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 *
 */

package nz.govt.natlib.ndha.wctdpsdepositor.extractor;

//import nz.govt.natlib.ndha.common.FixityUtils;
import nz.govt.natlib.ndha.wctdpsdepositor.extractor.filefinder.FileArchiveBuilder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.webcurator.core.archive.dps.DpsDepositFacade.HarvestType;
import org.xml.sax.SAXException;

import com.exlibris.core.sdk.formatting.DublinCore;
import com.exlibris.core.sdk.formatting.DublinCoreFactory;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.*;
import java.util.ArrayList;
import java.util.List;

/**
 * This class is responsible for parsing a Wct Met's document and extracting required data.
 * This class uses XPath queries to retrive the data.
 */
public class XPathWctMetsExtractor implements WctDataExtractor {
    private static final Log log = LogFactory.getLog(XPathWctMetsExtractor.class);
    private static final String harvestDateQuery = "//mets:mets/mets:amdSec/mets:techMD/mets:mdWrap/mets:xmlData/wct:wct/wct:TargetInstance/wct:Crawl/wct:StartDate";
    private static final String seedUrlsQuery = "//mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/wct:wct/wct:Target/wct:Seeds/wct:Seed";
    private static final String targetNameQuery = "//mets:mets/mets:dmdSec/mets:mdWrap/mets:xmlData/wct:wct/wct:Target/wct:Name";
    private static final String createdByQuery = "//mets:mets/mets:amdSec/mets:digiprovMD/mets:mdWrap/mets:xmlData/wct:wct/wct:TargetInstance/wct:Owner/wct:UID";
    private static final String creationDateQuery = "//mets:mets/mets:amdSec/mets:digiprovMD/mets:mdWrap/mets:xmlData/wct:wct/wct:TargetInstance/wct:HarvestResult/wct:CreationDate";
    private static final String provenanceNoteQuery = "//mets:mets/mets:amdSec/mets:digiprovMD/mets:mdWrap/mets:xmlData/wct:wct/wct:TargetInstance/wct:HarvestResult/wct:ProvenanceNote";
    private static final String copyrightStatementQuery = "//mets:mets/mets:amdSec/mets:rightsMD/mets:mdWrap/mets:xmlData/wct:wct/wct:Permissions/wct:Permission/wct:CopyrightStatement";
    private static final String copyrightURLQuery = "//mets:mets/mets:amdSec/mets:rightsMD/mets:mdWrap/mets:xmlData/wct:wct/wct:Permissions/wct:Permission/wct:CopyrightURL";
    private static final String accessRestrictionQuery = "//mets:mets/mets:amdSec/mets:rightsMD/mets:mdWrap/mets:xmlData/wct:wct/wct:Permissions/wct:Permission/wct:AccessStatus";

    private static final String metsArchiveFilesQuery = "//mets:mets/mets:fileSec/mets:fileGrp/mets:fileGrp[@USE='ARCHIVE']/mets:file";
    private static final String metsLogFilesQuery = "//mets:mets/mets:fileSec/mets:fileGrp/mets:fileGrp[@USE='LOGS']/mets:file";
    private static final String metsReportFilesQuery = "//mets:mets/mets:fileSec/mets:fileGrp/mets:fileGrp[@USE='REPORTS']/mets:file";
    private static final String metsHomeDirectoryFilesQuery = "//mets:mets/mets:fileSec/mets:fileGrp/mets:fileGrp[@USE='HOME DIRECTORY']/mets:file";

    private static final String ARC_FILE_LOCATION_PREFIX = "file://./";

    private String harvestDate;
    private List<SeedUrl> seedUrls = new ArrayList<SeedUrl>();
    private String targetName;
    private String ilsReference;
    private String createdBy;
    private String creationDate;
    private String provenanceNote;
    private String copyrightStatement;
    private String copyrightURL;
    private String accessRestriction;
    private List<ArchiveFile> archiveFiles = new ArrayList<ArchiveFile>();
    private List<ArchiveFile> logFiles = new ArrayList<ArchiveFile>();
    private List<ArchiveFile> reportFiles = new ArrayList<ArchiveFile>();
    private List<ArchiveFile> homeDirectoryFiles = new ArrayList<ArchiveFile>();
    private ArchiveFile wctMetsFile;
    private ArchiveFile arcIndex;
    private String wctTargetInstanceID;
    private DublinCore additionalDublinCoreElements;
    private HarvestType harvestType;
    private String ieEntityType;

    private static final String XML_MIME_TYPE = "text/xml";

    static {
        /*
         * Temporary fix, until we find a permanent solution, to get rid of the following error from wct-store in Tomcat:
         * 
         * org.webcurator.core.archive.dps.DPSUploadException: java.lang.RuntimeException: 
         * XPathFactory#newInstance() failed to create an XPathFactory for the default object 
         * model: http://java.sun.com/jaxp/xpath/dom with the XPathFactoryConfigurationException: 
         * javax.xml.xpath.XPathFactoryConfigurationException: No XPathFctory implementation 
         * found for the object model: http://java.sun.com/jaxp/xpath/dom
         * 
         */
        System.setProperty("javax.xml.xpath.XPathFactory:http://java.sun.com/jaxp/xpath/dom",
                "com.sun.org.apache.xpath.internal.jaxp.XPathFactoryImpl");
    }

    public void parseFile(File wctMets, FileArchiveBuilder fileBuilder) throws IOException {
        DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(wctMets)));
        byte[] buff = new byte[(int) wctMets.length()];
        dis.readFully(buff);
        dis.close();

        String xmlEscapedString = escapeXml(buff);
        parseFile(xmlEscapedString.getBytes(), wctMets.getName(), fileBuilder);
    }

    public void parseFile(byte[] wctMets, String fileName, FileArchiveBuilder fileBuilder) {
        try {

            InputStream inputStream = populateInputStreamFrom(wctMets);

            Document doc = createXmlDocumentFrom(inputStream);
            XPathFactory xPathFactory = XPathFactory.newInstance();
            XPath xpath = xPathFactory.newXPath();
            xpath.setNamespaceContext(new WctNamespaceContext());

            preprocess(doc, xpath, fileBuilder);
            popualteHarvestDate(doc, xpath);
            populateSeedUrlsFrom(doc, xpath);
            populateTargetName(doc, xpath);
            populateCreatedBy(doc, xpath);
            populateCreationDate(doc, xpath);
            populateProvenanceNote(doc, xpath);
            populateCopyrightStatement(doc, xpath);
            populateCopyrightURL(doc, xpath);
            populateAccessRestrictions(doc, xpath);

            populateArchiveFiles(doc, xpath, fileBuilder);
            populateLogFiles(doc, xpath, fileBuilder);
            populateReportFiles(doc, xpath, fileBuilder);
            populateHomeDirectoryFiles(doc, xpath, fileBuilder);

            // Populate anything additional
            populateAdditional(doc, xpath, fileBuilder);

            inputStream = populateInputStreamFrom(wctMets);
            populateWctMets(inputStream, fileName);

        } catch (XPathExpressionException xpe) {
            throw new RuntimeException("An exception occurred while parsing the WCT METS document for " + fileName,
                    xpe);
        }
    }

    /**
     * An empty implementation allowing the subclasses to remove any unnessary nodes
     * to speed up XPath processing.
     * @param doc
     */
    protected void preprocess(Document doc, XPath xpath, FileArchiveBuilder fileBuilder)
            throws XPathExpressionException {
        // empty implementation
    }

    /**
     * An empty implementation allowing the subclasses to parse any additional
     * details from WCT METS document.
     * @param doc
     * @param xpath
     * @param fileBuilder
     */
    protected void populateAdditional(Document doc, XPath xpath, FileArchiveBuilder fileBuilder)
            throws XPathExpressionException {
        // empty implementation
    }

    public String getTargetName() {
        return targetName;
    }

    public String getHarvestDate() {
        return harvestDate;
    }

    public List<SeedUrl> getSeedUrls() {
        return seedUrls;
    }

    public String getEvents() {
        return null;
    }

    public String getAccessRestriction() {
        return accessRestriction;
    }

    public void setAccessRestriction(String ar) {
        accessRestriction = ar;
    }

    public String getILSReference() {
        return ilsReference;
    }

    public String getCreatedBy() {
        return createdBy;
    }

    public String getCreationDate() {
        return creationDate;
    }

    public String getProvenanceNote() {
        return provenanceNote;
    }

    public String getCopyrightStatement() {
        return copyrightStatement;
    }

    public String getCopyrightURL() {
        return copyrightURL;
    }

    public List<ArchiveFile> getArchiveFiles() {
        return archiveFiles;
    }

    public ArchiveFile getArcIndexFile() {
        return arcIndex;
    }

    public void setArcIndexFile(ArchiveFile arcIndex) {
        this.arcIndex = arcIndex;
    }

    public List<ArchiveFile> getLogFiles() {
        return logFiles;
    }

    public List<ArchiveFile> getReportFiles() {
        return reportFiles;
    }

    public List<ArchiveFile> getHomeDirectoryFiles() {
        return homeDirectoryFiles;
    }

    public ArchiveFile getWctMetsFile() {
        return wctMetsFile;
    }

    public List<ArchiveFile> getAllFiles() {
        List<ArchiveFile> files = new ArrayList<ArchiveFile>();

        if (getArcIndexFile() != null)
            files.add(getArcIndexFile());

        if (getWctMetsFile() != null)
            files.add(getWctMetsFile());

        files.addAll(getHomeDirectoryFiles());
        files.addAll(getReportFiles());
        files.addAll(getLogFiles());
        files.addAll(getArchiveFiles());

        return files;
    }

    public String getWctTargetInstanceID() {
        return wctTargetInstanceID;
    }

    public void setWctTargetInstanceID(String targetInstanceID) {
        this.wctTargetInstanceID = targetInstanceID;
    }

    public void setILSReference(String ilsReference) {
        this.ilsReference = ilsReference;
    }

    public String getIeEntityType() {
        return ieEntityType;
    }

    public void setIeEntityType(String ieEntityType) {
        this.ieEntityType = ieEntityType;
    }

    public DublinCore getAdditionalDublinCoreElements() {
        return additionalDublinCoreElements;
    }

    public void setAdditionalDCTermElement(String name, String value) {
        setAdditionalDublinCoreElement(DublinCore.DCTERMS_NAMESPACE, name, value);
    }

    public void setAdditionalDCElement(String name, String value) {
        setAdditionalDublinCoreElement(DublinCore.DC_NAMESPACE, name, value);
    }

    private void setAdditionalDublinCoreElement(int namespace, String name, String value) {
        if (name == null || value == null)
            return;
        if (additionalDublinCoreElements == null)
            additionalDublinCoreElements = DublinCoreFactory.getInstance().createDocument();
        additionalDublinCoreElements.addElement(namespace, name, value);
    }

    public HarvestType getHarvestType() {
        return harvestType;
    }

    public void setHarvestType(HarvestType harvestType) {
        this.harvestType = harvestType;
    }

    public void cleanUpCdxFile() {
        String errorText = "WCT Target " + wctTargetInstanceID + ": Error deleting the arc index file ";// + file.getAbsolutePath() + " - it needs to be manually deleted";
        try {
            String indexFilePath = ((FileSystemArchiveFile) arcIndex).generateFilePath();
            errorText = errorText + indexFilePath + " - ";
            File indexFile = new File(indexFilePath);
            if (indexFile.exists()) {
                boolean status = indexFile.delete();
                if (status == true) {
                    log.info("WCT Target " + wctTargetInstanceID + ": Arc Index file " + indexFilePath
                            + " has been deleted successfully");
                } else {
                    log.error(errorText);
                }
            } else {
                log.error(errorText + " file does not exit");
            }
        } catch (RuntimeException ex) {
            log.error(errorText, ex);
        }
    }

    private InputStream populateInputStreamFrom(byte[] wctMets) {
        InputStream inputStream = new ByteArrayInputStream(wctMets.clone());
        return inputStream;
    }

    private void populateSeedUrlsFrom(Document doc, XPath xpath) throws XPathExpressionException {
        NodeList seedNodes = (NodeList) xpath.evaluate(seedUrlsQuery, doc, XPathConstants.NODESET);
        //int l = seedNodes.getLength();
        for (int i = 0; i < seedNodes.getLength(); i++) {
            Node node = seedNodes.item(i);
            if (node == null)
                continue;
            NodeList childNodes = node.getChildNodes();
            if (childNodes.getLength() <= 0)
                continue;
            String url = null;
            String type = null;
            for (int j = 0; j < childNodes.getLength(); j++) {
                Node childNode = childNodes.item(j);
                String nodeName = childNode.getLocalName();
                if ("SeedURL".equals(nodeName))
                    url = childNode.getTextContent();
                else if ("SeedType".equals(nodeName))
                    type = childNode.getTextContent();
            }
            SeedUrl.Type typeAsEnum;
            if (url != null) {
                try {
                    typeAsEnum = SeedUrl.Type.valueOf(type);
                } catch (RuntimeException e) {
                    typeAsEnum = SeedUrl.Type.Primary;
                }
                SeedUrl seedUrl = new SeedUrl(url, typeAsEnum);
                seedUrls.add(seedUrl);
            }
        }
    }

    private void popualteHarvestDate(Document doc, XPath xpath) throws XPathExpressionException {
        harvestDate = (String) xpath.evaluate(harvestDateQuery, doc, XPathConstants.STRING);
    }

    private void populateTargetName(Document doc, XPath xpath) throws XPathExpressionException {
        targetName = (String) xpath.evaluate(targetNameQuery, doc, XPathConstants.STRING);
    }

    private void populateCreatedBy(Document doc, XPath xpath) throws XPathExpressionException {
        createdBy = (String) xpath.evaluate(createdByQuery, doc, XPathConstants.STRING);
    }

    private void populateCreationDate(Document doc, XPath xpath) throws XPathExpressionException {
        creationDate = (String) xpath.evaluate(creationDateQuery, doc, XPathConstants.STRING);
    }

    private void populateProvenanceNote(Document doc, XPath xpath) throws XPathExpressionException {
        provenanceNote = (String) xpath.evaluate(provenanceNoteQuery, doc, XPathConstants.STRING);
    }

    private void populateCopyrightStatement(Document doc, XPath xpath) throws XPathExpressionException {
        copyrightStatement = (String) xpath.evaluate(copyrightStatementQuery, doc, XPathConstants.STRING);
    }

    private void populateCopyrightURL(Document doc, XPath xpath) throws XPathExpressionException {
        copyrightURL = (String) xpath.evaluate(copyrightURLQuery, doc, XPathConstants.STRING);
    }

    private void populateAccessRestrictions(Document doc, XPath xpath) throws XPathExpressionException {
        accessRestriction = (String) xpath.evaluate(accessRestrictionQuery, doc, XPathConstants.STRING);
    }

    private void populateArchiveFiles(Document doc, XPath xpath, FileArchiveBuilder fileBuilder)
            throws XPathExpressionException {
        populateFileValueObjectCollectionFrom(doc, xpath, metsArchiveFilesQuery, archiveFiles, fileBuilder);
    }

    private void populateLogFiles(Document doc, XPath xpath, FileArchiveBuilder fileBuilder)
            throws XPathExpressionException {
        populateFileValueObjectCollectionFrom(doc, xpath, metsLogFilesQuery, logFiles, fileBuilder);
    }

    private void populateReportFiles(Document doc, XPath xpath, FileArchiveBuilder fileBuilder)
            throws XPathExpressionException {
        populateFileValueObjectCollectionFrom(doc, xpath, metsReportFilesQuery, reportFiles, fileBuilder);
    }

    private void populateHomeDirectoryFiles(Document doc, XPath xpath, FileArchiveBuilder fileBuilder)
            throws XPathExpressionException {
        populateFileValueObjectCollectionFrom(doc, xpath, metsHomeDirectoryFilesQuery, homeDirectoryFiles,
                fileBuilder);
    }

    private void populateFileValueObjectCollectionFrom(Document doc, XPath xpath, String nodeQuery,
            List<ArchiveFile> fileCollection, FileArchiveBuilder fileBuilder) throws XPathExpressionException {
        NodeList seedNodes = (NodeList) xpath.evaluate(nodeQuery, doc, XPathConstants.NODESET);
        for (int i = 0; i < seedNodes.getLength(); i++) {
            Node metsFileNode = seedNodes.item(i);
            ArchiveFile af = populateFileValueObjectFrom(xpath, metsFileNode, fileBuilder);
            fileCollection.add(af);
        }
    }

    private ArchiveFile populateFileValueObjectFrom(XPath xpath, Node metsFileNode, FileArchiveBuilder fileBuilder)
            throws XPathExpressionException {
        String mimeType = (String) xpath.evaluate("@MIMETYPE", metsFileNode, XPathConstants.STRING);
        String checkSum = (String) xpath.evaluate("@CHECKSUM", metsFileNode, XPathConstants.STRING);
        String fileLocation = (String) xpath.evaluate("mets:FLocat/@xlink:href", metsFileNode,
                XPathConstants.STRING);
        String fileName = getFileName(fileLocation);
        return fileBuilder.createFileFrom(mimeType, checkSum, fileName);
    }

    private String getFileName(String xmlLocation) {
        if (!xmlLocation.startsWith(ARC_FILE_LOCATION_PREFIX))
            throw new RuntimeException("Arc file location was expected to begin with " + ARC_FILE_LOCATION_PREFIX
                    + ", instead: " + xmlLocation);

        return xmlLocation.substring(ARC_FILE_LOCATION_PREFIX.length());
    }

    private Document createXmlDocumentFrom(InputStream wctMets) {
        try {
            DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
            docFactory.setNamespaceAware(true);
            DocumentBuilder builder = docFactory.newDocumentBuilder();
            return builder.parse(wctMets);
        } catch (ParserConfigurationException pce) {
            throw new RuntimeException(pce);
        } catch (IOException ioe) {
            throw new RuntimeException(ioe);
        } catch (SAXException se) {
            throw new RuntimeException(se);
        }
    }

    private void populateWctMets(InputStream mets, String fileName) {
        //String wctMetsDigest = FixityUtils.calculateMD5(mets);
        String fileLocation = fileName;
        this.wctMetsFile = new InputStreamArchiveFile(XML_MIME_TYPE, fileLocation, mets);
    }

    protected String escapeXml(byte[] buff) {
        String xml = new String(buff);
        return xml.replaceAll("&", "&amp;");
    }

}