net.sourceforge.dita4publishers.word2dita.Word2DitaValidationHelper.java Source code

Java tutorial

Introduction

Here is the source code for net.sourceforge.dita4publishers.word2dita.Word2DitaValidationHelper.java

Source

/**
 * Copyright (c) 2010 DITA for Publishers. Licensed under Apache License 2. 
 * See license files for details.
 */
package net.sourceforge.dita4publishers.word2dita;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.HashMap;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
import java.util.zip.ZipFile;
import java.util.zip.ZipOutputStream;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import net.sourceforge.dita4publishers.api.bos.BosMemberValidationException;
import net.sourceforge.dita4publishers.impl.bos.BosConstructionOptions;
import net.sourceforge.dita4publishers.util.DataUtil;
import net.sourceforge.dita4publishers.util.DomException;
import net.sourceforge.dita4publishers.util.DomUtil;
import net.sourceforge.dita4publishers.util.SaxUtil;

import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

/**
 * Helper class to validate the XML generated from Word and push
 * validation messages back into the Word document.
 */
public class Word2DitaValidationHelper {

    public static final String wNs = DocxConstants.nsByPrefix.get("w");

    public static SimpleDateFormat timestampFormatter = new SimpleDateFormat("yyyy-MM-dd'T'HH':'mm':'ssZ");

    public static final Log log = LogFactory.getLog(Word2DitaValidationHelper.class);

    /**
     * @param zipComponents 
     * @param logDoc
     * @param documentDom
     * @param commentsDom
     * @param commentTemplate
     * @throws XPathExpressionException
     */
    static void addMessagesToDocxXml(Document logDoc, Document documentDom, Document commentsDom,
            Element commentTemplate) throws XPathExpressionException {
        NodeList messagesNl = logDoc.getDocumentElement().getElementsByTagName("message");

        for (int i = 0; i < messagesNl.getLength(); i++) {
            Element message = (Element) messagesNl.item(i);

            NodeList existingComments = commentsDom.getDocumentElement().getElementsByTagNameNS(wNs, "comment");
            String commentId = String.valueOf(existingComments.getLength());
            String messageText = message.getTextContent();

            addCommentToComments(commentsDom, commentTemplate, messageText, commentId);

            String xpath = message.getAttribute("wordParaXPath");
            // System.err.println("xpath=" + xpath);
            if (xpath == null || "".equals(xpath.trim())) {
                xpath = "/w:document/w:body[1]/w:p[1]";
            }

            addCommentRefToParaForXPath(documentDom, commentId, xpath);

        }
    }

    /**
     * Given a set of validation messages and a DOCX file to which those messages apply, 
     * creates a Word comment for each message, attached either to the paragraph the
     * message points to (by XPath) or to the first paragraph of the document if there
     * is not XPath for the message.
     * @param docxFile The DOCX file to be updated.
     * @param newDocxFile New DOCX file that will be a copy of the input DOCX with comments added.
     * @param logDoc The messages document as a DOM.
     * @throws ZipException
     * @throws IOException
     * @throws BosMemberValidationException
     * @throws DomException
     * @throws Exception
     * @throws XPathExpressionException
     * @throws FileNotFoundException
     */
    public static void addValidationMessagesToDocxFile(File docxFile, File newDocxFile, Document logDoc)
            throws ZipException, IOException, BosMemberValidationException, DomException, Exception,
            XPathExpressionException, FileNotFoundException {
        String[] catalogs = new String[0];
        Document documentDom = null;
        Document commentsDom = null;

        Map<URI, Document> domCache = new HashMap<URI, Document>();
        BosConstructionOptions bosOptions = new BosConstructionOptions(log, domCache);
        bosOptions.setCatalogs(catalogs);

        ZipFile docxZip = new ZipFile(docxFile);
        ZipComponents zipComponents = new ZipComponents(docxZip);

        ZipComponent documentXml = zipComponents.getEntry(DocxConstants.DOCUMENT_XML_PATH);

        // Load comments template doc:

        URL commentsTemplateUrl = DocxConstants.class.getResource("resources/comments.xml");
        Element commentTemplate = Word2DitaValidationHelper.getCommentTemplate(commentsTemplateUrl, bosOptions);
        commentsDom = Word2DitaValidationHelper.getCommentsDom(bosOptions, zipComponents, commentsTemplateUrl);

        documentDom = zipComponents.getDomForZipComponent(bosOptions, DocxConstants.DOCUMENT_XML_PATH);
        addMessagesToDocxXml(logDoc, documentDom, commentsDom, commentTemplate);
        Word2DitaValidationHelper.saveDomToZipComponent(documentDom, documentXml);

        ZipComponent comments = zipComponents.getEntry(DocxConstants.COMMENTS_XML_PATH);
        if (comments == null) {
            comments = zipComponents.createZipComponent(DocxConstants.COMMENTS_XML_PATH);
        }
        // System.out.println("[1] Comments.xml: " + IOUtils.toString(DomUtil.serializeToInputStream(commentsDom)));
        Word2DitaValidationHelper.saveDomToZipComponent(commentsDom,
                zipComponents.getEntry(DocxConstants.COMMENTS_XML_PATH));

        Word2DitaValidationHelper.addCommentFileRelationship(zipComponents, bosOptions);
        Word2DitaValidationHelper.addCommentFileContentType(zipComponents, bosOptions);

        Word2DitaValidationHelper.saveZipComponents(zipComponents, newDocxFile);
    }

    /**
     * @return
     * @throws IOException 
     * @throws DomException 
     * @throws BosMemberValidationException 
     */
    static Element getCommentTemplate(URL commentsUrl, BosConstructionOptions bosOptions)
            throws IOException, BosMemberValidationException, DomException {
        InputSource commentsTemplateXmlSource = new InputSource(commentsUrl.openStream());
        commentsTemplateXmlSource.setSystemId(DocxConstants.COMMENTS_XML_PATH);
        Document commentsTemplateDom = DomUtil.getDomForSource(commentsTemplateXmlSource, bosOptions, false, false);
        NodeList comments = commentsTemplateDom.getDocumentElement()
                .getElementsByTagNameNS(DocxConstants.nsByPrefix.get("w"), "comment");
        Element commentTemplate = (Element) comments.item(0);
        return commentTemplate;
    }

    /**
     * @param bosOptions
     * @param docxZip
     * @param commentsTemplateUrl
     * @return
     * @throws Exception 
     */
    static Document getCommentsDom(BosConstructionOptions bosOptions, ZipComponents zipComponents,
            URL commentsTemplateUrl) throws Exception {
        Document commentsDom;
        NodeList comments;
        ZipComponent commentsXml = zipComponents.getEntry(DocxConstants.COMMENTS_XML_PATH);
        if (commentsXml == null) {
            System.err.println("No comments.xml file");
            commentsXml = zipComponents.createZipComponent(DocxConstants.COMMENTS_XML_PATH);

            // Use the template as the base for new comments.xml DOM:
            InputSource templateSource = new InputSource(commentsTemplateUrl.openStream());
            templateSource.setSystemId(commentsTemplateUrl.toExternalForm());
            commentsDom = DomUtil.getDomForSource(templateSource, bosOptions, false, false);
            comments = commentsDom.getDocumentElement().getElementsByTagNameNS(DocxConstants.nsByPrefix.get("w"),
                    "comment");

            // Remove any existing comments that were in the template:
            for (int i = 0; i < comments.getLength(); i++) {
                Element comment = (Element) comments.item(i);
                commentsDom.getDocumentElement().removeChild(comment);
            }
            zipComponents.createZipComponent(DocxConstants.COMMENTS_XML_PATH, commentsDom);

        } else {
            commentsDom = zipComponents.getDomForZipComponent(bosOptions, DocxConstants.COMMENTS_XML_PATH);
        }
        return commentsDom;
    }

    /**
     * @param doc
     * @param zipComponent
     * @throws IOException
     * @throws Exception
     */
    static void saveDomToZipComponent(Document doc, ZipComponent zipComponent) throws IOException, Exception {
        if (zipComponent == null) {
            throw new IOException("zipComponent is null");
        }
        zipComponent.setDom(doc);
    }

    /**
     * @param commentsDom
     * @param commentTemplate
     * @param messageText
     * @param commentId
     */
    static void addCommentToComments(Document commentsDom, Element commentTemplate, String messageText,
            String commentId) {
        Element comment = (Element) commentsDom.importNode(commentTemplate, true);
        commentsDom.getDocumentElement().appendChild(comment);
        comment.setAttributeNS(wNs, "w:id", commentId);
        comment.setAttributeNS(wNs, "w:author", "XML Validator");
        comment.setAttributeNS(wNs, "w:initials", "XMLVal");
        comment.setAttributeNS(wNs, "w:date", timestampFormatter.format(Calendar.getInstance().getTime()));
        Element elem = DataUtil.getElementNS(comment, wNs, "p");
        NodeList nl = elem.getElementsByTagNameNS(wNs, "r");
        elem = (Element) nl.item(nl.getLength() - 1);
        Element text = DataUtil.getElementNS(elem, wNs, "t");
        text.setTextContent(messageText);
    }

    /**
     * @param documentDom
     * @param xpath
     * @return
     * @throws XPathExpressionException
     */
    static Node getWordParaForXPath(Document documentDom, String xpath) throws XPathExpressionException {
        XPathFactory xpathFactory = DomUtil.getXPathFactory();
        XPath xpathObj = xpathFactory.newXPath();
        xpathObj.setNamespaceContext(DocxConstants.docxNamespaceContext);
        Object result = xpathObj.evaluate(xpath, documentDom, XPathConstants.NODE);
        Node node = null;
        if (result != null) {
            node = (Node) result;

        }
        return node;
    }

    /**
     * @param documentDom
     * @param commentId
     * @param xpath
     * @throws XPathExpressionException
     */
    static void addCommentRefToParaForXPath(Document documentDom, String commentId, String xpath)
            throws XPathExpressionException {
        /**
        <w:r>
         <w:rPr>
           <w:rStyle
             w:val="CommentReference"/>
         </w:rPr>
         <w:commentReference
           w:id="14"/>
        </w:r>
             
         */
        Node node = getWordParaForXPath(documentDom, xpath);

        Element p = (Element) node;
        Element commentRef = documentDom.createElementNS(wNs, "w:r");
        Element elem = (Element) commentRef.appendChild(documentDom.createElementNS(wNs, "w:rPr"));
        elem = (Element) elem.appendChild(documentDom.createElementNS(wNs, "w:rStyle"));
        elem.setAttributeNS(wNs, "w:val", "CommentReference");
        elem = (Element) commentRef.appendChild(documentDom.createElementNS(wNs, "w:commentReference"));
        elem.setAttributeNS(wNs, "w:id", commentId);
        p.appendChild(commentRef);
    }

    /**
     * @param pkg
     * @param bosOptions
     * @throws Exception 
     */
    static void addCommentFileRelationship(ZipComponents zipComponents, BosConstructionOptions bosOptions)
            throws Exception {
        ZipComponent comp = zipComponents.getEntry(DocxConstants.DOCUMENT_XML_RELS_PATH);
        Document doc = zipComponents.getDomForZipComponent(bosOptions, comp);
        Element docElem = doc.getDocumentElement();
        NodeList nl = docElem.getElementsByTagNameNS(DocxConstants.RELS_NS, "Relationship");
        boolean foundCommentRel = false;
        for (int i = 0; i < nl.getLength(); i++) {
            Element elem = (Element) nl.item(i);
            String type = elem.getAttribute("Type");
            if (DocxConstants.COMMENT_REL_TYPE.equals(type)) {
                foundCommentRel = true;
                break;
            }
        }
        if (!foundCommentRel) {
            Element elem = doc.createElementNS(DocxConstants.RELS_NS, "Relationship");
            elem.setAttribute("Type", DocxConstants.COMMENT_REL_TYPE);
            elem.setAttribute("Id", "rId" + (nl.getLength() + 1));
            elem.setAttribute("Target", "comments.xml");
            docElem.appendChild(elem);
            // System.out.println(IOUtils.toString(DomUtil.serializeToInputStream(doc, "utf-8")));
            comp.setDom(doc);
        }

    }

    /**
     * Validates an XML document, capturing the messages into an XML document that includes
     * any @xtrc values pointing back into the original DOCX file from which the XML was
     * generated. The resulting document can be used to then annotate the original DOCX
     * file with messages bound to the original source paragraphs.
     * @param messageFile The file to hold the XML message log.
     * @param inputUrl The URL of the document to be validated.
     * @param catalogs List of entity resolution catalogs to be used by the parser (as for the Resolver class).
     * @return DOM document containing the log messages. Also saves the messages to the specified file.
     * @throws IOException
     * @throws ParserConfigurationException
     * @throws Exception
     * @throws SAXException
     * @throws FileNotFoundException
     */
    public static Document validateXml(File messageFile, URL inputUrl, String[] catalogs)
            throws IOException, ParserConfigurationException, Exception, SAXException, FileNotFoundException {
        InputSource source = new InputSource(inputUrl.openStream());
        Document logDoc = DomUtil.getNewDom();
        XMLReader reader = SaxUtil.getXMLFormatLoggingXMLReader(log, logDoc, true, catalogs);
        reader.parse(source);
        InputStream logStream = DomUtil.serializeToInputStream(logDoc, "utf-8");
        System.out.println("Creating message file \"" + messageFile.getAbsolutePath() + "\"...");
        OutputStream fos = new FileOutputStream(messageFile);
        IOUtils.copy(logStream, fos);
        return logDoc;
    }

    /**
     * @param zipComponents
     * @param bosOptions
     * @throws Exception 
     */
    public static void addCommentFileContentType(ZipComponents zipComponents, BosConstructionOptions bosOptions)
            throws Exception {

        /*
         *   <Override
        PartName="/word/comments.xml"
        ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml"/>
            
         */

        ZipComponent comp = zipComponents.getEntry("[Content_Types].xml");
        Document doc = zipComponents.getDomForZipComponent(bosOptions, comp);
        Element docElem = doc.getDocumentElement();
        String contentTypesNs = "http://schemas.openxmlformats.org/package/2006/content-types";
        NodeList nl = docElem.getElementsByTagNameNS(contentTypesNs, "Override");
        boolean foundCommentType = false;
        for (int i = 0; i < nl.getLength(); i++) {
            Element elem = (Element) nl.item(i);
            String partName = elem.getAttribute("PartName");
            if (DocxConstants.COMMENTS_PARTNAME.equals(partName)) {
                foundCommentType = true;
                break;
            }
        }
        if (!foundCommentType) {
            Element elem = doc.createElementNS(contentTypesNs, "Override");
            elem.setAttribute("PartName", DocxConstants.COMMENTS_PARTNAME);
            elem.setAttribute("ContentType", DocxConstants.COMMENTS_CONTENT_TYPE);
            docElem.appendChild(elem);
            comp.setDom(doc);
        }
    }

    /**
     * @param documentDom
     * @param commentsDom
     * @param docxZip
     * @param zipFile
     * @throws FileNotFoundException
     * @throws IOException
     * @throws Exception
     */
    public static void saveZipComponents(ZipComponents zipComponents, File zipFile)
            throws FileNotFoundException, IOException, Exception {
        ZipOutputStream zipOutStream = new ZipOutputStream(new FileOutputStream(zipFile));
        for (ZipComponent comp : zipComponents.getComponents()) {
            ZipEntry newEntry = new ZipEntry(comp.getName());
            zipOutStream.putNextEntry(newEntry);
            if (comp.isDirectory()) {
                // Nothing to do.
            } else {
                // System.out.println(" + [DEBUG] saving component \"" + comp.getName() + "\"");
                if (comp.getName().endsWith("document.xml") || comp.getName().endsWith("document.xml.rels")) {
                    // System.out.println("Handling a file of interest.");
                }
                InputStream inputStream = comp.getInputStream();
                IOUtils.copy(inputStream, zipOutStream);
                inputStream.close();
            }
        }
        zipOutStream.close();
    }

}