org.nuclos.server.common.ooxml.WordXMLReader.java Source code

Introduction

Here is the source code for org.nuclos.server.common.ooxml.WordXMLReader.java
Source

//Copyright (C) 2010  Novabit Informationssysteme GmbH
//
//This file is part of Nuclos.
//
//Nuclos is free software: you can redistribute it and/or modify
//it under the terms of the GNU Affero General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//
//Nuclos is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU Affero General Public License for more details.
//
//You should have received a copy of the GNU Affero General Public License
//along with Nuclos.  If not, see <http://www.gnu.org/licenses/>.

package org.nuclos.server.common.ooxml;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import javax.xml.datatype.DatatypeConfigurationException;
import javax.xml.datatype.DatatypeFactory;
import javax.xml.datatype.XMLGregorianCalendar;

import org.apache.log4j.Logger;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;

/**
 * Helper class providing simple read-only access for examing OOXML wordprocessing documents.
 *
 * <p>Implementation Note: This API is based on Apache POI 3.6 which only covers a very small subset
 * for "common use cases". Especially,structured document tags are not supported by POI 3.6 directly.
 * However, it is possible to access the underlying XML structure directly using precompiled XMLBeans
 * (packages starting with org.openxmlformats.schemas contains the XML Schema Definition (XSD) compiled
 * as XMLBeans). For details about the XML structure, see the ECMA-376 specification (in particular
 * [ECMA-376,2nd], part 1, 17.5.2).
 */
public class WordXMLReader {

    private static final Logger LOG = Logger.getLogger(WordXMLReader.class);

    /** WordprocessingML namespace URI. */
    private static final String WORDPROCESSINGML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";

    /** Namespace declaration for XMLBeans path selection. */
    private static final String DECLARE_NS_PREFIX = "declare namespace w='" + WORDPROCESSINGML_NS + "' ";

    private final XWPFDocument document;

    private List<StructuredDocumentTag> structuredDocumentTags;

    public WordXMLReader(InputStream is) throws IOException {
        this(new XWPFDocument(is));
    }

    public WordXMLReader(XWPFDocument document) {
        this.document = document;
    }

    public String getText() {
        // Note: in POI 3.6, text extraction does not always work correctly
        return new XWPFWordExtractor(document).getText();
    }

    /**
     * Returns a map with the text content of the structured document tags
     * contained in this document (cf. [ECMA-376,2nd], 17.5.2) with their
     * tag name (17.5.2.42) as key.  Supported tags are comboBox, date,
     * dropDownList, richText and text.
     * <p>
     * If the structured document tag is not filled, i.e. marked as
     * placeholder (17.5.2.25), the text content is {@code null}.
     * If the structured document tag does not provide a tag name, the
     * alias (aka friendly name, 17.5.2.1). If both are omitted, the
     * structured document tag is skipped.
     */
    public Map<String, String> getStructuredDocumentTagTexts() {
        Map<String, String> tags = new HashMap<String, String>();
        for (StructuredDocumentTag sdt : structuredDocumentTags()) {
            String name = (sdt.tagName != null) ? sdt.tagName : sdt.alias;
            if (name != null && !tags.containsKey(name))
                tags.put(name, sdt.text);
        }
        return tags;
    }

    /**
     * Similar to {@link #getStructuredDocumentTagTexts()}, but returns
     * prepared values.
     * <p>
     * For combobox (17.5.2.5) and drop-down (17.5.2.15) elements,
     * the text content (=display text) is resolved against the given
     * list items (17.5.2.21/22) and replaced with its associated value.
     * For date elements (17.5.2.7), a {@link java.util.Date} object based
     * the cached full-date is returned ({@code toString} returns the
     * original string). If no full-date is stored, the original string
     * object is returned.
     */
    public Map<String, Object> getStructuredDocumentTagValues() {
        Map<String, Object> tags = new HashMap<String, Object>();
        for (StructuredDocumentTag sdt : structuredDocumentTags()) {
            String name = (sdt.tagName != null) ? sdt.tagName : sdt.alias;
            if (name != null && !tags.containsKey(name))
                tags.put(name, sdt.value);
        }
        return tags;
    }

    private List<StructuredDocumentTag> structuredDocumentTags() {
        if (structuredDocumentTags == null) {
            structuredDocumentTags = new ArrayList<StructuredDocumentTag>();
            // TODO: tables, am besten getBodyElements(), siehe JavaDoc...
            for (XWPFParagraph p : document.getParagraphs()) {
                extractStructuredDocumentTags(p.getCTP().getSdtArray());
            }

            Iterator<XWPFTable> tableIter = document.getTablesIterator();
            while (tableIter.hasNext()) {
                extractStructuredDocumentTags(tableIter.next());
            }
        }
        return structuredDocumentTags;
    }

    private void extractStructuredDocumentTags(XWPFTable t) {
        CTTbl table = t.getCTTbl();
        for (CTRow row : table.getTrArray()) {
            for (CTTc cell : row.getTcArray()) {
                extractStructuredDocumentTags(cell.getSdtArray());
                for (CTP ctp : cell.getPArray()) {
                    extractStructuredDocumentTags(ctp.getSdtArray());
                }
            }
        }
    }

    private void extractStructuredDocumentTags(CTSdtRun[] sdtRuns) {
        for (CTSdtRun sdtRun : sdtRuns) {
            CTSdtPr sdtPr = sdtRun.getSdtPr();
            CTSdtContentRun sdtContent = sdtRun.getSdtContent();
            createStructuredDocumentTag(sdtPr, getRText(sdtContent.getRArray()));
        }
    }

    private void extractStructuredDocumentTags(CTSdtBlock[] sdtBlocks) {
        for (CTSdtBlock sdtRun : sdtBlocks) {
            CTSdtPr sdtPr = sdtRun.getSdtPr();
            CTSdtContentBlock sdtContent = sdtRun.getSdtContent();
            createStructuredDocumentTag(sdtPr, getPText(sdtContent.getPArray()));
        }
    }

    /**
     * A <w:sdt> element contains 2 child elements: <w:sdtPr> for the properties,
     * and <w:sdtContent> for content (here text).
     */
    private void createStructuredDocumentTag(CTSdtPr sdtPr, String text) {
        // The properties contain (among others) aliases (<w:alias>), tag names (w:tag)
        // and a flag (<w:showingPlcHdr>) whether the content is placeholder or real content.
        String alias = getCTStringVal(getFirst(sdtPr.getAliasArray()));
        String tagName = getCTStringVal(getFirst(sdtPr.getTagArray()));
        boolean isPlaceholder = sdtPr.getShowingPlcHdrArray().length > 0;

        Object value = null;
        // If placeholder is set, the element is not filled by the user
        if (!isPlaceholder) {
            value = text;

            // The following child element can occur and determine the type of the structured
            // document tag: equation, comboBox (*), date (*), docPartObj, docPartList,
            // dropDownList (*), picture, richText (*), text (*), citation, group, bibliography.

            // Note that we can't use the typed method (e.g sdtPr.getComboBoxArray()) here
            // because in the small (poi-)ooxml-schemas.jar bundled with POI, the specialized
            // classes (e.g. CTStdComboBox) are missing. Trying to use these methods will fail
            // with a NoClassDefFoundError exception (cf. POI FAQ).
            // But we can work with the plain XmlObjects or DOM nodes, if we extract them by
            // a generic path expression.
            Element sdtType;
            if ((sdtType = getFirstAsDomElement(sdtPr, "w:text")) != null
                    || (sdtType = getFirstAsDomElement(sdtPr, "w:richText")) != null) {
                // Value is the text (in the case of richText without formatting)
            } else if ((sdtType = getFirstAsDomElement(sdtPr, "w:date")) != null) {
                // 17.5.2.7: fullDate contains the "full date and time last entered"
                // in XML Schema DateTime syntax
                String fullDate = sdtType.getAttributeNS(WORDPROCESSINGML_NS, "fullDate");
                if (fullDate != null) {
                    XMLGregorianCalendar calendar;
                    try {
                        calendar = DatatypeFactory.newInstance().newXMLGregorianCalendar(fullDate);
                        long timeMillis = calendar.toGregorianCalendar(null, null, null).getTimeInMillis();
                        final String dateText = text;
                        value = new Date(timeMillis) {
                            @Override
                            public String toString() {
                                return dateText;
                            };
                        };
                    } catch (DatatypeConfigurationException e) {
                        LOG.warn("createStructuredDocumentTag failed: " + e, e);
                    }
                }
            } else if ((sdtType = getFirstAsDomElement(sdtPr, "w:comboBox")) != null
                    || (sdtType = getFirstAsDomElement(sdtPr, "w:dropDownList")) != null) {
                // 17.5.2.5 (comboBox), 17.5.2.15 (dropDownList)
                // Try to find the associated value with the extract text (if possible)
                NodeList listItems = sdtType.getElementsByTagNameNS(WORDPROCESSINGML_NS, "listItem");
                for (int i = 0, n = listItems.getLength(); i < n; i++) {
                    Element listItem = (Element) listItems.item(i);
                    String displayText = listItem.getAttributeNS(WORDPROCESSINGML_NS, "displayText");
                    if (text.equals(displayText)) {
                        value = listItem.getAttributeNS(WORDPROCESSINGML_NS, "value");
                        break;
                    }
                }
            } else if ((getFirstAsDomElement(sdtPr, "equation") != null)
                    || (getFirstAsDomElement(sdtPr, "docPartObj") != null)
                    || (getFirstAsDomElement(sdtPr, "docPartList") != null)
                    || (getFirstAsDomElement(sdtPr, "picture") != null)
                    || (getFirstAsDomElement(sdtPr, "citation") != null)
                    || (getFirstAsDomElement(sdtPr, "group") != null)
                    || (getFirstAsDomElement(sdtPr, "bibliography") != null)) {
                // ignore (unsupported type)
                return;
            } else {
                // type is unspecified, treat as text
            }
        }

        StructuredDocumentTag sdt = new StructuredDocumentTag(alias, tagName, value, text);
        structuredDocumentTags.add(sdt);
    }

    private static Element getFirstAsDomElement(XmlObject xmlObject, String path) {
        XmlObject[] children = xmlObject.selectPath(DECLARE_NS_PREFIX + path);
        if (children.length >= 1)
            return (Element) children[0].getDomNode();
        return null;
    }

    private static String getPText(CTP... ps) {
        StringBuilder sb = new StringBuilder();
        for (CTP p : ps) {
            sb.append(getRText(p.getRArray()));
        }
        return sb.toString();
    }

    private static String getRText(CTR[] rs) {
        // This method is inspired by the text extraction algorithm in the XWPFParagraph constructor
        StringBuilder sb = new StringBuilder();
        for (CTR r : rs) {
            XmlCursor c = r.newCursor();
            c.selectPath("./*");
            while (c.toNextSelection()) {
                XmlObject o = c.getObject();
                if (o instanceof CTText) {
                    sb.append(((CTText) o).getStringValue());
                }
                if (o instanceof CTPTab) {
                    sb.append("\t");
                }
            }
        }
        return sb.toString();
    }

    private static String getCTStringVal(CTString cts) {
        return (cts != null) ? cts.getVal() : null;
    }

    private static <T> T getFirst(T[] array) {
        return array.length >= 1 ? array[0] : null;
    }

    private static class StructuredDocumentTag {

        final String alias;
        final String tagName;
        final Object value;
        final String text;

        StructuredDocumentTag(String alias, String tagName, Object value, String text) {
            this.alias = alias;
            this.tagName = tagName;
            this.value = value;
            this.text = text;
        }
    }
}