org.openflexo.docxparser.OpenXml2Html.java Source code

Java tutorial

Introduction

Here is the source code for org.openflexo.docxparser.OpenXml2Html.java

Source

/*
 * (c) Copyright 2010-2011 AgileBirds
 *
 * This file is part of OpenFlexo.
 *
 * OpenFlexo is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * OpenFlexo is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with OpenFlexo. If not, see <http://www.gnu.org/licenses/>.
 *
 */
package org.openflexo.docxparser;

import java.awt.Color;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

import javax.swing.text.html.CSS;
import javax.swing.text.html.HTML;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.dom4j.Element;
import org.jaxen.JaxenException;
import org.jaxen.dom4j.Dom4jXPath;
import org.openflexo.docxparser.dto.ParsedHtml;
import org.openflexo.docxparser.dto.ParsedHtmlResource;
import org.openflexo.toolbox.HTMLUtils;

public class OpenXml2Html {
    private static final Logger logger = Logger.getLogger(OpenXml2Html.class.getPackage().toString());

    private Set<String> availableCssClasses;
    private String resourcesDirectory;

    private PackagePart documentPart;
    private DocxNumberingParser docxNumbering;

    private String currentNumId = null;
    private Integer currentNumLevel = null;
    private HTMLProperties currentParagraphProperties = new HTMLProperties();
    private HTMLProperties currentSpanProperties = new HTMLProperties();

    public OpenXml2Html(PackagePart documentPart, Set<String> availableCssClasses, String resourcesDirectory) {
        this.documentPart = documentPart;
        this.availableCssClasses = availableCssClasses;
        this.resourcesDirectory = resourcesDirectory;

        if (!this.resourcesDirectory.endsWith("/") && !this.resourcesDirectory.endsWith("\\")) {
            this.resourcesDirectory = resourcesDirectory + "/";
        }
    }

    public static ParsedHtml getHtml(Element element, PackagePart documentPart, Set<String> availableCssClasses,
            String resourcesDirectory) {
        return new OpenXml2Html(documentPart, availableCssClasses, resourcesDirectory).getHtml(element);
    }

    // Used as entry point
    public ParsedHtml getHtml(Element element) {
        currentNumId = null;
        currentNumLevel = null;

        ParsedHtml parsedHtml = new ParsedHtml();

        parsedHtml.append(getRecursiveHtml(element));

        handleNumberingLevel(parsedHtml, null, null);

        return parsedHtml;
    }

    // Used in recursive iteration
    private ParsedHtml getRecursiveHtml(Element element) {
        ParsedHtml parsedHtml = new ParsedHtml();

        OpenXmlTag elementTag = OpenXmlTag.getOpenXmlTag(element);
        switch (elementTag) {
        case w_p:
            parsedHtml.append(getHtmlFromW_PElement(element));
            break;
        case w_r:
            parsedHtml.append(getHtmlFromW_RElement(element));
            break;
        case w_t:
            parsedHtml.append(getHtmlFromW_TElement(element));
            break;
        case w_hyperlink:
            parsedHtml.append(getHtmlFromW_HyperlinkElement(element));
            break;
        case w_drawing:
            parsedHtml.append(getHtmlFromW_DrawingElement(element));
            break;
        default:
            // find all w:p inside this element
            Iterator<?> iterator = element.selectNodes("descendant::w:p").iterator();
            while (iterator.hasNext()) {
                Element childElement = (Element) iterator.next();
                parsedHtml.append(getRecursiveHtml(childElement));
            }
            break;

        }

        return parsedHtml;
    }

    private ParsedHtml getHtmlFromW_PElement(Element element) throws InvalidElementException {
        if (OpenXmlTag.getOpenXmlTag(element) != OpenXmlTag.w_p) {
            throw new InvalidElementException("Cannot transform element to html, expecting element w:p and get '"
                    + element.getQualifiedName() + "'");
        }

        Element numPrElement = (Element) element.selectSingleNode("w:pPr/w:numPr");
        String foundNumId = null;
        Integer foundNumLevel = null;
        if (numPrElement != null) {
            Element ilvlElement = numPrElement.element(DocxQName.getQName(OpenXmlTag.w_ilvl));
            Element numIdElement = numPrElement.element(DocxQName.getQName(OpenXmlTag.w_numId));
            if (ilvlElement != null && numIdElement != null) {
                try {
                    foundNumLevel = new Integer(ilvlElement.attributeValue(DocxQName.getQName(OpenXmlTag.w_val)));
                    foundNumId = numIdElement.attributeValue(DocxQName.getQName(OpenXmlTag.w_val));
                } catch (NumberFormatException e) {
                }
            }
        }

        ParsedHtml parsedHtml = new ParsedHtml();

        currentParagraphProperties = new HTMLProperties(element);

        if (foundNumId != null || foundNumLevel != null
                || element.selectSingleNode("w:pPr/w:pStyle[@w:val = 'ListParagraph']") == null) {
            handleNumberingLevel(parsedHtml, foundNumId, foundNumLevel);
        }

        parsedHtml.appendHtml(currentParagraphProperties.getOpenTag());

        for (Iterator<?> iterator = element.elementIterator(); iterator.hasNext();) {
            Element childElement = (Element) iterator.next();
            parsedHtml.append(getRecursiveHtml(childElement));
        }

        closeOpenedSpan(parsedHtml);

        parsedHtml.appendHtml(currentParagraphProperties.getCloseTag());

        return parsedHtml;
    }

    private ParsedHtml getHtmlFromW_HyperlinkElement(Element element) throws InvalidElementException {
        if (OpenXmlTag.getOpenXmlTag(element) != OpenXmlTag.w_hyperlink) {
            throw new InvalidElementException(
                    "Cannot transform element to html, expecting element w:hyperlink and get '"
                            + element.getQualifiedName() + "'");
        }

        ParsedHtml parsedHtml = new ParsedHtml();

        String href = null;

        String linkRid = element.attributeValue(DocxQName.getQName(OpenXmlTag.r_id));
        if (linkRid != null) {
            PackageRelationship linkRelationship = documentPart.getRelationship(linkRid);
            if (linkRelationship != null) {
                href = linkRelationship.getTargetURI().toString();
            }
        }

        if (href == null) { // Anchor ?
            String anchor = element.attributeValue(DocxQName.getQName(OpenXmlTag.w_anchor));
            if (anchor != null) {
                href = "#" + anchor;
            }
        }

        String closeTag;

        if (href != null) {
            String target = element.attributeValue(DocxQName.getQName(OpenXmlTag.w_tgtFrame));
            String title = element.attributeValue(DocxQName.getQName(OpenXmlTag.w_tooltip));

            parsedHtml.appendHtml("<a href=\"" + href + "\"");
            if (target != null) {
                parsedHtml.appendHtml(" target=\"" + StringEscapeUtils.escapeHtml(target) + "\"");
            }
            if (title != null) {
                parsedHtml.appendHtml(" title=\"" + StringEscapeUtils.escapeHtml(title) + "\"");
            }

            parsedHtml.appendHtml(">");

            closeTag = "</a>";
        } else {
            logger.log(Level.WARNING,
                    "OpenXml to Html: cannot get hyperlink relationship with id '" + linkRid + "'");
            closeTag = "";
        }

        for (Iterator<?> iterator = element.elementIterator(); iterator.hasNext();) {
            Element childElement = (Element) iterator.next();
            parsedHtml.append(getRecursiveHtml(childElement));
        }

        parsedHtml.appendHtml(closeTag);

        return parsedHtml;
    }

    private ParsedHtml getHtmlFromW_RElement(Element element) throws InvalidElementException {
        if (OpenXmlTag.getOpenXmlTag(element) != OpenXmlTag.w_r) {
            throw new InvalidElementException("Cannot transform element to html, expecting element w:r and get '"
                    + element.getQualifiedName() + "'");
        }

        ParsedHtml parsedHtml = new ParsedHtml();

        HTMLProperties elementHTMLProperties = new HTMLProperties(element);
        elementHTMLProperties.removePropertiesFrom(currentParagraphProperties);

        if (!elementHTMLProperties.equals(currentSpanProperties)) {
            closeOpenedSpan(parsedHtml);
            currentSpanProperties = elementHTMLProperties;
            if (!currentSpanProperties.isEmpty()) {
                parsedHtml.appendHtml(currentSpanProperties.getOpenTag());
            }
        }

        for (Iterator<?> iterator = element.elementIterator(); iterator.hasNext();) {
            Element childElement = (Element) iterator.next();
            parsedHtml.append(getRecursiveHtml(childElement));
        }

        return parsedHtml;
    }

    private ParsedHtml getHtmlFromW_DrawingElement(Element element) {
        if (OpenXmlTag.getOpenXmlTag(element) != OpenXmlTag.w_drawing) {
            throw new InvalidElementException(
                    "Cannot transform element to html, expecting element w:drawing and get '"
                            + element.getQualifiedName() + "'");
        }

        ParsedHtml parsedHtml = new ParsedHtml();

        try {
            Dom4jXPath xpath = new Dom4jXPath("descendant::a:blip");
            xpath.addNamespace(DocxXmlUtil.NAMESPACE_DRAWINGMAIN.getPrefix(),
                    DocxXmlUtil.NAMESPACE_DRAWINGMAIN.getURI());

            Element ablipElement = (Element) xpath.selectSingleNode(element);
            if (ablipElement == null) {
                logger.warning("Cannot handle drawing tag: a:blip element not found");
                return parsedHtml;
            }

            String imageRid = ablipElement.attributeValue(DocxQName.getQName(OpenXmlTag.r_embed));
            if (imageRid == null) {
                logger.warning("Cannot handle drawing tag: r:embed attribute in a:blip element not found");
                return parsedHtml;
            }

            PackageRelationship imageRelationship = documentPart.getRelationship(imageRid);
            if (imageRelationship == null) {
                logger.warning("Cannot handle drawing tag: imageRelationship with id '" + imageRid + "' not found");
                return parsedHtml;
            }

            PackagePartName imagePartName = PackagingURIHelper.createPartName(imageRelationship.getTargetURI());
            PackagePart imagePart = documentPart.getPackage().getPart(imagePartName);
            String imageFileName;
            if (imagePart != null) {
                byte[] imageBytes = DocxXmlUtil.getByteArrayFromInputStream(imagePart.getInputStream());

                imageFileName = imagePartName.getName().substring(imagePartName.getName().lastIndexOf('/') + 1);

                parsedHtml.addNeededResource(new ParsedHtmlResource(imageFileName, imageBytes));
            } else {
                imageFileName = "";
            }
            Integer imageWidth = null;
            Integer imageHeight = null;

            Element extentElement = (Element) element.selectSingleNode("wp:inline/wp:extent");
            if (extentElement != null) {
                String imageCx = extentElement.attributeValue("cx");
                String imageCy = extentElement.attributeValue("cy");

                if (imageCx != null) {
                    imageWidth = getEnglishMetricUnitInPixel(imageCx);
                }
                if (imageCy != null) {
                    imageHeight = getEnglishMetricUnitInPixel(imageCy);
                }
            }

            parsedHtml.appendHtml("<img src=\"" + resourcesDirectory + imageFileName + "\"");
            if (imageWidth != null) {
                parsedHtml.appendHtml(" width=\"" + imageWidth + "\"");
            }
            if (imageHeight != null) {
                parsedHtml.appendHtml(" height=\"" + imageHeight + "\"");
            }
            parsedHtml.appendHtml(" />");

            return parsedHtml;
        } catch (InvalidFormatException e) {
            logger.log(Level.WARNING, "Cannot handle drawing tag: InvalidFormatException catched", e);
            return new ParsedHtml();
        } catch (IOException e) {
            logger.log(Level.WARNING, "Cannot handle drawing tag: IOException catched", e);
            return new ParsedHtml();
        } catch (JaxenException e) {
            logger.log(Level.WARNING, "Cannot handle drawing tag: JaxenException catched", e);
            return new ParsedHtml();
        }
    }

    private Integer getEnglishMetricUnitInPixel(String value) {
        try {
            return Integer.parseInt(value) / 9525;
        } catch (NumberFormatException e) {
            logger.warning("Cannot transform EMU in pixel, input = '" + value + "'");
            return null;
        }
    }

    private ParsedHtml getHtmlFromW_TElement(Element element) throws InvalidElementException {
        if (OpenXmlTag.getOpenXmlTag(element) != OpenXmlTag.w_t) {
            throw new InvalidElementException("Cannot transform element to html, expecting element w:t and get '"
                    + element.getQualifiedName() + "'");
        }

        ParsedHtml parsedHtml = new ParsedHtml();

        parsedHtml.appendHtml(StringEscapeUtils.escapeHtml(element.getText()));

        return parsedHtml;
    }

    private void handleNumberingLevel(ParsedHtml parsedHtml, String foundNumId, Integer foundNumLevel) {
        Integer usedNumLevel = foundNumId == null || !foundNumId.equals(currentNumId) ? null : foundNumLevel;

        boolean needCloseOpenLi = true;
        while (currentNumLevel != null && (usedNumLevel == null || !currentNumLevel.equals(usedNumLevel))) {
            if (usedNumLevel == null || currentNumLevel > usedNumLevel) {
                parsedHtml.appendHtml("</li>");

                if (isOrderedNumbering(currentNumId, currentNumLevel)) {
                    parsedHtml.appendHtml("</ol>");
                } else {
                    parsedHtml.appendHtml("</ul>");
                }

                currentNumLevel--;
                if (currentNumLevel < 0) {
                    currentNumLevel = null;
                }
            } else {
                currentNumLevel++;

                if (isOrderedNumbering(currentNumId, currentNumLevel)) {
                    parsedHtml.appendHtml("<ol>");
                } else {
                    parsedHtml.appendHtml("<ul>");
                }

                parsedHtml.appendHtml("<li>");
                needCloseOpenLi = false;
            }
        }

        if (foundNumId != null && !foundNumId.equals(currentNumId)) {
            for (int i = 0; i <= foundNumLevel; i++) {
                if (isOrderedNumbering(foundNumId, i)) {
                    parsedHtml.appendHtml("<ol>");
                } else {
                    parsedHtml.appendHtml("<ul>");
                }

                parsedHtml.appendHtml("<li>");
            }
            currentNumLevel = foundNumLevel;
            needCloseOpenLi = false;
        }

        currentNumId = foundNumId;

        if (needCloseOpenLi && currentNumId != null) {
            parsedHtml.appendHtml("</li><li>");
        }
    }

    private DocxNumberingParser getDocxNumbering() {
        if (docxNumbering == null) {
            docxNumbering = new DocxNumberingParser(documentPart);
        }
        return docxNumbering;
    }

    private boolean isOrderedNumbering(String numId, Integer levelNumber) {
        return getDocxNumbering().isOrderedNumbering(numId, levelNumber.toString());
    }

    private void closeOpenedSpan(ParsedHtml parsedHtml) {
        if (!currentSpanProperties.isEmpty()) {
            parsedHtml.appendHtml(currentSpanProperties.getCloseTag());
            currentSpanProperties = new HTMLProperties();
        }
    }

    @SuppressWarnings("serial")
    private class InvalidElementException extends RuntimeException {
        public InvalidElementException(String msg) {
            super(msg);
        }
    }

    private class HTMLProperties {
        private Map<String, String> properties = new HashMap<String, String>();
        private HTML.Tag tagToUse;

        public HTMLProperties() {

        }

        public HTMLProperties(Element element) {
            OpenXmlTag openXmlTag = OpenXmlTag.getOpenXmlTag(element);
            Element propertyElement = null;

            switch (openXmlTag) {
            case w_p:
                propertyElement = (Element) element.selectSingleNode("w:pPr");
                tagToUse = HTML.Tag.P; // Can be changed to h1, h2 ... in fillPropertyMap
                break;
            case w_r:
                propertyElement = (Element) element.selectSingleNode("w:rPr");
                tagToUse = HTML.Tag.SPAN;
                break;
            case w_pPr:
                propertyElement = element;
                tagToUse = HTML.Tag.P; // Can be changed to h1, h2 ... in fillPropertyMap
                break;
            case w_rPr:
                propertyElement = element;
                tagToUse = HTML.Tag.SPAN;
                break;
            }

            if (propertyElement != null) {
                fillPropertyMap(propertyElement);
            }
        }

        public void removePropertiesFrom(HTMLProperties src) {
            for (String key : src.properties.keySet()) {
                if (src.properties.get(key).equals(this.properties.get(key))) {
                    this.properties.remove(key);
                }
            }
        }

        @Override
        public boolean equals(Object obj) {
            if (!(obj instanceof HTMLProperties)) {
                return false;
            }

            if (((HTMLProperties) obj).properties.size() != this.properties.size()) {
                return false;
            }

            for (String key : this.properties.keySet()) {
                if (!this.properties.get(key).equals(((HTMLProperties) obj).properties.get(key))) {
                    return false;
                }
            }

            return true;
        }

        /**
         * Return true if the htmlProperties in argument is entirely covered by 'this'
         * 
         * @param htmlProperties
         * @return
         */
        public boolean isContained(HTMLProperties htmlProperties) {
            for (String key : htmlProperties.properties.keySet()) {
                if (!htmlProperties.properties.get(key).equals(this.properties.get(key))) {
                    return false;
                }
            }

            return true;
        }

        public String getOpenTag() {
            StringBuilder styles = new StringBuilder();
            String classValue = null;
            StringBuilder fontAttributes = new StringBuilder();

            for (String key : properties.keySet()) {
                if (HTML.Attribute.CLASS.toString().equals(key)) {
                    classValue = properties.get(key);
                } else if (CSS.Attribute.FONT_SIZE.toString().equals(key)) {
                    fontAttributes
                            .append(" " + HTML.Attribute.STYLE + "=\"" + key + ": " + properties.get(key) + ";\"");
                } else if (CSS.Attribute.COLOR.toString().equals(key)) {
                    fontAttributes.append(" " + HTML.Attribute.COLOR + "=\"" + properties.get(key) + "\"");
                } else {
                    styles.append(key + ": " + properties.get(key) + ";");
                }
            }

            return "<" + tagToUse + (classValue != null ? " class=\"" + classValue + "\"" : "")
                    + (styles.length() > 0 ? " style=\"" + styles + "\"" : "") + ">"
                    + (fontAttributes.length() > 0 ? "<" + HTML.Tag.FONT + fontAttributes + ">" : "");
        }

        public String getCloseTag() {
            if (properties.containsKey(CSS.Attribute.FONT_SIZE.toString())
                    || properties.containsKey(CSS.Attribute.COLOR.toString())) {
                return "</" + HTML.Tag.FONT + "></" + tagToUse + ">";
            }
            return "</" + tagToUse + ">";
        }

        public boolean isEmpty() {
            return properties.isEmpty();
        }

        private void fillPropertyMap(Element propertyElement) {
            for (OpenXmlTag tag : OpenXmlTag.getStylePropertyTags()) {
                Element element = propertyElement.element(DocxQName.getQName(tag));
                if (element != null) {
                    String value;
                    switch (tag) {
                    case w_b:
                        properties.put(CSS.Attribute.FONT_WEIGHT.toString(), "bold");
                        break;
                    case w_u:
                        properties.put(CSS.Attribute.TEXT_DECORATION.toString(), "underline");
                        break;
                    case w_i:
                        properties.put(CSS.Attribute.FONT_STYLE.toString(), "italic");
                        break;
                    case w_color:
                        value = element.attributeValue(DocxQName.getQName(OpenXmlTag.w_val));
                        if (value != null) {
                            Color color = HTMLUtils.extractColorFromString(value);
                            if (color == null) {
                                color = HTMLUtils.extractColorFromString("#" + value);
                            }

                            if (color != null) {
                                properties.put(CSS.Attribute.COLOR.toString(), "#" + HTMLUtils.toHexString(color));
                            }

                        }
                        break;
                    case w_highlight:
                        value = element.attributeValue(DocxQName.getQName(OpenXmlTag.w_val));
                        if (value != null) {
                            Color color = HTMLUtils.extractColorFromString(value);
                            if (color == null) {
                                color = HTMLUtils.extractColorFromString("#" + value);
                            }

                            if (color != null) {
                                properties.put(CSS.Attribute.BACKGROUND_COLOR.toString(),
                                        "#" + HTMLUtils.toHexString(color));
                            }
                        }
                        break;
                    case w_shd:
                        value = element.attributeValue(DocxQName.getQName(OpenXmlTag.w_fill));
                        if (value != null) {
                            Color color = HTMLUtils.extractColorFromString(value);
                            if (color == null) {
                                color = HTMLUtils.extractColorFromString("#" + value);
                            }

                            if (color != null) {
                                properties.put(CSS.Attribute.BACKGROUND_COLOR.toString(),
                                        "#" + HTMLUtils.toHexString(color));
                            }
                        }
                        break;
                    case w_jc:
                        value = element.attributeValue(DocxQName.getQName(OpenXmlTag.w_val));
                        String alignValue = null;
                        if ("left".equals(value)) {
                            alignValue = "left";
                        } else if ("right".equals(value)) {
                            alignValue = "right";
                        } else if ("center".equals(value)) {
                            alignValue = "center";
                        } else if ("both".equals(value)) {
                            alignValue = "justify";
                        }

                        if (alignValue != null) {
                            properties.put(CSS.Attribute.TEXT_ALIGN.toString(), alignValue);
                        }
                        break;
                    case w_szCs:
                    case w_sz:
                        value = element.attributeValue(DocxQName.getQName(OpenXmlTag.w_val));
                        if (value != null) {
                            try {
                                int size = Integer.parseInt(value);
                                properties.put(CSS.Attribute.FONT_SIZE.toString(), String.valueOf(size / 2) + "pt");
                            } catch (NumberFormatException e) {
                                // Ok not a number, skip it
                            }
                        }
                        break;
                    case w_pStyle:
                        value = element.attributeValue(DocxQName.getQName(OpenXmlTag.w_val));
                        if (value != null) {
                            if (value.equals("Heading1")) {
                                tagToUse = HTML.Tag.H1;
                            } else if (value.equals("Heading2")) {
                                tagToUse = HTML.Tag.H2;
                            } else if (value.equals("Heading3")) {
                                tagToUse = HTML.Tag.H3;
                            } else if (value.equals("Heading4")) {
                                tagToUse = HTML.Tag.H4;
                            } else if (value.equals("Heading5")) {
                                tagToUse = HTML.Tag.H5;
                            } else if (value.equals("Heading6")) {
                                tagToUse = HTML.Tag.H6;
                            } else if (availableCssClasses.contains(value)) {
                                properties.put(HTML.Attribute.CLASS.toString(), value);
                            }
                        }
                        break;
                    case w_rStyle:
                        value = element.attributeValue(DocxQName.getQName(OpenXmlTag.w_val));
                        if (value != null && availableCssClasses.contains(value)) {
                            properties.put(HTML.Attribute.CLASS.toString(), value);
                        }
                        break;
                    }
                }
            }
        }
    }
}