nl.strohalm.cyclos.utils.conversion.HtmlConverter.java Source code

Introduction

Here is the source code for nl.strohalm.cyclos.utils.conversion.HtmlConverter.java
Source

/*
This file is part of Cyclos (www.cyclos.org).
A project of the Social Trade Organisation (www.socialtrade.org).
    
Cyclos is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
    
Cyclos is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with Cyclos; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
    
 */
package nl.strohalm.cyclos.utils.conversion;

import java.io.StringReader;

import nl.strohalm.cyclos.utils.XmlHelper;

import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;

/**
 * Used to handle HTML formatted values, removing script tags and converting paragraphs into line breaks
 * @author luis
 */
public class HtmlConverter implements Converter<String> {

    private static final long serialVersionUID = -1184040713929519035L;
    private static final String[] BAD_TAGS = { "script", "style", "iframe", "form" };
    // it's initialized because this mode (with flag in true) is the most used.
    private static final HtmlConverter INSTANCE = new HtmlConverter(true);
    // this mode is initialized on demand.
    private static HtmlConverter INSTANCE_NBSP = null;
    private static final int NBSP = 160;

    public static HtmlConverter instance() {
        return instance(true);
    }

    public static HtmlConverter instance(final boolean removeBlankspaces) {
        if (removeBlankspaces) {
            return INSTANCE;
        } else {
            if (INSTANCE_NBSP == null) {
                INSTANCE_NBSP = new HtmlConverter(false);
            }
            return INSTANCE_NBSP;
        }
    }

    private static void removeBadNodes(final Document document) {
        final NodeList elements = document.getElementsByTagName("*");
        for (int i = 0; i < elements.getLength(); i++) {
            final Element element = (Element) elements.item(i);
            if (ArrayUtils.contains(BAD_TAGS, element.getTagName())) {
                element.getParentNode().removeChild(element);
            }
            final NamedNodeMap attributes = element.getAttributes();
            for (int j = 0; j < attributes.getLength(); j++) {
                final Attr attr = (Attr) attributes.item(j);
                if (attr.getNodeName().startsWith("on")) {
                    // This is an event handler: remove it
                    element.removeAttributeNode(attr);
                }
            }
        }
    }

    private boolean removeBlankspaces;

    private HtmlConverter(final boolean removeBlankspaces) {
        this.removeBlankspaces = removeBlankspaces;
    }

    public String toString(final String string) {
        return string;
    }

    public String valueOf(final String string) {
        if (StringUtils.isBlank(string)) {
            return removeBlankspaces ? null : string;
        }

        final Tidy tidy = new Tidy(); // obtain a new Tidy instance
        tidy.setXHTML(false); // set desired config options using tidy setters
        tidy.setQuiet(true);
        tidy.setShowErrors(0);
        tidy.setShowWarnings(false);
        tidy.setIndentContent(false);
        tidy.setXmlOut(true);

        final Document document = tidy.parseDOM(new StringReader(string), null);
        removeBadNodes(document);

        final NodeList bodies = document.getElementsByTagName("body");
        if (bodies.getLength() == 0) {
            // No body element? return null
            return null;
        } else {
            // Result will contain the xml header plus the body element itself. We need to body content only
            String result = XmlHelper.toString(bodies.item(0));
            result = result.substring(result.indexOf("<body>") + "<body>".length(), result.indexOf("</body>"));
            // Remove the nbsps
            if (removeBlankspaces) {
                int begin = 0;
                while (result.charAt(begin) == NBSP) {
                    begin++;
                    if (begin == result.length()) {
                        // All the string was NBSPs
                        return null;
                    }
                }
                int end = result.length();
                while (result.charAt(end - 1) == NBSP) {
                    end--;
                }
                return StringUtils.trimToNull(result.substring(begin, end));
            } else {
                return StringUtils.trimToNull(result);
            }
        }
    }
}