HTMLRewriter.java Source code

Java tutorial

Introduction

Here is the source code for HTMLRewriter.java

Source

/*
 * Copyright 2000-2004 The Apache Software Foundation.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 *
 *
 *  COMPATIBILITY
 *  
 *      [28.01.2001, RammerI] Tested on W2K, with J2SE, JDK 1.3
 *      [29.01.2001, RammerI] Tested on W2K, with JDK 1.2.2
 *
 *
 *
 *  FEATURES
 *      = Rewriting of <A HREFs, <IMG SRCes, <FORM ACTIONs, <TD BACKGROUNDs,
 *          <INPUT SRCs, <APPLET CODEBASEs
 *      = Removal of <SCRIPT>, <STYLE>, <HEAD>, <EMBED>, <OBJECT>, <APPLET>,
 *          <NOSCRIPT>
 * 
 ****
 * Please include the following section in the WebPagePortlet documentation     
 ****
 * <CODE>
 *
 * The following describes how HTML tags are rewritten
 *
 * <!-- --> (HTML Comments)
 *   o Unless otherwise mentioned, comments are striped.
 * 
 * <A>
 *   o HREF attribute   - URL merged with base URL (See Note 1)
 *   o TARGET attribute - Set to "_BLANK" if it does not exist 
 *                        and openInNewWindow = TRUE
 * <AREA>
 *   o HREF attribute   - URL merged with base URL (See Note 1)
 *   o TARGET attribute - Set to "_BLANK" if it does not exist 
 *                        and openInNewWindow = TRUE
 * <APPLET>
 *   o Optionally included
 *   o CODEBASE attribute - Set to the current path if it does
 *                          not exist.
 * 
 * <BASE>
 *   o <HEAD> does NOT have to be included.
 *   o HREF attribute  - Set the Base URL of the page, but the tag
 *                       not set in resulting HTML. URL merged with
 *                       base URL (See Note 1)
 * 
 * <BODY>
 *   o Background attribute - Always striped.
 * 
 * <EMBED>
 *   o May not work.  Not supported by JDK 1.3/
 * 
 * <FORM>
 *   o ACTION attribute - Set to the current URL if it does
 *                        not exist. URL merged with base
 *                        URL (See Note 1)
 * 
 * <IMG>
 *   o SRC attribute - URL merged with base URL (See Note 1)
 * 
 * <INPUT>
 *   o SRC attribute - URL merged with base URL (See Note 1)
 * 
 * <LINK>
 *   o HREF attribute - URL merged with base URL (See Note 1)
 *
 * <OBJECT>
 *   o Optionally included
 *   o CODEBASE attribute - Set to the current path if it does
 *                          not exist. URL merged with base
 *                          URL (See Note 1)
 * 
 * <SCRIPT>
 *   o Optionally included
 *   o Contents may be striped if this tag appears in the <HEAD>
 *     and the contents are NOT in a comment
 *   o SRC attribute - URL merged with base URL (See Note 1)
 *   o Script code that is NOT enclosed in a comment (<!-- -->)
 *     and in the <HEAD> may NOT be in the resulting HTML.  This
 *     is related to the HTML parser in included in the JDK 
 * 
 * <TD>
 *   o BACKGROUND attribute - URL merged with base URL (See Note 1)
 * 
 * Note 1: URL Merging.
 *   This is done because the source of the page sent to the
 *   user's browser is different then source the current page.
 *   Example:
 *     Base URL........ http://jakarta.apache.org/jetspeed
 *     URL............. logo.gif
 *     Resulting URL... http://jakarta.apache.org/jetspeed/logo.gif
 * 
 * </CODE>
 *  KNOWN PROBLEMS
 *
 *
 *  == Seems to have problems with international characters, when the web-pages
 *     are not downloaded from the original URL but taken from the cache.
 *     (To reproduce do the following
 *      1. create a new portlet from the url http://www.sycom.at/default.htm
 *      2. stop tomcat & restart tomcat
 *      3. login and customize your page to include this portlet
 *      4. everything should appear fine, the webpage will show some german 
 *         umlauts
 *      5. shutdown tomcat and restart it
 *      6. jetspeed is now taking the HTML not from www.sycom.at, but from the
 *         cache. Instead of the umlauts, you will see weird characters. 
 *
 *
 *  == Does not yet work with XHTML-Pages but only plain-old HTMLs. I.e. Closed
 *     single tags like <BR /> screw the output up.
 *      
 *
 *
 */
//package org.apache.jetspeed.util;

import java.io.Reader;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Enumeration;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.MutableAttributeSet;

/**
 *
 * @author  Ingo Rammer (rammer@sycom.at)
 * @author <a href="mailto:sgala@apache.org">Santiago Gala</a>
 * @author <a href="mailto:paulsp@apache.org">Paul Spencer</a>
 * @version 0.2
 */

public class HTMLRewriter {
    /**
     * Static initialization of the logger for this class
     */

    private HTMLRewriter.Callback cb = new HTMLRewriter.Callback();

    /** Sets the parameters for the HTMLRewriter
     * @param removeScript Shall SCRIPT-Tags and their content be removed
     * @param removeStyle Shall STYLE-Tags and their content be removed
     * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
     * @param removeMeta Shall META-Tags be removed
     * @param removeApplet Shall APPLET-Tags and their content be removed
     * @param removeObject Shall OBJECT-Tags and their content be removed
     * @param removeHead Shall HEAD-Tags and their content be removed
     * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
     */
    public HTMLRewriter(boolean removeScript, boolean removeStyle, boolean removeNoScript, boolean removeMeta,
            boolean removeApplet, boolean removeObject, boolean removeHead, boolean removeOnSomething) {
        init(removeScript, removeStyle, removeNoScript, removeMeta, removeApplet, removeObject, removeHead,
                removeOnSomething, false);
    }

    /**
     * Sets the parameters for the HTMLRewriter
     * @param removeScript Shall SCRIPT-Tags and their content be removed
     * @param removeStyle Shall STYLE-Tags and their content be removed
     * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
     * @param removeMeta Shall META-Tags be removed
     * @param removeApplet Shall APPLET-Tags and their content be removed
     * @param removeObject Shall OBJECT-Tags and their content be removed
     * @param removeHead Shall HEAD-Tags and their content be removed
     * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
     */
    public HTMLRewriter(boolean removeScript, boolean removeStyle, boolean removeNoScript, boolean removeMeta,
            boolean removeApplet, boolean removeObject, boolean removeHead, boolean removeOnSomething,
            boolean openInNewWindow) {
        init(removeScript, removeStyle, removeNoScript, removeMeta, removeApplet, removeObject, removeHead,
                removeOnSomething, openInNewWindow);
    }

    /**
     * Sets the parameters for the HTMLRewriter
     *
     * @param removeScript Shall SCRIPT-Tags and their content be removed
     * @param removeStyle Shall STYLE-Tags and their content be removed
     * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
     * @param removeMeta Shall META-Tags be removed
     * @param removeApplet Shall APPLET-Tags and their content be removed
     * @param removeObject Shall OBJECT-Tags and their content be removed
     * @param removeHead Shall HEAD-Tags and their content be removed
     * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
     * @param openInNewWindow Shall links set Target="_blank"
     */
    private void init(boolean removeScript, boolean removeStyle, boolean removeNoScript, boolean removeMeta,
            boolean removeApplet, boolean removeObject, boolean removeHead, boolean removeOnSomething,
            boolean openInNewWindow) {
        cb.removeScript = removeScript;
        cb.removeStyle = removeStyle;
        cb.removeNoScript = removeNoScript;
        cb.removeMeta = removeMeta;
        cb.removeApplet = removeApplet;
        cb.removeObject = removeObject;
        cb.removeHead = removeHead;
        cb.removeOnSomething = removeOnSomething;
        cb.openInNewWindow = openInNewWindow;
    }

    /**
     * Does the conversion of the HTML
     * @param HTMLrdr Reader for HTML to be converted
     * @param BaseUrl URL from which this HTML was taken. We be the base-Url
     * for all URL-rewritings.
     * @throws MalformedURLException If the BaseUrl is not a valid URL or if an URL inside
     * the document could not be converted. Should not happen
     * normally, even in badly formatted HTML.
     * @return HTML-String with rewritten URLs and removed (according
     * to constructor-settings) tags
     */
    public synchronized String convertURLs(Reader HTMLrdr, String BaseUrl) throws MalformedURLException {
        HTMLEditorKit.Parser parse = new HTMLRewriter.ParserGetter().getParser();
        String res = "";
        try {
            if (cb.result != null) {
                cb.result = null;
                cb.result = new StringWriter();
            }
            cb.baseUrl = new URL(BaseUrl);
            parse.parse(HTMLrdr, cb, true);
            res = cb.getResult();
        } catch (Exception e) {
            //logger.error( "Unable to convertURLS", e );
            throw new MalformedURLException(e.toString());
        }
        return res;
    }

    /** That Class is needed, because getParser is protected and therefore 
     *  only accessibly by a subclass
     */
    class ParserGetter extends HTMLEditorKit {
        /** This is needed, because getParser is protected
         * @return Html Parser
         */
        public HTMLEditorKit.Parser getParser() {
            return super.getParser();
        }
    }

    class Callback extends HTMLEditorKit.ParserCallback {

        // the base-url of which the given html comes from.
        private URL baseUrl;

        // either handling of <FORM> is buggy, or I made some weird mistake ... 
        // ... JDK 1.3 sends double "</form>"-tags on closing <form>
        private boolean inForm = false;

        // when in multi-part ignored tags (like <script> foobar </script>, 
        // <style> foobar </style>, a counter for the nesting-level will be
        // kept here
        private int ignoreLevel = 0;

        private boolean removeScript = true;
        private boolean removeStyle = true;
        private boolean removeNoScript = true;
        private boolean removeMeta = true;
        private boolean removeApplet = true;
        private boolean removeObject = true;
        private boolean removeHead = true;
        private boolean openInNewWindow = false;

        // remove the onClick=, onBlur=, etc. - Attributes
        private boolean removeOnSomething = true;

        private boolean inScript = false;
        private boolean inStyle = false;

        private StringWriter result = new StringWriter();

        private Callback() {
        }

        private Callback addToResult(Object txt) {
            // to allow for implementation using Stringbuffer or StringWriter
            // I don't know yet, which one is better in this case
            if (ignoreLevel > 0)
                return this;

            try {
                result.write(txt.toString());
            } catch (Exception e) {
                /* ignore */ }
            return this;
        }

        private Callback addToResult(char[] txt) {
            if (ignoreLevel > 0)
                return this;

            try {
                result.write(txt);
            } catch (Exception e) {
                /* ignore */ }
            return this;
        }

        /** Accessor to the Callback's content-String
         * @return Cleaned and rewritten HTML-Content
         */
        public String getResult() {
            try {
                result.flush();
            } catch (Exception e) {
                /* ignore */ }

            // WARNING: doesn't work, if you remove " " + ... but don't know why
            String res = " " + result.toString();

            return res;
        }

        public void flush() throws javax.swing.text.BadLocationException {
            // nothing to do here ...
        }

        /** 
         * Because Scripts and Stlyle sometimes are defined in comments, thoese
         * will be written. Otherwise comments are removed
         */
        public void handleComment(char[] values, int param) {
            if (!(inStyle || inScript))
                return;

            try {
                result.write("<!--");
                result.write(values);
                result.write("-->");
            } catch (Exception e) {
                /* ignore */ }
            // we ignore them 
        }

        public void handleEndOfLineString(java.lang.String str) {
            addToResult("\n");
        }

        public void handleError(java.lang.String str, int param) {
            // ignored
        }

        public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet attrs, int param) {
            if (removeMeta && (tag == HTML.Tag.META)) {
                return;
            }
            appendTagToResult(tag, attrs);
        }

        public void handleStartTag(HTML.Tag tag, MutableAttributeSet attrs, int position) {
            appendTagToResult(tag, attrs);
        }

        public void handleEndTag(HTML.Tag tag, int position) {
            if ((tag == HTML.Tag.FORM) && (inForm)) {
                // form handling seems to be buggy
                addToResult("</").addToResult(tag).addToResult(">");
                inForm = false;
            } else if (tag == HTML.Tag.FORM) {
                // do nothing! ... i.e. we are now outside of any <FORM>, so a
                // closing </form> is not really needed ... 
            } else {
                addToResult("</").addToResult(tag).addToResult(">");
            }

            if ((removeScript == false) && (tag == HTML.Tag.SCRIPT)) {
                inScript = false;
            } else if ((removeStyle == false) && (tag == HTML.Tag.STYLE)) {
                inStyle = false;
            }

            if (removeScript && (tag == HTML.Tag.SCRIPT)) {
                ignoreLevel--;
            } else if (removeStyle && (tag == HTML.Tag.STYLE)) {
                ignoreLevel--;
            } else if (removeHead && (tag == HTML.Tag.HEAD)) {
                ignoreLevel--;
            } else if (removeApplet && (tag == HTML.Tag.APPLET)) {
                ignoreLevel--;
            } else if (removeObject && (tag == HTML.Tag.OBJECT)) {
                ignoreLevel--;
            } else if (removeNoScript && (tag.toString().equalsIgnoreCase("NOSCRIPT"))) {
                ignoreLevel--;
            }
        }

        private void appendTagToResult(HTML.Tag tag, MutableAttributeSet attrs) {

            if (tag.toString().equalsIgnoreCase("__ENDOFLINETAG__")) {
                // jdk 1.2.2 places a tag <__ENDOFLINETAG__> in the result ...
                // we don't want this one
                return;
            }

            if (tag.toString().equalsIgnoreCase("__IMPLIED__")) {
                // jdk 1.3 places a tag <__IMPLIED__> in the result ...
                // we don't want this one
                return;
            }

            convertURLS(tag, attrs);
            Enumeration e = attrs.getAttributeNames();
            if (tag == HTML.Tag.BASE)
                return;

            addToResult("<").addToResult(tag);
            while (e.hasMoreElements()) {
                Object attr = e.nextElement();
                String attrName = attr.toString();
                String value = attrs.getAttribute(attr).toString();

                // include attribute only when Not(RemoveOnSomething = True and starts with "on")
                if (!(removeOnSomething && attrName.toLowerCase().startsWith("on") && (attrName.length() > 2))) {
                    // Attribute included
                    addToResult(" ").addToResult(attr).addToResult("=\"").addToResult(value).addToResult("\"");
                }
            }
            addToResult(">");
        }

        /** Here the magic happens.
         *
         * If someone wants new types of URLs to be rewritten, add them here
         * @param tag TAG from the Callback-Interface
         * @param attrs Attribute-Set from the Callback-Interface
         */

        private void convertURLS(HTML.Tag tag, MutableAttributeSet attrs) {

            // first we do an URL-rewrite on different tags

            if (tag == HTML.Tag.A) {
                if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
                    // ---- CHECKING <A HREF
                    addConvertedAttribute(HTML.Attribute.HREF, attrs);
                }
                if ((attrs.getAttribute(HTML.Attribute.TARGET) == null) && cb.openInNewWindow) {
                    attrs.addAttribute(HTML.Attribute.TARGET, "_BLANK");
                }
            } else if (tag == HTML.Tag.AREA) {
                if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
                    // ---- CHECKING <A HREF
                    addConvertedAttribute(HTML.Attribute.HREF, attrs);
                }
                if ((attrs.getAttribute(HTML.Attribute.TARGET) == null) && cb.openInNewWindow) {
                    attrs.addAttribute(HTML.Attribute.TARGET, "_BLANK");
                }
            } else if (((tag == HTML.Tag.IMG) || (tag == HTML.Tag.INPUT) || (tag == HTML.Tag.SCRIPT))
                    && (attrs.getAttribute(HTML.Attribute.SRC) != null)) {
                // ---- CHECKING <IMG SRC & <INPUT SRC
                addConvertedAttribute(HTML.Attribute.SRC, attrs);
            } else if (tag == HTML.Tag.LINK) {
                if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
                    // ---- CHECKING <LINK HREF
                    addConvertedAttribute(HTML.Attribute.HREF, attrs);
                }
            } else if (tag == HTML.Tag.APPLET) {
                // ---- CHECKING <APPLET CODEBASE=
                if (attrs.getAttribute(HTML.Attribute.CODEBASE) == null) {
                    int endOfPath = baseUrl.toString().lastIndexOf("/");
                    attrs.addAttribute(HTML.Attribute.CODEBASE, baseUrl.toString().substring(0, endOfPath + 1));
                } else {
                    addConvertedAttribute(HTML.Attribute.CODEBASE, attrs);
                }
            } else if (tag == HTML.Tag.OBJECT) {
                // ---- CHECKING <OBJECT CODEBASE=
                if (attrs.getAttribute(HTML.Attribute.CODEBASE) == null) {
                    int endOfPath = baseUrl.toString().lastIndexOf("/");
                    attrs.addAttribute(HTML.Attribute.CODEBASE, baseUrl.toString().substring(0, endOfPath + 1));
                } else {
                    addConvertedAttribute(HTML.Attribute.CODEBASE, attrs);
                }
            } else if (tag == HTML.Tag.BODY) {
                if (attrs.getAttribute(HTML.Attribute.BACKGROUND) != null) {
                    // background images are applied to the ENTIRE page, this remove them!
                    attrs.removeAttribute(HTML.Attribute.BACKGROUND);
                }
            } else if (tag == HTML.Tag.BASE) {
                if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
                    try {
                        baseUrl = new URL(attrs.getAttribute(HTML.Attribute.HREF).toString());
                    } catch (Throwable t) {
                        // logger.error( "HTMLRewriter: Setting BASE=" 
                        // + attrs.getAttribute(HTML.Attribute.HREF).toString()
                        // + t.getMessage());
                    }
                    attrs.removeAttribute(HTML.Attribute.HREF);
                }
            } else if (tag == HTML.Tag.FORM) {
                // ---- CHECKING <FORM ACTION=
                inForm = true; // buggy <form> handling in jdk 1.3 
                if (attrs.getAttribute(HTML.Attribute.ACTION) == null) {
                    //self referencing <FORM>
                    attrs.addAttribute(HTML.Attribute.ACTION, baseUrl.toString());
                } else {
                    addConvertedAttribute(HTML.Attribute.ACTION, attrs);
                }
            } else if (tag == HTML.Tag.TD) {
                // ---- CHECKING <TD BACKGROUND=
                if (!(attrs.getAttribute(HTML.Attribute.BACKGROUND) == null)) {
                    addConvertedAttribute(HTML.Attribute.BACKGROUND, attrs);
                }
            }

            // then we check for ignored tags ...
            // btw. I know, that this code could be written in a shorter way, but
            // I think it's more readable like this ...

            // don't forget to add changes to  handleEndTag() as well, else 
            // things will get screwed up!

            if ((removeScript == false) && (tag == HTML.Tag.SCRIPT)) {
                inScript = true;
            } else if ((removeStyle == false) && (tag == HTML.Tag.STYLE)) {
                inStyle = true;
            }

            if (removeScript && (tag == HTML.Tag.SCRIPT)) {
                ignoreLevel++;
            } else if (removeStyle && (tag == HTML.Tag.STYLE)) {
                ignoreLevel++;
            } else if (removeHead && (tag == HTML.Tag.HEAD)) {
                ignoreLevel++;
            } else if (removeApplet && (tag == HTML.Tag.APPLET)) {
                ignoreLevel++;
            } else if (removeObject && (tag == HTML.Tag.OBJECT)) {
                ignoreLevel++;
            } else if (removeNoScript && (tag.toString().equalsIgnoreCase("NOSCRIPT"))) {
                ignoreLevel++;
            }
        }

        /**
         *
         * Converts the given attribute to base URL, if not null
         *
         */
        private void addConvertedAttribute(HTML.Attribute attr, MutableAttributeSet attrs) {
            if (attrs.getAttribute(attr) != null) {
                String attrSource = attrs.getAttribute(attr).toString();
                attrs.addAttribute(attr, generateNewUrl(attrSource));
            }
        }

        private String generateNewUrl(String oldURL) {
            try {
                URL x = new URL(baseUrl, oldURL);
                return x.toString();
            } catch (Throwable t) {
                if (oldURL.toLowerCase().startsWith("javascript:")) {
                    return oldURL;
                }
                //logger.error( "HTMLRewriter: Setting BASE="
                //+ baseUrl
                //+ " Old = "
                //+ oldURL
                //+ t.getMessage());
                return oldURL; // default behaviour ...
            }
        }

        public void handleText(char[] values, int param) {
            addToResult(values);
        }
    }
}