HtmlUtil.java Source code

Introduction

Here is the source code for HtmlUtil.java
Source

import java.util.Locale;

/**
 * Html utils for working with tag's names and attributes.
 * http://jodd.sourceforge.net
 *
 * @author najgor@users.sourceforge.net
 * @author Lingo
 * @since 2007-03-17
 * @version 1.0
 */
public final class HtmlUtil {
    // ---------------------------------------------------------------- tag name

    /**
     * Returns tag's name. Given string represents a HTML body of a tag,
     * therefore it <b>must</b> start with '<'.
     *
     * @param tagBody tag's body
     *
     * @return tag's name, or <code>null</code> if tag not found
     */
    public static String getTagName(String tagBody) {
        return getTagName(tagBody, 0);
    }

    /**
     * Returns tag's name. Given string represents a HTML body and given starting index
     * <b>must</b> be the index of tag's start (i.e. '<').
     * <p>
     *
     * Names of ending tags will always start with '/' character.
     *
     * @param body   hmtl body
     * @param i      index of tag's start
     *
     * @return tag's name, or <code>null</code> if tag not found
     */
    public static String getTagName(String body, int i) {
        if (body == null) {
            return null;
        }

        if (body.charAt(i) != '<') {
            return null; // no tag
        }

        int start = i + 1; // skip '<'
        int len = body.length();
        boolean isEndTag = false;

        // skip all non-letters
        while (start < len) {
            char c = body.charAt(start);

            if (c == '>') {
                return null; // tag end found => name not found
            }

            if (c == '/') { // this is an end tag
                start++;
                isEndTag = true;

                continue;
            }

            if (!Character.isWhitespace(c)) {
                break;
            }

            start++;
        }

        if (start == len) {
            return null; // tag name not found
        }

        int end = start;

        // skip all letters
        while (end < len) {
            char c = body.charAt(end);

            if (Character.isWhitespace(c) || (c == '>')) {
                break;
            }

            end++;
        }

        if (end == len) {
            return null; // tag end not found
        }

        String tagName = body.substring(start, end);

        if (isEndTag) {
            tagName = "/" + tagName;
        }

        return tagName;
    }

    // ---------------------------------------------------------------- tag attribute

    /**
     * Returns value of the first founded attribute that matches given name.
     * It is assumed that given string represents tag's body.
     * Note: attribute <b>must</b> end with the <code>="</code> or <code>='</code>.
     * Attribute name is not case sensitive.
     *
     * @param tagBody  tag body
     * @param attrName attribute name
     *
     * @return attribute value or <code>null</code> if attribute not found
     */
    public static String getAttribute(String tagBody, String attrName) {
        return getAttribute(tagBody, attrName, 0);
    }

    /**
     * Returns value of the first founded attribute that matches given name.
     * Given string may not be just a tag's body, however, start and end
     * parameters must define tags body.
     * Note: attribute <b>must</b> end with the <code>="</code> or <code>='</code>.
     * Attribute name is not case sensitive.
     *
     * @param body     html body
     * @param attrName attribute name
     * @param start    index of tag's start
     *
     * @return attribute value or <code>null</code> if attribute not found
     */
    public static String getAttribute(String body, String attrName, int start) {
        if (body == null) {
            return null;
        }

        char quote = '\"';
        int end = body.indexOf('>');

        if (end == -1) {
            return null; // tag's end not found
        }

        int i = indexOfIgnoreCase(body, attrName + "=\"", start);

        if ((i == -1) || (i > end)) {
            i = indexOfIgnoreCase(body, attrName + "='", start);

            if ((i == -1) || (i > end)) {
                return null;
            }

            quote = '\'';
        }

        String value = null;
        i += (attrName.length() + 2);

        int s = i;
        int j = -1;

        while (true) {
            j = body.indexOf(quote, s);

            if (j == -1) {
                break; // closed quation not found
            }

            if (body.charAt(j - 1) == '\\') {
                s = j + 1;

                continue;
            } else {
                value = body.substring(i, j);

                break;
            }
        }

        return value;
    }

    // ---------------------------------------------------------------- add attribute & value

    /**
     * Adds attribute and its value to a tag. Attribute is added to the end of
     * the tag, just before closing '>'. If name is not specified, nothing will
     * be added. If value is not specified, it will be set to an empty string.
     *
     * @param tagBody tag body
     * @param name    attribute name
     * @param value   attribute value
     *
     * @return tag string with added attribute and value
     */
    public static String addAttribute(String tagBody, String name, String value) {
        return addAttribute(tagBody, name, value, 0);
    }

    /**
     * Adds attribute and its value to a tag. Attribute is added to the end of
     * the tag, just before closing '>'. If name is not specified, nothing will
     * be added. If value is not specified, it will be set to an empty string.
     *
     * @param body   html body
     * @param name   attribute name
     * @param value  attribute value
     * @param i      tag's offset in html body
     *
     * @return tag string with added attribute and value
     */
    public static String addAttribute(String body, String name, String value, int i) {
        if (body == null) {
            return null;
        }

        if (name == null) {
            return body;
        }

        if (value == null) {
            value = "";
        }

        int end = body.indexOf('>', i);

        if (end == -1) {
            return body;
        }

        StringBuffer result = new StringBuffer(body.length());
        result.append(body.substring(i, end)).append(' ');
        result.append(name).append('=').append('"');
        //result.append(ServletUtil.encodeHtml(value)).append('"');
        result.append(HtmlEncoder.encode(value)).append('"');
        result.append(body.substring(end));

        return result.toString();
    }

    // ---------------------------------------------------------------- add attribute, no value

    /**
     * Adds single attribute without value to a tag. Attribute is added to the
     * end of the tag, just before closing '>'. If name is not specified, nothing
     * will be added.
     *
     * @param tagBody tag body
     * @param name    attribute name
     *
     * @return tag string with added attribute
     */
    public static String addAttribute(String tagBody, String name) {
        return addAttribute(tagBody, name, 0);
    }

    /**
     * Adds single attribute without value to a tag. Attribute is added to the
     * end of the tag, just before closing '>'. If name is not specified, nothing
     * will be added.
     *
     * @param body   html body
     * @param name   attribute name
     * @param i      tag's offset in html body
     *
     * @return tag string with added attribute
     */
    public static String addAttribute(String body, String name, int i) {
        if (body == null) {
            return null;
        }

        if (name == null) {
            return body;
        }

        int end = body.indexOf('>', i);

        if (end == -1) {
            return body;
        }

        StringBuffer result = new StringBuffer(body.length());
        result.append(body.substring(i, end)).append(' ');
        result.append(name).append(body.substring(end));

        return result.toString();
    }

    /**
     * Finds first index of a substring in the given source string with ignored
     * case. This seems to be the fastest way doing this, with common string
     * length and content (of course, with no use of Boyer-Mayer type of
     * algorithms). Other implementations are slower: getting char array frist,
     * lowercasing the source string, using String.regionMatch etc.
     *
     * @param src        source string for examination
     * @param subS       substring to find
     * @param startIndex starting index from where search begins
     *
     * @return index of founded substring or -1 if substring is not found
     */
    public static int indexOfIgnoreCase(String src, String subS, int startIndex) {
        String sub = subS.toLowerCase(Locale.CHINA);
        int sublen = sub.length();
        int total = src.length() - sublen + 1;

        for (int i = startIndex; i < total; i++) {
            int j = 0;

            while (j < sublen) {
                char source = Character.toLowerCase(src.charAt(i + j));

                if (sub.charAt(j) != source) {
                    break;
                }

                j++;
            }

            if (j == sublen) {
                return i;
            }
        }

        return -1;
    }
}

class HtmlEncoder {
    public static final float NEW_SIZE_FACTOR = 1.3f;

    /**
     * Lookup table for use in encode() method.
     *
     * @see #encode
     */
    private static final String[] TABLE_HTML = new String[256];

    /**
     * Lookup table for use in encodeTextXxx() methods.
     *
     * @see #encodeText
     * @see #encodeTextSmart
     * @see #encodeTextStrict
     */
    private static final String[] TABLE_HTML_STRICT = new String[256];

    static {
        for (int i = 0; i < 10; i++) {
            TABLE_HTML[i] = "&#00" + i + ";";
        }

        for (int i = 10; i < 32; i++) {
            TABLE_HTML[i] = "&#0" + i + ";";
        }

        for (int i = 32; i < 128; i++) {
            TABLE_HTML[i] = String.valueOf((char) i);
        }

        for (int i = 128; i < 256; i++) {
            TABLE_HTML[i] = "&#" + i + ";";
        }

        // special characters
        TABLE_HTML['\''] = "&#039;"; // apostrophe ('&apos;' doesn't work - it is not by the w3 specs).
        TABLE_HTML['\"'] = "&quot;"; // double quote.
        TABLE_HTML['&'] = "&amp;"; // ampersand.
        TABLE_HTML['<'] = "&lt;"; // lower than.
        TABLE_HTML['>'] = "&gt;"; // greater than.

        // strict table
        System.arraycopy(TABLE_HTML, 0, TABLE_HTML_STRICT, 0, 256);
        TABLE_HTML_STRICT[' '] = "&nbsp;"; // ??.
        TABLE_HTML_STRICT['\n'] = "<br>"; // ascii 10.
        TABLE_HTML_STRICT['\r'] = "<br>"; // ascii 13.
    }

    // ---------------------------------------------------------------- encoding

    /**
     * Encode string to HTML-safe text. Extra characters are encoded as decimals,
     * and five special characters are replaced with their HTML values:
     * <li>' with &amp;#039;</li>
     * <li>" with &amp;quot;</li>
     * <li>&amp; with &amp;amp;</li>
     * <li>&lt; with &amp;lt;</li>
     * <li>&gt; with &amp;gt;</li>
     *
     * @param string input string
     *
     * @return HTML-safe string
     * @see #encodeText
     */
    public static String encode(String string) {
        if ((string == null) || (string.length() == 0)) {
            return "";
        }

        int n = string.length();
        StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR));
        int tableLen = TABLE_HTML.length;
        char c;

        for (int i = 0; i < n; i++) {
            c = string.charAt(i);

            if (c < tableLen) {
                buffer.append(TABLE_HTML[c]);
            } else {
                buffer.append("&#").append((int) c).append(';');
            }
        }

        return buffer.toString();
    }

    /**
     * Encodes text int HTML-safe text and preserves format. Additionaly, the following
     * characters are replaced:
     * <li>' ' with &amp;nbsp;</li>
     * <li>\n with &lt;br&gt;</li>
     * <li>\r with &lt;br&gt;</li>
     * <br><br>
     * Additionaly, this method takes care about CRLF and LF texts and handles
     * both.
     *
     * Common problem with this method is that spaces are not breakable, so they
     * may break the outline of the page.
     *
     * @param string input string
     *
     * @return HTML-safe format
     */
    public static String encodeTextStrict(String string) {
        if ((string == null) || (string.length() == 0)) {
            return "";
        }

        int n = string.length();
        StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR));
        int tableLen = TABLE_HTML_STRICT.length;
        char c = 0;
        char prev = 0;

        for (int i = 0; i < n; i++, prev = c) {
            c = string.charAt(i);

            if ((c == '\n') && (prev == '\r')) {
                continue; // previously '\r' (CR) was encoded, so skip '\n' (LF)
            }

            if (c < tableLen) {
                buffer.append(TABLE_HTML_STRICT[c]);
            } else {
                buffer.append("&#").append((int) c).append(';');
            }
        }

        return buffer.toString();
    }

    /**
     * Encodes text int HTML-safe text and preserves format except spaces.
     * Additionaly, the following characters are replaced:
     *
     * <li>\n with &lt;br&gt;</li>
     * <li>\r with &lt;br&gt;</li>
     * <br><br>
     * Additionaly, this method takes care about CRLF and LF texts and handles
     * both.
     *
     * @param string input string
     *
     * @return HTML-safe format
     */
    public static String encodeText(String string) {
        if ((string == null) || (string.length() == 0)) {
            return "";
        }

        int n = string.length();
        StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR));
        int tableLen = TABLE_HTML_STRICT.length;
        char c = 0;
        char prev = 0;

        for (int i = 0; i < n; i++, prev = c) {
            c = string.charAt(i);

            if (c == ' ') {
                buffer.append(' ');

                continue;
            }

            if ((c == '\n') && (prev == '\r')) {
                continue; // previously '\r' (CR) was encoded, so skip '\n' (LF)
            }

            if (c < tableLen) {
                buffer.append(TABLE_HTML_STRICT[c]);
            } else {
                buffer.append("&#").append((int) c).append(';');
            }
        }

        return buffer.toString();
    }

    /**
     * Encodes text int HTML-safe text and preserves format using smart spaces.
     * Additionaly, the following characters are replaced:
     *
     * <li>\n with &lt;br&gt;</li>
     * <li>\r with &lt;br&gt;</li>
     * <br><br>
     * Additionaly, this method takes care about CRLF and LF texts and handles
     * both.<br>
     *
     * This method is special since it preserves format, but with combination of
     * not-breakable spaces and common spaces, so breaks are availiable.
     *
     * @param string input string
     *
     * @return HTML-safe format
     */
    public static String encodeTextSmart(String string) {
        if ((string == null) || (string.length() == 0)) {
            return "";
        }

        int n = string.length();
        StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR));
        int tableLen = TABLE_HTML_STRICT.length;
        char c = 0;
        char prev = 0;
        boolean prevSpace = false;

        for (int i = 0; i < n; i++, prev = c) {
            c = string.charAt(i);

            if (c == ' ') {
                if (prev != ' ') {
                    prevSpace = false;
                }

                if (!prevSpace) {
                    buffer.append(' ');
                } else {
                    buffer.append("&nbsp;");
                }

                prevSpace = !prevSpace;

                continue;
            }

            if ((c == '\n') && (prev == '\r')) {
                continue; // previously '\r' (CR) was encoded, so skip '\n' (LF)
            }

            if (c < tableLen) {
                buffer.append(TABLE_HTML_STRICT[c]);
            } else {
                buffer.append("&#").append((int) c).append(';');
            }
        }

        return buffer.toString();
    }

}