Html utils for working with tag's names and attributes.


import java.util.Locale;

 * Html utils for working with tag's names and attributes.
 * @author
 * @author Lingo
 * @since 2007-03-17
 * @version 1.0
public final class HtmlUtil {
    // ---------------------------------------------------------------- tag name

     * Returns tag's name. Given string represents a HTML body of a tag,
     * therefore it <b>must</b> start with '<'.
     * @param tagBody tag's body
     * @return tag's name, or <code>null</code> if tag not found
    public static String getTagName(String tagBody) {
        return getTagName(tagBody, 0);

     * Returns tag's name. Given string represents a HTML body and given starting index
     * <b>must</b> be the index of tag's start (i.e. '<').
     * <p>
     * Names of ending tags will always start with '/' character.
     * @param body   hmtl body
     * @param i      index of tag's start
     * @return tag's name, or <code>null</code> if tag not found
    public static String getTagName(String body, int i) {
        if (body == null) {
            return null;

        if (body.charAt(i) != '<') {
            return null; // no tag

        int start = i + 1; // skip '<'
        int len = body.length();
        boolean isEndTag = false;

        // skip all non-letters
        while (start < len) {
            char c = body.charAt(start);

            if (c == '>') {
                return null; // tag end found => name not found

            if (c == '/') { // this is an end tag
                isEndTag = true;


            if (!Character.isWhitespace(c)) {


        if (start == len) {
            return null; // tag name not found

        int end = start;

        // skip all letters
        while (end < len) {
            char c = body.charAt(end);

            if (Character.isWhitespace(c) || (c == '>')) {


        if (end == len) {
            return null; // tag end not found

        String tagName = body.substring(start, end);

        if (isEndTag) {
            tagName = "/" + tagName;

        return tagName;

    // ---------------------------------------------------------------- tag attribute

     * Returns value of the first founded attribute that matches given name.
     * It is assumed that given string represents tag's body.
     * Note: attribute <b>must</b> end with the <code>="</code> or <code>='</code>.
     * Attribute name is not case sensitive.
     * @param tagBody  tag body
     * @param attrName attribute name
     * @return attribute value or <code>null</code> if attribute not found
    public static String getAttribute(String tagBody, String attrName) {
        return getAttribute(tagBody, attrName, 0);

     * Returns value of the first founded attribute that matches given name.
     * Given string may not be just a tag's body, however, start and end
     * parameters must define tags body.
     * Note: attribute <b>must</b> end with the <code>="</code> or <code>='</code>.
     * Attribute name is not case sensitive.
     * @param body     html body
     * @param attrName attribute name
     * @param start    index of tag's start
     * @return attribute value or <code>null</code> if attribute not found
    public static String getAttribute(String body, String attrName,
        int start) {
        if (body == null) {
            return null;

        char quote = '\"';
        int end = body.indexOf('>');

        if (end == -1) {
            return null; // tag's end not found

        int i = indexOfIgnoreCase(body, attrName + "=\"", start);

        if ((i == -1) || (i > end)) {
            i = indexOfIgnoreCase(body, attrName + "='", start);

            if ((i == -1) || (i > end)) {
                return null;

            quote = '\'';

        String value = null;
        i += (attrName.length() + 2);

        int s = i;
        int j = -1;

        while (true) {
            j = body.indexOf(quote, s);

            if (j == -1) {
                break; // closed quation not found

            if (body.charAt(j - 1) == '\\') {
                s = j + 1;

            } else {
                value = body.substring(i, j);


        return value;

    // ---------------------------------------------------------------- add attribute & value

     * Adds attribute and its value to a tag. Attribute is added to the end of
     * the tag, just before closing '>'. If name is not specified, nothing will
     * be added. If value is not specified, it will be set to an empty string.
     * @param tagBody tag body
     * @param name    attribute name
     * @param value   attribute value
     * @return tag string with added attribute and value
    public static String addAttribute(String tagBody, String name,
        String value) {
        return addAttribute(tagBody, name, value, 0);

     * Adds attribute and its value to a tag. Attribute is added to the end of
     * the tag, just before closing '>'. If name is not specified, nothing will
     * be added. If value is not specified, it will be set to an empty string.
     * @param body   html body
     * @param name   attribute name
     * @param value  attribute value
     * @param i      tag's offset in html body
     * @return tag string with added attribute and value
    public static String addAttribute(String body, String name,
        String value, int i) {
        if (body == null) {
            return null;

        if (name == null) {
            return body;

        if (value == null) {
            value = "";

        int end = body.indexOf('>', i);

        if (end == -1) {
            return body;

        StringBuffer result = new StringBuffer(body.length());
        result.append(body.substring(i, end)).append(' ');

        return result.toString();

    // ---------------------------------------------------------------- add attribute, no value

     * Adds single attribute without value to a tag. Attribute is added to the
     * end of the tag, just before closing '>'. If name is not specified, nothing
     * will be added.
     * @param tagBody tag body
     * @param name    attribute name
     * @return tag string with added attribute
    public static String addAttribute(String tagBody, String name) {
        return addAttribute(tagBody, name, 0);

     * Adds single attribute without value to a tag. Attribute is added to the
     * end of the tag, just before closing '>'. If name is not specified, nothing
     * will be added.
     * @param body   html body
     * @param name   attribute name
     * @param i      tag's offset in html body
     * @return tag string with added attribute
    public static String addAttribute(String body, String name, int i) {
        if (body == null) {
            return null;

        if (name == null) {
            return body;

        int end = body.indexOf('>', i);

        if (end == -1) {
            return body;

        StringBuffer result = new StringBuffer(body.length());
        result.append(body.substring(i, end)).append(' ');

        return result.toString();

     * Finds first index of a substring in the given source string with ignored
     * case. This seems to be the fastest way doing this, with common string
     * length and content (of course, with no use of Boyer-Mayer type of
     * algorithms). Other implementations are slower: getting char array frist,
     * lowercasing the source string, using String.regionMatch etc.
     * @param src        source string for examination
     * @param subS       substring to find
     * @param startIndex starting index from where search begins
     * @return index of founded substring or -1 if substring is not found
    public static int indexOfIgnoreCase(String src, String subS,
        int startIndex) {
        String sub = subS.toLowerCase(Locale.CHINA);
        int sublen = sub.length();
        int total = src.length() - sublen + 1;

        for (int i = startIndex; i < total; i++) {
            int j = 0;

            while (j < sublen) {
                char source = Character.toLowerCase(src.charAt(i + j));

                if (sub.charAt(j) != source) {


            if (j == sublen) {
                return i;

        return -1;
class HtmlEncoder {
    public static final float NEW_SIZE_FACTOR = 1.3f;

     * Lookup table for use in encode() method.
     * @see #encode
    private static final String[] TABLE_HTML = new String[256];

     * Lookup table for use in encodeTextXxx() methods.
     * @see #encodeText
     * @see #encodeTextSmart
     * @see #encodeTextStrict
    private static final String[] TABLE_HTML_STRICT = new String[256];

    static {
        for (int i = 0; i < 10; i++) {
            TABLE_HTML[i] = "&#00" + i + ";";

        for (int i = 10; i < 32; i++) {
            TABLE_HTML[i] = "&#0" + i + ";";

        for (int i = 32; i < 128; i++) {
            TABLE_HTML[i] = String.valueOf((char) i);

        for (int i = 128; i < 256; i++) {
            TABLE_HTML[i] = "&#" + i + ";";

        // special characters
        TABLE_HTML['\''] = "&#039;"; // apostrophe ('&apos;' doesn't work - it is not by the w3 specs).
        TABLE_HTML['\"'] = "&quot;"; // double quote.
        TABLE_HTML['&'] = "&amp;"; // ampersand.
        TABLE_HTML['<'] = "&lt;"; // lower than.
        TABLE_HTML['>'] = "&gt;"; // greater than.

        // strict table
        System.arraycopy(TABLE_HTML, 0, TABLE_HTML_STRICT, 0, 256);
        TABLE_HTML_STRICT[' '] = "&nbsp;"; // ??.
        TABLE_HTML_STRICT['\n'] = "<br>"; // ascii 10.
        TABLE_HTML_STRICT['\r'] = "<br>"; // ascii 13.

    // ---------------------------------------------------------------- encoding

     * Encode string to HTML-safe text. Extra characters are encoded as decimals,
     * and five special characters are replaced with their HTML values:
     * <li>' with &amp;#039;</li>
     * <li>" with &amp;quot;</li>
     * <li>&amp; with &amp;amp;</li>
     * <li>&lt; with &amp;lt;</li>
     * <li>&gt; with &amp;gt;</li>
     * @param string input string
     * @return HTML-safe string
     * @see #encodeText
    public static String encode(String string) {
        if ((string == null) || (string.length() == 0)) {
            return "";

        int n = string.length();
        StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR));
        int tableLen = TABLE_HTML.length;
        char c;

        for (int i = 0; i < n; i++) {
            c = string.charAt(i);

            if (c < tableLen) {
            } else {
                buffer.append("&#").append((int) c).append(';');

        return buffer.toString();

     * Encodes text int HTML-safe text and preserves format. Additionaly, the following
     * characters are replaced:
     * <li>' ' with &amp;nbsp;</li>
     * <li>\n with &lt;br&gt;</li>
     * <li>\r with &lt;br&gt;</li>
     * <br><br>
     * Additionaly, this method takes care about CRLF and LF texts and handles
     * both.
     * Common problem with this method is that spaces are not breakable, so they
     * may break the outline of the page.
     * @param string input string
     * @return HTML-safe format
    public static String encodeTextStrict(String string) {
        if ((string == null) || (string.length() == 0)) {
            return "";

        int n = string.length();
        StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR));
        int tableLen = TABLE_HTML_STRICT.length;
        char c = 0;
        char prev = 0;

        for (int i = 0; i < n; i++, prev = c) {
            c = string.charAt(i);

            if ((c == '\n') && (prev == '\r')) {
                continue; // previously '\r' (CR) was encoded, so skip '\n' (LF)

            if (c < tableLen) {
            } else {
                buffer.append("&#").append((int) c).append(';');

        return buffer.toString();

     * Encodes text int HTML-safe text and preserves format except spaces.
     * Additionaly, the following characters are replaced:
     * <li>\n with &lt;br&gt;</li>
     * <li>\r with &lt;br&gt;</li>
     * <br><br>
     * Additionaly, this method takes care about CRLF and LF texts and handles
     * both.
     * @param string input string
     * @return HTML-safe format
    public static String encodeText(String string) {
        if ((string == null) || (string.length() == 0)) {
            return "";

        int n = string.length();
        StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR));
        int tableLen = TABLE_HTML_STRICT.length;
        char c = 0;
        char prev = 0;

        for (int i = 0; i < n; i++, prev = c) {
            c = string.charAt(i);

            if (c == ' ') {
                buffer.append(' ');


            if ((c == '\n') && (prev == '\r')) {
                continue; // previously '\r' (CR) was encoded, so skip '\n' (LF)

            if (c < tableLen) {
            } else {
                buffer.append("&#").append((int) c).append(';');

        return buffer.toString();

     * Encodes text int HTML-safe text and preserves format using smart spaces.
     * Additionaly, the following characters are replaced:
     * <li>\n with &lt;br&gt;</li>
     * <li>\r with &lt;br&gt;</li>
     * <br><br>
     * Additionaly, this method takes care about CRLF and LF texts and handles
     * both.<br>
     * This method is special since it preserves format, but with combination of
     * not-breakable spaces and common spaces, so breaks are availiable.
     * @param string input string
     * @return HTML-safe format
    public static String encodeTextSmart(String string) {
        if ((string == null) || (string.length() == 0)) {
            return "";

        int n = string.length();
        StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR));
        int tableLen = TABLE_HTML_STRICT.length;
        char c = 0;
        char prev = 0;
        boolean prevSpace = false;

        for (int i = 0; i < n; i++, prev = c) {
            c = string.charAt(i);

            if (c == ' ') {
                if (prev != ' ') {
                    prevSpace = false;

                if (!prevSpace) {
                    buffer.append(' ');
                } else {

                prevSpace = !prevSpace;


            if ((c == '\n') && (prev == '\r')) {
                continue; // previously '\r' (CR) was encoded, so skip '\n' (LF)

            if (c < tableLen) {
            } else {
                buffer.append("&#").append((int) c).append(';');

        return buffer.toString();



