Java tutorial
import java.util.Locale; /** * Html utils for working with tag's names and attributes. * http://jodd.sourceforge.net * * @author najgor@users.sourceforge.net * @author Lingo * @since 2007-03-17 * @version 1.0 */ public final class HtmlUtil { // ---------------------------------------------------------------- tag name /** * Returns tag's name. Given string represents a HTML body of a tag, * therefore it <b>must</b> start with '<'. * * @param tagBody tag's body * * @return tag's name, or <code>null</code> if tag not found */ public static String getTagName(String tagBody) { return getTagName(tagBody, 0); } /** * Returns tag's name. Given string represents a HTML body and given starting index * <b>must</b> be the index of tag's start (i.e. '<'). * <p> * * Names of ending tags will always start with '/' character. * * @param body hmtl body * @param i index of tag's start * * @return tag's name, or <code>null</code> if tag not found */ public static String getTagName(String body, int i) { if (body == null) { return null; } if (body.charAt(i) != '<') { return null; // no tag } int start = i + 1; // skip '<' int len = body.length(); boolean isEndTag = false; // skip all non-letters while (start < len) { char c = body.charAt(start); if (c == '>') { return null; // tag end found => name not found } if (c == '/') { // this is an end tag start++; isEndTag = true; continue; } if (!Character.isWhitespace(c)) { break; } start++; } if (start == len) { return null; // tag name not found } int end = start; // skip all letters while (end < len) { char c = body.charAt(end); if (Character.isWhitespace(c) || (c == '>')) { break; } end++; } if (end == len) { return null; // tag end not found } String tagName = body.substring(start, end); if (isEndTag) { tagName = "/" + tagName; } return tagName; } // ---------------------------------------------------------------- tag attribute /** * Returns value of the first founded attribute that matches given name. * It is assumed that given string represents tag's body. * Note: attribute <b>must</b> end with the <code>="</code> or <code>='</code>. * Attribute name is not case sensitive. * * @param tagBody tag body * @param attrName attribute name * * @return attribute value or <code>null</code> if attribute not found */ public static String getAttribute(String tagBody, String attrName) { return getAttribute(tagBody, attrName, 0); } /** * Returns value of the first founded attribute that matches given name. * Given string may not be just a tag's body, however, start and end * parameters must define tags body. * Note: attribute <b>must</b> end with the <code>="</code> or <code>='</code>. * Attribute name is not case sensitive. * * @param body html body * @param attrName attribute name * @param start index of tag's start * * @return attribute value or <code>null</code> if attribute not found */ public static String getAttribute(String body, String attrName, int start) { if (body == null) { return null; } char quote = '\"'; int end = body.indexOf('>'); if (end == -1) { return null; // tag's end not found } int i = indexOfIgnoreCase(body, attrName + "=\"", start); if ((i == -1) || (i > end)) { i = indexOfIgnoreCase(body, attrName + "='", start); if ((i == -1) || (i > end)) { return null; } quote = '\''; } String value = null; i += (attrName.length() + 2); int s = i; int j = -1; while (true) { j = body.indexOf(quote, s); if (j == -1) { break; // closed quation not found } if (body.charAt(j - 1) == '\\') { s = j + 1; continue; } else { value = body.substring(i, j); break; } } return value; } // ---------------------------------------------------------------- add attribute & value /** * Adds attribute and its value to a tag. Attribute is added to the end of * the tag, just before closing '>'. If name is not specified, nothing will * be added. If value is not specified, it will be set to an empty string. * * @param tagBody tag body * @param name attribute name * @param value attribute value * * @return tag string with added attribute and value */ public static String addAttribute(String tagBody, String name, String value) { return addAttribute(tagBody, name, value, 0); } /** * Adds attribute and its value to a tag. Attribute is added to the end of * the tag, just before closing '>'. If name is not specified, nothing will * be added. If value is not specified, it will be set to an empty string. * * @param body html body * @param name attribute name * @param value attribute value * @param i tag's offset in html body * * @return tag string with added attribute and value */ public static String addAttribute(String body, String name, String value, int i) { if (body == null) { return null; } if (name == null) { return body; } if (value == null) { value = ""; } int end = body.indexOf('>', i); if (end == -1) { return body; } StringBuffer result = new StringBuffer(body.length()); result.append(body.substring(i, end)).append(' '); result.append(name).append('=').append('"'); //result.append(ServletUtil.encodeHtml(value)).append('"'); result.append(HtmlEncoder.encode(value)).append('"'); result.append(body.substring(end)); return result.toString(); } // ---------------------------------------------------------------- add attribute, no value /** * Adds single attribute without value to a tag. Attribute is added to the * end of the tag, just before closing '>'. If name is not specified, nothing * will be added. * * @param tagBody tag body * @param name attribute name * * @return tag string with added attribute */ public static String addAttribute(String tagBody, String name) { return addAttribute(tagBody, name, 0); } /** * Adds single attribute without value to a tag. Attribute is added to the * end of the tag, just before closing '>'. If name is not specified, nothing * will be added. * * @param body html body * @param name attribute name * @param i tag's offset in html body * * @return tag string with added attribute */ public static String addAttribute(String body, String name, int i) { if (body == null) { return null; } if (name == null) { return body; } int end = body.indexOf('>', i); if (end == -1) { return body; } StringBuffer result = new StringBuffer(body.length()); result.append(body.substring(i, end)).append(' '); result.append(name).append(body.substring(end)); return result.toString(); } /** * Finds first index of a substring in the given source string with ignored * case. This seems to be the fastest way doing this, with common string * length and content (of course, with no use of Boyer-Mayer type of * algorithms). Other implementations are slower: getting char array frist, * lowercasing the source string, using String.regionMatch etc. * * @param src source string for examination * @param subS substring to find * @param startIndex starting index from where search begins * * @return index of founded substring or -1 if substring is not found */ public static int indexOfIgnoreCase(String src, String subS, int startIndex) { String sub = subS.toLowerCase(Locale.CHINA); int sublen = sub.length(); int total = src.length() - sublen + 1; for (int i = startIndex; i < total; i++) { int j = 0; while (j < sublen) { char source = Character.toLowerCase(src.charAt(i + j)); if (sub.charAt(j) != source) { break; } j++; } if (j == sublen) { return i; } } return -1; } } class HtmlEncoder { public static final float NEW_SIZE_FACTOR = 1.3f; /** * Lookup table for use in encode() method. * * @see #encode */ private static final String[] TABLE_HTML = new String[256]; /** * Lookup table for use in encodeTextXxx() methods. * * @see #encodeText * @see #encodeTextSmart * @see #encodeTextStrict */ private static final String[] TABLE_HTML_STRICT = new String[256]; static { for (int i = 0; i < 10; i++) { TABLE_HTML[i] = "�" + i + ";"; } for (int i = 10; i < 32; i++) { TABLE_HTML[i] = "�" + i + ";"; } for (int i = 32; i < 128; i++) { TABLE_HTML[i] = String.valueOf((char) i); } for (int i = 128; i < 256; i++) { TABLE_HTML[i] = "&#" + i + ";"; } // special characters TABLE_HTML['\''] = "'"; // apostrophe (''' doesn't work - it is not by the w3 specs). TABLE_HTML['\"'] = """; // double quote. TABLE_HTML['&'] = "&"; // ampersand. TABLE_HTML['<'] = "<"; // lower than. TABLE_HTML['>'] = ">"; // greater than. // strict table System.arraycopy(TABLE_HTML, 0, TABLE_HTML_STRICT, 0, 256); TABLE_HTML_STRICT[' '] = " "; // ??. TABLE_HTML_STRICT['\n'] = "<br>"; // ascii 10. TABLE_HTML_STRICT['\r'] = "<br>"; // ascii 13. } // ---------------------------------------------------------------- encoding /** * Encode string to HTML-safe text. Extra characters are encoded as decimals, * and five special characters are replaced with their HTML values: * <li>' with &#039;</li> * <li>" with &quot;</li> * <li>& with &amp;</li> * <li>< with &lt;</li> * <li>> with &gt;</li> * * @param string input string * * @return HTML-safe string * @see #encodeText */ public static String encode(String string) { if ((string == null) || (string.length() == 0)) { return ""; } int n = string.length(); StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR)); int tableLen = TABLE_HTML.length; char c; for (int i = 0; i < n; i++) { c = string.charAt(i); if (c < tableLen) { buffer.append(TABLE_HTML[c]); } else { buffer.append("&#").append((int) c).append(';'); } } return buffer.toString(); } /** * Encodes text int HTML-safe text and preserves format. Additionaly, the following * characters are replaced: * <li>' ' with &nbsp;</li> * <li>\n with <br></li> * <li>\r with <br></li> * <br><br> * Additionaly, this method takes care about CRLF and LF texts and handles * both. * * Common problem with this method is that spaces are not breakable, so they * may break the outline of the page. * * @param string input string * * @return HTML-safe format */ public static String encodeTextStrict(String string) { if ((string == null) || (string.length() == 0)) { return ""; } int n = string.length(); StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR)); int tableLen = TABLE_HTML_STRICT.length; char c = 0; char prev = 0; for (int i = 0; i < n; i++, prev = c) { c = string.charAt(i); if ((c == '\n') && (prev == '\r')) { continue; // previously '\r' (CR) was encoded, so skip '\n' (LF) } if (c < tableLen) { buffer.append(TABLE_HTML_STRICT[c]); } else { buffer.append("&#").append((int) c).append(';'); } } return buffer.toString(); } /** * Encodes text int HTML-safe text and preserves format except spaces. * Additionaly, the following characters are replaced: * * <li>\n with <br></li> * <li>\r with <br></li> * <br><br> * Additionaly, this method takes care about CRLF and LF texts and handles * both. * * @param string input string * * @return HTML-safe format */ public static String encodeText(String string) { if ((string == null) || (string.length() == 0)) { return ""; } int n = string.length(); StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR)); int tableLen = TABLE_HTML_STRICT.length; char c = 0; char prev = 0; for (int i = 0; i < n; i++, prev = c) { c = string.charAt(i); if (c == ' ') { buffer.append(' '); continue; } if ((c == '\n') && (prev == '\r')) { continue; // previously '\r' (CR) was encoded, so skip '\n' (LF) } if (c < tableLen) { buffer.append(TABLE_HTML_STRICT[c]); } else { buffer.append("&#").append((int) c).append(';'); } } return buffer.toString(); } /** * Encodes text int HTML-safe text and preserves format using smart spaces. * Additionaly, the following characters are replaced: * * <li>\n with <br></li> * <li>\r with <br></li> * <br><br> * Additionaly, this method takes care about CRLF and LF texts and handles * both.<br> * * This method is special since it preserves format, but with combination of * not-breakable spaces and common spaces, so breaks are availiable. * * @param string input string * * @return HTML-safe format */ public static String encodeTextSmart(String string) { if ((string == null) || (string.length() == 0)) { return ""; } int n = string.length(); StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR)); int tableLen = TABLE_HTML_STRICT.length; char c = 0; char prev = 0; boolean prevSpace = false; for (int i = 0; i < n; i++, prev = c) { c = string.charAt(i); if (c == ' ') { if (prev != ' ') { prevSpace = false; } if (!prevSpace) { buffer.append(' '); } else { buffer.append(" "); } prevSpace = !prevSpace; continue; } if ((c == '\n') && (prev == '\r')) { continue; // previously '\r' (CR) was encoded, so skip '\n' (LF) } if (c < tableLen) { buffer.append(TABLE_HTML_STRICT[c]); } else { buffer.append("&#").append((int) c).append(';'); } } return buffer.toString(); } }