Html utils for working with tag's names and attributes.
import java.util.Locale;
/**
* Html utils for working with tag's names and attributes.
* http://jodd.sourceforge.net
*
* @author najgor@users.sourceforge.net
* @author Lingo
* @since 2007-03-17
* @version 1.0
*/
public final class HtmlUtil {
// ---------------------------------------------------------------- tag name
/**
* Returns tag's name. Given string represents a HTML body of a tag,
* therefore it <b>must</b> start with '<'.
*
* @param tagBody tag's body
*
* @return tag's name, or <code>null</code> if tag not found
*/
public static String getTagName(String tagBody) {
return getTagName(tagBody, 0);
}
/**
* Returns tag's name. Given string represents a HTML body and given starting index
* <b>must</b> be the index of tag's start (i.e. '<').
* <p>
*
* Names of ending tags will always start with '/' character.
*
* @param body hmtl body
* @param i index of tag's start
*
* @return tag's name, or <code>null</code> if tag not found
*/
public static String getTagName(String body, int i) {
if (body == null) {
return null;
}
if (body.charAt(i) != '<') {
return null; // no tag
}
int start = i + 1; // skip '<'
int len = body.length();
boolean isEndTag = false;
// skip all non-letters
while (start < len) {
char c = body.charAt(start);
if (c == '>') {
return null; // tag end found => name not found
}
if (c == '/') { // this is an end tag
start++;
isEndTag = true;
continue;
}
if (!Character.isWhitespace(c)) {
break;
}
start++;
}
if (start == len) {
return null; // tag name not found
}
int end = start;
// skip all letters
while (end < len) {
char c = body.charAt(end);
if (Character.isWhitespace(c) || (c == '>')) {
break;
}
end++;
}
if (end == len) {
return null; // tag end not found
}
String tagName = body.substring(start, end);
if (isEndTag) {
tagName = "/" + tagName;
}
return tagName;
}
// ---------------------------------------------------------------- tag attribute
/**
* Returns value of the first founded attribute that matches given name.
* It is assumed that given string represents tag's body.
* Note: attribute <b>must</b> end with the <code>="</code> or <code>='</code>.
* Attribute name is not case sensitive.
*
* @param tagBody tag body
* @param attrName attribute name
*
* @return attribute value or <code>null</code> if attribute not found
*/
public static String getAttribute(String tagBody, String attrName) {
return getAttribute(tagBody, attrName, 0);
}
/**
* Returns value of the first founded attribute that matches given name.
* Given string may not be just a tag's body, however, start and end
* parameters must define tags body.
* Note: attribute <b>must</b> end with the <code>="</code> or <code>='</code>.
* Attribute name is not case sensitive.
*
* @param body html body
* @param attrName attribute name
* @param start index of tag's start
*
* @return attribute value or <code>null</code> if attribute not found
*/
public static String getAttribute(String body, String attrName,
int start) {
if (body == null) {
return null;
}
char quote = '\"';
int end = body.indexOf('>');
if (end == -1) {
return null; // tag's end not found
}
int i = indexOfIgnoreCase(body, attrName + "=\"", start);
if ((i == -1) || (i > end)) {
i = indexOfIgnoreCase(body, attrName + "='", start);
if ((i == -1) || (i > end)) {
return null;
}
quote = '\'';
}
String value = null;
i += (attrName.length() + 2);
int s = i;
int j = -1;
while (true) {
j = body.indexOf(quote, s);
if (j == -1) {
break; // closed quation not found
}
if (body.charAt(j - 1) == '\\') {
s = j + 1;
continue;
} else {
value = body.substring(i, j);
break;
}
}
return value;
}
// ---------------------------------------------------------------- add attribute & value
/**
* Adds attribute and its value to a tag. Attribute is added to the end of
* the tag, just before closing '>'. If name is not specified, nothing will
* be added. If value is not specified, it will be set to an empty string.
*
* @param tagBody tag body
* @param name attribute name
* @param value attribute value
*
* @return tag string with added attribute and value
*/
public static String addAttribute(String tagBody, String name,
String value) {
return addAttribute(tagBody, name, value, 0);
}
/**
* Adds attribute and its value to a tag. Attribute is added to the end of
* the tag, just before closing '>'. If name is not specified, nothing will
* be added. If value is not specified, it will be set to an empty string.
*
* @param body html body
* @param name attribute name
* @param value attribute value
* @param i tag's offset in html body
*
* @return tag string with added attribute and value
*/
public static String addAttribute(String body, String name,
String value, int i) {
if (body == null) {
return null;
}
if (name == null) {
return body;
}
if (value == null) {
value = "";
}
int end = body.indexOf('>', i);
if (end == -1) {
return body;
}
StringBuffer result = new StringBuffer(body.length());
result.append(body.substring(i, end)).append(' ');
result.append(name).append('=').append('"');
//result.append(ServletUtil.encodeHtml(value)).append('"');
result.append(HtmlEncoder.encode(value)).append('"');
result.append(body.substring(end));
return result.toString();
}
// ---------------------------------------------------------------- add attribute, no value
/**
* Adds single attribute without value to a tag. Attribute is added to the
* end of the tag, just before closing '>'. If name is not specified, nothing
* will be added.
*
* @param tagBody tag body
* @param name attribute name
*
* @return tag string with added attribute
*/
public static String addAttribute(String tagBody, String name) {
return addAttribute(tagBody, name, 0);
}
/**
* Adds single attribute without value to a tag. Attribute is added to the
* end of the tag, just before closing '>'. If name is not specified, nothing
* will be added.
*
* @param body html body
* @param name attribute name
* @param i tag's offset in html body
*
* @return tag string with added attribute
*/
public static String addAttribute(String body, String name, int i) {
if (body == null) {
return null;
}
if (name == null) {
return body;
}
int end = body.indexOf('>', i);
if (end == -1) {
return body;
}
StringBuffer result = new StringBuffer(body.length());
result.append(body.substring(i, end)).append(' ');
result.append(name).append(body.substring(end));
return result.toString();
}
/**
* Finds first index of a substring in the given source string with ignored
* case. This seems to be the fastest way doing this, with common string
* length and content (of course, with no use of Boyer-Mayer type of
* algorithms). Other implementations are slower: getting char array frist,
* lowercasing the source string, using String.regionMatch etc.
*
* @param src source string for examination
* @param subS substring to find
* @param startIndex starting index from where search begins
*
* @return index of founded substring or -1 if substring is not found
*/
public static int indexOfIgnoreCase(String src, String subS,
int startIndex) {
String sub = subS.toLowerCase(Locale.CHINA);
int sublen = sub.length();
int total = src.length() - sublen + 1;
for (int i = startIndex; i < total; i++) {
int j = 0;
while (j < sublen) {
char source = Character.toLowerCase(src.charAt(i + j));
if (sub.charAt(j) != source) {
break;
}
j++;
}
if (j == sublen) {
return i;
}
}
return -1;
}
}
class HtmlEncoder {
public static final float NEW_SIZE_FACTOR = 1.3f;
/**
* Lookup table for use in encode() method.
*
* @see #encode
*/
private static final String[] TABLE_HTML = new String[256];
/**
* Lookup table for use in encodeTextXxx() methods.
*
* @see #encodeText
* @see #encodeTextSmart
* @see #encodeTextStrict
*/
private static final String[] TABLE_HTML_STRICT = new String[256];
static {
for (int i = 0; i < 10; i++) {
TABLE_HTML[i] = "�" + i + ";";
}
for (int i = 10; i < 32; i++) {
TABLE_HTML[i] = "�" + i + ";";
}
for (int i = 32; i < 128; i++) {
TABLE_HTML[i] = String.valueOf((char) i);
}
for (int i = 128; i < 256; i++) {
TABLE_HTML[i] = "&#" + i + ";";
}
// special characters
TABLE_HTML['\''] = "'"; // apostrophe (''' doesn't work - it is not by the w3 specs).
TABLE_HTML['\"'] = """; // double quote.
TABLE_HTML['&'] = "&"; // ampersand.
TABLE_HTML['<'] = "<"; // lower than.
TABLE_HTML['>'] = ">"; // greater than.
// strict table
System.arraycopy(TABLE_HTML, 0, TABLE_HTML_STRICT, 0, 256);
TABLE_HTML_STRICT[' '] = " "; // ??.
TABLE_HTML_STRICT['\n'] = "<br>"; // ascii 10.
TABLE_HTML_STRICT['\r'] = "<br>"; // ascii 13.
}
// ---------------------------------------------------------------- encoding
/**
* Encode string to HTML-safe text. Extra characters are encoded as decimals,
* and five special characters are replaced with their HTML values:
* <li>' with &#039;</li>
* <li>" with &quot;</li>
* <li>& with &amp;</li>
* <li>< with &lt;</li>
* <li>> with &gt;</li>
*
* @param string input string
*
* @return HTML-safe string
* @see #encodeText
*/
public static String encode(String string) {
if ((string == null) || (string.length() == 0)) {
return "";
}
int n = string.length();
StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR));
int tableLen = TABLE_HTML.length;
char c;
for (int i = 0; i < n; i++) {
c = string.charAt(i);
if (c < tableLen) {
buffer.append(TABLE_HTML[c]);
} else {
buffer.append("&#").append((int) c).append(';');
}
}
return buffer.toString();
}
/**
* Encodes text int HTML-safe text and preserves format. Additionaly, the following
* characters are replaced:
* <li>' ' with &nbsp;</li>
* <li>\n with <br></li>
* <li>\r with <br></li>
* <br><br>
* Additionaly, this method takes care about CRLF and LF texts and handles
* both.
*
* Common problem with this method is that spaces are not breakable, so they
* may break the outline of the page.
*
* @param string input string
*
* @return HTML-safe format
*/
public static String encodeTextStrict(String string) {
if ((string == null) || (string.length() == 0)) {
return "";
}
int n = string.length();
StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR));
int tableLen = TABLE_HTML_STRICT.length;
char c = 0;
char prev = 0;
for (int i = 0; i < n; i++, prev = c) {
c = string.charAt(i);
if ((c == '\n') && (prev == '\r')) {
continue; // previously '\r' (CR) was encoded, so skip '\n' (LF)
}
if (c < tableLen) {
buffer.append(TABLE_HTML_STRICT[c]);
} else {
buffer.append("&#").append((int) c).append(';');
}
}
return buffer.toString();
}
/**
* Encodes text int HTML-safe text and preserves format except spaces.
* Additionaly, the following characters are replaced:
*
* <li>\n with <br></li>
* <li>\r with <br></li>
* <br><br>
* Additionaly, this method takes care about CRLF and LF texts and handles
* both.
*
* @param string input string
*
* @return HTML-safe format
*/
public static String encodeText(String string) {
if ((string == null) || (string.length() == 0)) {
return "";
}
int n = string.length();
StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR));
int tableLen = TABLE_HTML_STRICT.length;
char c = 0;
char prev = 0;
for (int i = 0; i < n; i++, prev = c) {
c = string.charAt(i);
if (c == ' ') {
buffer.append(' ');
continue;
}
if ((c == '\n') && (prev == '\r')) {
continue; // previously '\r' (CR) was encoded, so skip '\n' (LF)
}
if (c < tableLen) {
buffer.append(TABLE_HTML_STRICT[c]);
} else {
buffer.append("&#").append((int) c).append(';');
}
}
return buffer.toString();
}
/**
* Encodes text int HTML-safe text and preserves format using smart spaces.
* Additionaly, the following characters are replaced:
*
* <li>\n with <br></li>
* <li>\r with <br></li>
* <br><br>
* Additionaly, this method takes care about CRLF and LF texts and handles
* both.<br>
*
* This method is special since it preserves format, but with combination of
* not-breakable spaces and common spaces, so breaks are availiable.
*
* @param string input string
*
* @return HTML-safe format
*/
public static String encodeTextSmart(String string) {
if ((string == null) || (string.length() == 0)) {
return "";
}
int n = string.length();
StringBuffer buffer = new StringBuffer((int) (n * NEW_SIZE_FACTOR));
int tableLen = TABLE_HTML_STRICT.length;
char c = 0;
char prev = 0;
boolean prevSpace = false;
for (int i = 0; i < n; i++, prev = c) {
c = string.charAt(i);
if (c == ' ') {
if (prev != ' ') {
prevSpace = false;
}
if (!prevSpace) {
buffer.append(' ');
} else {
buffer.append(" ");
}
prevSpace = !prevSpace;
continue;
}
if ((c == '\n') && (prev == '\r')) {
continue; // previously '\r' (CR) was encoded, so skip '\n' (LF)
}
if (c < tableLen) {
buffer.append(TABLE_HTML_STRICT[c]);
} else {
buffer.append("&#").append((int) c).append(';');
}
}
return buffer.toString();
}
}
Related examples in the same category