Escape HTML : Document HTML « Development Class « Java






Escape HTML

   
/**
 * Berlin Brown
 * Dec 26, 2006
 */
//package org.bresearch.websec.utils.botlist.xml;

import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;

/**
 * This is class is used by botverse.
 * @author Berlin Brown
 *
 */
public final class EscapeHTML {

  /**
   * Synonym for <tt>URLEncoder.encode(String, "UTF-8")</tt>.
   *
   * <P>Used to ensure that HTTP query strings are in proper form, by escaping
   * special characters such as spaces.
   *
   * <P>An example use case for this method is a login scheme in which, after successful
   * login, the user is redirected to the "original" target destination. Such a target

   * might be passed around as a request parameter. Such a request parameter
   * will have a URL as its value, as in "LoginTarget=Blah.jsp?this=that&blah=boo", and
   * would need to be URL-encoded in order to escape its special characters.
   *
   * <P>It is important to note that if a query string appears in an <tt>HREF</tt>
   * attribute, then there are two issues - ensuring the query string is valid HTTP
   * (it is URL-encoded), and ensuring it is valid HTML (ensuring the ampersand is escaped).
   */
  public String escapeURL(String aURLFragment) {
    String result = null;
    try {
      result = URLEncoder.encode(aURLFragment, "UTF-8");
    }
    catch (UnsupportedEncodingException ex){
      throw new RuntimeException("UTF-8 not supported", ex);
    }
    return result;
  }

  /**
   * Replace characters having special meaning <em>inside</em> HTML tags
   * with their escaped equivalents, using character entities such as <tt>'&amp;'</tt>.
   *
   * <P>The escaped characters are :
   * <ul>
   * <li> <
   * <li> >
   * <li> "
   * <li> '
   * <li> \
   * <li> &
   * </ul>
   *
   * <P>This method ensures that arbitrary text appearing inside a tag does not "confuse"
   * the tag. For example, <tt>HREF='Blah.do?Page=1&Sort=ASC'</tt>
   * does not comply with strict HTML because of the ampersand, and should be changed to
   * <tt>HREF='Blah.do?Page=1&amp;Sort=ASC'</tt>. This is commonly seen in building
   * query strings. (In JSTL, the c:url tag performs this task automatically.)
   */
  public String escapeHTMLTag(String aTagFragment) {
    final StringBuffer result = new StringBuffer();

    final StringCharacterIterator iterator = new StringCharacterIterator(aTagFragment);
    char character =  iterator.current();
    while (character != CharacterIterator.DONE ){
      if (character == '<') {
        result.append("&lt;");
      }
      else if (character == '>') {
        result.append("&gt;");
      }
      else if (character == '\"') {
        result.append("&quot;");
      }
      else if (character == '\'') {
        result.append("&#039;");
      }
      else if (character == '\\') {
        result.append("&#092;");
      }
      else if (character == '&') {
        result.append("&amp;");
      }
      else {
        //the char is not a special one
        //add it to the result as is
        result.append(character);
      }
      character = iterator.next();
    }
    return result.toString();
  }

  /**
   * Return <tt>aText</tt> with all start-of-tag and end-of-tag characters
   * replaced by their escaped equivalents.
   *
   * <P>If user input may contain tags which must be disabled, then call
   * this method, not {@link #forHTMLTag}. This method is used for text appearing
   * <em>outside</em> of a tag, while {@link #forHTMLTag} is used for text appearing
   * <em>inside</em> an HTML tag.
   *
   * <P>It is not uncommon to see text on a web page presented erroneously, because
   * <em>all</em> special characters are escaped (as in {@link #forHTMLTag}), instead of 
   * just the start-of-tag and end-of-tag characters. In
   * particular, the ampersand character is often escaped not once but <em>twice</em> :
   * once when the original input occurs, and then a second time when the same item is
   * retrieved from the database. This occurs because the ampersand is the only escaped
   * character which appears in a character entity.
   */
  public String escapeDisableTags(String aText) {
      
    final StringBuffer result = new StringBuffer();
    final StringCharacterIterator iterator = new StringCharacterIterator(aText);
    char character =  iterator.current();
    while (character != CharacterIterator.DONE ){
      if (character == '<') {
        result.append("&lt;");
      }
      else if (character == '>') {
        result.append("&gt;");
      }
      else {
        //the char is not a special one
        //add it to the result as is
        result.append(character);
      }
      character = iterator.next();
    }
    return result.toString();
  }

}

   
    
    
  








Related examples in the same category

1.HTMLDocument: Element Iterator Example
2.HTMLEditorKit DemoHTMLEditorKit Demo
3.SimpleAttributeSet ExampleSimpleAttributeSet Example
4.Text Tab SampleText Tab Sample
5.Styled DocumentStyled Document
6.Html utils for working with tag's names and attributes.
7.Escape HTML
8.Encode HTML
9.Replace all the occurences of HTML escape strings with the respective characters.
10.HTML Rewriter
11.HTML Encode
12.XMLWriter is a generic class that provides common behavior to writers of a tagged language such as XML, WordML and HTML.
13.Escape html entities.
14.Html Encoder
15.Remove Comment